diff --git a/Cargo.lock b/Cargo.lock index 20963b7f..5636dd56 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,34 +2,17 @@ # It is not intended for manual editing. version = 3 -[[package]] -name = "Inflector" -version = "0.11.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe438c63458706e03479442743baae6c88256498e6431708f6dfc520a26515d3" - [[package]] name = "adler" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" -[[package]] -name = "aes" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac1f845298e95f983ff1944b728ae08b8cebab80d684f0a832ed0fc74dfa27e2" -dependencies = [ - "cfg-if", - "cipher", - "cpufeatures", -] - [[package]] name = "ahash" -version = "0.7.6" +version = "0.7.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" +checksum = "5a824f2aa7e75a0c98c5a504fceb80649e9c35265d44525b5f94de4771a395cd" dependencies = [ "getrandom", "once_cell", @@ -38,9 +21,9 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "1.0.5" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c378d78423fdad8089616f827526ee33c19f2fddbd5de1629152c9593ba4783" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" dependencies = [ "memchr", ] @@ -68,9 +51,9 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.2" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15c4c2c83f81532e5845a733998b6971faca23490340a418e9b72a3ec9de12ea" +checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87" [[package]] name = "anyhow" @@ -111,12 +94,6 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b7e4c2464d97fe331d41de9d5db0def0a96f4d823b8b32a2efd503578988973" -[[package]] -name = "base64ct" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" - [[package]] name = "bgzip" version = "0.2.2" @@ -127,6 +104,12 @@ dependencies = [ "thiserror", ] +[[package]] +name = "binary-merge" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "597bb81c80a54b6a4381b23faba8d7774b144c94cbd1d6fe3f1329bd776554ab" + [[package]] name = "bindgen" version = "0.65.1" @@ -156,9 +139,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.4.0" +version = "2.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4682ae6287fcf752ecaabbfcc7b6f9b72aa33933dc23a554d853aea8eea8635" +checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf" [[package]] name = "bitvec" @@ -172,20 +155,11 @@ dependencies = [ "wyz", ] -[[package]] -name = "block-buffer" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" -dependencies = [ - "generic-array", -] - [[package]] name = "bstr" -version = "1.6.2" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c2f7349907b712260e64b0afe2f84692af14a454be26187d9df565c7f69266a" +checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc" dependencies = [ "memchr", "regex-automata", @@ -204,9 +178,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.13.0" +version = "3.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" +checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" [[package]] name = "bytecheck" @@ -232,21 +206,27 @@ dependencies = [ [[package]] name = "bytecount" -version = "0.6.3" +version = "0.6.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" +checksum = "e1e5f035d16fc623ae5f74981db80a439803888314e3a555fd6f04acd51a3205" [[package]] name = "bytemuck" -version = "1.13.1" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17febce684fd15d89027105661fec94afb475cb995fbc59d2865198446ba2eea" +checksum = "374d28ec25809ee0e23827c2ab573d729e293f281dfe393500e7ad618baa61c6" [[package]] name = "byteorder" -version = "1.4.3" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" [[package]] name = "bzip2" @@ -269,6 +249,15 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "camino" +version = "1.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c59e92b5a388f549b863a7bea62612c09f24c8393560709a54558a9abdfb3b9c" +dependencies = [ + "serde", +] + [[package]] name = "cc" version = "1.0.83" @@ -296,34 +285,23 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.28" +version = "0.4.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95ed24df0632f708f5f6d8082675bef2596f7084dee3dd55f632290bf35bfe0f" +checksum = "41daef31d7a747c5c847246f36de49ced6f7403b4cdabc807a97b5cc184cda7a" dependencies = [ "android-tzdata", "iana-time-zone", "js-sys", "num-traits", - "time 0.1.45", "wasm-bindgen", - "windows-targets 0.48.5", -] - -[[package]] -name = "cipher" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" -dependencies = [ - "crypto-common", - "inout", + "windows-targets 0.52.0", ] [[package]] name = "clang-sys" -version = "1.6.1" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f" +checksum = "67523a3b4be3ce1989d607a828d036249522dd9c1c8de7f4dd2dae43a37369d1" dependencies = [ "glob", "libc", @@ -339,17 +317,11 @@ dependencies = [ "csv", ] -[[package]] -name = "constant_time_eq" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" - [[package]] name = "core-foundation-sys" -version = "0.8.4" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" +checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" [[package]] name = "counter" @@ -360,15 +332,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "cpufeatures" -version = "0.2.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1" -dependencies = [ - "libc", -] - [[package]] name = "crc32fast" version = "1.3.2" @@ -380,46 +343,28 @@ dependencies = [ [[package]] name = "crossbeam-deque" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" dependencies = [ - "cfg-if", "crossbeam-epoch", "crossbeam-utils", ] [[package]] name = "crossbeam-epoch" -version = "0.9.15" +version = "0.9.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" dependencies = [ - "autocfg", - "cfg-if", "crossbeam-utils", - "memoffset", - "scopeguard", ] [[package]] name = "crossbeam-utils" -version = "0.8.16" +version = "0.8.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "crypto-common" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" -dependencies = [ - "generic-array", - "typenum", -] +checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" [[package]] name = "csv" @@ -442,29 +387,12 @@ dependencies = [ "memchr", ] -[[package]] -name = "deranged" -version = "0.3.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2696e8a945f658fd14dc3b87242e6b80cd0f36ff04ea560fa39082368847946" - [[package]] name = "difflib" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" -[[package]] -name = "digest" -version = "0.10.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" -dependencies = [ - "block-buffer", - "crypto-common", - "subtle", -] - [[package]] name = "doc-comment" version = "0.3.3" @@ -477,6 +405,18 @@ version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" +[[package]] +name = "enum_dispatch" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f33313078bb8d4d05a2733a94ac4c2d8a0df9a2b84424ebf4f33bfc224a890e" +dependencies = [ + "once_cell", + "proc-macro2", + "quote", + "syn 2.0.48", +] + [[package]] name = "env_logger" version = "0.10.2" @@ -502,9 +442,9 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.0.0" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764" +checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" [[package]] name = "fixedbitset" @@ -514,9 +454,9 @@ checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" [[package]] name = "flate2" -version = "1.0.27" +version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6c98ee8095e9d1dcbf2fcc6d95acccb90d1c81db1e44725c6a984b1dbdfb010" +checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e" dependencies = [ "crc32fast", "miniz_oxide", @@ -531,57 +471,22 @@ dependencies = [ "num-traits", ] -[[package]] -name = "flume" -version = "0.10.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1657b4441c3403d9f7b3409e47575237dac27b1b5726df654a6ecbf92f0f7577" -dependencies = [ - "futures-core", - "futures-sink", - "nanorand", - "pin-project", - "spin", -] - [[package]] name = "funty" version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" -[[package]] -name = "futures-core" -version = "0.3.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" - -[[package]] -name = "futures-sink" -version = "0.3.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" - -[[package]] -name = "generic-array" -version = "0.14.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" -dependencies = [ - "typenum", - "version_check", -] - [[package]] name = "getrandom" -version = "0.2.10" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" dependencies = [ "cfg-if", "js-sys", "libc", - "wasi 0.11.0+wasi-snapshot-preview1", + "wasi", "wasm-bindgen", ] @@ -620,23 +525,17 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "hermit-abi" -version = "0.3.2" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" +checksum = "5d3d0e0f38255e7fa3cf31335b3a56f05febd18025f4db5ef7a0cfb4f8da651f" [[package]] name = "histogram" -version = "0.6.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12cb882ccb290b8646e554b157ab0b71e64e8d5bef775cd66b6531e52d302669" - -[[package]] -name = "hmac" -version = "0.12.1" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +checksum = "e5ee9487899388cf1a1155759c39e3c156c5d198b6da1734053954a6e40e6d4d" dependencies = [ - "digest", + "thiserror", ] [[package]] @@ -647,16 +546,16 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "iana-time-zone" -version = "0.1.57" +version = "0.1.59" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fad5b825842d2b38bd206f3e81d6957625fd7f0a361e345c30e01a0ae2dd613" +checksum = "b6a67363e2aa4443928ce15e57ebae94fd8949958fd1223c4cfc0cd473ad7539" dependencies = [ "android_system_properties", "core-foundation-sys", "iana-time-zone-haiku", "js-sys", "wasm-bindgen", - "windows", + "windows-core", ] [[package]] @@ -675,45 +574,54 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e186cfbae8084e513daff4240b4797e342f988cecda4fb6c939150f96315fd8" [[package]] -name = "inout" -version = "0.1.3" +name = "inplace-vec-builder" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0c10553d664a4d0bcff9f4215d0aac67a639cc68ef660840afe309b807bc9f5" +checksum = "cf64c2edc8226891a71f127587a2861b132d2b942310843814d5001d99a1d307" dependencies = [ - "generic-array", + "smallvec", ] [[package]] name = "is-terminal" -version = "0.4.9" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" +checksum = "0bad00257d07be169d870ab665980b06cdb366d792ad690bf2e76876dc503455" dependencies = [ "hermit-abi", "rustix", - "windows-sys 0.48.0", + "windows-sys 0.52.0", +] + +[[package]] +name = "itertools" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25db6b064527c5d482d0423354fcd07a89a2dfe07b67892e62411946db7f07b0" +dependencies = [ + "either", ] [[package]] name = "itoa" -version = "1.0.9" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" +checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" [[package]] name = "jobserver" -version = "0.1.26" +version = "0.1.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2" +checksum = "8c37f63953c4c63420ed5fd3d6d398c719489b9f872b9fa683262f8edd363c7d" dependencies = [ "libc", ] [[package]] name = "js-sys" -version = "0.3.64" +version = "0.3.67" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" +checksum = "9a1d36f1235bc969acba30b7f5990b864423a6068a10f7c90ae8f0112e3a59d1" dependencies = [ "wasm-bindgen", ] @@ -732,18 +640,18 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.151" +version = "0.2.152" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "302d7ab3130588088d277783b1e2d2e10c9e9e4a16dd9050e6ec93fb3e7048f4" +checksum = "13e3bf6590cbc649f4d1a3eefc9d5d6eb746f5200ffb04e5e142700b8faa56e7" [[package]] name = "libloading" -version = "0.7.4" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" +checksum = "c571b676ddfc9a8c12f1f3d3085a7b163966a8fd8098a90640953ce5f6170161" dependencies = [ "cfg-if", - "winapi", + "windows-sys 0.48.0", ] [[package]] @@ -764,9 +672,9 @@ dependencies = [ [[package]] name = "libz-sys" -version = "1.1.12" +version = "1.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d97137b25e321a73eef1418d1d5d2eda4d77e12813f8e6dead84bc52c5870a7b" +checksum = "295c17e837573c8c821dbaeb3cceb3d745ad082f7572191409e69cbc1b3fd050" dependencies = [ "cc", "pkg-config", @@ -775,15 +683,15 @@ dependencies = [ [[package]] name = "linux-raw-sys" -version = "0.4.12" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4cd1a83af159aa67994778be9070f0ae1bd732942279cabb14f86f986a21456" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" [[package]] name = "lock_api" -version = "0.4.10" +version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" +checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" dependencies = [ "autocfg", "scopeguard", @@ -824,15 +732,15 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "memchr" -version = "2.6.2" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5486aed0026218e61b8a01d5fbd5a0a134649abb71a0e53b7bc088529dced86e" +checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" [[package]] name = "memmap2" -version = "0.5.10" +version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327" +checksum = "45fd3a57831bf88bc63f8cebc0cf956116276e97fef3966103e96416209f7c92" dependencies = [ "libc", ] @@ -867,15 +775,6 @@ version = "0.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2983372caf4480544083767bf2d27defafe32af49ab4df3a0b7fc90793a3664" -[[package]] -name = "nanorand" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a51313c5820b0b02bd422f4b44776fbf47961755c74ce64afc73bfad10226c3" -dependencies = [ - "getrandom", -] - [[package]] name = "needletail" version = "0.5.1" @@ -902,7 +801,7 @@ dependencies = [ "flate2", "thiserror", "xz2", - "zstd 0.12.4", + "zstd", ] [[package]] @@ -950,49 +849,42 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2" +checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" dependencies = [ "autocfg", ] -[[package]] -name = "numsep" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad5c49c3e12c314efb1f43cba136031b657dcd59ee26936ab2be313c5e97da22" -dependencies = [ - "slicestring", -] - [[package]] name = "once_cell" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "ouroboros" -version = "0.15.6" +version = "0.18.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1358bd1558bd2a083fed428ffeda486fbfb323e698cdda7794259d592ca72db" +checksum = "97b7be5a8a3462b752f4be3ff2b2bf2f7f1d00834902e46be2a4d68b87b0573c" dependencies = [ "aliasable", "ouroboros_macro", + "static_assertions", ] [[package]] name = "ouroboros_macro" -version = "0.15.6" +version = "0.18.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f7d21ccd03305a674437ee1248f3ab5d4b1db095cf1caf49f1713ddf61956b7" +checksum = "b645dcde5f119c2c454a92d0dfa271a2a3b205da92e4292a68ead4bdbfde1f33" dependencies = [ - "Inflector", - "proc-macro-error", + "heck", + "itertools", "proc-macro2", + "proc-macro2-diagnostics", "quote", - "syn 1.0.109", + "syn 2.0.48", ] [[package]] @@ -1007,86 +899,44 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.8" +version = "0.9.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" +checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.3.5", + "redox_syscall", "smallvec", "windows-targets 0.48.5", ] -[[package]] -name = "password-hash" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7676374caaee8a325c9e7a2ae557f216c5563a171d6997b0ef8a65af35147700" -dependencies = [ - "base64ct", - "rand_core", - "subtle", -] - -[[package]] -name = "pbkdf2" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83a0692ec44e4cf1ef28ca317f14f8f07da2d95ec3fa01f86e4467b725e60917" -dependencies = [ - "digest", - "hmac", - "password-hash", - "sha2", -] - [[package]] name = "peeking_take_while" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" -[[package]] -name = "pin-project" -version = "1.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fda4ed1c6c173e3fc7a83629421152e01d7b1f9b7f65fb301e490e8cfc656422" -dependencies = [ - "pin-project-internal", -] - -[[package]] -name = "pin-project-internal" -version = "1.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4359fd9c9171ec6e8c62926d6faaf553a8dc3f64e1507e76da7911b4f6a04405" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.48", -] - [[package]] name = "piz" -version = "0.4.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58c75d1c00e6d407e283cc66d9d4fd0985ef1703c761520845b93c4f981bfb65" +checksum = "898b071c1938a2c92b95c18708cbf38f2566a01f0ab9dd7bdf4329987e5c2e17" dependencies = [ + "camino", "chrono", "codepage-437", "crc32fast", "flate2", "log", + "memchr", "thiserror", - "twoway", ] [[package]] name = "pkg-config" -version = "0.3.27" +version = "0.3.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" +checksum = "2900ede94e305130c13ddd391e0ab7cbaeb783945ae07a279c268cb05109c6cb" [[package]] name = "ppv-lite86" @@ -1126,9 +976,9 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.12" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c64d9ba0963cdcea2e1b2230fbae2bab30eb25a174be395c41e764bfb65dd62" +checksum = "a41cf62165e97c7f814d2221421dbb9afcbcdb0a88068e5ea206e19951c2cbb5" dependencies = [ "proc-macro2", "syn 2.0.48", @@ -1169,13 +1019,26 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.76" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95fc56cda0b5c3325f5fbbd7ff9fda9e02bb00bb3dac51252d2f1bfa1cb8cc8c" +checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" dependencies = [ "unicode-ident", ] +[[package]] +name = "proc-macro2-diagnostics" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.48", + "version_check", + "yansi", +] + [[package]] name = "ptr_meta" version = "0.1.4" @@ -1323,15 +1186,6 @@ dependencies = [ "crossbeam-utils", ] -[[package]] -name = "redox_syscall" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" -dependencies = [ - "bitflags 1.3.2", -] - [[package]] name = "redox_syscall" version = "0.4.1" @@ -1343,9 +1197,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.9.4" +version = "1.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12de2eff854e5fa4b1295edd650e227e9d8fb0c9e90b12e7f36d6a6811791a29" +checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" dependencies = [ "aho-corasick", "memchr", @@ -1355,9 +1209,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.3.7" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49530408a136e16e5b486e883fbb6ba058e8e4e8ae6621a77b048b314336e629" +checksum = "3b7fa1134405e2ec9353fd416b17f8dacd46c473d7d3fd1cf202706a14eb792a" dependencies = [ "aho-corasick", "memchr", @@ -1366,33 +1220,34 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.7.5" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" [[package]] name = "rend" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "581008d2099240d37fb08d77ad713bcaec2c4d89d50b5b21a8bb1996bbab68ab" +checksum = "a2571463863a6bd50c32f94402933f03457a3fbaf697a707c5be741e459f08fd" dependencies = [ "bytecheck", ] [[package]] name = "retain_mut" -version = "0.1.9" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4389f1d5789befaf6029ebd9f7dac4af7f7e3d61b69d4f30e2ac02b57e7712b0" +checksum = "8c31b5c4033f8fdde8700e4657be2c497e7288f01515be52168c631e2e4d4086" [[package]] name = "rkyv" -version = "0.7.42" +version = "0.7.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0200c8230b013893c0b2d6213d6ec64ed2b9be2e0e016682b7224ff82cff5c58" +checksum = "527a97cdfef66f65998b5f3b637c26f5a5ec09cc52a3f9932313ac645f4190f5" dependencies = [ "bitvec", "bytecheck", + "bytes", "hashbrown", "ptr_meta", "rend", @@ -1404,9 +1259,9 @@ dependencies = [ [[package]] name = "rkyv_derive" -version = "0.7.42" +version = "0.7.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2e06b915b5c230a17d7a736d1e2e63ee753c256a8614ef3f5147b13a4f5541d" +checksum = "b5c462a1328c8e67e4d6dbad1eb0355dd43e8ab432c6e227a43657f16ade5033" dependencies = [ "proc-macro2", "quote", @@ -1415,9 +1270,9 @@ dependencies = [ [[package]] name = "roaring" -version = "0.9.0" +version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd539cab4e32019956fe7e0cf160bb6d4802f4be2b52c4253d76d3bb0f85a5f7" +checksum = "6106b5cf8587f5834158895e9715a3c6c9716c8aefab57f1f7680917191c7873" dependencies = [ "bytemuck", "byteorder", @@ -1442,11 +1297,11 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] name = "rustix" -version = "0.38.28" +version = "0.38.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316" +checksum = "322394588aaf33c24007e8bb3238ee3e4c5c09c084ab32bc73890b99ff326bca" dependencies = [ - "bitflags 2.4.0", + "bitflags 2.4.2", "errno", "libc", "linux-raw-sys", @@ -1455,9 +1310,9 @@ dependencies = [ [[package]] name = "ryu" -version = "1.0.15" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741" +checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c" [[package]] name = "safemem" @@ -1508,33 +1363,11 @@ dependencies = [ "serde", ] -[[package]] -name = "sha1" -version = "0.10.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f04293dc80c3993519f2d7f6f511707ee7094fe0c6d3406feb330cdb3540eba3" -dependencies = [ - "cfg-if", - "cpufeatures", - "digest", -] - -[[package]] -name = "sha2" -version = "0.10.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "479fb9d862239e610720565ca91403019f2f00410f1864c5aa7479b950a76ed8" -dependencies = [ - "cfg-if", - "cpufeatures", - "digest", -] - [[package]] name = "shlex" -version = "1.1.0" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "simdutf8" @@ -1548,23 +1381,11 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8542b68b8800c3cda649d2c72d688b6907b30f1580043135d61669d4aad1c175" -[[package]] -name = "size" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fed904c7fb2856d868b92464fc8fa597fce366edea1a9cbfaa8cb5fe080bd6d" - -[[package]] -name = "slicestring" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "636b979c5672ac7c2a1120ca0a9a6074cd090dadfec42af6f8a5baea1223d180" - [[package]] name = "smallvec" -version = "1.11.0" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" +checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" [[package]] name = "sorted-iter" @@ -1575,15 +1396,18 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.12.0" -source = "git+https://github.com/sourmash-bio/sourmash?rev=ff1092f8f366339caa59d7203f623813228f4356#ff1092f8f366339caa59d7203f623813228f4356" +source = "git+https://github.com/sourmash-bio/sourmash?rev=409aeb415ba8b04b9c09f203817d67791afa96da#409aeb415ba8b04b9c09f203817d67791afa96da" dependencies = [ "az", - "bytecount", "byteorder", + "camino", "cfg-if", + "chrono", "counter", + "csv", + "enum_dispatch", "fixedbitset", - "flume", + "getrandom", "getset", "histogram", "log", @@ -1593,7 +1417,6 @@ dependencies = [ "niffler", "nohash-hasher", "num-iter", - "numsep", "once_cell", "ouroboros", "piz", @@ -1604,7 +1427,6 @@ dependencies = [ "rocksdb", "serde", "serde_json", - "size", "thiserror", "twox-hash", "typed-builder", @@ -1620,6 +1442,7 @@ dependencies = [ "anyhow", "assert_cmd", "assert_matches", + "camino", "csv", "env_logger", "log", @@ -1636,27 +1459,12 @@ dependencies = [ "zip", ] -[[package]] -name = "spin" -version = "0.9.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" -dependencies = [ - "lock_api", -] - [[package]] name = "static_assertions" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" -[[package]] -name = "subtle" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" - [[package]] name = "syn" version = "1.0.109" @@ -1687,9 +1495,9 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" [[package]] name = "target-lexicon" -version = "0.12.11" +version = "0.12.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d0e916b1148c8e263850e1ebcbd046f333e0683c724876bb0da63ea4373dc8a" +checksum = "69758bda2e78f098e4ccb393021a0963bb3442eac05f135c30f61b7370bbafae" [[package]] name = "tempfile" @@ -1699,16 +1507,16 @@ checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" dependencies = [ "cfg-if", "fastrand", - "redox_syscall 0.4.1", + "redox_syscall", "rustix", "windows-sys 0.52.0", ] [[package]] name = "termcolor" -version = "1.2.0" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" +checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755" dependencies = [ "winapi-util", ] @@ -1721,52 +1529,24 @@ checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76" [[package]] name = "thiserror" -version = "1.0.47" +version = "1.0.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97a802ec30afc17eee47b2855fc72e0c4cd62be9b4efe6591edde0ec5bd68d8f" +checksum = "d54378c645627613241d077a3a79db965db602882668f9136ac42af9ecb730ad" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.47" +version = "1.0.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bb623b56e39ab7dcd4b1b98bb6c8f8d907ed255b18de254088016b27a8ee19b" +checksum = "fa0faa943b50f3db30a20aa7e265dbc66076993efed8463e8de414e5d06d3471" dependencies = [ "proc-macro2", "quote", "syn 2.0.48", ] -[[package]] -name = "time" -version = "0.1.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a" -dependencies = [ - "libc", - "wasi 0.10.0+wasi-snapshot-preview1", - "winapi", -] - -[[package]] -name = "time" -version = "0.3.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17f6bb557fd245c28e6411aa56b6403c689ad95061f50e4be16c274e70a17e48" -dependencies = [ - "deranged", - "serde", - "time-core", -] - -[[package]] -name = "time-core" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb" - [[package]] name = "tinyvec" version = "1.6.0" @@ -1782,16 +1562,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" -[[package]] -name = "twoway" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c57ffb460d7c24cd6eda43694110189030a3d1dfe418416d9468fd1c1d290b47" -dependencies = [ - "memchr", - "unchecked-index", -] - [[package]] name = "twox-hash" version = "1.6.3" @@ -1805,32 +1575,29 @@ dependencies = [ [[package]] name = "typed-builder" -version = "0.10.0" +version = "0.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89851716b67b937e393b3daa8423e67ddfc4bbbf1654bcf05488e95e0828db0c" +checksum = "444d8748011b93cb168770e8092458cb0f8854f931ff82fdf6ddfbd72a9c933e" dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.109", + "typed-builder-macro", ] [[package]] -name = "typenum" -version = "1.16.0" +name = "typed-builder-macro" +version = "0.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" - -[[package]] -name = "unchecked-index" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eeba86d422ce181a719445e51872fa30f1f7413b62becb52e95ec91aa262d85c" +checksum = "563b3b88238ec95680aef36bdece66896eaa7ce3c0f1b4f39d38fb2435261352" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.48", +] [[package]] name = "unicode-ident" -version = "1.0.11" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "unindent" @@ -1840,9 +1607,9 @@ checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" [[package]] name = "uuid" -version = "1.4.1" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79daa5ed5740825c40b389c5e50312b9c86df53fccd33f281df655642b43869d" +checksum = "f00cc9702ca12d3c81455259621e676d0f7251cec66a21e98fe2e9a37db93b2a" [[package]] name = "vcpkg" @@ -1852,10 +1619,13 @@ checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" [[package]] name = "vec-collections" -version = "0.3.6" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f2390c4dc8ae8640c57d067b1a3d40bc05c124cc6bc7394d761b53435d41b76" +checksum = "3c9965c8f2ffed1dbcd16cafe18a009642f540fa22661c6cfd6309ddb02e4982" dependencies = [ + "binary-merge", + "inplace-vec-builder", + "lazy_static", "num-traits", "serde", "smallvec", @@ -1877,12 +1647,6 @@ dependencies = [ "libc", ] -[[package]] -name = "wasi" -version = "0.10.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" - [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -1891,9 +1655,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.87" +version = "0.2.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" +checksum = "b1223296a201415c7fad14792dbefaace9bd52b62d33453ade1c5b5f07555406" dependencies = [ "cfg-if", "serde", @@ -1903,9 +1667,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.87" +version = "0.2.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" +checksum = "fcdc935b63408d58a32f8cc9738a0bffd8f05cc7c002086c6ef20b7312ad9dcd" dependencies = [ "bumpalo", "log", @@ -1918,9 +1682,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.87" +version = "0.2.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" +checksum = "3e4c238561b2d428924c49815533a8b9121c664599558a5d9ec51f8a1740a999" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1928,9 +1692,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.87" +version = "0.2.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" +checksum = "bae1abb6806dc1ad9e560ed242107c0f6c84335f1749dd4e8ddb012ebd5e25a7" dependencies = [ "proc-macro2", "quote", @@ -1941,15 +1705,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.87" +version = "0.2.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" +checksum = "4d91413b1c31d7539ba5ef2451af3f0b833a005eb27a631cec32bc0635a8602b" [[package]] name = "web-sys" -version = "0.3.64" +version = "0.3.67" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b" +checksum = "58cd2333b6e0be7a39605f0e255892fd7418a682d8da8fe042fe25128794d2ed" dependencies = [ "js-sys", "wasm-bindgen", @@ -1973,9 +1737,9 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" dependencies = [ "winapi", ] @@ -1987,12 +1751,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] -name = "windows" -version = "0.48.0" +name = "windows-core" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" dependencies = [ - "windows-targets 0.48.5", + "windows-targets 0.52.0", ] [[package]] @@ -2145,33 +1909,22 @@ dependencies = [ "lzma-sys", ] +[[package]] +name = "yansi" +version = "1.0.0-rc.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1367295b8f788d371ce2dbc842c7b709c73ee1364d30351dd300ec2203b12377" + [[package]] name = "zip" version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261" dependencies = [ - "aes", "byteorder", - "bzip2", - "constant_time_eq", "crc32fast", "crossbeam-utils", "flate2", - "hmac", - "pbkdf2", - "sha1", - "time 0.3.28", - "zstd 0.11.2+zstd.1.5.2", -] - -[[package]] -name = "zstd" -version = "0.11.2+zstd.1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4" -dependencies = [ - "zstd-safe 5.0.2+zstd.1.5.2", ] [[package]] @@ -2180,17 +1933,7 @@ version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c" dependencies = [ - "zstd-safe 6.0.6", -] - -[[package]] -name = "zstd-safe" -version = "5.0.2+zstd.1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db" -dependencies = [ - "libc", - "zstd-sys", + "zstd-safe", ] [[package]] @@ -2205,11 +1948,10 @@ dependencies = [ [[package]] name = "zstd-sys" -version = "2.0.8+zstd.1.5.5" +version = "2.0.9+zstd.1.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c" +checksum = "9e16efa8a874a0481a574084d34cc26fdb3b99627480f785888deb6386506656" dependencies = [ "cc", - "libc", "pkg-config", ] diff --git a/Cargo.toml b/Cargo.toml index ca57454b..60f43354 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,17 +12,19 @@ crate-type = ["cdylib"] pyo3 = { version = "0.20.2", features = ["extension-module", "anyhow"] } rayon = "1.8.1" serde = { version = "1.0.196", features = ["derive"] } -sourmash = { git = "https://github.com/sourmash-bio/sourmash", "rev" = "ff1092f8f366339caa59d7203f623813228f4356" } +sourmash = { git = "https://github.com/sourmash-bio/sourmash", rev= "409aeb415ba8b04b9c09f203817d67791afa96da", features = ["branchwater"] } +#sourmash = { version = "0.12.1", features = ["branchwater"] } serde_json = "1.0.113" niffler = "2.4.0" log = "0.4.14" env_logger = "0.10.2" simple-error = "0.3.0" anyhow = "1.0.79" -zip = "0.6" +zip = { version = "0.6", default-features = false, features = ["deflate"] } tempfile = "3.9" needletail = "0.5.1" csv = "1.3.0" +camino = "1.1.6" [dev-dependencies] assert_cmd = "2.0.13" diff --git a/src/check.rs b/src/check.rs index 3b6484ee..2995284b 100644 --- a/src/check.rs +++ b/src/check.rs @@ -1,19 +1,14 @@ -use std::path::Path; - use crate::utils::is_revindex_database; -use sourmash::index::revindex::RevIndex; +use sourmash::index::revindex::{RevIndex, RevIndexOps}; -pub fn check>(index: P, quick: bool) -> Result<(), Box> { - if !is_revindex_database(index.as_ref()) { - bail!( - "'{}' is not a valid RevIndex database", - index.as_ref().display() - ); +pub fn check(index: camino::Utf8PathBuf, quick: bool) -> Result<(), Box> { + if !is_revindex_database(&index) { + bail!("'{}' is not a valid RevIndex database", index); } println!("Opening DB"); - let db = RevIndex::open(index.as_ref(), true); + let db = RevIndex::open(index, true)?; println!("Starting check"); db.check(quick); diff --git a/src/fastgather.rs b/src/fastgather.rs index 963a6232..349ed974 100644 --- a/src/fastgather.rs +++ b/src/fastgather.rs @@ -1,49 +1,52 @@ /// fastgather: Run gather with a query against a list of files. use anyhow::Result; - -use sourmash::signature::Signature; -use sourmash::sketch::Sketch; -use std::path::Path; +use sourmash::prelude::Select; +use sourmash::selection::Selection; use crate::utils::{ - consume_query_by_gather, load_sigpaths_from_zip_or_pathlist, load_sketches_above_threshold, - prepare_query, write_prefetch, ReportType, + consume_query_by_gather, load_collection, load_sketches_above_threshold, write_prefetch, + ReportType, }; -pub fn fastgather + std::fmt::Debug + std::fmt::Display + Clone>( - query_filename: P, - matchlist_filename: P, +pub fn fastgather( + query_filepath: String, + against_filepath: String, threshold_bp: usize, - ksize: u8, scaled: usize, - template: Sketch, - gather_output: Option

, - prefetch_output: Option

, + selection: &Selection, + gather_output: Option, + prefetch_output: Option, + allow_failed_sigpaths: bool, ) -> Result<()> { - let location = query_filename.to_string(); - eprintln!("Loading query from '{}'", location); - let query = { - let sigs = Signature::from_path(query_filename)?; + let query_collection = load_collection( + &query_filepath, + selection, + ReportType::Query, + allow_failed_sigpaths, + )?; - prepare_query(&sigs, &template, &location) - }; - // did we find anything matching the desired template? - let query = match query { - Some(query) => query, - None => bail!("No sketch found with scaled={}, k={}", scaled, ksize), + if query_collection.len() != 1 { + bail!( + "Fastgather requires a single query sketch. Check input: '{:?}'", + &query_filepath + ) + } + // get single query sig and minhash + let query_sig = query_collection.sig_for_dataset(0)?; // need this for original md5sum + let query_sig_ds = query_sig.clone().select(selection)?; // downsample + let query_mh = match query_sig_ds.minhash() { + Some(query_mh) => query_mh, + None => { + bail!("No query sketch matching selection parameters."); + } }; - - // build the list of paths to match against. - eprintln!( - "Loading matchlist from '{}'", - matchlist_filename.as_ref().display() - ); - - let matchlist_filename = matchlist_filename.as_ref().to_string_lossy().to_string(); - let (matchlist_paths, _temp_dir) = - load_sigpaths_from_zip_or_pathlist(matchlist_filename, &template, ReportType::Against)?; - - eprintln!("Loaded {} sig paths in matchlist", matchlist_paths.len()); + // load collection to match against. + let against_collection = load_collection( + &against_filepath, + selection, + ReportType::Against, + allow_failed_sigpaths, + )?; // calculate the minimum number of hashes based on desired threshold let threshold_hashes: u64 = { @@ -62,16 +65,10 @@ pub fn fastgather + std::fmt::Debug + std::fmt::Display + Clone>( ); // load a set of sketches, filtering for those with overlaps > threshold - let result = load_sketches_above_threshold( - matchlist_paths, - &template, - &query.minhash, - threshold_hashes, - )?; + let result = load_sketches_above_threshold(against_collection, query_mh, threshold_hashes)?; let matchlist = result.0; let skipped_paths = result.1; let failed_paths = result.2; - if skipped_paths > 0 { eprintln!( "WARNING: skipped {} search paths - no compatible signatures.", @@ -91,10 +88,10 @@ pub fn fastgather + std::fmt::Debug + std::fmt::Display + Clone>( } if prefetch_output.is_some() { - write_prefetch(&query, prefetch_output, &matchlist).ok(); + write_prefetch(&query_sig, prefetch_output, &matchlist).ok(); } // run the gather! - consume_query_by_gather(query, matchlist, threshold_hashes, gather_output).ok(); + consume_query_by_gather(query_sig, matchlist, threshold_hashes, gather_output).ok(); Ok(()) } diff --git a/src/fastmultigather.rs b/src/fastmultigather.rs index 915b6370..1ed14f10 100644 --- a/src/fastmultigather.rs +++ b/src/fastmultigather.rs @@ -2,32 +2,35 @@ use anyhow::Result; use rayon::prelude::*; -use sourmash::signature::Signature; -use sourmash::sketch::Sketch; -use std::path::Path; +use sourmash::selection::Selection; use std::sync::atomic; use std::sync::atomic::AtomicUsize; use std::collections::BinaryHeap; +use camino::Utf8Path as PathBuf; + use crate::utils::{ - consume_query_by_gather, load_sigpaths_from_zip_or_pathlist, - load_sketches_from_zip_or_pathlist, prepare_query, write_prefetch, PrefetchResult, ReportType, + consume_query_by_gather, load_collection, load_sketches, write_prefetch, PrefetchResult, + ReportType, }; -pub fn fastmultigather + std::fmt::Debug + Clone>( - query_filenames: P, - matchlist_filename: P, +pub fn fastmultigather( + query_filepath: String, + against_filepath: String, threshold_bp: usize, scaled: usize, - template: Sketch, + selection: &Selection, + allow_failed_sigpaths: bool, ) -> Result<()> { - // load the list of query paths - let queryfile_name = query_filenames.as_ref().to_string_lossy().to_string(); - let (querylist_paths, _temp_dir) = - load_sigpaths_from_zip_or_pathlist(&query_filenames, &template, ReportType::Query)?; - println!("Loaded {} sig paths in querylist", querylist_paths.len()); + // load query collection + let query_collection = load_collection( + &query_filepath, + selection, + ReportType::Query, + allow_failed_sigpaths, + )?; let threshold_hashes: u64 = { let x = threshold_bp / scaled; @@ -42,80 +45,82 @@ pub fn fastmultigather + std::fmt::Debug + Clone>( println!("threshold overlap: {} {}", threshold_hashes, threshold_bp); - // Load all the against sketches - let sketchlist = - load_sketches_from_zip_or_pathlist(&matchlist_filename, &template, ReportType::Against)?; + // load against collection + let against_collection = load_collection( + &against_filepath, + selection, + ReportType::Against, + allow_failed_sigpaths, + )?; + // load against sketches into memory, downsampling on the way + let against = load_sketches(against_collection, selection, ReportType::Against).unwrap(); // Iterate over all queries => do prefetch and gather! let processed_queries = AtomicUsize::new(0); let skipped_paths = AtomicUsize::new(0); let failed_paths = AtomicUsize::new(0); - querylist_paths.par_iter().for_each(|q| { - // increment counter of # of queries + query_collection.par_iter().for_each(|(_idx, record)| { + // increment counter of # of queries. q: could we instead use the _idx from par_iter(), or will it vary based on thread? let _i = processed_queries.fetch_add(1, atomic::Ordering::SeqCst); - - // set query_label to the last path element. - let location = q.clone().into_os_string().into_string().unwrap(); - let location = location.split('/').last().unwrap().to_string(); - - let query = match Signature::from_path(dbg!(q)) { - Ok(sigs) => { - let mm = prepare_query(&sigs, &template, &location); - - if mm.is_none() { - if !queryfile_name.ends_with(".zip") { - eprintln!("WARNING: no compatible sketches in path '{}'", q.display()); + // Load query sig (downsampling happens here) + match query_collection.sig_from_record(record) { + Ok(query_sig) => { + let prefix = query_sig.name(); + let location = PathBuf::new(&prefix).file_name().unwrap(); + if let Some(query_mh) = query_sig.minhash() { + let matchlist: BinaryHeap = against + .iter() + .filter_map(|against| { + let mut mm: Option = None; + if let Ok(overlap) = against.minhash.count_common(query_mh, false) { + if overlap >= threshold_hashes { + let result = PrefetchResult { + name: against.name.clone(), + md5sum: against.md5sum.clone(), + minhash: against.minhash.clone(), + overlap, + }; + mm = Some(result); + } + } + mm + }) + .collect(); + if !matchlist.is_empty() { + let prefetch_output = format!("{}.prefetch.csv", location); + let gather_output = format!("{}.gather.csv", location); + + // Save initial list of matches to prefetch output + write_prefetch(&query_sig, Some(prefetch_output), &matchlist).ok(); + + // Now, do the gather! + consume_query_by_gather( + query_sig.clone(), + matchlist, + threshold_hashes, + Some(gather_output), + ) + .ok(); + } else { + println!("No matches to '{}'", location); } + } else { + // different warning here? Could not load sig from record?? + eprintln!( + "WARNING: no compatible sketches in path '{}'", + record.internal_location() + ); let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } - mm } - Err(err) => { - eprintln!("Sketch loading error: {}", err); + Err(_) => { + // different warning here? Could not load sig from record?? eprintln!( - "WARNING: could not load sketches from path '{}'", - q.display() + "WARNING: no compatible sketches in path '{}'", + record.internal_location() ); - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); - None - } - }; - - if let Some(query) = query { - // filter first set of matches out of sketchlist - let matchlist: BinaryHeap = sketchlist - .par_iter() - .filter_map(|sm| { - let mut mm = None; - - if let Ok(overlap) = sm.minhash.count_common(&query.minhash, false) { - if overlap >= threshold_hashes { - let result = PrefetchResult { - name: sm.name.clone(), - md5sum: sm.md5sum.clone(), - minhash: sm.minhash.clone(), - overlap, - }; - mm = Some(result); - } - } - mm - }) - .collect(); - - if !matchlist.is_empty() { - let prefetch_output = format!("{location}.prefetch.csv"); - let gather_output = format!("{location}.gather.csv"); - - // save initial list of matches to prefetch output - write_prefetch(&query, Some(prefetch_output), &matchlist).ok(); - - // now, do the gather! - consume_query_by_gather(query, matchlist, threshold_hashes, Some(gather_output)) - .ok(); - } else { - println!("No matches to '{}'", location); + let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } } }); diff --git a/src/index.rs b/src/index.rs index bee725cd..3747e6f5 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,31 +1,30 @@ use sourmash::index::revindex::RevIndex; -use sourmash::sketch::Sketch; +use sourmash::prelude::*; use std::path::Path; -use crate::utils::{load_sigpaths_from_zip_or_pathlist, ReportType}; +use crate::utils::{load_collection, ReportType}; pub fn index>( - siglist: P, - template: Sketch, + siglist: String, + selection: &Selection, output: P, - save_paths: bool, colors: bool, + allow_failed_sigpaths: bool, ) -> Result<(), Box> { println!("Loading siglist"); - let (index_sigs, _temp_dir) = - load_sigpaths_from_zip_or_pathlist(&siglist, &template, ReportType::Index)?; + let collection = load_collection( + &siglist, + selection, + ReportType::General, + allow_failed_sigpaths, + )?; - // if index_sigs pathlist is empty, bail - if index_sigs.is_empty() { - bail!("No signatures to index loaded, exiting."); - } - - // Create or open the RevIndex database with the provided output path and colors flag - let db = RevIndex::create(output.as_ref(), colors); - - // Index the signatures using the loaded template, threshold, and save_paths option - db.index(index_sigs, &template, 0.0, save_paths); + RevIndex::create( + output.as_ref(), + collection.select(selection)?.try_into()?, + colors, + )?; Ok(()) } diff --git a/src/lib.rs b/src/lib.rs index 8a47137b..16df3ae4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,7 +5,7 @@ use pyo3::prelude::*; extern crate simple_error; mod utils; -use crate::utils::build_template; +use crate::utils::build_selection; use crate::utils::is_revindex_database; mod check; mod fastgather; @@ -18,6 +18,8 @@ mod mastiff_manysearch; mod multisearch; mod pairwise; +use camino::Utf8PathBuf as PathBuf; + #[pyfunction] fn do_manysearch( querylist_path: String, @@ -28,15 +30,20 @@ fn do_manysearch( moltype: String, output_path: Option, ) -> anyhow::Result { + let againstfile_path: PathBuf = siglist_path.clone().into(); + let selection = build_selection(ksize, scaled, &moltype); + eprintln!("selection scaled: {:?}", selection.scaled()); + let allow_failed_sigpaths = true; + // if siglist_path is revindex, run mastiff_manysearch; otherwise run manysearch - let template = build_template(ksize, scaled, &moltype); - if is_revindex_database(siglist_path.as_ref()) { + if is_revindex_database(&againstfile_path) { match mastiff_manysearch::mastiff_manysearch( querylist_path, - siglist_path, - template, + againstfile_path, + &selection, threshold, output_path, + allow_failed_sigpaths, ) { Ok(_) => Ok(0), Err(e) => { @@ -48,9 +55,10 @@ fn do_manysearch( match manysearch::manysearch( querylist_path, siglist_path, - template, + &selection, threshold, output_path, + allow_failed_sigpaths, ) { Ok(_) => Ok(0), Err(e) => { @@ -72,16 +80,18 @@ fn do_fastgather( output_path_prefetch: Option, output_path_gather: Option, ) -> anyhow::Result { - let template = build_template(ksize, scaled, &moltype); + let selection = build_selection(ksize, scaled, &moltype); + let allow_failed_sigpaths = true; + match fastgather::fastgather( query_filename, siglist_path, threshold_bp, - ksize, scaled, - template, + &selection, output_path_prefetch, output_path_gather, + allow_failed_sigpaths, ) { Ok(_) => Ok(0), Err(e) => { @@ -101,15 +111,19 @@ fn do_fastmultigather( moltype: String, output_path: Option, ) -> anyhow::Result { + let againstfile_path: camino::Utf8PathBuf = siglist_path.clone().into(); + let selection = build_selection(ksize, scaled, &moltype); + let allow_failed_sigpaths = true; + // if a siglist path is a revindex, run mastiff_manygather. If not, run multigather - let template = build_template(ksize, scaled, &moltype); - if is_revindex_database(siglist_path.as_ref()) { + if is_revindex_database(&againstfile_path) { match mastiff_manygather::mastiff_manygather( query_filenames, - siglist_path, - template, + againstfile_path, + &selection, threshold_bp, output_path, + allow_failed_sigpaths, ) { Ok(_) => Ok(0), Err(e) => { @@ -123,7 +137,8 @@ fn do_fastmultigather( siglist_path, threshold_bp, scaled, - template, + &selection, + allow_failed_sigpaths, ) { Ok(_) => Ok(0), Err(e) => { @@ -158,12 +173,11 @@ fn do_index( scaled: usize, moltype: String, output: String, - save_paths: bool, colors: bool, ) -> anyhow::Result { - // build template from ksize, scaled - let template = build_template(ksize, scaled, &moltype); - match index::index(siglist, template, output, save_paths, colors) { + let selection = build_selection(ksize, scaled, &moltype); + let allow_failed_sigpaths = false; + match index::index(siglist, &selection, output, colors, allow_failed_sigpaths) { Ok(_) => Ok(0), Err(e) => { eprintln!("Error: {e}"); @@ -174,7 +188,8 @@ fn do_index( #[pyfunction] fn do_check(index: String, quick: bool) -> anyhow::Result { - match check::check(index, quick) { + let idx: PathBuf = index.into(); + match check::check(idx, quick) { Ok(_) => Ok(0), Err(e) => { eprintln!("Error: {e}"); @@ -193,13 +208,16 @@ fn do_multisearch( moltype: String, output_path: Option, ) -> anyhow::Result { - let template = build_template(ksize, scaled, &moltype); + let selection = build_selection(ksize, scaled, &moltype); + let allow_failed_sigpaths = true; + match multisearch::multisearch( querylist_path, siglist_path, threshold, - template, + &selection, output_path, + allow_failed_sigpaths, ) { Ok(_) => Ok(0), Err(e) => { @@ -218,8 +236,15 @@ fn do_pairwise( moltype: String, output_path: Option, ) -> anyhow::Result { - let template = build_template(ksize, scaled, &moltype); - match pairwise::pairwise(siglist_path, threshold, template, output_path) { + let selection = build_selection(ksize, scaled, &moltype); + let allow_failed_sigpaths = true; + match pairwise::pairwise( + siglist_path, + threshold, + &selection, + output_path, + allow_failed_sigpaths, + ) { Ok(_) => Ok(0), Err(e) => { eprintln!("Error: {e}"); diff --git a/src/manysearch.rs b/src/manysearch.rs index a95f8d69..d7ff7808 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -5,51 +5,44 @@ /// database once. use anyhow::Result; use rayon::prelude::*; - -use sourmash::signature::{Signature, SigsTrait}; -use sourmash::sketch::Sketch; -use std::path::Path; - use std::sync::atomic; use std::sync::atomic::AtomicUsize; -use crate::utils::{ - csvwriter_thread, load_sigpaths_from_zip_or_pathlist, load_sketches_from_zip_or_pathlist, - prepare_query, ReportType, SearchResult, -}; +use crate::utils::{csvwriter_thread, load_collection, load_sketches, ReportType, SearchResult}; +use sourmash::selection::Selection; +use sourmash::signature::SigsTrait; -pub fn manysearch>( - querylist: P, - siglist: P, - template: Sketch, +pub fn manysearch( + query_filepath: String, + against_filepath: String, + selection: &Selection, threshold: f64, - output: Option

, + output: Option, + allow_failed_sigpaths: bool, ) -> Result<()> { - // Read in list of query paths. - eprintln!( - "Reading list of queries from: '{}'", - querylist.as_ref().display() - ); - - // Load all queries into memory at once. - let queries = load_sketches_from_zip_or_pathlist(querylist, &template, ReportType::Query)?; - - // Load all _paths_, not signatures, into memory. - let siglist_name = siglist.as_ref().to_string_lossy().to_string(); - let (search_sigs_paths, _temp_dir) = - load_sigpaths_from_zip_or_pathlist(siglist, &template, ReportType::Against)?; - - if search_sigs_paths.is_empty() { - bail!("No signatures to search loaded, exiting."); - } - - eprintln!("Loaded {} sig paths to search.", search_sigs_paths.len()); + // Load query collection + let query_collection = load_collection( + &query_filepath, + selection, + ReportType::Query, + allow_failed_sigpaths, + )?; + // load all query sketches into memory, downsampling on the way + let query_sketchlist = load_sketches(query_collection, selection, ReportType::Query).unwrap(); + + // Against: Load all _paths_, not signatures, into memory. + let against_collection = load_collection( + &against_filepath, + selection, + ReportType::Against, + allow_failed_sigpaths, + )?; // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel::(rayon::current_num_threads()); // & spawn a thread that is dedicated to printing to a buffered output - let thrd = csvwriter_thread(recv, output.as_ref()); + let thrd = csvwriter_thread(recv, output); // // Main loop: iterate (in parallel) over all search signature paths, @@ -61,9 +54,9 @@ pub fn manysearch>( let skipped_paths = AtomicUsize::new(0); let failed_paths = AtomicUsize::new(0); - let send = search_sigs_paths + let send = against_collection .par_iter() - .filter_map(|filename| { + .filter_map(|(_idx, record)| { let i = processed_sigs.fetch_add(1, atomic::Ordering::SeqCst); if i % 1000 == 0 { eprintln!("Processed {} search sigs", i); @@ -71,18 +64,15 @@ pub fn manysearch>( let mut results = vec![]; - // load search signature from path: - match Signature::from_path(filename) { - Ok(search_sigs) => { - let location = filename.display().to_string(); - if let Some(search_sm) = prepare_query(&search_sigs, &template, &location) { - // search for matches & save containment. - for q in queries.iter() { + // against downsampling happens here + match against_collection.sig_from_record(record) { + Ok(against_sig) => { + if let Some(against_mh) = against_sig.minhash() { + for query in query_sketchlist.iter() { let overlap = - q.minhash.count_common(&search_sm.minhash, false).unwrap() as f64; - let query_size = q.minhash.size() as f64; - let target_size = search_sm.minhash.size() as f64; - + query.minhash.count_common(against_mh, false).unwrap() as f64; + let query_size = query.minhash.size() as f64; + let target_size = against_mh.size() as f64; let containment_query_in_target = overlap / query_size; let containment_in_target = overlap / target_size; let max_containment = @@ -91,40 +81,36 @@ pub fn manysearch>( if containment_query_in_target > threshold { results.push(SearchResult { - query_name: q.name.clone(), - query_md5: q.md5sum.clone(), - match_name: search_sm.name.clone(), + query_name: query.name.clone(), + query_md5: query.md5sum.clone(), + match_name: against_sig.name(), containment: containment_query_in_target, intersect_hashes: overlap as usize, - match_md5: Some(search_sm.md5sum.clone()), + match_md5: Some(against_sig.md5sum()), jaccard: Some(jaccard), max_containment: Some(max_containment), }); } } } else { - // for reading zips, this is likely not a useful warning and - // would show up too often (every sig is stored as individual file). - if !siglist_name.ends_with(".zip") { - eprintln!( - "WARNING: no compatible sketches in path '{}'", - filename.display() - ); - } + eprintln!( + "WARNING: no compatible sketches in path '{}'", + record.internal_location() + ); let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } - Some(results) } Err(err) => { - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); eprintln!("Sketch loading error: {}", err); eprintln!( - "WARNING: could not load sketches from path '{}'", - filename.display() + "WARNING: no compatible sketches in path '{}'", + record.internal_location() ); - None + let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } } + + Some(results) }) .flatten() .try_for_each_with(send, |s, m| s.send(m)); diff --git a/src/manysketch.rs b/src/manysketch.rs index 67ff25ae..a4eefc7a 100644 --- a/src/manysketch.rs +++ b/src/manysketch.rs @@ -3,10 +3,10 @@ use anyhow::{anyhow, Result}; use rayon::prelude::*; use crate::utils::{load_fasta_fromfile, sigwriter, Params, ZipMessage}; +use camino::Utf8Path as Path; use needletail::parse_fastx_file; use sourmash::cmd::ComputeParameters; use sourmash::signature::Signature; -use std::path::Path; use std::sync::atomic; use std::sync::atomic::AtomicUsize; @@ -117,7 +117,7 @@ fn build_siginfo( let sig = Signature::builder() .hash_function("0.murmur64") .name(Some(name.to_string())) - .filename(Some(filename.to_string_lossy().into_owned())) + .filename(Some(filename.to_string())) .signatures(template) .build(); sigs.push(sig); @@ -128,12 +128,12 @@ fn build_siginfo( (sigs, params_vec) } -pub fn manysketch + Sync>( - filelist: P, +pub fn manysketch( + filelist: String, param_str: String, output: String, ) -> Result<(), Box> { - let fileinfo = match load_fasta_fromfile(&filelist) { + let fileinfo = match load_fasta_fromfile(filelist) { Ok(result) => result, Err(e) => bail!("Could not load fromfile csv. Underlying error: {}", e), }; @@ -158,7 +158,7 @@ pub fn manysketch + Sync>( let send = std::sync::Arc::new(send); // & spawn a thread that is dedicated to printing to a buffered output - let thrd = sigwriter::<&str>(recv, output); + let thrd = sigwriter(recv, output); // parse param string into params_vec, print error if fail let param_result = parse_params_str(param_str); @@ -206,7 +206,7 @@ pub fn manysketch + Sync>( let mut reader = match parse_fastx_file(filename) { Ok(r) => r, Err(err) => { - eprintln!("Error opening file {}: {:?}", filename.display(), err); + eprintln!("Error opening file {}: {:?}", filename, err); let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); return None; } diff --git a/src/mastiff_manygather.rs b/src/mastiff_manygather.rs index 48a23053..cb794735 100644 --- a/src/mastiff_manygather.rs +++ b/src/mastiff_manygather.rs @@ -1,69 +1,44 @@ /// mastiff_manygather: mastiff-indexed version of fastmultigather. use anyhow::Result; +use camino::Utf8PathBuf as PathBuf; use rayon::prelude::*; - -use sourmash::signature::Signature; -use sourmash::sketch::Sketch; -use std::path::Path; - -use sourmash::index::revindex::RevIndex; - +use sourmash::index::revindex::{RevIndex, RevIndexOps}; +use sourmash::prelude::*; use std::sync::atomic; use std::sync::atomic::AtomicUsize; -use std::fs::File; -use std::io::{BufWriter, Write}; - use crate::utils::{ - is_revindex_database, load_sigpaths_from_zip_or_pathlist, prepare_query, ReportType, + csvwriter_thread, is_revindex_database, load_collection, BranchwaterGatherResult, ReportType, }; -pub fn mastiff_manygather>( - queries_file: P, - index: P, - template: Sketch, +pub fn mastiff_manygather( + queries_file: String, + index: PathBuf, + selection: &Selection, threshold_bp: usize, - output: Option

, + output: Option, + allow_failed_sigpaths: bool, ) -> Result<(), Box> { - if !is_revindex_database(index.as_ref()) { - bail!( - "'{}' is not a valid RevIndex database", - index.as_ref().display() - ); + if !is_revindex_database(&index) { + bail!("'{}' is not a valid RevIndex database", index); } // Open database once - let db = RevIndex::open(index.as_ref(), true); + let db = RevIndex::open(index, true)?; println!("Loaded DB"); - // Load query paths - let queryfile_name = queries_file.as_ref().to_string_lossy().to_string(); - let (query_paths, _temp_dir) = - load_sigpaths_from_zip_or_pathlist(&queries_file, &template, ReportType::Query)?; + let query_collection = load_collection( + &queries_file, + selection, + ReportType::Query, + allow_failed_sigpaths, + )?; // set up a multi-producer, single-consumer channel. - let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); + let (send, recv) = + std::sync::mpsc::sync_channel::(rayon::current_num_threads()); // & spawn a thread that is dedicated to printing to a buffered output - let out: Box = match output { - Some(path) => Box::new(BufWriter::new(File::create(path).unwrap())), - None => Box::new(std::io::stdout()), - }; - let thrd = std::thread::spawn(move || { - let mut writer = BufWriter::new(out); - writeln!( - &mut writer, - "query_name,query_md5,match_name,match_md5,f_match_query,intersect_bp" - ) - .unwrap(); - for (query, query_md5, m, m_md5, f_match_query, intersect_bp) in recv.into_iter() { - writeln!( - &mut writer, - "\"{}\",{},\"{}\",{},{},{}", - query, query_md5, m, m_md5, f_match_query, intersect_bp - ) - .ok(); - } - }); + let thrd = csvwriter_thread(recv, output); // // Main loop: iterate (in parallel) over all search signature paths, @@ -75,61 +50,51 @@ pub fn mastiff_manygather>( let skipped_paths = AtomicUsize::new(0); let failed_paths = AtomicUsize::new(0); - let send = query_paths + let send = query_collection .par_iter() - .filter_map(|filename| { - let i = processed_sigs.fetch_add(1, atomic::Ordering::SeqCst); - if i % 1000 == 0 { - eprintln!("Processed {} search sigs", i); - } + .filter_map(|(_idx, record)| { + let threshold = threshold_bp / selection.scaled()? as usize; - let mut results = vec![]; - - // load query signature from path: - match Signature::from_path(filename) { + // query downsampling happens here + match query_collection.sig_from_record(record) { Ok(query_sig) => { - let location = filename.display().to_string(); - if let Some(query) = prepare_query(&query_sig, &template, &location) { - // let query_size = query.minhash.size() as f64; - let threshold = threshold_bp / query.minhash.scaled() as usize; - - // mastiff gather code + let mut results = vec![]; + if let Some(query_mh) = query_sig.minhash() { + // Gather! let (counter, query_colors, hash_to_color) = - db.prepare_gather_counters(&query.minhash); + db.prepare_gather_counters(query_mh); let matches = db.gather( counter, query_colors, hash_to_color, threshold, - &query.minhash, - &template, + query_mh, + Some(selection.clone()), ); - - // extract matches from Result + // extract results TODO: ADD REST OF GATHER COLUMNS if let Ok(matches) = matches { for match_ in &matches { - results.push(( - query.name.clone(), - query.md5sum.clone(), - match_.name().clone(), - match_.md5().clone(), - match_.f_match(), // f_match_query - match_.intersect_bp(), - )); // intersect_bp + results.push(BranchwaterGatherResult { + query_name: query_sig.name().clone(), + query_md5: query_sig.md5sum().clone(), + match_name: match_.name().clone(), + match_md5: match_.md5().clone(), + f_match_query: match_.f_match(), + intersect_bp: match_.intersect_bp(), + }); } } else { eprintln!("Error gathering matches: {:?}", matches.err()); } } else { - if !queryfile_name.ends_with(".zip") { - eprintln!( - "WARNING: no compatible sketches in path '{}'", - filename.display() - ); - } + eprintln!( + "WARNING: no compatible sketches in path '{}'", + query_sig.filename() + ); let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } + if results.is_empty() { None } else { @@ -137,12 +102,8 @@ pub fn mastiff_manygather>( } } Err(err) => { + eprintln!("Error loading sketch: {}", err); let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); - eprintln!("Sketch loading error: {}", err); - eprintln!( - "WARNING: could not load sketches from path '{}'", - filename.display() - ); None } } @@ -174,7 +135,7 @@ pub fn mastiff_manygather>( } if failed_paths > 0 { eprintln!( - "WARNING: {} signature paths failed to load. See error messages above.", + "WARNING: {} query paths failed to load. See error messages above.", failed_paths ); } diff --git a/src/mastiff_manysearch.rs b/src/mastiff_manysearch.rs index 4681a8ef..0b7c163d 100644 --- a/src/mastiff_manysearch.rs +++ b/src/mastiff_manysearch.rs @@ -1,53 +1,45 @@ /// mastiff_manysearch: mastiff-indexed version of manysearch. use anyhow::Result; +use camino::Utf8PathBuf as PathBuf; use rayon::prelude::*; - -use sourmash::signature::{Signature, SigsTrait}; -use sourmash::sketch::Sketch; -use std::path::Path; - -use sourmash::index::revindex::RevIndex; - +use sourmash::index::revindex::{RevIndex, RevIndexOps}; +use sourmash::selection::Selection; +use sourmash::signature::SigsTrait; use std::sync::atomic; use std::sync::atomic::AtomicUsize; use crate::utils::{ - csvwriter_thread, is_revindex_database, load_sigpaths_from_zip_or_pathlist, prepare_query, - ReportType, SearchResult, + csvwriter_thread, is_revindex_database, load_collection, ReportType, SearchResult, }; -pub fn mastiff_manysearch>( - queries_file: P, - index: P, - template: Sketch, +pub fn mastiff_manysearch( + queries_path: String, + index: PathBuf, + selection: &Selection, minimum_containment: f64, - output: Option

, + output: Option, + allow_failed_sigpaths: bool, ) -> Result<(), Box> { - if !is_revindex_database(index.as_ref()) { - bail!( - "'{}' is not a valid RevIndex database", - index.as_ref().display() - ); + if !is_revindex_database(&index) { + bail!("'{}' is not a valid RevIndex database", index); } // Open database once - let db = RevIndex::open(index.as_ref(), true); + let db = RevIndex::open(index, true)?; println!("Loaded DB"); // Load query paths - let queryfile_name = queries_file.as_ref().to_string_lossy().to_string(); - let (query_paths, _temp_dir) = - load_sigpaths_from_zip_or_pathlist(&queries_file, &template, ReportType::Query)?; - - // if query_paths is empty, exit with error - if query_paths.is_empty() { - bail!("No query signatures loaded, exiting."); - } + let query_collection = load_collection( + &queries_path, + selection, + ReportType::Query, + allow_failed_sigpaths, + )?; // set up a multi-producer, single-consumer channel. let (send, recv) = std::sync::mpsc::sync_channel::(rayon::current_num_threads()); // & spawn a thread that is dedicated to printing to a buffered output - let thrd = csvwriter_thread(recv, output.as_ref()); + let thrd = csvwriter_thread(recv, output); // // Main loop: iterate (in parallel) over all search signature paths, @@ -59,34 +51,31 @@ pub fn mastiff_manysearch>( let skipped_paths = AtomicUsize::new(0); let failed_paths = AtomicUsize::new(0); - let send_result = query_paths + let send_result = query_collection .par_iter() - .filter_map(|filename| { + .filter_map(|(_idx, record)| { let i = processed_sigs.fetch_add(1, atomic::Ordering::SeqCst); if i % 1000 == 0 { eprintln!("Processed {} search sigs", i); } let mut results = vec![]; - - // load query signature from path: - match Signature::from_path(filename) { + // query downsample happens here + match query_collection.sig_from_record(record) { Ok(query_sig) => { - let location = filename.display().to_string(); - if let Some(query) = prepare_query(&query_sig, &template, &location) { - let query_size = query.minhash.size() as f64; - // search mastiff db - let counter = db.counter_for_query(&query.minhash); + if let Some(query_mh) = query_sig.minhash() { + let query_size = query_mh.size(); + let counter = db.counter_for_query(query_mh); let matches = db.matches_from_counter(counter, minimum_containment as usize); // filter the matches for containment for (path, overlap) in matches { - let containment = overlap as f64 / query_size; + let containment = overlap as f64 / query_size as f64; if containment >= minimum_containment { results.push(SearchResult { - query_name: query.name.clone(), - query_md5: query.md5sum.clone(), + query_name: query_sig.name(), + query_md5: query_sig.md5sum(), match_name: path.clone(), containment, intersect_hashes: overlap, @@ -97,14 +86,10 @@ pub fn mastiff_manysearch>( } } } else { - // for reading zips, this is likely not a useful warning and - // would show up too often (every sig is stored as individual file). - if !queryfile_name.ends_with(".zip") { - eprintln!( - "WARNING: no compatible sketches in path '{}'", - filename.display() - ); - } + eprintln!( + "WARNING: no compatible sketches in path '{}'", + query_sig.filename() + ); let _ = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } if results.is_empty() { @@ -118,7 +103,7 @@ pub fn mastiff_manysearch>( eprintln!("Sketch loading error: {}", err); eprintln!( "WARNING: could not load sketches from path '{}'", - filename.display() + record.internal_location() ); None } @@ -163,7 +148,5 @@ pub fn mastiff_manysearch>( ); } - // _temp_dir goes out of scope => is deleted. - Ok(()) } diff --git a/src/multisearch.rs b/src/multisearch.rs index 73fe9437..c4f33843 100644 --- a/src/multisearch.rs +++ b/src/multisearch.rs @@ -1,57 +1,53 @@ -use anyhow::Result; /// multisearch: massively parallel in-memory sketch search. +use anyhow::Result; use rayon::prelude::*; - -use std::fs::File; -use std::io::{BufWriter, Write}; -use std::path::Path; - +use sourmash::selection::Selection; +use sourmash::signature::SigsTrait; use std::sync::atomic; use std::sync::atomic::AtomicUsize; -use sourmash::signature::SigsTrait; -use sourmash::sketch::Sketch; - -use crate::utils::{load_sketches_from_zip_or_pathlist, ReportType}; +use crate::utils::{ + csvwriter_thread, load_collection, load_sketches, MultiSearchResult, ReportType, +}; /// Search many queries against a list of signatures. /// /// Note: this function loads all _queries_ into memory, and iterates over /// database once. -pub fn multisearch>( - querylist: P, - againstlist: P, +pub fn multisearch( + query_filepath: String, + against_filepath: String, threshold: f64, - template: Sketch, - output: Option

, + selection: &Selection, + output: Option, + allow_failed_sigpaths: bool, ) -> Result<(), Box> { // Load all queries into memory at once. - let queries = load_sketches_from_zip_or_pathlist(&querylist, &template, ReportType::Query)?; + + let query_collection = load_collection( + &query_filepath, + selection, + ReportType::Query, + allow_failed_sigpaths, + )?; + let queries = load_sketches(query_collection, selection, ReportType::Query).unwrap(); // Load all against sketches into memory at once. - let against = load_sketches_from_zip_or_pathlist(&againstlist, &template, ReportType::Against)?; + let against_collection = load_collection( + &against_filepath, + selection, + ReportType::Against, + allow_failed_sigpaths, + )?; + let against = load_sketches(against_collection, selection, ReportType::Against).unwrap(); // set up a multi-producer, single-consumer channel. - let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); - - // & spawn a thread that is dedicated to printing to a buffered output - let out: Box = match output { - Some(path) => Box::new(BufWriter::new(File::create(path).unwrap())), - None => Box::new(std::io::stdout()), - }; - let thrd = std::thread::spawn(move || { - let mut writer = BufWriter::new(out); - writeln!(&mut writer, "query_name,query_md5,match_name,match_md5,containment,max_containment,jaccard,intersect_hashes").unwrap(); - for (query, query_md5, m, m_md5, cont, max_cont, jaccard, overlap) in recv.into_iter() { - writeln!( - &mut writer, - "\"{}\",{},\"{}\",{},{},{},{},{}", - query, query_md5, m, m_md5, cont, max_cont, jaccard, overlap - ) - .ok(); - } - }); + let (send, recv) = + std::sync::mpsc::sync_channel::(rayon::current_num_threads()); + + // // & spawn a thread that is dedicated to printing to a buffered output + let thrd = csvwriter_thread(recv, output); // // Main loop: iterate (in parallel) over all search signature paths, @@ -63,19 +59,19 @@ pub fn multisearch>( let send = against .par_iter() - .filter_map(|target| { + .filter_map(|against| { let mut results = vec![]; - // search for matches & save containment. - for q in queries.iter() { + for query in queries.iter() { let i = processed_cmp.fetch_add(1, atomic::Ordering::SeqCst); if i % 100000 == 0 { eprintln!("Processed {} comparisons", i); } - let overlap = q.minhash.count_common(&target.minhash, false).unwrap() as f64; - let query_size = q.minhash.size() as f64; - let target_size = target.minhash.size() as f64; + let overlap = query.minhash.count_common(&against.minhash, false).unwrap() as f64; + // use downsampled sizes + let query_size = query.minhash.size() as f64; + let target_size = against.minhash.size() as f64; let containment_query_in_target = overlap / query_size; let containment_in_target = overlap / target_size; @@ -83,16 +79,16 @@ pub fn multisearch>( let jaccard = overlap / (target_size + query_size - overlap); if containment_query_in_target > threshold { - results.push(( - q.name.clone(), - q.md5sum.clone(), - target.name.clone(), - target.md5sum.clone(), - containment_query_in_target, + results.push(MultiSearchResult { + query_name: query.name.clone(), + query_md5: query.md5sum.clone(), + match_name: against.name.clone(), + match_md5: against.md5sum.clone(), + containment: containment_query_in_target, max_containment, jaccard, - overlap, - )) + intersect_hashes: overlap, + }) } } if results.is_empty() { diff --git a/src/pairwise.rs b/src/pairwise.rs index 6e7fe7c4..aca9f797 100644 --- a/src/pairwise.rs +++ b/src/pairwise.rs @@ -1,52 +1,48 @@ -use anyhow::Result; /// pairwise: massively parallel in-memory pairwise comparisons. +use anyhow::Result; use rayon::prelude::*; - -use std::fs::File; -use std::io::{BufWriter, Write}; -use std::path::Path; - use std::sync::atomic; use std::sync::atomic::AtomicUsize; +use crate::utils::{ + csvwriter_thread, load_collection, load_sketches, MultiSearchResult, ReportType, +}; +use sourmash::selection::Selection; use sourmash::signature::SigsTrait; -use sourmash::sketch::Sketch; - -use crate::utils::{load_sketches_from_zip_or_pathlist, ReportType}; /// Perform pairwise comparisons of all signatures in a list. /// /// Note: this function loads all _signatures_ into memory. -pub fn pairwise>( - siglist: P, +pub fn pairwise( + siglist: String, threshold: f64, - template: Sketch, - output: Option

, + selection: &Selection, + output: Option, + allow_failed_sigpaths: bool, ) -> Result<(), Box> { // Load all sigs into memory at once. - let sigs = load_sketches_from_zip_or_pathlist(&siglist, &template, ReportType::Query)?; + let collection = load_collection( + &siglist, + selection, + ReportType::General, + allow_failed_sigpaths, + )?; + + if collection.len() <= 1 { + bail!( + "Pairwise requires two or more sketches. Check input: '{:?}'", + &siglist + ) + } + let sketches = load_sketches(collection, selection, ReportType::General).unwrap(); // set up a multi-producer, single-consumer channel. - let (send, recv) = std::sync::mpsc::sync_channel(rayon::current_num_threads()); - - // & spawn a thread that is dedicated to printing to a buffered output - let out: Box = match output { - Some(path) => Box::new(BufWriter::new(File::create(path).unwrap())), - None => Box::new(std::io::stdout()), - }; - let thrd = std::thread::spawn(move || { - let mut writer = BufWriter::new(out); - writeln!(&mut writer, "query_name,query_md5,match_name,match_md5,containment,max_containment,jaccard,intersect_hashes").unwrap(); - for (query, query_md5, m, m_md5, cont, max_cont, jaccard, overlap) in recv.into_iter() { - writeln!( - &mut writer, - "\"{}\",{},\"{}\",{},{},{},{},{}", - query, query_md5, m, m_md5, cont, max_cont, jaccard, overlap - ) - .ok(); - } - }); + let (send, recv) = + std::sync::mpsc::sync_channel::(rayon::current_num_threads()); + + // // & spawn a thread that is dedicated to printing to a buffered output + let thrd = csvwriter_thread(recv, output); // // Main loop: iterate (in parallel) over all signature, @@ -54,11 +50,11 @@ pub fn pairwise>( let processed_cmp = AtomicUsize::new(0); - sigs.par_iter().enumerate().for_each(|(i, q1)| { - for q2 in &sigs[(i + 1)..] { - let overlap = q1.minhash.count_common(&q2.minhash, false).unwrap() as f64; - let query1_size = q1.minhash.size() as f64; - let query2_size = q2.minhash.size() as f64; + sketches.par_iter().enumerate().for_each(|(idx, query)| { + for against in sketches.iter().skip(idx + 1) { + let overlap = query.minhash.count_common(&against.minhash, false).unwrap() as f64; + let query1_size = query.minhash.size() as f64; + let query2_size = against.minhash.size() as f64; let containment_q1_in_q2 = overlap / query1_size; let containment_q2_in_q1 = overlap / query2_size; @@ -66,16 +62,16 @@ pub fn pairwise>( let jaccard = overlap / (query1_size + query2_size - overlap); if containment_q1_in_q2 > threshold || containment_q2_in_q1 > threshold { - send.send(( - q1.name.clone(), - q1.md5sum.clone(), - q2.name.clone(), - q2.md5sum.clone(), - containment_q1_in_q2, + send.send(MultiSearchResult { + query_name: query.name.clone(), + query_md5: query.md5sum.clone(), + match_name: against.name.clone(), + match_md5: against.md5sum.clone(), + containment: containment_q1_in_q2, max_containment, jaccard, - overlap, - )) + intersect_hashes: overlap, + }) .unwrap(); } diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py index 6aff91b3..def6fec7 100755 --- a/src/python/sourmash_plugin_branchwater/__init__.py +++ b/src/python/sourmash_plugin_branchwater/__init__.py @@ -189,8 +189,6 @@ def __init__(self, p): help='scaled factor at which to do comparisons') p.add_argument('-m', '--moltype', default='DNA', choices = ["DNA", "protein", "dayhoff", "hp"], help = 'molecule type (DNA, protein, dayhoff, or hp; default DNA)') - p.add_argument('--save-paths', action='store_true', - help='save paths to signatures into index. Default: save full sig into index') p.add_argument('-c', '--cores', default=0, type=int, help='number of cores to use (default is all available)') @@ -208,7 +206,6 @@ def main(self, args): args.scaled, args.moltype, args.output, - args.save_paths, False) # colors - currently must be false? if status == 0: notify(f"...index is done! results in '{args.output}'") diff --git a/src/python/tests/test_gather.py b/src/python/tests/test_gather.py index 2b59ea2b..2e975613 100644 --- a/src/python/tests/test_gather.py +++ b/src/python/tests/test_gather.py @@ -120,7 +120,8 @@ def test_missing_query(runtmp, capfd, zip_against): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + assert 'Error: No such file or directory' in captured.err + @pytest.mark.parametrize('zip_against', [False, True]) def test_bad_query(runtmp, capfd, zip_against): @@ -132,9 +133,9 @@ def test_bad_query(runtmp, capfd, zip_against): sig47 = get_test_data('47.fa.sig.gz') sig63 = get_test_data('63.fa.sig.gz') - # since 'query' needs to be a sig, this breaks it. - make_file_list(query, [sig2]) - + # query doesn't need to be a sig anymore - sig, zip, or pathlist welcome + # as long as there's only one sketch that matches params + make_file_list(query, [sig2,sig47]) # [sig2] make_file_list(against_list, [sig2, sig47, sig63]) if zip_against: @@ -151,7 +152,7 @@ def test_bad_query(runtmp, capfd, zip_against): captured = capfd.readouterr() print(captured.err) - assert 'Error: expected value at line 1' in captured.err + assert 'Error: Fastgather requires a single query sketch. Check input:' in captured.err @pytest.mark.parametrize('zip_against', [False, True]) @@ -179,11 +180,11 @@ def test_missing_against(runtmp, capfd, zip_against): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + assert 'Error: No such file or directory' in captured.err -def test_bad_against(runtmp, capfd): - # test bad 'against' file - in this case, use a .sig.gz file. +def test_sig_against(runtmp, capfd): + # sig file is ok as against file now query = get_test_data('SRR606249.sig.gz') sig2 = get_test_data('2.fa.sig.gz') @@ -191,18 +192,23 @@ def test_bad_against(runtmp, capfd): g_output = runtmp.output('gather.csv') p_output = runtmp.output('prefetch.csv') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'fastgather', query, sig2, + runtmp.sourmash('scripts', 'fastgather', query, sig2, '-o', g_output, '--output-prefetch', p_output, '-s', '100000') captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid line in fromfile ' in captured.err + assert os.path.exists(g_output) + + df = pandas.read_csv(g_output) + assert len(df) == 1 + keys = set(df.keys()) + assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} -def test_bad_against_2(runtmp, capfd): + +def test_bad_against(runtmp, capfd): # test bad 'against' file - in this case, one containing a bad filename. query = get_test_data('SRR606249.sig.gz') against_list = runtmp.output('against.txt') @@ -225,7 +231,7 @@ def test_bad_against_2(runtmp, capfd): assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err -def test_bad_against_3(runtmp, capfd): +def test_bad_against_2(runtmp, capfd): # test bad 'against' file - in this case, one containing an empty file query = get_test_data('SRR606249.sig.gz') against_list = runtmp.output('against.txt') @@ -253,7 +259,7 @@ def test_bad_against_3(runtmp, capfd): assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err -def test_bad_against_4(runtmp, capfd): +def test_bad_against_3(runtmp, capfd): # test with a bad against (a .sig.gz file renamed as zip file) query = get_test_data('SRR606249.sig.gz') @@ -275,7 +281,7 @@ def test_bad_against_4(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + assert 'InvalidArchive' in captured.err @pytest.mark.parametrize('zip_against', [False, True]) @@ -298,19 +304,44 @@ def test_against_multisigfile(runtmp, zip_against): g_output = runtmp.output('gather.csv') p_output = runtmp.output('prefetch.csv') + runtmp.sourmash('scripts', 'fastgather', query, combined, + '-o', g_output, '--output-prefetch', p_output, + '-s', '100000') + df = pandas.read_csv(g_output) + assert len(df) == 3 + print(df) + + +def test_against_multisigfile_in_pathlist(runtmp): + # test against a sigfile that contains multiple sketches + query = get_test_data('SRR606249.sig.gz') + against_list = runtmp.output('against.txt') + + sig2 = get_test_data('2.fa.sig.gz') + sig47 = get_test_data('47.fa.sig.gz') + sig63 = get_test_data('63.fa.sig.gz') + + combined = runtmp.output('combined.sig.gz') + runtmp.sourmash('sig', 'cat', sig2, sig47, sig63, '-o', combined) + make_file_list(against_list, [combined]) + + g_output = runtmp.output('gather.csv') + p_output = runtmp.output('prefetch.csv') + runtmp.sourmash('scripts', 'fastgather', query, against_list, '-o', g_output, '--output-prefetch', p_output, '-s', '100000') df = pandas.read_csv(g_output) - if zip_against: - assert len(df) == 3 - else: - assert len(df) == 1 + print(df) + assert len(df) == 3 # @CTB this is a bug :(. It should load multiple sketches properly! + # @NTP: see pathlist loading in load_collection. When we build + # records from a signature, all records from the same signature + # are read in, but end up having the same name/md5sum @pytest.mark.parametrize('zip_against', [False, True]) -def test_query_multisigfile(runtmp, zip_against): +def test_query_multisigfile(runtmp, capfd, zip_against): # test with a sigfile that contains multiple sketches against_list = runtmp.output('against.txt') @@ -329,12 +360,14 @@ def test_query_multisigfile(runtmp, zip_against): g_output = runtmp.output('gather.csv') p_output = runtmp.output('prefetch.csv') - runtmp.sourmash('scripts', 'fastgather', combined, against_list, + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'fastgather', combined, against_list, '-o', g_output, '--output-prefetch', p_output, '-s', '100000') - # @CTB this should fail, not succeed :(. - df = pandas.read_csv(g_output) - assert len(df) == 1 + # this fails now :) + captured = capfd.readouterr() + print(captured.err) + assert "Error: Fastgather requires a single query sketch. Check input:" in captured.err @pytest.mark.parametrize('zip_against', [False, True]) @@ -549,7 +582,7 @@ def test_simple_protein(runtmp): # test basic protein execution sigs = get_test_data('protein.zip') - query = runtmp.output('query.sig') + query = runtmp.output('query.zip') against = runtmp.output('against.zip') # extract query from zip file runtmp.sourmash('sig', 'extract', sigs, '--name', 'GCA_001593935', '-o', query) @@ -576,7 +609,7 @@ def test_simple_dayhoff(runtmp): # test basic protein execution sigs = get_test_data('dayhoff.zip') - query = runtmp.output('query.sig') + query = runtmp.output('query.zip') against = runtmp.output('against.zip') # extract query from zip file runtmp.sourmash('sig', 'extract', sigs, '--name', 'GCA_001593935', '-o', query) @@ -603,7 +636,7 @@ def test_simple_hp(runtmp): # test basic protein execution sigs = get_test_data('hp.zip') - query = runtmp.output('query.sig') + query = runtmp.output('query.zip') against = runtmp.output('against.zip') # extract query from zip file runtmp.sourmash('sig', 'extract', sigs, '--name', 'GCA_001593935', '-o', query) diff --git a/src/python/tests/test_index.py b/src/python/tests/test_index.py index 2968b356..432d7630 100644 --- a/src/python/tests/test_index.py +++ b/src/python/tests/test_index.py @@ -89,22 +89,22 @@ def test_index_missing_siglist(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + assert 'Error: No such file or directory' in captured.err -def test_index_bad_siglist(runtmp, capfd): - # test index with a bad siglist (.sig.gz file instead of pathlist) +def test_index_sig(runtmp, capfd): + # test index with a .sig.gz file instead of pathlist + # (should work now) sig2 = get_test_data('2.fa.sig.gz') output = runtmp.output('out.db') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'index', sig2, + runtmp.sourmash('scripts', 'index', sig2, '-o', output) captured = capfd.readouterr() print(captured.err) - assert "Error: invalid line in fromfile" in captured.err print(runtmp.last_result.err) + assert 'index is done' in runtmp.last_result.err def test_index_bad_siglist_2(runtmp, capfd): @@ -124,7 +124,7 @@ def test_index_bad_siglist_2(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error processing "no-exist"' in captured.err + assert "WARNING: could not load sketches from path 'no-exist'" in captured.err def test_index_empty_siglist(runtmp, capfd): @@ -138,11 +138,16 @@ def test_index_empty_siglist(runtmp, capfd): '-o', output) captured = capfd.readouterr() + assert not os.path.exists(output) # do we want an empty file, or no file? + print(runtmp.last_result.out) + print(runtmp.last_result.err) print(captured.err) - assert "No signatures to index loaded, exiting." in captured.err + assert "Error: Signatures failed to load. Exiting." in captured.err def test_index_nomatch_sig_in_siglist(runtmp, capfd): + ## TODO: index:: do not write output if no signatures to write? + # test index with a siglist file that has (only) a non-matching ksize sig siglist = runtmp.output('against.txt') db = runtmp.output('db.rdb') @@ -151,13 +156,16 @@ def test_index_nomatch_sig_in_siglist(runtmp, capfd): sig1 = get_test_data('1.fa.k21.sig.gz') make_file_list(siglist, [sig2, sig1]) - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'index', siglist, + # with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'index', siglist, '-o', db) captured = capfd.readouterr() + assert os.path.exists(db) # do we want an empty file, or no file? + print(runtmp.last_result.out) + print(runtmp.last_result.err) print(captured.err) - assert "Couldn't find a compatible MinHash" in captured.err + # assert "Couldn't find a compatible MinHash" in captured.err def test_index_zipfile(runtmp, capfd): @@ -184,7 +192,7 @@ def test_index_zipfile(runtmp, capfd): assert 'index is done' in runtmp.last_result.err captured = capfd.readouterr() print(captured.err) - assert 'Found 3 filepaths' in captured.err + # assert 'Found 3 filepaths' in captured.err def test_index_zipfile_repeated_md5sums(runtmp, capfd): @@ -212,7 +220,7 @@ def test_index_zipfile_repeated_md5sums(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Found 3 filepaths' in captured.err + # assert 'Found 3 filepaths' in captured.err assert 'index is done' in runtmp.last_result.err @@ -243,8 +251,8 @@ def test_index_zipfile_multiparam(runtmp, capfd): assert 'index is done' in runtmp.last_result.err captured = capfd.readouterr() print(captured.err) - assert 'WARNING: skipped 5 index paths - no compatible signatures.' in captured.err - assert 'Found 4 filepaths' in captured.err + # assert 'WARNING: skipped 5 index paths - no compatible signatures.' in captured.err + # assert 'Found 4 filepaths' in captured.err def test_index_zipfile_bad(runtmp, capfd): @@ -266,7 +274,8 @@ def test_index_zipfile_bad(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + assert "Couldn't find End Of Central Directory Record" in captured.err + # assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err def test_index_check(runtmp): diff --git a/src/python/tests/test_multigather.py b/src/python/tests/test_multigather.py index a00d2b62..1f96eed1 100644 --- a/src/python/tests/test_multigather.py +++ b/src/python/tests/test_multigather.py @@ -67,8 +67,8 @@ def test_simple(runtmp, zip_against): print(os.listdir(runtmp.output(''))) - g_output = runtmp.output('SRR606249.sig.gz.gather.csv') - p_output = runtmp.output('SRR606249.sig.gz.prefetch.csv') + g_output = runtmp.output('SRR606249.gather.csv') + p_output = runtmp.output('SRR606249.prefetch.csv') assert os.path.exists(p_output) # check prefetch output (only non-indexed gather) @@ -79,6 +79,7 @@ def test_simple(runtmp, zip_against): assert os.path.exists(g_output) df = pandas.read_csv(g_output) + print(df) assert len(df) == 3 keys = set(df.keys()) assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} @@ -109,9 +110,8 @@ def test_simple_zip_query(runtmp): print(os.listdir(runtmp.output(''))) - # outputs are based on md5sum, e.g. "{md5}.sig.gz.gather.csv" - g_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.gather.csv') - p_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.prefetch.csv') + g_output = runtmp.output('SRR606249.gather.csv') + p_output = runtmp.output('SRR606249.prefetch.csv') # check prefetch output (only non-indexed gather) assert os.path.exists(p_output) @@ -183,12 +183,15 @@ def test_missing_querylist(runtmp, capfd, indexed, zip_query): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + # assert 'Error: failed to load query' in captured.err + assert 'Error: No such file or directory' in captured.err @pytest.mark.parametrize('indexed', [False, True]) -def test_bad_query(runtmp, capfd, indexed): - # test bad querylist (a sig file) +def test_sig_query(runtmp, capfd, indexed): + # sig file is now fine as a query + query = get_test_data('SRR606249.sig.gz') + against_list = runtmp.output('against.txt') sig2 = get_test_data('2.fa.sig.gz') @@ -199,19 +202,37 @@ def test_bad_query(runtmp, capfd, indexed): if indexed: against_list = index_siglist(runtmp, against_list, runtmp.output('db')) + g_output = runtmp.output('out.csv') + else: + g_output = runtmp.output('SRR606249.gather.csv') + p_output = runtmp.output('SRR606249.prefetch.csv') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'fastmultigather', sig2, against_list, - '-s', '100000') + runtmp.sourmash('scripts', 'fastmultigather', query, against_list, + '-s', '100000', '-o', g_output) captured = capfd.readouterr() print(captured.err) + if not indexed: + # check prefetch output (only non-indexed gather) + assert os.path.exists(p_output) + df = pandas.read_csv(p_output) + assert len(df) == 3 + keys = set(df.keys()) + assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'} - assert 'Error: invalid line in fromfile ' in captured.err + # check gather output (both) + assert os.path.exists(g_output) + df = pandas.read_csv(g_output) + assert len(df) == 3 + keys = set(df.keys()) + if indexed: + assert keys == {'query_name', 'query_md5', 'match_name', 'match_md5', 'f_match_query', 'intersect_bp'} + else: + assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} @pytest.mark.parametrize('indexed', [False, True]) -def test_bad_query_2(runtmp, capfd, indexed): +def test_bad_query(runtmp, capfd, indexed): # test with a bad query (a .sig.gz file renamed as zip file) against_list = runtmp.output('against.txt') @@ -239,12 +260,12 @@ def test_bad_query_2(runtmp, capfd, indexed): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + assert "InvalidArchive" in captured.err @pytest.mark.parametrize('indexed', [False, True]) def test_missing_query(runtmp, capfd, indexed): - # test missingfile in querylist + # test missing query query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -263,7 +284,6 @@ def test_missing_query(runtmp, capfd, indexed): captured = capfd.readouterr() print(captured.err) - assert "WARNING: could not load sketches from path 'no-exist'" in captured.err assert "WARNING: 1 query paths failed to load. See error messages above." @@ -294,10 +314,7 @@ def test_nomatch_query(runtmp, capfd, indexed, zip_query): captured = capfd.readouterr() print(captured.err) - if zip_query: - assert "WARNING: no compatible sketches in path " not in captured.err - else: - assert "WARNING: no compatible sketches in path " in captured.err + # assert "WARNING: no compatible sketches in path " in captured.err assert "WARNING: skipped 1 query paths - no compatible signatures." in captured.err @@ -324,27 +341,40 @@ def test_missing_against(runtmp, capfd, zip_against): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + assert 'Error: No such file or directory' in captured.err -def test_bad_against(runtmp, capfd): - # test bad 'against' file - in this case, use a .sig.gz file. +def test_sig_against(runtmp, capfd): + # against file can be a sig now query = get_test_data('SRR606249.sig.gz') against_list = runtmp.output('against.txt') sig2 = get_test_data('2.fa.sig.gz') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'fastmultigather', query, sig2, + g_output = runtmp.output('SRR606249.gather.csv') + p_output = runtmp.output('SRR606249.prefetch.csv') + runtmp.sourmash('scripts', 'fastmultigather', query, sig2, '-s', '100000') captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid line in fromfile ' in captured.err + # check prefetch output (only non-indexed gather) + assert os.path.exists(p_output) + df = pandas.read_csv(p_output) + assert len(df) == 1 + keys = set(df.keys()) + assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'intersect_bp'} + + # check gather output + assert os.path.exists(g_output) + df = pandas.read_csv(g_output) + assert len(df) == 1 + keys = set(df.keys()) + assert keys == {'query_filename', 'query_name', 'query_md5', 'match_name', 'match_md5', 'rank', 'intersect_bp'} -def test_bad_against_2(runtmp, capfd): +def test_bad_against(runtmp, capfd): # test bad 'against' file - in this case, one containing a nonexistent file query = get_test_data('SRR606249.sig.gz') query_list = runtmp.output('query.txt') @@ -366,8 +396,8 @@ def test_bad_against_2(runtmp, capfd): @pytest.mark.parametrize('zip_query', [False, True]) -def test_bad_against_3(runtmp, capfd, zip_query): - # test with a bad query (a .sig.gz file renamed as zip file) +def test_bad_against_2(runtmp, capfd, zip_query): + # test with a bad against (a .sig.gz file renamed as zip file) query = get_test_data('SRR606249.sig.gz') query_list = runtmp.output('query.txt') make_file_list(query_list, [query]) @@ -385,15 +415,16 @@ def test_bad_against_3(runtmp, capfd, zip_query): with pytest.raises(utils.SourmashCommandFailed): runtmp.sourmash('scripts', 'fastmultigather', query_list, against_zip, - '-o', output) + '-s', '100000', '-o', output) captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + assert 'InvalidArchive' in captured.err def test_empty_against(runtmp, capfd): + # like fastgather - exit gracefully. # test bad 'against' file - in this case, an empty one query = get_test_data('SRR606249.sig.gz') query_list = runtmp.output('query.txt') @@ -402,15 +433,14 @@ def test_empty_against(runtmp, capfd): against_list = runtmp.output('against.txt') make_file_list(against_list, []) - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, + runtmp.sourmash('scripts', 'fastmultigather', query_list, against_list, '-s', '100000') captured = capfd.readouterr() print(captured.err) - assert "Loaded 0 search signature(s)" in captured.err - assert "Error: No search signatures loaded, exiting." in captured.err + assert "Sketch loading error: No such file or directory" in captured.err + assert "No search signatures loaded, exiting." in captured.err @pytest.mark.parametrize('zip_against', [False, True]) @@ -465,11 +495,8 @@ def test_md5(runtmp, zip_query): print(os.listdir(runtmp.output(''))) - g_output = runtmp.output('SRR606249.sig.gz.gather.csv') - p_output = runtmp.output('SRR606249.sig.gz.prefetch.csv') - if zip_query: - g_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.gather.csv') - p_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.prefetch.csv') + g_output = runtmp.output('SRR606249.gather.csv') + p_output = runtmp.output('SRR606249.prefetch.csv') # check prefetch output (only non-indexed gather) assert os.path.exists(p_output) @@ -560,11 +587,8 @@ def test_csv_columns_vs_sourmash_prefetch(runtmp, zip_query, zip_against): finally: os.chdir(cwd) - g_output = runtmp.output('SRR606249.sig.gz.gather.csv') - p_output = runtmp.output('SRR606249.sig.gz.prefetch.csv') - if zip_query: - g_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.gather.csv') - p_output = runtmp.output('dec29ca72e68db0f15de0b1b46f82fc5.sig.gz.prefetch.csv') + g_output = runtmp.output('SRR606249.gather.csv') + p_output = runtmp.output('SRR606249.prefetch.csv') assert os.path.exists(p_output) assert os.path.exists(g_output) @@ -627,14 +651,14 @@ def test_simple_protein(runtmp): # test basic protein execution sigs = get_test_data('protein.zip') - sig_names = ["GCA_001593935.1_ASM159393v1_protein.faa.gz", "GCA_001593925.1_ASM159392v1_protein.faa.gz"] + sig_names = ["GCA_001593935", "GCA_001593925"] runtmp.sourmash('scripts', 'fastmultigather', sigs, sigs, '-s', '100', '--moltype', 'protein', '-k', '19') for qsig in sig_names: - g_output = runtmp.output(os.path.join(qsig + '.sig.gather.csv')) - p_output = runtmp.output(os.path.join(qsig + '.sig.prefetch.csv')) + g_output = runtmp.output(os.path.join(qsig + '.gather.csv')) + p_output = runtmp.output(os.path.join(qsig + '.prefetch.csv')) print(g_output) assert os.path.exists(g_output) assert os.path.exists(p_output) @@ -652,14 +676,14 @@ def test_simple_dayhoff(runtmp): # test basic protein execution sigs = get_test_data('dayhoff.zip') - sig_names = ["GCA_001593935.1_ASM159393v1_protein.faa.gz", "GCA_001593925.1_ASM159392v1_protein.faa.gz"] + sig_names = ["GCA_001593935", "GCA_001593925"] runtmp.sourmash('scripts', 'fastmultigather', sigs, sigs, '-s', '100', '--moltype', 'dayhoff', '-k', '19') for qsig in sig_names: - g_output = runtmp.output(os.path.join(qsig + '.sig.gather.csv')) - p_output = runtmp.output(os.path.join(qsig + '.sig.prefetch.csv')) + g_output = runtmp.output(os.path.join(qsig + '.gather.csv')) + p_output = runtmp.output(os.path.join(qsig + '.prefetch.csv')) print(g_output) assert os.path.exists(g_output) assert os.path.exists(p_output) @@ -677,14 +701,14 @@ def test_simple_hp(runtmp): # test basic protein execution sigs = get_test_data('hp.zip') - sig_names = ["GCA_001593935.1_ASM159393v1_protein.faa.gz", "GCA_001593925.1_ASM159392v1_protein.faa.gz"] + sig_names = ["GCA_001593935", "GCA_001593925"] runtmp.sourmash('scripts', 'fastmultigather', sigs, sigs, '-s', '100', '--moltype', 'hp', '-k', '19') for qsig in sig_names: - g_output = runtmp.output(os.path.join(qsig + '.sig.gather.csv')) - p_output = runtmp.output(os.path.join(qsig + '.sig.prefetch.csv')) + g_output = runtmp.output(os.path.join(qsig + '.gather.csv')) + p_output = runtmp.output(os.path.join(qsig + '.prefetch.csv')) print(g_output) assert os.path.exists(g_output) assert os.path.exists(p_output) diff --git a/src/python/tests/test_multisearch.py b/src/python/tests/test_multisearch.py index ef2ea222..a7b09931 100644 --- a/src/python/tests/test_multisearch.py +++ b/src/python/tests/test_multisearch.py @@ -148,11 +148,11 @@ def test_missing_query(runtmp, capfd, zip_query): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + assert 'Error: No such file or directory' in captured.err -def test_bad_query(runtmp, capfd): - # test with a bad query (a .sig.gz file) +def test_sig_query(runtmp, capfd): + # sig is ok as query now against_list = runtmp.output('against.txt') sig2 = get_test_data('2.fa.sig.gz') @@ -163,17 +163,17 @@ def test_bad_query(runtmp, capfd): output = runtmp.output('out.csv') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'multisearch', sig2, against_list, + runtmp.sourmash('scripts', 'multisearch', sig2, against_list, '-o', output) captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid line in fromfile ' in captured.err - + assert os.path.exists(output) + df = pandas.read_csv(output) + assert len(df) == 1 -def test_bad_query_2(runtmp, capfd): +def test_bad_query(runtmp, capfd): # test with a bad query list (a missing file) query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -221,7 +221,7 @@ def test_bad_query_3(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + assert 'InvalidArchive' in captured.err @pytest.mark.parametrize("zip_db", [False, True]) @@ -250,11 +250,11 @@ def test_missing_against(runtmp, capfd, zip_db): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + assert 'Error: No such file or directory' in captured.err -def test_bad_against(runtmp, capfd): - # test with a bad against list (a .sig file in this case) +def test_sig_against(runtmp, capfd): + # against can be sig now query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -267,17 +267,17 @@ def test_bad_against(runtmp, capfd): output = runtmp.output('out.csv') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'multisearch', query_list, sig2, + runtmp.sourmash('scripts', 'multisearch', query_list, sig2, '-o', output) captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid line in fromfile ' in captured.err - + assert os.path.exists(output) + df = pandas.read_csv(output) + assert len(df) == 1 -def test_bad_against_2(runtmp, capfd): +def test_bad_against(runtmp, capfd): # test with a bad against list (a missing file) query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -300,8 +300,8 @@ def test_bad_against_2(runtmp, capfd): assert "WARNING: 1 search paths failed to load. See error messages above." in captured.err -def test_empty_query(runtmp): - # test with an empty query list +def test_empty_query(runtmp, capfd): + # test with an empty query list - fail gracefully query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -314,11 +314,13 @@ def test_empty_query(runtmp): output = runtmp.output('out.csv') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'multisearch', query_list, against_list, + runtmp.sourmash('scripts', 'multisearch', query_list, against_list, '-o', output) print(runtmp.last_result.err) + captured = capfd.readouterr() + print(captured.err) + assert "No query signatures loaded, exiting." in captured.err # @CTB @@ -380,7 +382,7 @@ def test_load_only_one_bug(runtmp, capfd, zip_db): print(captured.err) assert not 'WARNING: skipped 1 paths - no compatible signatures.' in captured.err - assert not 'WARNING: no compatible sketches in path ' in captured.err + assert not 'WARNING: no compatible sketches in path' in captured.err @pytest.mark.parametrize("zip_query", [False, True]) diff --git a/src/python/tests/test_pairwise.py b/src/python/tests/test_pairwise.py index 84bb2365..0dd67c05 100644 --- a/src/python/tests/test_pairwise.py +++ b/src/python/tests/test_pairwise.py @@ -115,15 +115,9 @@ def test_simple_threshold(runtmp, zip_query): -def test_bad_query(runtmp, capfd): - # test with a bad query (a .sig.gz file) - against_list = runtmp.output('against.txt') - +def test_sig_query(runtmp, capfd): + # sig query is ok now, but fails bc only one sig sig2 = get_test_data('2.fa.sig.gz') - sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') - - make_file_list(against_list, [sig2, sig47, sig63]) output = runtmp.output('out.csv') @@ -133,18 +127,16 @@ def test_bad_query(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - - assert 'Error: invalid line in fromfile ' in captured.err + assert "Error: Pairwise requires two or more sketches. Check input" in captured.err -def test_bad_query_2(runtmp, capfd): +def test_bad_query(runtmp, capfd): # test with a bad query list (a missing file) query_list = runtmp.output('query.txt') sig2 = get_test_data('2.fa.sig.gz') sig47 = get_test_data('47.fa.sig.gz') - sig63 = get_test_data('63.fa.sig.gz') - make_file_list(query_list, [sig2, "no-exist"]) + make_file_list(query_list, [sig2, sig47, "no-exist"]) output = runtmp.output('out.csv') @@ -155,12 +147,10 @@ def test_bad_query_2(runtmp, capfd): print(captured.err) assert "WARNING: could not load sketches from path 'no-exist'" in captured.err - assert "WARNING: 1 query paths failed to load. See error messages above." in captured.err + assert "WARNING: 1 signature paths failed to load. See error messages above." in captured.err - - -def test_bad_query_3(runtmp, capfd): +def test_bad_query_2(runtmp, capfd): # test with a bad query (a .sig.gz file renamed as zip file) sig2 = get_test_data('2.fa.sig.gz') @@ -182,7 +172,7 @@ def test_bad_query_3(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + assert 'InvalidArchive' in captured.err @pytest.mark.parametrize("zip_db", [False, True]) @@ -203,7 +193,7 @@ def test_missing_query(runtmp, capfd, zip_db): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + assert 'Error: No such file or directory' in captured.err @@ -251,7 +241,7 @@ def test_nomatch_query(runtmp, capfd, zip_query): captured = capfd.readouterr() print(captured.err) - assert 'WARNING: skipped 1 query paths - no compatible signatures' in captured.err + assert 'WARNING: skipped 1 signature paths - no compatible signatures' in captured.err @pytest.mark.parametrize("zip_db", [False, True]) diff --git a/src/python/tests/test_search.py b/src/python/tests/test_search.py index 5427d303..c6c49c95 100644 --- a/src/python/tests/test_search.py +++ b/src/python/tests/test_search.py @@ -247,12 +247,12 @@ def test_missing_query(runtmp, capfd, indexed, zip_query): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + assert 'Error: No such file or directory' in captured.err @pytest.mark.parametrize("indexed", [False, True]) -def test_bad_query(runtmp, capfd, indexed): - # test with a bad query (a .sig.gz file) +def test_sig_query(runtmp, capfd, indexed): + # test with a single sig query (a .sig.gz file) against_list = runtmp.output('against.txt') sig2 = get_test_data('2.fa.sig.gz') @@ -266,14 +266,14 @@ def test_bad_query(runtmp, capfd, indexed): output = runtmp.output('out.csv') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'manysearch', sig2, against_list, + # with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'manysearch', sig2, against_list, '-o', output) - captured = capfd.readouterr() - print(captured.err) + # captured = capfd.readouterr() + # print(captured.err) - assert 'Error: invalid line in fromfile ' in captured.err + # assert 'Error: invalid line in fromfile' in captured.err @pytest.mark.parametrize("indexed", [False, True]) @@ -327,7 +327,7 @@ def test_bad_query_3(runtmp, capfd): captured = capfd.readouterr() print(captured.err) - assert 'Error: invalid Zip archive: Could not find central directory end' in captured.err + assert 'InvalidArchive' in captured.err @pytest.mark.parametrize("indexed", [False, True]) @@ -352,34 +352,34 @@ def test_missing_against(runtmp, capfd, indexed): captured = capfd.readouterr() print(captured.err) - assert 'Error: No such file or directory ' in captured.err + assert 'Error: No such file or directory' in captured.err -def test_bad_against(runtmp, capfd): - # test with a bad against list (a .sig file in this case) +def test_nomatch_against(runtmp, capfd): + # nonmatching against file (num sig) query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') sig2 = get_test_data('2.fa.sig.gz') sig47 = get_test_data('47.fa.sig.gz') sig63 = get_test_data('63.fa.sig.gz') + # nomatch_sketch = get_test_data('genome-s11.fa.gz.sig') + nomatch_sketch = get_test_data('SRR606249.sig.gz') make_file_list(query_list, [sig2, sig47, sig63]) - #make_file_list(against_list, [sig2, sig47, sig63]) + make_file_list(against_list, [nomatch_sketch]) output = runtmp.output('out.csv') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'manysearch', query_list, sig2, + # with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'manysearch', query_list, against_list, '-o', output) captured = capfd.readouterr() - print(captured.err) + assert "No search signatures loaded, exiting." in captured.err - assert 'Error: invalid line in fromfile ' in captured.err - -def test_bad_against_2(runtmp, capfd): +def test_bad_against(runtmp, capfd): # test with a bad against list (a missing file) query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -403,7 +403,7 @@ def test_bad_against_2(runtmp, capfd): @pytest.mark.parametrize("indexed", [False, True]) -def test_empty_query(runtmp, indexed): +def test_empty_query(runtmp, indexed, capfd): # test with an empty query list query_list = runtmp.output('query.txt') against_list = runtmp.output('against.txt') @@ -420,11 +420,14 @@ def test_empty_query(runtmp, indexed): output = runtmp.output('out.csv') - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'manysearch', query_list, against_list, + # with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash('scripts', 'manysearch', query_list, against_list, '-o', output) print(runtmp.last_result.err) + captured = capfd.readouterr() + print(captured.err) + assert "No query signatures loaded, exiting." in captured.err @pytest.mark.parametrize("indexed", [False, True]) diff --git a/src/utils.rs b/src/utils.rs index 6a3f78c9..53bf39b2 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,31 +1,30 @@ /// Utility functions for sourmash_plugin_branchwater. use rayon::prelude::*; use sourmash::encodings::HashFunctions; +use sourmash::manifest::Manifest; +use sourmash::selection::Select; -use std::fs::File; -use std::io::Read; +use anyhow::{anyhow, Result}; +use camino::Utf8Path as Path; +use camino::Utf8PathBuf as PathBuf; +use csv::Writer; +use serde::ser::Serializer; +use serde::Serialize; +use std::cmp::{Ordering, PartialOrd}; +use std::collections::BinaryHeap; +use std::fs::{create_dir_all, File}; use std::io::{BufRead, BufReader, BufWriter, Write}; -use std::path::{Path, PathBuf}; - -use tempfile::tempdir; -use zip::read::ZipArchive; - +use std::panic; use std::sync::atomic; use std::sync::atomic::AtomicUsize; -use std::collections::BinaryHeap; - -use anyhow::{anyhow, Result}; - -use std::cmp::{Ordering, PartialOrd}; - -use sourmash::prelude::FracMinHashOps; -use sourmash::prelude::MinHashOps; +use sourmash::collection::Collection; +use sourmash::manifest::Record; +use sourmash::selection::Selection; use sourmash::signature::{Signature, SigsTrait}; -use sourmash::sketch::minhash::{max_hash_for_scaled, KmerMinHash}; -use sourmash::sketch::Sketch; +use sourmash::sketch::minhash::KmerMinHash; +use sourmash::storage::{FSStorage, InnerStorage, SigStore}; -// use tempfile::tempdir; /// Track a name/minhash. pub struct SmallSignature { @@ -34,7 +33,6 @@ pub struct SmallSignature { pub md5sum: String, pub minhash: KmerMinHash, } - /// Structure to hold overlap information from comparisons. pub struct PrefetchResult { @@ -64,86 +62,6 @@ impl PartialEq for PrefetchResult { impl Eq for PrefetchResult {} -/// check to see if two KmerMinHash are compatible. -/// -/// CTB note: despite the name, downsampling is not performed? -/// Although it checks if they are compatible in one direction... - -pub fn check_compatible_downsample( - me: &KmerMinHash, - other: &KmerMinHash, -) -> Result<(), sourmash::Error> { - /* // ignore num minhashes. - if self.num != other.num { - return Err(Error::MismatchNum { - n1: self.num, - n2: other.num, - } - .into()); - } - */ - use sourmash::Error; - - if me.ksize() != other.ksize() { - return Err(Error::MismatchKSizes); - } - if me.hash_function() != other.hash_function() { - // TODO: fix this error - return Err(Error::MismatchDNAProt); - } - if me.max_hash() < other.max_hash() { - return Err(Error::MismatchScaled); - } - if me.seed() != other.seed() { - return Err(Error::MismatchSeed); - } - Ok(()) -} - -/// Given a vec of search Signatures, each containing one or more sketches, -/// and a template Sketch, return a compatible (& now downsampled) -/// Sketch from the search Signatures.. -/// -/// CTB note: this will return the first acceptable match, I think, ignoring -/// all others. - -pub fn prepare_query( - search_sigs: &[Signature], - template: &Sketch, - location: &str, -) -> Option { - for search_sig in search_sigs.iter() { - // find exact match for template? - if let Some(Sketch::MinHash(mh)) = search_sig.select_sketch(template) { - return Some(SmallSignature { - location: location.to_string().clone(), - name: search_sig.name(), - md5sum: mh.md5sum(), - minhash: mh.clone(), - }); - } else { - // no - try to find one that can be downsampled - if let Sketch::MinHash(template_mh) = template { - for sketch in search_sig.sketches() { - if let Sketch::MinHash(ref_mh) = sketch { - if check_compatible_downsample(&ref_mh, template_mh).is_ok() { - let max_hash = max_hash_for_scaled(template_mh.scaled()); - let mh = ref_mh.downsample_max_hash(max_hash).unwrap(); - return Some(SmallSignature { - location: location.to_string().clone(), - name: search_sig.name(), - md5sum: ref_mh.md5sum(), // original - minhash: mh, // downsampled - }); - } - } - } - } - } - } - None -} - /// Find sketches in 'sketchlist' that overlap with 'query' above /// specified threshold. @@ -157,7 +75,8 @@ pub fn prefetch( .filter_map(|result| { let mut mm = None; let searchsig = &result.minhash; - let overlap = searchsig.count_common(query_mh, false); + // TODO: fix Select so we can go back to downsample: false here + let overlap = searchsig.count_common(query_mh, true); if let Ok(overlap) = overlap { if overlap >= threshold_hashes { let result = PrefetchResult { overlap, ..result }; @@ -170,17 +89,27 @@ pub fn prefetch( } /// Write list of prefetch matches. -pub fn write_prefetch + std::fmt::Debug + std::fmt::Display + Clone>( - query: &SmallSignature, - prefetch_output: Option

, +pub fn write_prefetch( + query: &SigStore, + prefetch_output: Option, matchlist: &BinaryHeap, -) -> Result<()> { - // Set up a writer for prefetch output - let prefetch_out: Box = match prefetch_output { - Some(path) => Box::new(BufWriter::new(File::create(path).unwrap())), - None => Box::new(std::io::stdout()), - }; - let mut writer = BufWriter::new(prefetch_out); +) -> Result<(), Box> { + // Define the writer to stdout by default + let mut writer: Box = Box::new(std::io::stdout()); + + if let Some(output_path) = &prefetch_output { + // Account for potential missing dir in output path + let directory_path = Path::new(output_path).parent(); + + // If a directory path exists in the filename, create it if it doesn't already exist + if let Some(dir) = directory_path { + create_dir_all(dir)?; + } + + let file = File::create(output_path)?; + writer = Box::new(BufWriter::new(file)); + } + writeln!( &mut writer, "query_filename,query_name,query_md5,match_name,match_md5,intersect_bp" @@ -191,7 +120,12 @@ pub fn write_prefetch + std::fmt::Debug + std::fmt::Display + Clo writeln!( &mut writer, "{},\"{}\",{},\"{}\",{},{}", - query.location, query.name, query.md5sum, m.name, m.md5sum, m.overlap + query.filename(), + query.name(), + query.md5sum(), + m.name, + m.md5sum, + m.overlap ) .ok(); } @@ -199,131 +133,7 @@ pub fn write_prefetch + std::fmt::Debug + std::fmt::Display + Clo Ok(()) } -/// Load a list of filenames from a file. Exits on bad lines. -pub fn load_sketchlist_filenames>(sketchlist_filename: &P) -> Result> { - let sketchlist_file = BufReader::new(File::open(sketchlist_filename)?); - - let mut sketchlist_filenames: Vec = Vec::new(); - for line in sketchlist_file.lines() { - let line = match line { - Ok(v) => v, - Err(_) => { - return { - let filename = sketchlist_filename.as_ref().display(); - let msg = format!("invalid line in fromfile '{}'", filename); - Err(anyhow!(msg)) - } - } - }; - - if !line.is_empty() { - let mut path = PathBuf::new(); - path.push(line); - sketchlist_filenames.push(path); - } - } - Ok(sketchlist_filenames) -} - -/// Loads signature file paths from a ZIP archive. -/// -/// This function extracts the contents of a ZIP archive containing -/// signature files (with extensions ".sig" or ".sig.gz") to a temporary directory. -/// It returns the paths of these extracted signature files. -/// -/// # Arguments -/// -/// * `zip_path` - The path to the ZIP archive. -/// -/// # Returns -/// -/// Returns a tuple containing: -/// * A vector of `PathBuf` representing the paths to the extracted signature files. -/// * The `TempDir` representing the temporary directory where the files were extracted. -/// Since tempfile::TempDir creates a temporary directory that is automatically -/// deleted once the TempDir value goes out of scope, we return it here to move it -/// to the main function scope. -/// -/// # Errors -/// -/// Returns an error if: -/// * Unable to create a temporary directory. -/// * Unable to open or read the ZIP archive. -/// * Any other IO or file related error. -pub fn load_sigpaths_from_zip>( - zip_path: P, - template: &Sketch, - report_type: ReportType, -) -> Result<(Vec, tempfile::TempDir)> { - let mut signature_paths = Vec::new(); - let temp_dir = tempdir()?; - let zip_file = File::open(&zip_path)?; - let mut zip_archive = ZipArchive::new(zip_file)?; - - let mut skipped_paths = 0; - for i in 0..zip_archive.len() { - let mut file = zip_archive.by_index(i)?; - // make string copy to avoid file borrowing issues - let file_name_str = file.name().to_string(); - let file_name = Path::new(&file_name_str) - .file_name() - .unwrap() - .to_str() - .unwrap(); - // use contains to account for sig.gz_0 bug in sourmash - if file_name.contains(".sig") || file_name.contains(".sig.gz") { - // read file - let mut contents = Vec::new(); - file.read_to_end(&mut contents)?; - // get sig from file - let sigs = Signature::from_reader(&contents[..])?; - if sigs.len() > 1 { - return Err(anyhow::anyhow!( - "File '{}' has more than one signature.", - file_name - )); - } - let sig = &sigs[0]; // Directly take the first (only) signature - // check for compatible sketch - let is_compatible = if let Some(Sketch::MinHash(_)) = sig.select_sketch(template) { - true - } else if let Sketch::MinHash(template_mh) = template { - sig.sketches().iter().any(|sketch| { - matches!(sketch, Sketch::MinHash(ref_mh) if check_compatible_downsample(&ref_mh, template_mh).is_ok()) - }) - } else { - false - }; - - if is_compatible { - let path = temp_dir.path().join(file_name); - // write contents to new file - let mut new_file = File::create(&path)?; - new_file.write_all(&contents)?; - // add filepath to signature paths - signature_paths.push(path); - } else { - skipped_paths += 1; - } - } - } - if skipped_paths > 0 { - eprintln!( - "WARNING: skipped {} {} paths - no compatible signatures.", - skipped_paths, report_type - ); - } - eprintln!( - "loaded paths for {} signature files from zipfile {}", - signature_paths.len(), - zip_path.as_ref().display() - ); - Ok((signature_paths, temp_dir)) -} - -pub fn load_fasta_fromfile>( - sketchlist_filename: &P, -) -> Result> { +pub fn load_fasta_fromfile(sketchlist_filename: String) -> Result> { let mut rdr = csv::Reader::from_path(sketchlist_filename)?; // Check for right header @@ -395,96 +205,94 @@ pub fn load_fasta_fromfile>( Ok(results) } -/// Load a collection of sketches from a file in parallel. +// Load all compatible minhashes from a collection into memory +// also store sig name and md5 alongside, as we usually need those pub fn load_sketches( - sketchlist_paths: Vec, - template: &Sketch, -) -> Result<(Vec, usize, usize)> { - let skipped_paths = AtomicUsize::new(0); - let failed_paths = AtomicUsize::new(0); - - let sketchlist: Vec = sketchlist_paths - .par_iter() - .filter_map(|m| { - let filename = m.display().to_string(); - - match Signature::from_path(m) { - Ok(sigs) => { - let sm = prepare_query(&sigs, template, &filename); - if sm.is_none() { - // track number of paths that have no matching sigs - let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); - } - sm - } - Err(err) => { - // failed to load from this path - print error & track. - eprintln!("Sketch loading error: {}", err); - eprintln!("WARNING: could not load sketches from path '{}'", filename); - let _i = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); - None - } + collection: Collection, + selection: &Selection, + report_type: ReportType, +) -> Result> { + let mut sketchinfo: Vec = Vec::new(); + for (_idx, record) in collection.iter() { + if let Ok(sig) = collection.sig_from_record(record) { + if let Some(minhash) = sig.clone().select(selection)?.minhash().cloned() { + sketchinfo.push(SmallSignature { + location: record.internal_location().to_string(), + name: sig.name(), + md5sum: sig.md5sum(), + minhash, + }) } - }) - .collect(); - - let skipped_paths = skipped_paths.load(atomic::Ordering::SeqCst); - let failed_paths = failed_paths.load(atomic::Ordering::SeqCst); - Ok((sketchlist, skipped_paths, failed_paths)) + } else { + bail!( + "Error: Failed to load {} record: {}", + report_type, + record.name() + ); + } + } + Ok(sketchinfo) } /// Load a collection of sketches from a file, filtering to keep only /// those with a minimum overlap. pub fn load_sketches_above_threshold( - sketchlist_paths: Vec, - template: &Sketch, + against_collection: Collection, query: &KmerMinHash, threshold_hashes: u64, ) -> Result<(BinaryHeap, usize, usize)> { let skipped_paths = AtomicUsize::new(0); let failed_paths = AtomicUsize::new(0); - let matchlist: BinaryHeap = sketchlist_paths + let matchlist: BinaryHeap = against_collection .par_iter() - .filter_map(|m| { - let sigs = Signature::from_path(m); - let location = m.display().to_string(); - - match sigs { - Ok(sigs) => { - let mut mm = None; - - if let Some(sm) = prepare_query(&sigs, template, &location) { - let mh = sm.minhash; - if let Ok(overlap) = mh.count_common(query, false) { - if overlap >= threshold_hashes { - let result = PrefetchResult { - name: sm.name, - md5sum: sm.md5sum, - minhash: mh, - overlap, - }; - mm = Some(result); - } + .filter_map(|(_idx, against_record)| { + let mut results = Vec::new(); + // Load against into memory + if let Ok(against_sig) = against_collection.sig_from_record(against_record) { + if let Some(against_mh) = against_sig.minhash() { + eprintln!( + "against_sig info: name: {}, md5:{},", + against_sig.name(), + against_sig.md5sum() + ); + eprintln!("against_mh info: md5:{},", against_mh.md5sum()); + // currently downsampling here to avoid changing md5sum + if let Ok(overlap) = against_mh.count_common(query, true) { + //downsample via count_common + if overlap >= threshold_hashes { + let result = PrefetchResult { + name: against_record.name().to_string(), + md5sum: against_mh.md5sum(), + minhash: against_mh.clone(), + overlap, + }; + results.push(result); } - } else { - eprintln!("WARNING: no compatible sketches in path '{}'", m.display()); - let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } - mm - } - Err(err) => { - eprintln!("Sketch loading error: {}", err); - let _ = failed_paths.fetch_add(1, atomic::Ordering::SeqCst); + } else { eprintln!( - "WARNING: could not load sketches from path '{}'", - m.display() + "WARNING: no compatible sketches in path '{}'", + against_sig.filename() ); - None + let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } + } else { + // this shouldn't happen here anymore -- likely would happen at load_collection + eprintln!( + "WARNING: could not load sketches for record '{}'", + against_record.internal_location() + ); + let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); + } + if results.is_empty() { + None + } else { + Some(results) } }) + .flatten() .collect(); let skipped_paths = skipped_paths.load(atomic::Ordering::SeqCst); @@ -493,117 +301,10 @@ pub fn load_sketches_above_threshold( Ok((matchlist, skipped_paths, failed_paths)) } -/// Loads all compatible sketches from a ZIP archive at the given path into memory. -/// Currently not parallelized; use a different zip crate to enable parallelization. -/// -/// # Arguments -/// -/// * `zip_path` - Path to the ZIP archive. -/// * `template` - Reference to the Sketch template. -/// -/// # Returns -/// -/// Returns a tuple containing: -/// * A vector of `SmallSignature`s. -/// * Number of paths that were skipped because they did not match the sketch parameters. -/// * Number of paths that failed to load. -/// -/// # Errors -/// -/// Returns an error if: -/// * Unable to open the ZIP file. -/// * ZIP archive is malformed. -pub fn load_sketches_from_zip>( - zip_path: P, - template: &Sketch, -) -> Result<(Vec, usize, usize)> { - let mut sketchlist = Vec::new(); - let zip_file = File::open(&zip_path)?; - let mut zip_archive = ZipArchive::new(zip_file)?; - let mut skipped_paths = 0; - let mut failed_paths = 0; - - // loop through, loading signatures - for i in 0..zip_archive.len() { - let mut file = zip_archive.by_index(i)?; - let file_name = Path::new(file.name()) - .file_name() - .unwrap() - .to_str() - .unwrap() - .to_owned(); - - if !file_name.contains(".sig") && !file_name.contains(".sig.gz") { - continue; - } - if let Ok(sigs) = Signature::from_reader(&mut file) { - if let Some(sm) = - prepare_query(&sigs, template, &zip_path.as_ref().display().to_string()) - { - sketchlist.push(sm); - } else { - // track number of paths that have no matching sigs - skipped_paths += 1; - } - } else { - // failed to load from this path - print error & track. - eprintln!("WARNING: could not load sketches from path '{}'", file_name); - failed_paths += 1; - } - } - drop(zip_archive); - println!("loaded {} signatures", sketchlist.len()); - Ok((sketchlist, skipped_paths, failed_paths)) -} - -/// Control function to read signature FILE PATHS from an input file. -/// If a ZIP archive is provided (detected via extension), -/// use `load_sigpaths_from_zip`. Otherwise, assume the -/// user provided a `fromfile` sketchlist and use -/// `load_sketchlist_filenames`. -/// -/// # Arguments -/// -/// * `sketchlist_path` - Path to either a ZIP archive or a list of signature file paths. -/// -/// # Returns -/// -/// Returns a tuple containing: -/// * A vector of `PathBuf` representing the signature file paths. -/// * If extracting from a zipfile, signature files will be extracted to a -/// `TempDir` temporary directory where they can be used individually. -pub fn load_sigpaths_from_zip_or_pathlist>( - sketchlist_path: P, - template: &Sketch, - report_type: ReportType, -) -> Result<(Vec, Option)> { - eprintln!( - "Reading list of filepaths from: '{}'", - sketchlist_path.as_ref().display() - ); - - let result = if sketchlist_path - .as_ref() - .extension() - .map(|ext| ext == "zip") - .unwrap_or(false) - { - let (paths, tempdir) = load_sigpaths_from_zip(&sketchlist_path, template, report_type)?; - (paths, Some(tempdir)) - } else { - let paths = load_sketchlist_filenames(&sketchlist_path)?; - (paths, None) - }; - - eprintln!("Found {} filepaths", result.0.len()); - // should we bail here if empty? - Ok(result) -} - pub enum ReportType { Query, Against, - Index, + General, } impl std::fmt::Display for ReportType { @@ -611,56 +312,97 @@ impl std::fmt::Display for ReportType { let description = match self { ReportType::Query => "query", ReportType::Against => "search", - ReportType::Index => "index", + ReportType::General => "signature", }; write!(f, "{}", description) } } -/// Control function to load compatible signatures from an input file. -/// If a ZIP archive is provided (detected via extension), -/// calls `load_sketches_from_zip`. Otherwise, assumes the -/// user provided a `fromfile` sketchlist and calls -/// `load_sketchlist_filenames`. -/// -/// # Arguments -/// -/// * `sketchlist_path` - Path to either a ZIP archive or a list of signature file paths. -/// * `template` - Reference to the Sketch template (used to load only compatible signatures). -/// * `report_type` - ReportType Enum. Are these 'query' or 'search' signatures? -/// -/// # Returns -/// -/// Returns a vector of `SmallSignature`s. -pub fn load_sketches_from_zip_or_pathlist>( - sketchlist_path: P, - template: &Sketch, +pub fn load_collection( + siglist: &String, + selection: &Selection, report_type: ReportType, -) -> Result> { - eprintln!( - "Reading list of {} paths from: '{}'", - report_type, - sketchlist_path.as_ref().display() - ); + allow_failed: bool, +) -> Result { + let sigpath = PathBuf::from(siglist); - let (sketchlist, skipped_paths, failed_paths) = if sketchlist_path - .as_ref() - .extension() - .map(|ext| ext == "zip") - .unwrap_or(false) - { - load_sketches_from_zip(sketchlist_path, template)? + if !sigpath.exists() { + bail!("No such file or directory: '{}'", &sigpath); + } + eprintln!("Reading {}(s) from: '{}'", report_type, &siglist); + + let mut n_failed = 0; + let collection = if sigpath.extension().map_or(false, |ext| ext == "zip") { + match Collection::from_zipfile(&sigpath) { + Ok(collection) => collection, + Err(_) => bail!("failed to load {} zipfile: '{}'", report_type, sigpath), + } } else { - let sketch_paths = load_sketchlist_filenames(&sketchlist_path)?; - load_sketches(sketch_paths, template)? + // if pathlist is just a signature path, load it into a collection + match Signature::from_path(&sigpath) { + Ok(signatures) => { + // Load the collection from the signature + match Collection::from_sigs(signatures) { + Ok(collection) => collection, + Err(_) => bail!( + "loaded {} signatures but failed to load as collection: '{}'", + report_type, + sigpath + ), + } + } + // if not, try to load file as list of sig paths + Err(_) => { + // using core fn doesn't allow us to ignore failed paths; I reimplement loading here to allow + let sketchlist_file = BufReader::new(File::open(sigpath)?); + let records: Vec = sketchlist_file + .lines() + .filter_map(|line| { + let path = line.ok()?; + match Signature::from_path(&path) { + Ok(signatures) => { + // TODO: Handling for multisig files: Split into separate sigs so records are unique? + // Currently, we end up with a single record + let recs: Vec = signatures + .into_iter() + .flat_map(|v| Record::from_sig(&v, &path)) + .collect(); + Some(recs) + } + Err(err) => { + eprintln!("Sketch loading error: {}", err); + eprintln!("WARNING: could not load sketches from path '{}'", path); + n_failed += 1; + None + } + } + }) + .flatten() + .collect(); + + let manifest: Manifest = records.into(); + eprintln!("len manifest: {}", manifest.len()); + Collection::new( + manifest, + InnerStorage::new( + FSStorage::builder() + .fullpath("".into()) + .subdir("".into()) + .build(), + ), + ) + } + } }; - report_on_sketch_loading(&sketchlist, skipped_paths, failed_paths, report_type)?; - - Ok(sketchlist) + let n_total = collection.len(); + let selected = collection.select(selection)?; + let n_skipped = n_total - selected.len(); + report_on_collection_loading(&selected, n_skipped, n_failed, report_type, allow_failed)?; + Ok(selected) } -/// Uses the output of sketch loading functions to report the +/// Uses the output of collection loading function to report the /// total number of sketches loaded, as well as the number of files, /// if any, that failed to load or contained no compatible sketches. /// If no sketches were loaded, bail. @@ -682,17 +424,21 @@ pub fn load_sketches_from_zip_or_pathlist>( /// /// Returns an error if: /// * No signatures were successfully loaded. -pub fn report_on_sketch_loading( - sketchlist: &[SmallSignature], +pub fn report_on_collection_loading( + collection: &Collection, skipped_paths: usize, failed_paths: usize, report_type: ReportType, + allow_failed: bool, ) -> Result<()> { if failed_paths > 0 { eprintln!( "WARNING: {} {} paths failed to load. See error messages above.", failed_paths, report_type ); + if !allow_failed { + bail! {"Signatures failed to load. Exiting."} + } } if skipped_paths > 0 { eprintln!( @@ -702,28 +448,38 @@ pub fn report_on_sketch_loading( } // Validate sketches - eprintln!("Loaded {} {} signature(s)", sketchlist.len(), report_type); - if sketchlist.is_empty() { - bail!("No {} signatures loaded, exiting.", report_type); + if collection.is_empty() { + eprintln!("No {} signatures loaded, exiting.", report_type); + return Ok(()); } + eprintln!("Loaded {} {} signature(s)", collection.len(), report_type); Ok(()) } /// Execute the gather algorithm, greedy min-set-cov, by iteratively /// removing matches in 'matchlist' from 'query'. -pub fn consume_query_by_gather + std::fmt::Debug + std::fmt::Display + Clone>( - query: SmallSignature, +pub fn consume_query_by_gather( + query: SigStore, matchlist: BinaryHeap, threshold_hashes: u64, - gather_output: Option

, + gather_output: Option, ) -> Result<()> { - // Set up a writer for gather output - let gather_out: Box = match gather_output { - Some(path) => Box::new(BufWriter::new(File::create(path).unwrap())), - None => Box::new(std::io::stdout()), - }; - let mut writer = BufWriter::new(gather_out); + // Define the writer to stdout by default + let mut writer: Box = Box::new(std::io::stdout()); + + if let Some(output_path) = &gather_output { + // Account for potential missing dir in output path + let directory_path = Path::new(output_path).parent(); + + // If a directory path exists in the filename, create it if it doesn't already exist + if let Some(dir) = directory_path { + create_dir_all(dir)?; + } + + let file = File::create(output_path)?; + writer = Box::new(BufWriter::new(file)); + } writeln!( &mut writer, "query_filename,rank,query_name,query_md5,match_name,match_md5,intersect_bp" @@ -733,17 +489,20 @@ pub fn consume_query_by_gather + std::fmt::Debug + std::fmt::Disp let mut matching_sketches = matchlist; let mut rank = 0; - let mut last_hashes = query.minhash.size(); let mut last_matches = matching_sketches.len(); - let location = query.location; - let mut query_mh = query.minhash; + // let location = query.location; + let location = query.filename(); // this is different (original fasta filename) than query.location was (sig name)!! + + let orig_query_mh = query.minhash().unwrap(); + let mut query_mh = orig_query_mh.clone(); + let mut last_hashes = orig_query_mh.size(); eprintln!( "{} iter {}: start: query hashes={} matches={}", location, rank, - query_mh.size(), + orig_query_mh.size(), matching_sketches.len() ); @@ -758,8 +517,8 @@ pub fn consume_query_by_gather + std::fmt::Debug + std::fmt::Disp "{},{},\"{}\",{},\"{}\",{},{}", location, rank, - query.name, - query.md5sum, + query.name(), + query.md5sum(), best_element.name, best_element.md5sum, best_element.overlap @@ -790,27 +549,26 @@ pub fn consume_query_by_gather + std::fmt::Debug + std::fmt::Disp Ok(()) } -pub fn build_template(ksize: u8, scaled: usize, moltype: &str) -> Sketch { +pub fn build_selection(ksize: u8, scaled: usize, moltype: &str) -> Selection { let hash_function = match moltype { - "dna" => HashFunctions::murmur64_DNA, - "protein" => HashFunctions::murmur64_protein, - "dayhoff" => HashFunctions::murmur64_dayhoff, - "hp" => HashFunctions::murmur64_hp, + "dna" => HashFunctions::Murmur64Dna, + "protein" => HashFunctions::Murmur64Protein, + "dayhoff" => HashFunctions::Murmur64Dayhoff, + "hp" => HashFunctions::Murmur64Hp, _ => panic!("Unknown molecule type: {}", moltype), }; - //adjust ksize if not dna - let adjusted_ksize = if moltype == "dna" { ksize } else { ksize * 3 }; - let max_hash = max_hash_for_scaled(scaled as u64); - let template_mh = KmerMinHash::builder() - .num(0u32) - .ksize(adjusted_ksize as u32) - .max_hash(max_hash) - .hash_function(hash_function) - .build(); - Sketch::MinHash(template_mh) + // let hash_function = HashFunctions::try_from(moltype) + // .map_err(|_| panic!("Unknown molecule type: {}", moltype)) + // .unwrap(); + + Selection::builder() + .ksize(ksize.into()) + .scaled(scaled as u32) + .moltype(hash_function) + .build() } -pub fn is_revindex_database(path: &Path) -> bool { +pub fn is_revindex_database(path: &camino::Utf8PathBuf) -> bool { // quick file check for Revindex database: // is path a directory that contains a file named 'CURRENT'? if path.is_dir() { @@ -821,6 +579,7 @@ pub fn is_revindex_database(path: &Path) -> bool { } } +#[derive(Serialize)] pub struct SearchResult { pub query_name: String, pub query_md5: String, @@ -832,43 +591,29 @@ pub struct SearchResult { pub max_containment: Option, } -impl ResultType for SearchResult { - fn header_fields() -> Vec<&'static str> { - vec![ - "query_name", - "query_md5", - "match_name", - "containment", - "intersect_hashes", - "match_md5", - "jaccard", - "max_containment", - ] - } +#[derive(Serialize)] +pub struct BranchwaterGatherResult { + pub query_name: String, + pub query_md5: String, + pub match_name: String, + pub match_md5: String, + pub f_match_query: f64, + pub intersect_bp: usize, +} - fn format_fields(&self) -> Vec { - vec![ - format!("\"{}\"", self.query_name), // Wrap query_name with quotes - self.query_md5.clone(), - format!("\"{}\"", self.match_name), // Wrap match_name with quotes - self.containment.to_string(), - self.intersect_hashes.to_string(), - match &self.match_md5 { - Some(md5) => md5.clone(), - None => "".to_string(), - }, - match &self.jaccard { - Some(jaccard) => jaccard.to_string(), - None => "".to_string(), - }, - match &self.max_containment { - Some(max_containment) => max_containment.to_string(), - None => "".to_string(), - }, - ] - } +#[derive(Serialize)] +pub struct MultiSearchResult { + pub query_name: String, + pub query_md5: String, + pub match_name: String, + pub match_md5: String, + pub containment: f64, + pub max_containment: f64, + pub jaccard: f64, + pub intersect_hashes: f64, } +#[derive(Serialize)] pub struct ManifestRow { pub md5: String, pub md5short: String, @@ -877,50 +622,24 @@ pub struct ManifestRow { pub num: u32, pub scaled: u64, pub n_hashes: usize, - pub with_abundance: bool, + pub with_abundance: BoolPython, pub name: String, pub filename: String, pub internal_location: String, } -pub fn bool_to_python_string(b: bool) -> String { - match b { - true => "True".to_string(), - false => "False".to_string(), - } -} - -impl ResultType for ManifestRow { - fn header_fields() -> Vec<&'static str> { - vec![ - "internal_location", - "md5", - "md5short", - "ksize", - "moltype", - "num", - "scaled", - "n_hashes", - "with_abundance", - "name", - "filename", - ] - } +// A wrapper type for booleans to customize serialization +pub struct BoolPython(bool); - fn format_fields(&self) -> Vec { - vec![ - self.internal_location.clone(), - self.md5.clone(), - self.md5short.clone(), - self.ksize.to_string(), - self.moltype.clone(), - self.num.to_string(), - self.scaled.to_string(), - self.n_hashes.to_string(), - bool_to_python_string(self.with_abundance), - format!("\"{}\"", self.name), // Wrap name with quotes - self.filename.clone(), - ] +impl Serialize for BoolPython { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + match self.0 { + true => serializer.serialize_str("True"), + false => serializer.serialize_str("False"), + } } } @@ -954,23 +673,23 @@ pub fn make_manifest_row( num, scaled, n_hashes: sketch.size(), - with_abundance: abund, + with_abundance: BoolPython(abund), name: sig.name().to_string(), - // filename: filename.display().to_string(), - filename: filename.to_str().unwrap().to_string(), + filename: filename.to_string(), } } -pub fn open_stdout_or_file>(output: Option

) -> Box { +pub fn open_stdout_or_file(output: Option) -> Box { // if output is a file, use open_output_file if let Some(path) = output { - Box::new(open_output_file(&path)) + let outpath: PathBuf = path.into(); + Box::new(open_output_file(&outpath)) } else { Box::new(std::io::stdout()) } } -pub fn open_output_file>(output: &P) -> BufWriter { +pub fn open_output_file(output: &PathBuf) -> BufWriter { let file = File::create(output).unwrap_or_else(|e| { eprintln!("Error creating output file: {:?}", e); std::process::exit(1); @@ -1008,12 +727,15 @@ pub enum ZipMessage { WriteManifest, } -pub fn sigwriter + Send + 'static>( +pub fn sigwriter( recv: std::sync::mpsc::Receiver, output: String, ) -> std::thread::JoinHandle> { std::thread::spawn(move || -> Result<()> { - let file_writer = open_output_file(&output); + // cast output as pathbuf + let outpath: PathBuf = output.into(); + + let file_writer = open_output_file(&outpath); let options = zip::write::FileOptions::default() .compression_method(zip::CompressionMethod::Stored) @@ -1056,24 +778,27 @@ pub fn sigwriter + Send + 'static>( println!("Writing manifest"); // Start the CSV file inside the zip zip.start_file("SOURMASH-MANIFEST.csv", options).unwrap(); - // write manifest version line writeln!(&mut zip, "# SOURMASH-MANIFEST-VERSION: 1.0").unwrap(); - // Write the header - let header = ManifestRow::header_fields(); - if let Err(e) = writeln!(&mut zip, "{}", header.join(",")) { - eprintln!("Error writing header: {:?}", e); - } + // scoped block for csv writing + { + let mut csv_writer = Writer::from_writer(&mut zip); - // Write each manifest row - for row in &manifest_rows { - let formatted_fields = row.format_fields(); // Assuming you have a format_fields method on ManifestRow - if let Err(e) = writeln!(&mut zip, "{}", formatted_fields.join(",")) { - eprintln!("Error writing item: {:?}", e); + for row in &manifest_rows { + if let Err(e) = csv_writer.serialize(row) { + eprintln!("Error writing item: {:?}", e); + } } + // CSV writer must be manually flushed to ensure all data is written + if let Err(e) = csv_writer.flush() { + eprintln!("Error flushing CSV writer: {:?}", e); + } + } // drop csv writer here + + // Properly finish writing to the ZIP file + if let Err(e) = zip.finish() { + eprintln!("Error finalizing ZIP file: {:?}", e); } - // finalize the zip file writing. - zip.finish().unwrap(); } } } @@ -1081,38 +806,22 @@ pub fn sigwriter + Send + 'static>( }) } -pub trait ResultType { - fn header_fields() -> Vec<&'static str>; - fn format_fields(&self) -> Vec; -} - -pub fn csvwriter_thread( +pub fn csvwriter_thread( recv: std::sync::mpsc::Receiver, - output: Option

, -) -> std::thread::JoinHandle<()> -where - T: ResultType, - P: Clone + std::convert::AsRef, -{ + output: Option, +) -> std::thread::JoinHandle<()> { // create output file - let out = open_stdout_or_file(output.as_ref()); + let out = open_stdout_or_file(output); // spawn a thread that is dedicated to printing to a buffered output std::thread::spawn(move || { - let mut writer = out; - - let header = T::header_fields(); - if let Err(e) = writeln!(&mut writer, "{}", header.join(",")) { - eprintln!("Error writing header: {:?}", e); - } - writer.flush().unwrap(); + let mut writer = Writer::from_writer(out); - for item in recv.iter() { - let formatted_fields = item.format_fields(); - if let Err(e) = writeln!(&mut writer, "{}", formatted_fields.join(",")) { + for res in recv.iter() { + if let Err(e) = writer.serialize(res) { eprintln!("Error writing item: {:?}", e); } - writer.flush().unwrap(); } + writer.flush().expect("Failed to flush writer."); }) }