From 31e01f0eca6477b920bf5afcffb874c7eed5d681 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Fri, 8 Jul 2022 18:39:26 -0700 Subject: [PATCH 01/19] rkyv support in signature try to skip md5sum derive typed builder for GatherResult expose md5 start moving mastiff remove unused bigsi and sbt indices Expose iterator methods for Signature remove unused function preparing for MinHashOps Bump version, there will be many breaking changes... more splits default to large minhash, expose frozen use enum for MinHash thru FFI bug fixes and cleanup add c++ stdlib package for mamba fix finch feature try out roaring Add back check command initial update impl add semver check for rust bump once_cell fix rust ci start bringing #1943 more selection more picklist use dashmap in mem_revindex Revert "use dashmap in mem_revindex" This reverts commit 22727b7091dee2dbafd10ce707578c5085aa6cfc. bump rocksdb to 0.19 bump rocksdb to 0.20, bump MSRV to 1.60 update deps add cargo deny config use cibuildwheel configs in pyproject.toml flake cleanup fix cargo.lock tox updates don't worry with in-mem sigs for now rename mastiff to branchwater add revindex test wip collection clippy fixes start using camino for paths cargo fmt Implement MemStorage and from_paths and from_sigs cleanup Replace DatasetID with Idx Use 32-bits Idx, instead of 64 use enum_dispatch to avoid repeating code implement selection more widely working with protein in tests... add semver-checks save/load collection from rocksdb --- .github/workflows/dev_envs.yml | 2 +- .github/workflows/rust.yml | 6 + .readthedocs.yml | 4 + Cargo.lock | 437 +++++++++++-- Makefile | 3 +- deny.toml | 1 + doc/developer.md | 2 +- flake.lock | 18 +- flake.nix | 4 + include/sourmash.h | 1 + pyproject.toml | 16 +- src/core/Cargo.toml | 34 +- src/core/build.rs | 6 +- src/core/cbindgen.toml | 2 +- src/core/src/collection.rs | 135 ++++ src/core/src/encodings.rs | 27 +- src/core/src/errors.rs | 15 + src/core/src/ffi/index/mod.rs | 2 + src/core/src/ffi/index/revindex.rs | 16 +- src/core/src/ffi/storage.rs | 9 +- src/core/src/from.rs | 13 +- src/core/src/index/linear.rs | 508 ++++++++++----- src/core/src/index/mod.rs | 405 +++++------- src/core/src/index/revindex/disk_revindex.rs | 546 ++++++++++++++++ .../{revindex.rs => revindex/mem_revindex.rs} | 598 +++++++++--------- src/core/src/index/revindex/mod.rs | 538 ++++++++++++++++ src/core/src/lib.rs | 6 +- src/core/src/manifest.rs | 256 ++++++++ src/core/src/picklist.rs | 29 + src/core/src/signature.rs | 121 ++++ src/core/src/sketch/hyperloglog/mod.rs | 16 +- src/core/src/sketch/minhash.rs | 12 + src/core/src/sketch/mod.rs | 4 + src/core/src/sketch/nodegraph.rs | 11 +- src/core/src/storage.rs | 447 +++++++++++-- src/core/tests/storage.rs | 35 + src/sourmash/sbt_storage.py | 4 +- tox.ini | 7 +- 38 files changed, 3421 insertions(+), 875 deletions(-) create mode 100644 src/core/src/collection.rs create mode 100644 src/core/src/index/revindex/disk_revindex.rs rename src/core/src/index/{revindex.rs => revindex/mem_revindex.rs} (61%) create mode 100644 src/core/src/index/revindex/mod.rs create mode 100644 src/core/src/manifest.rs create mode 100644 src/core/src/picklist.rs diff --git a/.github/workflows/dev_envs.yml b/.github/workflows/dev_envs.yml index 0c66aaa970..a5f3b2f839 100644 --- a/.github/workflows/dev_envs.yml +++ b/.github/workflows/dev_envs.yml @@ -57,7 +57,7 @@ jobs: - name: install dependencies shell: bash -l {0} - run: mamba install 'tox>=3.27,<4' tox-conda rust git compilers pandoc + run: mamba install 'tox>=3.27,<4' tox-conda rust git compilers pandoc libstdcxx-ng - name: run tests for 3.9 shell: bash -l {0} diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 349a050894..3b2535f033 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -234,6 +234,12 @@ jobs: toolchain: stable override: true + - name: Check semver + uses: obi1kenobi/cargo-semver-checks-action@v2 + with: + crate-name: sourmash + version-tag-prefix: r + - name: Make sure we can publish the sourmash crate uses: actions-rs/cargo@v1 with: diff --git a/.readthedocs.yml b/.readthedocs.yml index 5b33921869..5479606af7 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -9,6 +9,10 @@ build: tools: python: "3.10" rust: "1.64" + apt_packages: + - llvm-dev + - libclang-dev + - clang # Build documentation in the docs/ directory with Sphinx sphinx: diff --git a/Cargo.lock b/Cargo.lock index a2d9e0b6d7..122923d2e7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8,6 +8,17 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "ahash" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" +dependencies = [ + "getrandom", + "once_cell", + "version_check", +] + [[package]] name = "aliasable" version = "0.1.3" @@ -59,6 +70,12 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b7e4c2464d97fe331d41de9d5db0def0a96f4d823b8b32a2efd503578988973" +[[package]] +name = "binary-merge" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "597bb81c80a54b6a4381b23faba8d7774b144c94cbd1d6fe3f1329bd776554ab" + [[package]] name = "bincode" version = "1.3.3" @@ -68,6 +85,27 @@ dependencies = [ "serde", ] +[[package]] +name = "bindgen" +version = "0.65.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfdf7b466f9a4903edc73f95d6d2bcd5baf8ae620638762244d3f60143643cc5" +dependencies = [ + "bitflags 1.3.2", + "cexpr", + "clang-sys", + "lazy_static", + "lazycell", + "peeking_take_while", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn 2.0.23", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -80,18 +118,6 @@ version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42" -[[package]] -name = "bstr" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" -dependencies = [ - "lazy_static", - "memchr", - "regex-automata", - "serde", -] - [[package]] name = "buffer-redux" version = "1.0.0" @@ -108,12 +134,39 @@ version = "3.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535" +[[package]] +name = "bytecheck" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a31f923c2db9513e4298b72df143e6e655a759b3d6a0966df18f81223fff54f" +dependencies = [ + "bytecheck_derive", + "ptr_meta", +] + +[[package]] +name = "bytecheck_derive" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edb17c862a905d912174daa27ae002326fff56dc8b8ada50a0a5f0976cb174f0" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.104", +] + [[package]] name = "bytecount" version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" +[[package]] +name = "bytemuck" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f5715e491b5a1598fc2bef5a606847b5dc1d48ea625bd3c02c00de8285591da" + [[package]] name = "byteorder" version = "1.4.3" @@ -146,6 +199,9 @@ name = "camino" version = "1.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c59e92b5a388f549b863a7bea62612c09f24c8393560709a54558a9abdfb3b9c" +dependencies = [ + "serde", +] [[package]] name = "capnp" @@ -164,6 +220,18 @@ name = "cc" version = "1.0.73" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" +dependencies = [ + "jobserver", +] + +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] [[package]] name = "cfg-if" @@ -212,6 +280,17 @@ dependencies = [ "half", ] +[[package]] +name = "clang-sys" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a050e2153c5be08febd6734e29298e844fdb0fa21aeddd63b4eb7baa106c69b" +dependencies = [ + "glob", + "libc", + "libloading", +] + [[package]] name = "clap" version = "4.3.0" @@ -373,13 +452,12 @@ dependencies = [ [[package]] name = "csv" -version = "1.1.6" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" +checksum = "af91f40b7355f82b0a891f50e70399475945bb0b0da4f1700ce60761c9d3e359" dependencies = [ - "bstr", "csv-core", - "itoa 0.4.8", + "itoa", "ryu", "serde", ] @@ -395,9 +473,9 @@ dependencies = [ [[package]] name = "cxx" -version = "1.0.85" +version = "1.0.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5add3fc1717409d029b20c5b6903fc0c0b02fa6741d820054f4a2efa5e5816fd" +checksum = "86d3488e7665a7a483b57e25bdd90d0aeb2bc7608c8d0346acf2ad3f1caf1d62" dependencies = [ "cc", "cxxbridge-flags", @@ -407,9 +485,9 @@ dependencies = [ [[package]] name = "cxx-build" -version = "1.0.85" +version = "1.0.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4c87959ba14bc6fbc61df77c3fcfe180fc32b93538c4f1031dd802ccb5f2ff0" +checksum = "48fcaf066a053a41a81dfb14d57d99738b767febb8b735c3016e469fac5da690" dependencies = [ "cc", "codespan-reporting", @@ -422,15 +500,15 @@ dependencies = [ [[package]] name = "cxxbridge-flags" -version = "1.0.85" +version = "1.0.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69a3e162fde4e594ed2b07d0f83c6c67b745e7f28ce58c6df5e6b6bef99dfb59" +checksum = "a2ef98b8b717a829ca5603af80e1f9e2e48013ab227b68ef37872ef84ee479bf" [[package]] name = "cxxbridge-macro" -version = "1.0.85" +version = "1.0.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e7e2adeb6a0d4a282e581096b06e1791532b7d576dcde5ccd9382acf55db8e6" +checksum = "086c685979a698443656e5cf7856c95c642295a38599f12fb1ff76fb28d19892" dependencies = [ "proc-macro2", "quote", @@ -443,6 +521,18 @@ version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" +[[package]] +name = "enum_dispatch" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f33313078bb8d4d05a2733a94ac4c2d8a0df9a2b84424ebf4f33bfc224a890e" +dependencies = [ + "once_cell", + "proc-macro2", + "quote", + "syn 2.0.23", +] + [[package]] name = "errno" version = "0.3.1" @@ -530,12 +620,27 @@ dependencies = [ "syn 1.0.104", ] +[[package]] +name = "glob" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" + [[package]] name = "half" version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" +[[package]] +name = "hashbrown" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db0d4cf898abf0081f964436dc980e96670a0f36863e4b83aaacdb65c9d7ccc3" +dependencies = [ + "ahash", +] + [[package]] name = "heck" version = "0.4.1" @@ -557,6 +662,12 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" +[[package]] +name = "histogram" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12cb882ccb290b8646e554b157ab0b71e64e8d5bef775cd66b6531e52d302669" + [[package]] name = "iana-time-zone" version = "0.1.53" @@ -581,6 +692,15 @@ dependencies = [ "cxx-build", ] +[[package]] +name = "inplace-vec-builder" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf64c2edc8226891a71f127587a2861b132d2b942310843814d5001d99a1d307" +dependencies = [ + "smallvec", +] + [[package]] name = "io-lifetimes" version = "1.0.11" @@ -624,15 +744,18 @@ dependencies = [ [[package]] name = "itoa" -version = "0.4.8" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" +checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35" [[package]] -name = "itoa" -version = "1.0.1" +name = "jobserver" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35" +checksum = "af25a77299a7f711a01975c35a6a424eb6862092cc2d6c72c4ed6cbc56dfc1fa" +dependencies = [ + "libc", +] [[package]] name = "js-sys" @@ -649,18 +772,61 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + [[package]] name = "libc" version = "0.2.146" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f92be4933c13fd498862a9e02a3055f8a8d9c039ce33db97306fd5a6caa7f29b" +[[package]] +name = "libloading" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" +dependencies = [ + "cfg-if", + "winapi", +] + [[package]] name = "libm" version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb" +[[package]] +name = "librocksdb-sys" +version = "0.11.0+8.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3386f101bcb4bd252d8e9d2fb41ec3b0862a15a62b478c355b2982efa469e3e" +dependencies = [ + "bindgen", + "bzip2-sys", + "cc", + "glob", + "libc", + "libz-sys", + "lz4-sys", + "zstd-sys", +] + +[[package]] +name = "libz-sys" +version = "1.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9702761c3935f8cc2f101793272e202c72b99da8f4224a19ddcf1279a6450bbf" +dependencies = [ + "cc", + "pkg-config", + "vcpkg", +] + [[package]] name = "link-cplusplus" version = "1.0.8" @@ -688,6 +854,16 @@ version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +[[package]] +name = "lz4-sys" +version = "1.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57d27b317e207b10f69f5e75494119e391a96f48861ae870d1da6edac98ca900" +dependencies = [ + "cc", + "libc", +] + [[package]] name = "lzma-sys" version = "0.1.17" @@ -739,6 +915,12 @@ dependencies = [ "autocfg", ] +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "miniz_oxide" version = "0.4.4" @@ -771,9 +953,9 @@ dependencies = [ [[package]] name = "niffler" -version = "2.4.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68c7ffd42bdba05fc9fbfda31283d44c5c8a88fed1a191f68795dba23cc8204b" +checksum = "470dd05a938a5ad42c2cb80ceea4255e275990ee530b86ca164e6d8a19fa407f" dependencies = [ "cfg-if", "flate2", @@ -786,6 +968,16 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2bf50223579dc7cdcfb3bfcacf7069ff68243f8c363f62ffa99cf000a6b9c451" +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + [[package]] name = "num-integer" version = "0.1.44" @@ -827,6 +1019,15 @@ dependencies = [ "libc", ] +[[package]] +name = "numsep" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad5c49c3e12c314efb1f43cba136031b657dcd59ee26936ab2be313c5e97da22" +dependencies = [ + "slicestring", +] + [[package]] name = "once_cell" version = "1.18.0" @@ -864,6 +1065,12 @@ dependencies = [ "syn 2.0.23", ] +[[package]] +name = "peeking_take_while" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" + [[package]] name = "piz" version = "0.5.1" @@ -920,6 +1127,16 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" +[[package]] +name = "prettyplease" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c64d9ba0963cdcea2e1b2230fbae2bab30eb25a174be395c41e764bfb65dd62" +dependencies = [ + "proc-macro2", + "syn 2.0.23", +] + [[package]] name = "primal-check" version = "0.3.3" @@ -979,6 +1196,26 @@ dependencies = [ "unarray", ] +[[package]] +name = "ptr_meta" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0738ccf7ea06b608c10564b31debd4f5bc5e197fc8bfe088f68ae5ce81e7a4f1" +dependencies = [ + "ptr_meta_derive", +] + +[[package]] +name = "ptr_meta_derive" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16b845dbfca988fa33db069c0e230574d15a3088f147a87b64c7589eb662c9ac" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.104", +] + [[package]] name = "quote" version = "1.0.29" @@ -1067,18 +1304,79 @@ dependencies = [ "regex-syntax", ] -[[package]] -name = "regex-automata" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" - [[package]] name = "regex-syntax" version = "0.6.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49b3de9ec5dc0a3417da371aab17d729997c15010e7fd24ff707773a33bddb64" +[[package]] +name = "rend" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581008d2099240d37fb08d77ad713bcaec2c4d89d50b5b21a8bb1996bbab68ab" +dependencies = [ + "bytecheck", +] + +[[package]] +name = "retain_mut" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c31b5c4033f8fdde8700e4657be2c497e7288f01515be52168c631e2e4d4086" + +[[package]] +name = "rkyv" +version = "0.7.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c30f1d45d9aa61cbc8cd1eb87705470892289bb2d01943e7803b873a57404dc3" +dependencies = [ + "bytecheck", + "hashbrown", + "ptr_meta", + "rend", + "rkyv_derive", + "seahash", +] + +[[package]] +name = "rkyv_derive" +version = "0.7.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff26ed6c7c4dfc2aa9480b86a60e3c7233543a270a680e10758a507c5a4ce476" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.104", +] + +[[package]] +name = "roaring" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef0fb5e826a8bde011ecae6a8539dd333884335c57ff0f003fbe27c25bbe8f71" +dependencies = [ + "bytemuck", + "byteorder", + "retain_mut", +] + +[[package]] +name = "rocksdb" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb6f170a4041d50a0ce04b0d2e14916d6ca863ea2e422689a5b694395d299ffe" +dependencies = [ + "libc", + "librocksdb-sys", +] + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustix" version = "0.37.20" @@ -1145,6 +1443,12 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ddccb15bcce173023b3fedd9436f882a0739b8dfb45e4f6b6002bee5929f61b2" +[[package]] +name = "seahash" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b" + [[package]] name = "serde" version = "1.0.168" @@ -1171,11 +1475,29 @@ version = "1.0.107" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6b420ce6e3d8bd882e9b243c6eed35dbc9a6110c9769e74b584e0d68d1f20c65" dependencies = [ - "itoa 1.0.1", + "itoa", "ryu", "serde", ] +[[package]] +name = "shlex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" + +[[package]] +name = "size" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fed904c7fb2856d868b92464fc8fa597fce366edea1a9cbfaa8cb5fe080bd6d" + +[[package]] +name = "slicestring" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "636b979c5672ac7c2a1120ca0a9a6074cd090dadfec42af6f8a5baea1223d180" + [[package]] name = "smallvec" version = "1.8.0" @@ -1190,20 +1512,24 @@ checksum = "9f1341053f34bb13b5e9590afb7d94b48b48d4b87467ec28e3c238693bb553de" [[package]] name = "sourmash" -version = "0.11.0" +version = "0.12.0" dependencies = [ "assert_matches", "az", "bytecount", "byteorder", + "camino", "cfg-if", "chrono", "counter", "criterion", + "csv", + "enum_dispatch", "finch", "fixedbitset", "getrandom", "getset", + "histogram", "log", "md5", "memmap2", @@ -1212,6 +1538,7 @@ dependencies = [ "niffler", "nohash-hasher", "num-iter", + "numsep", "once_cell", "ouroboros", "piz", @@ -1219,8 +1546,12 @@ dependencies = [ "proptest", "rand", "rayon", + "rkyv", + "roaring", + "rocksdb", "serde", "serde_json", + "size", "tempfile", "thiserror", "twox-hash", @@ -1347,16 +1678,25 @@ checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" [[package]] name = "unicode-width" -version = "0.1.9" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" + +[[package]] +name = "vcpkg" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" [[package]] name = "vec-collections" -version = "0.3.6" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f2390c4dc8ae8640c57d067b1a3d40bc05c124cc6bc7394d761b53435d41b76" +checksum = "3c9965c8f2ffed1dbcd16cafe18a009642f540fa22661c6cfd6309ddb02e4982" dependencies = [ + "binary-merge", + "inplace-vec-builder", + "lazy_static", "num-traits", "serde", "smallvec", @@ -1593,3 +1933,14 @@ checksum = "c179869f34fc7c01830d3ce7ea2086bc3a07e0d35289b667d0a8bf910258926c" dependencies = [ "lzma-sys", ] + +[[package]] +name = "zstd-sys" +version = "2.0.7+zstd.1.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94509c3ba2fe55294d752b79842c530ccfab760192521df74a081a78d2b3c7f5" +dependencies = [ + "cc", + "libc", + "pkg-config", +] diff --git a/Makefile b/Makefile index 4c2ef69abb..f964bc3cce 100644 --- a/Makefile +++ b/Makefile @@ -32,7 +32,8 @@ include/sourmash.h: src/core/src/lib.rs \ src/core/src/ffi/index/mod.rs \ src/core/src/ffi/index/revindex.rs \ src/core/src/ffi/storage.rs \ - src/core/src/errors.rs + src/core/src/errors.rs \ + src/core/cbindgen.toml cd src/core && \ RUSTC_BOOTSTRAP=1 cbindgen -c cbindgen.toml . -o ../../$@ diff --git a/deny.toml b/deny.toml index 29d148d50b..99f3b442c7 100644 --- a/deny.toml +++ b/deny.toml @@ -29,6 +29,7 @@ default = "deny" confidence-threshold = 0.8 exceptions = [ { allow = ["Zlib"], name = "piz", version = "*" }, + { allow = ["ISC"], name = "libloading", version = "*" }, ] [bans] diff --git a/doc/developer.md b/doc/developer.md index 2368611e7a..b169d557de 100644 --- a/doc/developer.md +++ b/doc/developer.md @@ -25,7 +25,7 @@ and the [`conda-forge`](https://conda-forge.org/) channel by default). Once `mamba` is installed, run ``` -mamba create -n sourmash_dev 'tox>=3.27,<4' tox-conda rust git compilers pandoc +mamba create -n sourmash_dev 'tox>=3.27,<4' tox-conda rust git compilers pandoc libstdcxx-ng ``` to create an environment called `sourmash_dev` containing the programs needed for development. diff --git a/flake.lock b/flake.lock index c39e6a9732..314b9a5bf7 100644 --- a/flake.lock +++ b/flake.lock @@ -2,11 +2,11 @@ "nodes": { "nixpkgs": { "locked": { - "lastModified": 1692404241, - "narHash": "sha256-TRZlFHtrQI6Kh8RFqnjBF2uNNi/c66ldB4WuIcrwMzg=", + "lastModified": 1693780807, + "narHash": "sha256-diV1X53HjSB3fIcDFieh9tGZkJ3vqJJQhTz89NbYw60=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "2f9286912cb215969ece465147badf6d07aa43fe", + "rev": "84ef5335abf541d8148433489e0cf79affae3f89", "type": "github" }, "original": { @@ -33,11 +33,11 @@ ] }, "locked": { - "lastModified": 1692410823, - "narHash": "sha256-YM1QCenpghNqgleUmoCJUArTuMEBqScyQuhepA6JZaI=", + "lastModified": 1693793487, + "narHash": "sha256-MS6CDyAC0sJMTE/pRYlfrhBnhlAPvEo43ipwf5ZNzHg=", "owner": "oxalica", "repo": "rust-overlay", - "rev": "598b2f04ed252eb5808b108d7a10084c0c548753", + "rev": "f179280eed5eb93759c94bf3231fbbda28f894b7", "type": "github" }, "original": { @@ -66,11 +66,11 @@ "systems": "systems" }, "locked": { - "lastModified": 1689068808, - "narHash": "sha256-6ixXo3wt24N/melDWjq70UuHQLxGV8jZvooRanIHXw0=", + "lastModified": 1692799911, + "narHash": "sha256-3eihraek4qL744EvQXsK1Ha6C3CR7nnT8X2qWap4RNk=", "owner": "numtide", "repo": "flake-utils", - "rev": "919d646de7be200f3bf08cb76ae1f09402b6f9b4", + "rev": "f9e7cf818399d17d347f847525c5a5a8032e4e44", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index 98b43f4c61..7fa3f25ae1 100644 --- a/flake.nix +++ b/flake.nix @@ -103,17 +103,21 @@ wasmtime wasm-pack nodejs_20 + #emscripten #py-spy #heaptrack + cargo-all-features cargo-watch cargo-limit cargo-outdated cargo-udeps cargo-deny + cargo-semver-checks nixpkgs-fmt ]; + # Needed for matplotlib LD_LIBRARY_PATH = lib.makeLibraryPath [ pkgs.stdenv.cc.cc.lib ]; # workaround for https://github.com/NixOS/nixpkgs/blob/48dfc9fa97d762bce28cc8372a2dd3805d14c633/doc/languages-frameworks/python.section.md#python-setuppy-bdist_wheel-cannot-create-whl diff --git a/include/sourmash.h b/include/sourmash.h index 6fa7854880..011aee2925 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -42,6 +42,7 @@ enum SourmashErrorCode { SOURMASH_ERROR_CODE_PARSE_INT = 100003, SOURMASH_ERROR_CODE_SERDE_ERROR = 100004, SOURMASH_ERROR_CODE_NIFFLER_ERROR = 100005, + SOURMASH_ERROR_CODE_CSV_ERROR = 100006, }; typedef uint32_t SourmashErrorCode; diff --git a/pyproject.toml b/pyproject.toml index 3249545ecc..ecdfd87beb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -144,7 +144,7 @@ include = [ exclude = [ { path = "**/__pycache__/*", format = ["sdist", "wheel"] }, ] -features = ["maturin"] +features = ["maturin", "branchwater"] locked = true module-name = "sourmash._lowlevel" @@ -164,7 +164,7 @@ known_first_party = ["sourmash"] [tool.cibuildwheel] build = "cp39-*" -skip = "*-win32 *-manylinux_i686 *-musllinux_ppc64le *-musllinux_s390x" +skip = "*-win32 *-manylinux_i686 *-musllinux_*" before-all = [ "curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain=stable", "cargo update --dry-run", @@ -178,6 +178,18 @@ build-verbosity = 3 CARGO_REGISTRIES_CRATES_IO_PROTOCOL="sparse" PATH="$HOME/.cargo/bin:$PATH" +[tool.cibuildwheel.linux] +before-all = [ + "curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain=stable", + "cargo update --dry-run", + "if [ -f /etc/system-release ]; then yum -y install centos-release-scl; fi", + "if [ -f /etc/system-release ]; then yum -y install llvm-toolset-7.0; fi", +] +before-build = [ + "if [ -f /etc/system-release ]; then source scl_source enable llvm-toolset-7.0; fi", + "if [ -f /etc/system-release ]; then source scl_source enable devtoolset-10; fi", +] + [tool.cibuildwheel.linux.environment] CARGO_REGISTRIES_CRATES_IO_PROTOCOL="sparse" PATH="$HOME/.cargo/bin:$PATH" diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index fe05a69d2e..73e42057e2 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sourmash" -version = "0.11.0" +version = "0.12.0" authors = ["Luiz Irber "] description = "MinHash sketches for genomic data" repository = "https://github.com/sourmash-bio/sourmash" @@ -22,35 +22,45 @@ bench = false from-finch = ["finch"] parallel = ["rayon"] maturin = [] +branchwater = ["rocksdb", "rkyv", "parallel"] +default = [] [dependencies] az = "1.0.0" bytecount = "0.6.0" byteorder = "1.4.3" +camino = { version = "1.1.6", features = ["serde1"] } cfg-if = "1.0" counter = "0.5.7" +csv = "1.1.6" +enum_dispatch = "0.3.12" finch = { version = "0.6.0", optional = true } fixedbitset = "0.4.0" getrandom = { version = "0.2", features = ["js"] } getset = "0.1.1" +histogram = "0.6.9" log = "0.4.20" md5 = "0.7.0" +memmap2 = "0.7.1" murmurhash3 = "0.0.5" niffler = { version = "2.3.1", default-features = false, features = [ "gz" ] } nohash-hasher = "0.2.0" num-iter = "0.1.43" +numsep = "0.1.12" once_cell = "1.18.0" +ouroboros = "0.18.0" +piz = "0.5.0" +primal-check = "0.3.1" rayon = { version = "1.7.0", optional = true } +rkyv = { version = "0.7.39", optional = true } +roaring = "0.10.0" serde = { version = "1.0.168", features = ["derive"] } serde_json = "1.0.107" -primal-check = "0.3.1" +size = "0.4.0" thiserror = "1.0" -typed-builder = "0.14.0" twox-hash = "1.6.0" -vec-collections = "0.3.4" -piz = "0.5.0" -memmap2 = "0.7.1" -ouroboros = "0.18.0" +typed-builder = "0.14.0" +vec-collections = "0.4.3" [dev-dependencies] assert_matches = "1.3.0" @@ -72,6 +82,13 @@ harness = false name = "minhash" harness = false +[package.metadata.cargo-all-features] +skip_optional_dependencies = true +denylist = ["maturin"] +skip_feature_sets = [ + ["branchwater", "parallel"], # branchwater implies parallel +] + ## Wasm section. Crates only used for WASM, as well as specific configurations [target.'cfg(all(target_arch = "wasm32", target_os="unknown"))'.dependencies.wasm-bindgen] @@ -90,4 +107,5 @@ features = ["wasmbind"] wasm-bindgen-test = "0.3.37" ### These crates don't compile on wasm -[target.'cfg(not(all(target_arch = "wasm32", target_os="unknown")))'.dependencies] +[target.'cfg(not(target_arch = "wasm32"))'.dependencies] +rocksdb = { version = "0.21.0", optional = true } diff --git a/src/core/build.rs b/src/core/build.rs index a22396c25a..f067828d50 100644 --- a/src/core/build.rs +++ b/src/core/build.rs @@ -55,12 +55,12 @@ fn copy_c_bindings(crate_dir: &str) { let new_header: String = header .lines() .filter_map(|s| { - if s.starts_with("#") { + if s.starts_with('#') { None } else { Some({ let mut s = s.to_owned(); - s.push_str("\n"); + s.push('\n'); s }) } @@ -71,5 +71,5 @@ fn copy_c_bindings(crate_dir: &str) { let target_dir = find_target_dir(&out_dir); std::fs::create_dir_all(&target_dir).expect("error creating target dir"); let out_path = target_dir.join("header.h"); - std::fs::write(out_path, &new_header).expect("error writing header"); + std::fs::write(out_path, new_header).expect("error writing header"); } diff --git a/src/core/cbindgen.toml b/src/core/cbindgen.toml index cd6cd781c2..1a0a81af47 100644 --- a/src/core/cbindgen.toml +++ b/src/core/cbindgen.toml @@ -8,7 +8,7 @@ clean = true [parse.expand] crates = ["sourmash"] -features = [] +features = ["branchwater"] [enum] rename_variants = "QualifiedScreamingSnakeCase" diff --git a/src/core/src/collection.rs b/src/core/src/collection.rs new file mode 100644 index 0000000000..164db5efe7 --- /dev/null +++ b/src/core/src/collection.rs @@ -0,0 +1,135 @@ +use std::ops::{Deref, DerefMut}; + +use camino::Utf8Path as Path; + +use crate::encodings::Idx; +use crate::index::Selection; +use crate::manifest::{Manifest, Record}; +use crate::signature::Signature; +use crate::storage::{FSStorage, InnerStorage, MemStorage, SigStore, Storage, ZipStorage}; +use crate::Result; + +pub struct Collection { + pub(crate) manifest: Manifest, + pub(crate) storage: InnerStorage, +} + +pub struct CollectionSet { + collection: Collection, +} + +impl Deref for CollectionSet { + type Target = Collection; + + fn deref(&self) -> &Self::Target { + &self.collection + } +} + +impl DerefMut for CollectionSet { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.collection + } +} + +impl TryFrom for CollectionSet { + type Error = crate::Error; + + fn try_from(collection: Collection) -> Result { + let first = if let Some(first) = collection.manifest.first() { + first + } else { + // empty collection is consistent ¯\_(ツ)_/¯ + return Ok(Self { collection }); + }; + + collection + .manifest + .iter() + .skip(1) + .try_for_each(|c| first.check_compatible(c))?; + + Ok(Self { collection }) + } +} + +impl Collection { + pub fn from_zipfile>(zipfile: P) -> Result { + let storage = ZipStorage::from_file(zipfile)?; + // Load manifest from standard location in zipstorage + let manifest = Manifest::from_reader(storage.load("SOURMASH-MANIFEST.csv")?.as_slice())?; + Ok(Self { + manifest, + storage: InnerStorage::new(storage), + }) + } + + pub fn from_sigs(sigs: Vec) -> Result { + let storage = MemStorage::new(); + + let mut records = vec![]; + for (i, sig) in sigs.into_iter().enumerate() { + let path = format!("{}", i); + let mut record = Record::from_sig(&sig, &path); + let path = storage.save_sig(&path, sig)?; + record.iter_mut().for_each(|rec| { + rec.set_internal_location(path.clone().into()); + }); + records.extend(record); + } + + Ok(Self { + manifest: records.into(), + storage: InnerStorage::new(storage), + }) + } + + pub fn from_paths>(paths: &[P]) -> Result { + // TODO: + // - Build manifest from paths + // - Might need to load the data? + // - Use FSStorage (figure out if there is a common path between sigs?) + let records: Vec = paths + .iter() + .flat_map(|p| { + let recs: Vec = Signature::from_path(p.as_ref()) + .unwrap_or_else(|_| panic!("Error processing {:?}", p.as_ref())) + .into_iter() + .flat_map(|v| Record::from_sig(&v, p.as_ref().as_str())) + .collect(); + recs + }) + //.map(|p| self.collection().storage.load_sig(p.as_str())?.into()) + .collect(); + + Ok(Self { + manifest: records.into(), + storage: InnerStorage::new( + FSStorage::builder() + .fullpath("".into()) + .subdir("".into()) + .build(), + ), + }) + } + + pub fn sig_for_dataset(&self, dataset_id: Idx) -> Result { + let match_path = if self.manifest.is_empty() { + "" + } else { + self.manifest[dataset_id as usize] + .internal_location() + .as_str() + }; + + let selection = Selection::from_record(&self.manifest[dataset_id as usize])?; + let sig = self.storage.load_sig(match_path)?.select(&selection)?; + assert_eq!(sig.signatures.len(), 1); + Ok(sig) + } + + pub fn select(mut self, selection: &Selection) -> Result { + self.manifest = self.manifest.select_to_manifest(selection)?; + Ok(self) + } +} diff --git a/src/core/src/encodings.rs b/src/core/src/encodings.rs index 6010cf2f6d..be34a00444 100644 --- a/src/core/src/encodings.rs +++ b/src/core/src/encodings.rs @@ -7,6 +7,7 @@ use std::str; use nohash_hasher::BuildNoHashHasher; use once_cell::sync::Lazy; +use vec_collections::AbstractVecSet; use crate::Error; @@ -17,12 +18,16 @@ use crate::Error; // and a `Slab`. This might be very useful if K is something // heavy such as a `String`. pub type Color = u64; -pub type Idx = u64; -type IdxTracker = (vec_collections::VecSet<[Idx; 4]>, u64); +pub type Idx = u32; +type IdxTracker = (vec_collections::VecSet<[Idx; 8]>, u64); type ColorToIdx = HashMap>; #[allow(non_camel_case_types)] #[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[cfg_attr( + feature = "rkyv", + derive(rkyv::Serialize, rkyv::Deserialize, rkyv::Archive) +)] #[repr(u32)] pub enum HashFunctions { murmur64_DNA = 1, @@ -73,7 +78,7 @@ impl TryFrom<&str> for HashFunctions { "dayhoff" => Ok(HashFunctions::murmur64_dayhoff), "hp" => Ok(HashFunctions::murmur64_hp), "protein" => Ok(HashFunctions::murmur64_protein), - _ => unimplemented!(), + v => unimplemented!("{v}"), } } } @@ -507,16 +512,16 @@ mod test { fn colors_update() { let mut colors = Colors::new(); - let color = colors.update(None, &[1_u64]).unwrap(); + let color = colors.update(None, &[1_u32]).unwrap(); assert_eq!(colors.len(), 1); dbg!("update"); - let new_color = colors.update(Some(color), &[1_u64]).unwrap(); + let new_color = colors.update(Some(color), &[1_u32]).unwrap(); assert_eq!(colors.len(), 1); assert_eq!(color, new_color); dbg!("upgrade"); - let new_color = colors.update(Some(color), &[2_u64]).unwrap(); + let new_color = colors.update(Some(color), &[2_u32]).unwrap(); assert_eq!(colors.len(), 2); assert_ne!(color, new_color); } @@ -525,20 +530,20 @@ mod test { fn colors_retain() { let mut colors = Colors::new(); - let color1 = colors.update(None, &[1_u64]).unwrap(); + let color1 = colors.update(None, &[1_u32]).unwrap(); assert_eq!(colors.len(), 1); // used_colors: // color1: 1 dbg!("update"); - let same_color = colors.update(Some(color1), &[1_u64]).unwrap(); + let same_color = colors.update(Some(color1), &[1_u32]).unwrap(); assert_eq!(colors.len(), 1); assert_eq!(color1, same_color); // used_colors: // color1: 2 dbg!("upgrade"); - let color2 = colors.update(Some(color1), &[2_u64]).unwrap(); + let color2 = colors.update(Some(color1), &[2_u32]).unwrap(); assert_eq!(colors.len(), 2); assert_ne!(color1, color2); // used_colors: @@ -546,7 +551,7 @@ mod test { // color2: 1 dbg!("update"); - let same_color = colors.update(Some(color2), &[2_u64]).unwrap(); + let same_color = colors.update(Some(color2), &[2_u32]).unwrap(); assert_eq!(colors.len(), 2); assert_eq!(color2, same_color); // used_colors: @@ -554,7 +559,7 @@ mod test { // color1: 2 dbg!("upgrade"); - let color3 = colors.update(Some(color1), &[3_u64]).unwrap(); + let color3 = colors.update(Some(color1), &[3_u32]).unwrap(); assert_ne!(color1, color3); assert_ne!(color2, color3); // used_colors: diff --git a/src/core/src/errors.rs b/src/core/src/errors.rs index cd4ddcfaf1..c43b104bee 100644 --- a/src/core/src/errors.rs +++ b/src/core/src/errors.rs @@ -63,9 +63,17 @@ pub enum SourmashError { #[error(transparent)] IOError(#[from] std::io::Error), + #[error(transparent)] + CsvError(#[from] csv::Error), + #[cfg(not(all(target_arch = "wasm32", target_os = "unknown")))] #[error(transparent)] Panic(#[from] crate::ffi::utils::Panic), + + #[cfg(not(target_arch = "wasm32"))] + #[cfg(feature = "branchwater")] + #[error(transparent)] + RocksDBError(#[from] rocksdb::Error), } #[derive(Debug, Error)] @@ -108,6 +116,8 @@ pub enum SourmashErrorCode { ParseInt = 100_003, SerdeError = 100_004, NifflerError = 100_005, + CsvError = 100_006, + RocksDBError = 100_007, } #[cfg(not(all(target_arch = "wasm32", target_os = "unknown")))] @@ -137,6 +147,11 @@ impl SourmashErrorCode { SourmashError::IOError { .. } => SourmashErrorCode::Io, SourmashError::NifflerError { .. } => SourmashErrorCode::NifflerError, SourmashError::Utf8Error { .. } => SourmashErrorCode::Utf8Error, + SourmashError::CsvError { .. } => SourmashErrorCode::CsvError, + + #[cfg(not(target_arch = "wasm32"))] + #[cfg(feature = "branchwater")] + SourmashError::RocksDBError { .. } => SourmashErrorCode::RocksDBError, } } } diff --git a/src/core/src/ffi/index/mod.rs b/src/core/src/ffi/index/mod.rs index 932a97b222..a2f03f222f 100644 --- a/src/core/src/ffi/index/mod.rs +++ b/src/core/src/ffi/index/mod.rs @@ -1,3 +1,5 @@ +#[cfg(not(target_arch = "wasm32"))] +#[cfg(feature = "branchwater")] pub mod revindex; use crate::signature::Signature; diff --git a/src/core/src/ffi/index/revindex.rs b/src/core/src/ffi/index/revindex.rs index 3597121bce..ab24e2ac3e 100644 --- a/src/core/src/ffi/index/revindex.rs +++ b/src/core/src/ffi/index/revindex.rs @@ -1,16 +1,16 @@ -use std::path::PathBuf; use std::slice; -use crate::index::revindex::RevIndex; -use crate::index::Index; -use crate::signature::{Signature, SigsTrait}; -use crate::sketch::minhash::KmerMinHash; -use crate::sketch::Sketch; +use camino::Utf8PathBuf as PathBuf; use crate::ffi::index::SourmashSearchResult; use crate::ffi::minhash::SourmashKmerMinHash; use crate::ffi::signature::SourmashSignature; use crate::ffi::utils::{ForeignObject, SourmashStr}; +use crate::index::revindex::mem_revindex::RevIndex; +use crate::index::Index; +use crate::signature::{Signature, SigsTrait}; +use crate::sketch::minhash::KmerMinHash; +use crate::sketch::Sketch; pub struct SourmashRevIndex; @@ -64,7 +64,7 @@ unsafe fn revindex_new_with_paths( threshold, queries, keep_sigs, - ); + )?; Ok(SourmashRevIndex::from_rust(revindex)) } } @@ -105,7 +105,7 @@ unsafe fn revindex_new_with_sigs( .collect(); Some(queries_vec.as_ref()) }; - let revindex = RevIndex::new_with_sigs(search_sigs, &template, threshold, queries); + let revindex = RevIndex::new_with_sigs(search_sigs, &template, threshold, queries)?; Ok(SourmashRevIndex::from_rust(revindex)) } } diff --git a/src/core/src/ffi/storage.rs b/src/core/src/ffi/storage.rs index 86d3834201..7479e983e5 100644 --- a/src/core/src/ffi/storage.rs +++ b/src/core/src/ffi/storage.rs @@ -1,5 +1,6 @@ use std::os::raw::c_char; use std::slice; +use std::sync::Arc; use crate::ffi::utils::{ForeignObject, SourmashStr}; use crate::prelude::*; @@ -8,7 +9,7 @@ use crate::storage::ZipStorage; pub struct SourmashZipStorage; impl ForeignObject for SourmashZipStorage { - type RustObject = ZipStorage; + type RustObject = Arc; } ffi_fn! { @@ -20,7 +21,7 @@ unsafe fn zipstorage_new(ptr: *const c_char, insize: usize) -> Result<*mut Sourm }; let zipstorage = ZipStorage::from_file(path)?; - Ok(SourmashZipStorage::from_rust(zipstorage)) + Ok(SourmashZipStorage::from_rust(Arc::new(zipstorage))) } } @@ -110,7 +111,7 @@ unsafe fn zipstorage_set_subdir( std::str::from_utf8(path)? }; - storage.set_subdir(path.to_string()); + (*Arc::get_mut(storage).unwrap()).set_subdir(path.to_string()); Ok(()) } } @@ -120,7 +121,7 @@ unsafe fn zipstorage_path(ptr: *const SourmashZipStorage) -> Result let storage = SourmashZipStorage::as_rust(ptr); if let Some(ref path) = storage.path() { - Ok(path.clone().into()) + Ok(path.clone().into_string().into()) } else { Ok("".into()) } diff --git a/src/core/src/from.rs b/src/core/src/from.rs index dfc384236e..7847714cfe 100644 --- a/src/core/src/from.rs +++ b/src/core/src/from.rs @@ -23,10 +23,8 @@ impl From for KmerMinHash { values.len() as u32, ); - let hash_with_abunds: Vec<(u64, u64)> = values - .iter() - .map(|x| (x.hash as u64, x.count as u64)) - .collect(); + let hash_with_abunds: Vec<(u64, u64)> = + values.iter().map(|x| (x.hash, x.count as u64)).collect(); new_mh .add_many_with_abund(&hash_with_abunds) @@ -68,7 +66,7 @@ mod test { let b_hashes = b.to_vec(); let s1: HashSet<_> = a.mins().into_iter().collect(); - let s2: HashSet<_> = b_hashes.iter().map(|x| x.hash as u64).collect(); + let s2: HashSet<_> = b_hashes.iter().map(|x| x.hash).collect(); let i1 = &s1 & &s2; assert!(i1.len() == a.size()); @@ -79,10 +77,9 @@ mod test { let smap: HashMap<_, _> = mins.iter().zip(abunds.iter()).collect(); println!("{:?}", smap); for item in b_hashes.iter() { - assert!(smap.contains_key(&(item.hash as u64))); + assert!(smap.contains_key(&{ item.hash })); assert!( - **smap.get(&(item.hash as u64)).unwrap() - == ((item.count + item.extra_count) as u64) + **smap.get(&{ item.hash }).unwrap() == ((item.count + item.extra_count) as u64) ); } } diff --git a/src/core/src/index/linear.rs b/src/core/src/index/linear.rs index 78b2c6f1f5..ed12bbd745 100644 --- a/src/core/src/index/linear.rs +++ b/src/core/src/index/linear.rs @@ -1,185 +1,387 @@ -use std::fs::File; -use std::io::{BufReader, Read}; -use std::path::Path; -use std::path::PathBuf; - -use serde::{Deserialize, Serialize}; -use typed_builder::TypedBuilder; - -use crate::index::{Comparable, DatasetInfo, Index, SigStore}; -use crate::prelude::*; -use crate::storage::{FSStorage, InnerStorage, Storage, StorageInfo}; -use crate::Error; - -#[derive(TypedBuilder)] -pub struct LinearIndex { - #[builder(default)] - storage: Option, - - #[builder(default)] - datasets: Vec>, -} +use std::collections::HashSet; +use std::sync::atomic::{AtomicUsize, Ordering}; + +use camino::Utf8PathBuf as PathBuf; +use log::info; + +#[cfg(feature = "parallel")] +use rayon::prelude::*; -#[derive(Serialize, Deserialize)] -struct LinearInfo { - version: u32, - storage: StorageInfo, - leaves: Vec, +use crate::collection::CollectionSet; +use crate::encodings::Idx; +use crate::index::{GatherResult, Index, Selection, SigCounter}; +use crate::manifest::Manifest; +use crate::signature::{Signature, SigsTrait}; +use crate::sketch::minhash::KmerMinHash; +use crate::sketch::Sketch; +use crate::storage::{InnerStorage, SigStore, Storage}; +use crate::Result; + +//#[derive(Serialize, Deserialize)] +pub struct LinearIndex { + collection: CollectionSet, + template: Sketch, } -impl<'a, L> Index<'a> for LinearIndex -where - L: Clone + Comparable + 'a, - SigStore: From, -{ - type Item = L; - //type SignatureIterator = std::slice::Iter<'a, Self::Item>; +impl LinearIndex { + pub fn from_collection(collection: CollectionSet) -> Self { + let sig = collection.sig_for_dataset(0).unwrap(); + let template = sig.sketches().swap_remove(0); + Self { + collection, + template, + } + } + + pub fn sig_for_dataset(&self, dataset_id: Idx) -> Result { + self.collection.sig_for_dataset(dataset_id) + } - fn insert(&mut self, node: L) -> Result<(), Error> { - self.datasets.push(node.into()); - Ok(()) + pub fn collection(&self) -> &CollectionSet { + &self.collection } - fn save>(&self, _path: P) -> Result<(), Error> { - /* - let file = File::create(path)?; - match serde_json::to_writer(file, &self) { - Ok(_) => Ok(()), - Err(_) => Err(SourmashError::SerdeError.into()), + pub fn template(&self) -> &Sketch { + &self.template + } + + pub fn location(&self) -> Option { + if let Some(_storage) = &self.storage() { + // storage.path() + unimplemented!() + } else { + None } - */ - unimplemented!() } - fn load>(_path: P) -> Result<(), Error> { - unimplemented!() + pub fn storage(&self) -> Option { + Some(self.collection.storage.clone()) } - fn signatures(&self) -> Vec { - self.datasets - .iter() - .map(|x| x.data.get().unwrap().clone()) - .collect() + pub fn select(mut self, selection: &Selection) -> Result { + let manifest = self.collection.manifest.select_to_manifest(selection)?; + self.collection.manifest = manifest; + + Ok(self) + /* + # if we have a manifest, run 'select' on the manifest. + manifest = self.manifest + traverse_yield_all = self.traverse_yield_all + + if manifest is not None: + manifest = manifest.select_to_manifest(**kwargs) + return ZipFileLinearIndex(self.storage, + selection_dict=None, + traverse_yield_all=traverse_yield_all, + manifest=manifest, + use_manifest=True) + else: + # no manifest? just pass along all the selection kwargs to + # the new ZipFileLinearIndex. + + assert manifest is None + if self.selection_dict: + # combine selects... + d = dict(self.selection_dict) + for k, v in kwargs.items(): + if k in d: + if d[k] is not None and d[k] != v: + raise ValueError(f"incompatible select on '{k}'") + d[k] = v + kwargs = d + + return ZipFileLinearIndex(self.storage, + selection_dict=kwargs, + traverse_yield_all=traverse_yield_all, + manifest=None, + use_manifest=False) + */ } - fn signature_refs(&self) -> Vec<&Self::Item> { - self.datasets - .iter() - .map(|x| x.data.get().unwrap()) - .collect() + pub fn counter_for_query(&self, query: &KmerMinHash) -> SigCounter { + let processed_sigs = AtomicUsize::new(0); + + let search_sigs: Vec<_> = self + .collection + .manifest + .internal_locations() + .map(PathBuf::from) + .collect(); + + let template = self.template(); + + #[cfg(feature = "parallel")] + let sig_iter = search_sigs.par_iter(); + + #[cfg(not(feature = "parallel"))] + let sig_iter = search_sigs.iter(); + + let counters = sig_iter.enumerate().filter_map(|(dataset_id, filename)| { + let i = processed_sigs.fetch_add(1, Ordering::SeqCst); + if i % 1000 == 0 { + info!("Processed {} reference sigs", i); + } + + let search_sig = if let Some(storage) = &self.storage() { + let sig_data = storage + .load(filename.as_str()) + .unwrap_or_else(|_| panic!("error loading {:?}", filename)); + + Signature::from_reader(sig_data.as_slice()) + } else { + Signature::from_path(filename) + } + .unwrap_or_else(|_| panic!("Error processing {:?}", filename)) + .swap_remove(0); + + let mut search_mh = None; + if let Some(Sketch::MinHash(mh)) = search_sig.select_sketch(template) { + search_mh = Some(mh); + }; + let search_mh = search_mh.expect("Couldn't find a compatible MinHash"); + + let (large_mh, small_mh) = if query.size() > search_mh.size() { + (query, search_mh) + } else { + (search_mh, query) + }; + + let (size, _) = small_mh + .intersection_size(large_mh) + .unwrap_or_else(|_| panic!("error computing intersection for {:?}", filename)); + + if size == 0 { + None + } else { + let mut counter: SigCounter = Default::default(); + counter[&(dataset_id as Idx)] += size as usize; + Some(counter) + } + }); + + let reduce_counters = |mut a: SigCounter, b: SigCounter| { + a.extend(&b); + a + }; + + #[cfg(feature = "parallel")] + let counter = counters.reduce(SigCounter::new, reduce_counters); + + #[cfg(not(feature = "parallel"))] + let counter = counters.fold(SigCounter::new(), reduce_counters); + + counter } - /* - fn iter_signatures(&'a self) -> Self::SignatureIterator { - self.datasets.iter() + pub fn search( + &self, + counter: SigCounter, + similarity: bool, + threshold: usize, + ) -> Result> { + let mut matches = vec![]; + if similarity { + unimplemented!("TODO: threshold correction") + } + + for (dataset_id, size) in counter.most_common() { + if size >= threshold { + matches.push( + self.collection.manifest[dataset_id as usize] + .internal_location() + .to_string(), + ); + } else { + break; + }; + } + Ok(matches) + } + + pub fn gather_round( + &self, + dataset_id: Idx, + match_size: usize, + query: &KmerMinHash, + round: usize, + ) -> Result { + let match_path = if self.collection.manifest.is_empty() { + "" + } else { + self.collection.manifest[dataset_id as usize] + .internal_location() + .as_str() + } + .into(); + let match_sig = self.collection.sig_for_dataset(dataset_id)?; + let result = self.stats_for_match(&match_sig, query, match_size, match_path, round)?; + Ok(result) } - */ -} -impl LinearIndex -where - L: ToWriter, - SigStore: ReadData, -{ - pub fn save_file>( - &mut self, - path: P, - storage: Option, - ) -> Result<(), Error> { - let ref_path = path.as_ref(); - let mut basename = ref_path.file_name().unwrap().to_str().unwrap().to_owned(); - if basename.ends_with(".sbt.json") { - basename = basename.replace(".sbt.json", ""); + fn stats_for_match( + &self, + match_sig: &Signature, + query: &KmerMinHash, + match_size: usize, + match_path: PathBuf, + gather_result_rank: usize, + ) -> Result { + let template = self.template(); + + let mut match_mh = None; + if let Some(Sketch::MinHash(mh)) = match_sig.select_sketch(template) { + match_mh = Some(mh); } - let location = ref_path.parent().unwrap(); + let match_mh = match_mh.expect("Couldn't find a compatible MinHash"); + + // Calculate stats + let f_orig_query = match_size as f64 / query.size() as f64; + let f_match = match_size as f64 / match_mh.size() as f64; + let filename = match_path.into_string(); + let name = match_sig.name(); + let unique_intersect_bp = match_mh.scaled() as usize * match_size; + + let (intersect_orig, _) = match_mh.intersection_size(query)?; + let intersect_bp = (match_mh.scaled() * intersect_orig) as usize; - let storage = match storage { - Some(s) => s, - None => { - let subdir = format!(".linear.{}", basename); - InnerStorage::new(FSStorage::new(location.to_str().unwrap(), &subdir)) + let f_unique_to_query = intersect_orig as f64 / query.size() as f64; + let match_ = match_sig.clone(); + + // TODO: all of these + let f_unique_weighted = 0.; + let average_abund = 0; + let median_abund = 0; + let std_abund = 0; + let md5 = "".into(); + let f_match_orig = 0.; + let remaining_bp = 0; + + Ok(GatherResult { + intersect_bp, + f_orig_query, + f_match, + f_unique_to_query, + f_unique_weighted, + average_abund, + median_abund, + std_abund, + filename, + name, + md5, + match_, + f_match_orig, + unique_intersect_bp, + gather_result_rank, + remaining_bp, + }) + } + + pub fn gather( + &self, + mut counter: SigCounter, + threshold: usize, + query: &KmerMinHash, + ) -> std::result::Result, Box> { + let mut match_size = usize::max_value(); + let mut matches = vec![]; + let template = self.template(); + + while match_size > threshold && !counter.is_empty() { + let (dataset_id, size) = counter.most_common()[0]; + if threshold == 0 && size == 0 { + break; } - }; - let args = storage.args(); - let storage_info = StorageInfo { - backend: "FSStorage".into(), - args, - }; + match_size = if size >= threshold { + size + } else { + break; + }; - let info: LinearInfo = LinearInfo { - storage: storage_info, - version: 5, - leaves: self - .datasets - .iter_mut() - .map(|l| { - // Trigger data loading - let _: &L = (*l).data().unwrap(); - - // set storage to new one - l.storage = Some(storage.clone()); - - let filename = (*l).save(&l.filename).unwrap(); - - DatasetInfo { - filename, - name: l.name.clone(), - metadata: l.metadata.clone(), - } - }) - .collect(), - }; + let result = self.gather_round(dataset_id, match_size, query, matches.len())?; - let file = File::create(path)?; - serde_json::to_writer(file, &info)?; + // Prepare counter for finding the next match by decrementing + // all hashes found in the current match in other datasets + // TODO: maybe par_iter? + let mut to_remove: HashSet = Default::default(); + to_remove.insert(dataset_id); + + for (dataset, value) in counter.iter_mut() { + let dataset_sig = self.collection.sig_for_dataset(*dataset)?; + let mut match_mh = None; + if let Some(Sketch::MinHash(mh)) = dataset_sig.select_sketch(template) { + match_mh = Some(mh); + } + let match_mh = match_mh.expect("Couldn't find a compatible MinHash"); + + let (intersection, _) = query.intersection_size(match_mh)?; + if intersection as usize > *value { + to_remove.insert(*dataset); + } else { + *value -= intersection as usize; + }; + } + to_remove.iter().for_each(|dataset_id| { + counter.remove(dataset_id); + }); + matches.push(result); + } + Ok(matches) + } + pub fn manifest(&self) -> Manifest { + self.collection.manifest.clone() + } + + pub fn set_manifest(&mut self, new_manifest: Manifest) -> Result<()> { + self.collection.manifest = new_manifest; Ok(()) } - pub fn from_path>(path: P) -> Result, Error> { - let file = File::open(&path)?; - let mut reader = BufReader::new(file); - - // TODO: match with available Storage while we don't - // add a function to build a Storage from a StorageInfo - let mut basepath = PathBuf::new(); - basepath.push(path); - basepath.canonicalize()?; - - let linear = LinearIndex::::from_reader(&mut reader, basepath.parent().unwrap())?; - Ok(linear) - } - - pub fn from_reader(rdr: R, path: P) -> Result, Error> - where - R: Read, - P: AsRef, - { - // TODO: check https://serde.rs/enum-representations.html for a - // solution for loading v4 and v5 - let linear: LinearInfo = serde_json::from_reader(rdr)?; - - // TODO: support other storages - let mut st: FSStorage = (&linear.storage.args).into(); - st.set_base(path.as_ref().to_str().unwrap()); - let storage = InnerStorage::new(st); - - Ok(LinearIndex { - storage: Some(storage.clone()), - datasets: linear - .leaves - .into_iter() - .map(|l| { - let mut v: SigStore = l.into(); - v.storage = Some(storage.clone()); - v - }) - .collect(), + pub fn signatures_iter(&self) -> impl Iterator + '_ { + // FIXME temp solution, must find better one! + (0..self.collection.manifest.len()).map(move |dataset_id| { + self.collection + .sig_for_dataset(dataset_id as Idx) + .expect("error loading sig") }) } +} - pub fn storage(&self) -> Option { - self.storage.clone() +impl<'a> Index<'a> for LinearIndex { + type Item = SigStore; + + fn insert(&mut self, _node: Self::Item) -> Result<()> { + unimplemented!() + } + + fn save>(&self, _path: P) -> Result<()> { + unimplemented!() + } + + fn load>(_path: P) -> Result<()> { + unimplemented!() + } + + fn len(&self) -> usize { + self.collection.manifest.len() + } + + fn signatures(&self) -> Vec { + self.collection() + .manifest + .internal_locations() + .map(PathBuf::from) + .map(|p| { + self.collection() + .storage + .load_sig(p.as_str()) + .unwrap_or_else(|_| panic!("Error processing {:?}", p)) + }) + .collect() + } + + fn signature_refs(&self) -> Vec<&Self::Item> { + unimplemented!() } } diff --git a/src/core/src/index/mod.rs b/src/core/src/index/mod.rs index 832fdf9091..ad65bf9d08 100644 --- a/src/core/src/index/mod.rs +++ b/src/core/src/index/mod.rs @@ -4,35 +4,177 @@ //! Some indices also support containment searches. pub mod linear; + +#[cfg(not(target_arch = "wasm32"))] +#[cfg(feature = "branchwater")] pub mod revindex; pub mod search; -use std::ops::Deref; use std::path::Path; -use once_cell::sync::OnceCell; +use getset::{CopyGetters, Getters, Setters}; + use serde::{Deserialize, Serialize}; use typed_builder::TypedBuilder; -use crate::errors::ReadDataError; +use crate::encodings::{HashFunctions, Idx}; use crate::index::search::{search_minhashes, search_minhashes_containment}; +use crate::manifest::Record; +use crate::picklist::Picklist; use crate::prelude::*; use crate::signature::SigsTrait; use crate::sketch::Sketch; -use crate::storage::{InnerStorage, Storage}; -use crate::Error; +use crate::Result; + +#[derive(TypedBuilder, CopyGetters, Getters, Setters, Serialize, Deserialize, Debug, PartialEq)] +pub struct GatherResult { + #[getset(get_copy = "pub")] + intersect_bp: usize, + + #[getset(get_copy = "pub")] + f_orig_query: f64, + + #[getset(get_copy = "pub")] + f_match: f64, + + f_unique_to_query: f64, + f_unique_weighted: f64, + average_abund: usize, + median_abund: usize, + std_abund: usize, + + #[getset(get = "pub")] + filename: String, + + #[getset(get = "pub")] + name: String, + + #[getset(get = "pub")] + md5: String, + + #[serde(skip)] + match_: Signature, + + f_match_orig: f64, + unique_intersect_bp: usize, + gather_result_rank: usize, + remaining_bp: usize, +} + +impl GatherResult { + pub fn get_match(&self) -> Signature { + self.match_.clone() + } +} + +#[derive(Default, Debug)] +pub struct Selection { + ksize: Option, + abund: Option, + num: Option, + scaled: Option, + containment: Option, + moltype: Option, + picklist: Option, +} + +type SigCounter = counter::Counter; + +impl Selection { + pub fn ksize(&self) -> Option { + self.ksize + } + + pub fn set_ksize(&mut self, ksize: u32) { + self.ksize = Some(ksize); + } + + pub fn abund(&self) -> Option { + self.abund + } + + pub fn set_abund(&mut self, value: bool) { + self.abund = Some(value); + } + + pub fn num(&self) -> Option { + self.num + } + + pub fn set_num(&mut self, num: u32) { + self.num = Some(num); + } + + pub fn scaled(&self) -> Option { + self.scaled + } + + pub fn set_scaled(&mut self, scaled: u32) { + self.scaled = Some(scaled); + } + + pub fn containment(&self) -> Option { + self.containment + } + + pub fn set_containment(&mut self, containment: bool) { + self.containment = Some(containment); + } + + pub fn moltype(&self) -> Option { + self.moltype + } + + pub fn set_moltype(&mut self, value: HashFunctions) { + self.moltype = Some(value); + } + + pub fn picklist(&self) -> Option { + self.picklist.clone() + } + + pub fn set_picklist(&mut self, value: Picklist) { + self.picklist = Some(value); + } + + pub fn from_template(template: &Sketch) -> Self { + let (num, scaled) = match template { + Sketch::MinHash(mh) => (Some(mh.num()), Some(mh.scaled() as u32)), + Sketch::LargeMinHash(mh) => (Some(mh.num()), Some(mh.scaled() as u32)), + _ => (None, None), + }; + + Selection { + ksize: Some(template.ksize() as u32), + abund: None, + containment: None, + //moltype: Some(template.hash_function()), + moltype: None, + num, + picklist: None, + scaled, + } + } + + pub fn from_record(row: &Record) -> Result { + Ok(Self { + ksize: Some(*row.ksize()), + abund: Some(*row.with_abundance()), + moltype: Some(row.moltype()), + num: None, + scaled: None, + containment: None, + picklist: None, + }) + } +} pub trait Index<'a> { type Item: Comparable; //type SignatureIterator: Iterator; - fn find( - &self, - search_fn: F, - sig: &Self::Item, - threshold: f64, - ) -> Result, Error> + fn find(&self, search_fn: F, sig: &Self::Item, threshold: f64) -> Result> where F: Fn(&dyn Comparable, &Self::Item, f64) -> bool, { @@ -54,7 +196,7 @@ pub trait Index<'a> { sig: &Self::Item, threshold: f64, containment: bool, - ) -> Result, Error> { + ) -> Result> { if containment { self.find(search_minhashes_containment, sig, threshold) } else { @@ -62,11 +204,11 @@ pub trait Index<'a> { } } - //fn gather(&self, sig: &Self::Item, threshold: f64) -> Result, Error>; + //fn gather(&self, sig: &Self::Item, threshold: f64) -> Result>; - fn insert(&mut self, node: Self::Item) -> Result<(), Error>; + fn insert(&mut self, node: Self::Item) -> Result<()>; - fn batch_insert(&mut self, nodes: Vec) -> Result<(), Error> { + fn batch_insert(&mut self, nodes: Vec) -> Result<()> { for node in nodes { self.insert(node)?; } @@ -74,9 +216,9 @@ pub trait Index<'a> { Ok(()) } - fn save>(&self, path: P) -> Result<(), Error>; + fn save>(&self, path: P) -> Result<()>; - fn load>(path: P) -> Result<(), Error>; + fn load>(path: P) -> Result<()>; fn signatures(&self) -> Vec; @@ -107,232 +249,3 @@ where (*self).containment(other) } } - -#[derive(Serialize, Deserialize, Debug)] -pub struct DatasetInfo { - pub filename: String, - pub name: String, - pub metadata: String, -} - -#[derive(TypedBuilder, Default, Clone)] -pub struct SigStore { - #[builder(setter(into))] - filename: String, - - #[builder(setter(into))] - name: String, - - #[builder(setter(into))] - metadata: String, - - storage: Option, - - #[builder(setter(into), default)] - data: OnceCell, -} - -impl SigStore { - pub fn name(&self) -> String { - self.name.clone() - } -} - -impl std::fmt::Debug for SigStore { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "SigStore [filename: {}, name: {}, metadata: {}]", - self.filename, self.name, self.metadata - ) - } -} - -impl ReadData for SigStore { - fn data(&self) -> Result<&Signature, Error> { - if let Some(sig) = self.data.get() { - Ok(sig) - } else if let Some(storage) = &self.storage { - let sig = self.data.get_or_init(|| { - let raw = storage.load(&self.filename).unwrap(); - let sigs: Result, _> = serde_json::from_reader(&mut &raw[..]); - if let Ok(sigs) = sigs { - // TODO: select the right sig? - sigs[0].to_owned() - } else { - let sig: Signature = serde_json::from_reader(&mut &raw[..]).unwrap(); - sig - } - }); - - Ok(sig) - } else { - Err(ReadDataError::LoadError.into()) - } - } -} - -impl SigStore -where - T: ToWriter, -{ - pub fn save(&self, path: &str) -> Result { - if let Some(storage) = &self.storage { - if let Some(data) = self.data.get() { - let mut buffer = Vec::new(); - data.to_writer(&mut buffer)?; - - Ok(storage.save(path, &buffer)?) - } else { - unimplemented!() - } - } else { - unimplemented!() - } - } -} - -impl SigStore { - pub fn count_common(&self, other: &SigStore) -> u64 { - let ng: &Signature = self.data().unwrap(); - let ong: &Signature = other.data().unwrap(); - - // TODO: select the right signatures... - // TODO: better matching here, what if it is not a mh? - if let Sketch::MinHash(mh) = &ng.signatures[0] { - if let Sketch::MinHash(omh) = &ong.signatures[0] { - return mh.count_common(omh, false).unwrap(); - } - } - unimplemented!(); - } - - pub fn mins(&self) -> Vec { - let ng: &Signature = self.data().unwrap(); - - // TODO: select the right signatures... - // TODO: better matching here, what if it is not a mh? - if let Sketch::MinHash(mh) = &ng.signatures[0] { - mh.mins() - } else { - unimplemented!() - } - } -} - -impl From> for Signature { - fn from(other: SigStore) -> Signature { - other.data.get().unwrap().to_owned() - } -} - -impl Deref for SigStore { - type Target = Signature; - - fn deref(&self) -> &Signature { - self.data.get().unwrap() - } -} - -impl From for SigStore { - fn from(other: Signature) -> SigStore { - let name = other.name(); - let filename = other.filename(); - - SigStore::builder() - .name(name) - .filename(filename) - .data(other) - .metadata("") - .storage(None) - .build() - } -} - -impl Comparable> for SigStore { - fn similarity(&self, other: &SigStore) -> f64 { - let ng: &Signature = self.data().unwrap(); - let ong: &Signature = other.data().unwrap(); - - // TODO: select the right signatures... - // TODO: better matching here, what if it is not a mh? - if let Sketch::MinHash(mh) = &ng.signatures[0] { - if let Sketch::MinHash(omh) = &ong.signatures[0] { - return mh.similarity(omh, true, false).unwrap(); - } - } - - /* FIXME: bring back after boomphf changes - if let Sketch::UKHS(mh) = &ng.signatures[0] { - if let Sketch::UKHS(omh) = &ong.signatures[0] { - return 1. - mh.distance(&omh); - } - } - */ - - unimplemented!() - } - - fn containment(&self, other: &SigStore) -> f64 { - let ng: &Signature = self.data().unwrap(); - let ong: &Signature = other.data().unwrap(); - - // TODO: select the right signatures... - // TODO: better matching here, what if it is not a mh? - if let Sketch::MinHash(mh) = &ng.signatures[0] { - if let Sketch::MinHash(omh) = &ong.signatures[0] { - let common = mh.count_common(omh, false).unwrap(); - let size = mh.size(); - return common as f64 / size as f64; - } - } - unimplemented!() - } -} - -impl Comparable for Signature { - fn similarity(&self, other: &Signature) -> f64 { - // TODO: select the right signatures... - // TODO: better matching here, what if it is not a mh? - if let Sketch::MinHash(mh) = &self.signatures[0] { - if let Sketch::MinHash(omh) = &other.signatures[0] { - return mh.similarity(omh, true, false).unwrap(); - } - } - - /* FIXME: bring back after boomphf changes - if let Sketch::UKHS(mh) = &self.signatures[0] { - if let Sketch::UKHS(omh) = &other.signatures[0] { - return 1. - mh.distance(&omh); - } - } - */ - - unimplemented!() - } - - fn containment(&self, other: &Signature) -> f64 { - // TODO: select the right signatures... - // TODO: better matching here, what if it is not a mh? - if let Sketch::MinHash(mh) = &self.signatures[0] { - if let Sketch::MinHash(omh) = &other.signatures[0] { - let common = mh.count_common(omh, false).unwrap(); - let size = mh.size(); - return common as f64 / size as f64; - } - } - unimplemented!() - } -} - -impl From for SigStore { - fn from(other: DatasetInfo) -> SigStore { - SigStore { - filename: other.filename, - name: other.name, - metadata: other.metadata, - storage: None, - data: OnceCell::new(), - } - } -} diff --git a/src/core/src/index/revindex/disk_revindex.rs b/src/core/src/index/revindex/disk_revindex.rs new file mode 100644 index 0000000000..60e33cf1d4 --- /dev/null +++ b/src/core/src/index/revindex/disk_revindex.rs @@ -0,0 +1,546 @@ +use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher}; +use std::path::{Path, PathBuf}; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; + +use byteorder::{LittleEndian, WriteBytesExt}; +use log::{info, trace}; +use rayon::prelude::*; +use rocksdb::{ColumnFamilyDescriptor, MergeOperands, Options}; + +use crate::collection::{Collection, CollectionSet}; +use crate::encodings::{Color, Idx}; +use crate::index::revindex::prepare_query; +use crate::index::revindex::{ + self as module, stats_for_cf, Datasets, HashToColor, QueryColors, RevIndexOps, DB, HASHES, + MANIFEST, METADATA, STORAGE_SPEC, +}; +use crate::index::{GatherResult, SigCounter}; +use crate::manifest::Manifest; +use crate::signature::SigsTrait; +use crate::sketch::minhash::KmerMinHash; +use crate::sketch::Sketch; +use crate::storage::{InnerStorage, Storage}; +use crate::Result; + +fn compute_color(idxs: &Datasets) -> Color { + let s = BuildHasherDefault::::default(); + let mut hasher = s.build_hasher(); + /* + // TODO: remove this... + let mut sorted: Vec<_> = idxs.iter().collect(); + sorted.sort(); + */ + idxs.hash(&mut hasher); + hasher.finish() +} + +#[derive(Clone)] +pub struct RevIndex { + db: Arc, + collection: Arc, +} + +fn merge_datasets( + _: &[u8], + existing_val: Option<&[u8]>, + operands: &MergeOperands, +) -> Option> { + let mut datasets = existing_val + .and_then(Datasets::from_slice) + .unwrap_or_default(); + + for op in operands { + let new_vals = Datasets::from_slice(op).unwrap(); + datasets.union(new_vals); + } + // TODO: optimization! if nothing changed, skip as_bytes() + datasets.as_bytes() +} + +/* TODO: need the repair_cf variant, not available in rocksdb-rust yet +pub fn repair(path: &Path) { + let opts = db_options(); + + DB::repair(&opts, path).unwrap() +} +*/ + +impl RevIndex { + pub fn create(path: &Path, collection: CollectionSet) -> module::RevIndex { + let mut opts = module::RevIndex::db_options(); + opts.create_if_missing(true); + opts.create_missing_column_families(true); + opts.prepare_for_bulk_load(); + + // prepare column family descriptors + let cfs = cf_descriptors(); + + let db = Arc::new(DB::open_cf_descriptors(&opts, path, cfs).unwrap()); + + let processed_sigs = AtomicUsize::new(0); + + let index = Self { + db, + collection: Arc::new(collection), + }; + + index + .collection + .manifest + .par_iter() + .enumerate() + .for_each(|(dataset_id, _)| { + let i = processed_sigs.fetch_add(1, Ordering::SeqCst); + if i % 1000 == 0 { + info!("Processed {} reference sigs", i); + } + + index.map_hashes_colors(dataset_id as Idx); + }); + + index.save_collection().expect("Error saving collection"); + + info!("Compact SSTs"); + index.compact(); + info!("Processed {} reference sigs", processed_sigs.into_inner()); + + module::RevIndex::Plain(index) + } + + pub fn open>(path: P, read_only: bool) -> Result { + let mut opts = module::RevIndex::db_options(); + if !read_only { + opts.prepare_for_bulk_load(); + } + + // prepare column family descriptors + let cfs = cf_descriptors(); + + let db = if read_only { + Arc::new(DB::open_cf_descriptors_read_only( + &opts, + path.as_ref(), + cfs, + false, + )?) + } else { + Arc::new(DB::open_cf_descriptors(&opts, path.as_ref(), cfs)?) + }; + + let collection = Arc::new(Self::load_collection_from_rocksdb(db.clone())?); + + Ok(module::RevIndex::Plain(Self { db, collection })) + } + + fn load_collection_from_rocksdb(db: Arc) -> Result { + let cf_metadata = db.cf_handle(METADATA).unwrap(); + + let rdr = db.get_cf(&cf_metadata, MANIFEST)?.unwrap(); + let manifest = Manifest::from_reader(&rdr[..])?; + + let spec = String::from_utf8(db.get_cf(&cf_metadata, STORAGE_SPEC)?.unwrap()) + .expect("invalid utf-8"); + + let storage = if spec == "rocksdb://" { + todo!("init storage from db") + } else { + InnerStorage::from_spec(spec)? + }; + + Collection { manifest, storage }.try_into() + } + + fn save_collection(&self) -> Result<()> { + let cf_metadata = self.db.cf_handle(METADATA).unwrap(); + + // write manifest + let mut wtr = vec![]; + { + self.collection.manifest.to_writer(&mut wtr)?; + } + self.db.put_cf(&cf_metadata, MANIFEST, &wtr[..])?; + + // write storage spec + let spec = self.collection.storage.spec(); + + // TODO: check if spec if memstorage, would probably have to + // save into rocksdb in that case! + + self.db.put_cf(&cf_metadata, STORAGE_SPEC, spec)?; + + Ok(()) + } + + fn map_hashes_colors(&self, dataset_id: Idx) { + let search_sig = self + .collection + .sig_for_dataset(dataset_id) + .expect("Couldn't find a compatible Signature"); + let search_mh = &search_sig.sketches()[0]; + + let colors = Datasets::new(&[dataset_id]).as_bytes().unwrap(); + + let cf_hashes = self.db.cf_handle(HASHES).unwrap(); + + let hashes = match search_mh { + Sketch::MinHash(mh) => mh.mins(), + Sketch::LargeMinHash(mh) => mh.mins(), + _ => unimplemented!(), + }; + + let mut hash_bytes = [0u8; 8]; + for hash in hashes { + (&mut hash_bytes[..]) + .write_u64::(hash) + .expect("error writing bytes"); + self.db + .merge_cf(&cf_hashes, &hash_bytes[..], colors.as_slice()) + .expect("error merging"); + } + // TODO: save collection to DB? + } +} + +impl RevIndexOps for RevIndex { + fn counter_for_query(&self, query: &KmerMinHash) -> SigCounter { + info!("Collecting hashes"); + let cf_hashes = self.db.cf_handle(HASHES).unwrap(); + let hashes_iter = query.iter_mins().map(|hash| { + let mut v = vec![0_u8; 8]; + (&mut v[..]) + .write_u64::(*hash) + .expect("error writing bytes"); + (&cf_hashes, v) + }); + + info!("Multi get"); + self.db + .multi_get_cf(hashes_iter) + .into_iter() + .filter_map(|r| r.ok().unwrap_or(None)) + .flat_map(|raw_datasets| { + let new_vals = Datasets::from_slice(&raw_datasets).unwrap(); + new_vals.into_iter() + }) + .collect() + } + + fn prepare_gather_counters( + &self, + query: &KmerMinHash, + ) -> (SigCounter, QueryColors, HashToColor) { + let cf_hashes = self.db.cf_handle(HASHES).unwrap(); + let hashes_iter = query.iter_mins().map(|hash| { + let mut v = vec![0_u8; 8]; + (&mut v[..]) + .write_u64::(*hash) + .expect("error writing bytes"); + (&cf_hashes, v) + }); + + /* + build a HashToColors for query, + and a QueryColors (Color -> Datasets) mapping. + Loading Datasets from rocksdb for every hash takes too long. + */ + let mut query_colors: QueryColors = Default::default(); + let mut counter: SigCounter = Default::default(); + + info!("Building hash_to_colors and query_colors"); + let hash_to_colors = query + .iter_mins() + .zip(self.db.multi_get_cf(hashes_iter)) + .filter_map(|(k, r)| { + let raw = r.ok().unwrap_or(None); + raw.map(|raw| { + let new_vals = Datasets::from_slice(&raw).unwrap(); + let color = compute_color(&new_vals); + query_colors + .entry(color) + .or_insert_with(|| new_vals.clone()); + counter.update(new_vals); + (*k, color) + }) + }) + .collect(); + + (counter, query_colors, hash_to_colors) + } + + fn matches_from_counter(&self, counter: SigCounter, threshold: usize) -> Vec<(String, usize)> { + info!("get matches from counter"); + counter + .most_common() + .into_iter() + .filter_map(|(dataset_id, size)| { + if size >= threshold { + let row = &self.collection.manifest[dataset_id as usize]; + Some((row.name().into(), size)) + } else { + None + } + }) + .collect() + } + + fn gather( + &self, + mut counter: SigCounter, + query_colors: QueryColors, + hash_to_color: HashToColor, + threshold: usize, + orig_query: &KmerMinHash, + template: &Sketch, + ) -> Result> { + let mut match_size = usize::max_value(); + let mut matches = vec![]; + //let mut query: KmerMinHashBTree = orig_query.clone().into(); + + while match_size > threshold && !counter.is_empty() { + trace!("counter len: {}", counter.len()); + trace!("match size: {}", match_size); + + let (dataset_id, size) = counter.k_most_common_ordered(1)[0]; + match_size = if size >= threshold { size } else { break }; + + let match_sig = self.collection.sig_for_dataset(dataset_id)?; + + let match_mh = + prepare_query(&match_sig, template).expect("Couldn't find a compatible MinHash"); + + // Calculate stats + let f_orig_query = match_size as f64 / orig_query.size() as f64; + let f_match = match_size as f64 / match_mh.size() as f64; + let name = match_sig.name(); + let unique_intersect_bp = match_mh.scaled() as usize * match_size; + let gather_result_rank = matches.len(); + + let (intersect_orig, _) = match_mh.intersection_size(orig_query)?; + let intersect_bp = (match_mh.scaled() * intersect_orig) as usize; + + let f_unique_to_query = intersect_orig as f64 / orig_query.size() as f64; + let match_ = match_sig.clone(); + let md5 = match_sig.md5sum(); + + // TODO: all of these + let filename = "".into(); + let f_unique_weighted = 0.; + let average_abund = 0; + let median_abund = 0; + let std_abund = 0; + let f_match_orig = 0.; + let remaining_bp = 0; + + let result = GatherResult::builder() + .intersect_bp(intersect_bp) + .f_orig_query(f_orig_query) + .f_match(f_match) + .f_unique_to_query(f_unique_to_query) + .f_unique_weighted(f_unique_weighted) + .average_abund(average_abund) + .median_abund(median_abund) + .std_abund(std_abund) + .filename(filename) + .name(name) + .md5(md5) + .match_(match_.into()) + .f_match_orig(f_match_orig) + .unique_intersect_bp(unique_intersect_bp) + .gather_result_rank(gather_result_rank) + .remaining_bp(remaining_bp) + .build(); + matches.push(result); + + trace!("Preparing counter for next round"); + // Prepare counter for finding the next match by decrementing + // all hashes found in the current match in other datasets + // TODO: not used at the moment, so just skip. + //query.remove_many(match_mh.to_vec().as_slice())?; + + // TODO: Use HashesToColors here instead. If not initialized, + // build it. + match_mh + .iter_mins() + .filter_map(|hash| hash_to_color.get(hash)) + .flat_map(|color| { + // TODO: remove this clone + query_colors.get(color).unwrap().clone().into_iter() + }) + .for_each(|dataset| { + // TODO: collect the flat_map into a Counter, and remove more + // than one at a time... + counter.entry(dataset).and_modify(|e| { + if *e > 0 { + *e -= 1 + } + }); + }); + + counter.remove(&dataset_id); + } + Ok(matches) + } + + fn update( + &self, + _index_sigs: Vec, + _template: &Sketch, + _threshold: f64, + _save_paths: bool, + ) { + todo!() + /* + use byteorder::ReadBytesExt; + + if !save_paths { + todo!("only supports with save_paths=True for now"); + } + + let cf_sigs = self.db.cf_handle(SIGS).unwrap(); + let iter = self.db.iterator_cf(&cf_sigs, rocksdb::IteratorMode::Start); + + info!("Verifying existing sigs"); + // verify data match up to this point + let mut max_dataset_id = 0; + let to_skip = iter + .map(|result| { + let (key, value) = result.unwrap(); + let current_dataset_id = (&key[..]).read_u64::().unwrap(); + + let filename = &index_sigs[current_dataset_id as usize]; + let sig_data = SignatureData::from_slice(&value).unwrap(); + match sig_data { + SignatureData::External(sig) => { + assert_eq!(sig, filename.as_os_str().to_str().unwrap().to_string()) + } + SignatureData::Empty => (), + SignatureData::Internal(_) => { + todo!("only supports with save_paths=True for now") + } + }; + max_dataset_id = max_dataset_id.max(current_dataset_id); + }) + .count(); + + max_dataset_id += 1; + assert_eq!(max_dataset_id as usize, to_skip); + + // process the remainder + let processed_sigs = AtomicUsize::new(0); + + index_sigs + .par_iter() + .skip(to_skip) + .enumerate() + .for_each(|(i, filename)| { + let dataset_id = i + to_skip; + + let i = processed_sigs.fetch_add(1, Ordering::SeqCst); + if i % 1000 == 0 { + info!("Processed {} reference sigs", i); + } + + self.map_hashes_colors(dataset_id as Idx); + }); + + info!( + "Processed additional {} reference sigs", + processed_sigs.into_inner() + ); + */ + } + + fn check(&self, quick: bool) { + stats_for_cf(self.db.clone(), HASHES, true, quick); + } + + fn compact(&self) { + for cf_name in [HASHES, METADATA] { + let cf = self.db.cf_handle(cf_name).unwrap(); + self.db.compact_range_cf(&cf, None::<&[u8]>, None::<&[u8]>) + } + } + + fn flush(&self) -> Result<()> { + self.db.flush_wal(true)?; + + for cf_name in [HASHES, METADATA] { + let cf = self.db.cf_handle(cf_name).unwrap(); + self.db.flush_cf(&cf)?; + } + + Ok(()) + } + + fn convert(&self, _output_db: module::RevIndex) -> Result<()> { + todo!() + /* + if let RevIndex::Color(db) = output_db { + let other_db = db.db; + + let cf_hashes = self.db.cf_handle(HASHES).unwrap(); + + info!("start converting colors"); + let mut color_bytes = [0u8; 8]; + let iter = self + .db + .iterator_cf(&cf_hashes, rocksdb::IteratorMode::Start); + for (key, value) in iter { + let datasets = Datasets::from_slice(&value).unwrap(); + let new_idx: Vec<_> = datasets.into_iter().collect(); + let new_color = Colors::update(other_db.clone(), None, new_idx.as_slice()).unwrap(); + + (&mut color_bytes[..]) + .write_u64::(new_color) + .expect("error writing bytes"); + other_db + .put_cf(&cf_hashes, &key[..], &color_bytes[..]) + .unwrap(); + } + info!("finished converting colors"); + + info!("copying sigs to output"); + let cf_sigs = self.db.cf_handle(SIGS).unwrap(); + let iter = self.db.iterator_cf(&cf_sigs, rocksdb::IteratorMode::Start); + for (key, value) in iter { + other_db.put_cf(&cf_sigs, &key[..], &value[..]).unwrap(); + } + info!("finished copying sigs to output"); + + Ok(()) + } else { + todo!() + } + */ + } +} + +fn cf_descriptors() -> Vec { + let mut cfopts = Options::default(); + cfopts.set_max_write_buffer_number(16); + cfopts.set_merge_operator_associative("datasets operator", merge_datasets); + cfopts.set_min_write_buffer_number_to_merge(10); + + // Updated default from + // https://github.com/facebook/rocksdb/wiki/Setup-Options-and-Basic-Tuning#other-general-options + cfopts.set_level_compaction_dynamic_level_bytes(true); + + let cf_hashes = ColumnFamilyDescriptor::new(HASHES, cfopts); + + let mut cfopts = Options::default(); + cfopts.set_max_write_buffer_number(16); + // Updated default + cfopts.set_level_compaction_dynamic_level_bytes(true); + //cfopts.set_merge_operator_associative("colors operator", merge_colors); + + let cf_metadata = ColumnFamilyDescriptor::new(METADATA, cfopts); + + let mut cfopts = Options::default(); + cfopts.set_max_write_buffer_number(16); + // Updated default + cfopts.set_level_compaction_dynamic_level_bytes(true); + //cfopts.set_merge_operator_associative("colors operator", merge_colors); + + vec![cf_hashes, cf_metadata] +} diff --git a/src/core/src/index/revindex.rs b/src/core/src/index/revindex/mem_revindex.rs similarity index 61% rename from src/core/src/index/revindex.rs rename to src/core/src/index/revindex/mem_revindex.rs index 0a1fc25d18..113452ed1b 100644 --- a/src/core/src/index/revindex.rs +++ b/src/core/src/index/revindex/mem_revindex.rs @@ -1,8 +1,8 @@ use std::collections::{HashMap, HashSet}; -use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicUsize, Ordering}; -use getset::{CopyGetters, Getters, Setters}; +use camino::Utf8Path as Path; +use camino::Utf8PathBuf as PathBuf; use log::{debug, info}; use nohash_hasher::BuildNoHashHasher; use serde::{Deserialize, Serialize}; @@ -10,15 +10,25 @@ use serde::{Deserialize, Serialize}; #[cfg(feature = "parallel")] use rayon::prelude::*; +use crate::collection::Collection; use crate::encodings::{Color, Colors, Idx}; -use crate::index::Index; +use crate::index::linear::LinearIndex; +use crate::index::{GatherResult, Index, Selection, SigCounter}; use crate::signature::{Signature, SigsTrait}; use crate::sketch::minhash::KmerMinHash; use crate::sketch::Sketch; -use crate::Error; +use crate::storage::Storage; use crate::HashIntoType; +use crate::Result; -type SigCounter = counter::Counter; +// Use rkyv for serialization? +// https://davidkoloski.me/rkyv/ +//#[derive(Serialize, Deserialize)] +pub struct RevIndex { + linear: LinearIndex, + hash_to_color: HashToColor, + colors: Colors, +} #[derive(Serialize, Deserialize)] struct HashToColor(HashMap>); @@ -95,28 +105,78 @@ impl HashToColor { } } -// Use rkyv for serialization? -// https://davidkoloski.me/rkyv/ -#[derive(Serialize, Deserialize)] -pub struct RevIndex { - hash_to_color: HashToColor, +impl LinearIndex { + fn index( + self, + threshold: usize, + merged_query: Option, + queries: Option<&[KmerMinHash]>, + ) -> RevIndex { + let processed_sigs = AtomicUsize::new(0); + + let search_sigs: Vec<_> = self + .collection() + .manifest + .internal_locations() + .map(PathBuf::from) + .collect(); + + #[cfg(feature = "parallel")] + let sig_iter = search_sigs.par_iter(); - sig_files: Vec, + #[cfg(not(feature = "parallel"))] + let sig_iter = search_sigs.iter(); - #[serde(skip)] - ref_sigs: Option>, + let filtered_sigs = sig_iter.enumerate().filter_map(|(dataset_id, filename)| { + let i = processed_sigs.fetch_add(1, Ordering::SeqCst); + if i % 1000 == 0 { + info!("Processed {} reference sigs", i); + } - template: Sketch, - colors: Colors, - //#[serde(skip)] - //storage: Option, + let search_sig = self + .collection() + .storage + .load_sig(filename.as_str()) + .unwrap_or_else(|_| panic!("Error processing {:?}", filename)) + .into(); + + RevIndex::map_hashes_colors( + dataset_id, + &search_sig, + queries, + &merged_query, + threshold, + self.template(), + ) + }); + + #[cfg(feature = "parallel")] + let (hash_to_color, colors) = filtered_sigs.reduce( + || (HashToColor::new(), Colors::default()), + HashToColor::reduce_hashes_colors, + ); + + #[cfg(not(feature = "parallel"))] + let (hash_to_color, colors) = filtered_sigs.fold( + (HashToColor::new(), Colors::default()), + HashToColor::reduce_hashes_colors, + ); + + RevIndex { + hash_to_color, + colors, + linear: self, + } + } } impl RevIndex { pub fn load>( - index_path: P, - queries: Option<&[KmerMinHash]>, - ) -> Result> { + _index_path: P, + _queries: Option<&[KmerMinHash]>, + ) -> Result { + unimplemented!() + /* let (rdr, _) = niffler::from_path(index_path)?; let revindex = if let Some(qs) = queries { // TODO: avoid loading full revindex if query != None @@ -152,6 +212,7 @@ impl RevIndex { }; Ok(revindex) + */ } pub fn new( @@ -159,80 +220,33 @@ impl RevIndex { template: &Sketch, threshold: usize, queries: Option<&[KmerMinHash]>, - keep_sigs: bool, - ) -> RevIndex { + _keep_sigs: bool, + ) -> Result { // If threshold is zero, let's merge all queries and save time later let merged_query = queries.and_then(|qs| Self::merge_queries(qs, threshold)); - let processed_sigs = AtomicUsize::new(0); - - #[cfg(feature = "parallel")] - let sig_iter = search_sigs.par_iter(); - - #[cfg(not(feature = "parallel"))] - let sig_iter = search_sigs.iter(); - - let filtered_sigs = sig_iter.enumerate().filter_map(|(dataset_id, filename)| { - let i = processed_sigs.fetch_add(1, Ordering::SeqCst); - if i % 1000 == 0 { - info!("Processed {} reference sigs", i); - } - - let search_sig = Signature::from_path(filename) - .unwrap_or_else(|_| panic!("Error processing {:?}", filename)) - .swap_remove(0); + let collection = + Collection::from_paths(search_sigs)?.select(&Selection::from_template(template))?; + let linear = LinearIndex::from_collection(collection.try_into()?); - RevIndex::map_hashes_colors( - dataset_id, - &search_sig, - queries, - &merged_query, - threshold, - template, - ) - }); + Ok(linear.index(threshold, merged_query, queries)) + } - #[cfg(feature = "parallel")] - let (hash_to_color, colors) = filtered_sigs.reduce( - || (HashToColor::new(), Colors::default()), - HashToColor::reduce_hashes_colors, - ); + pub fn from_zipfile>( + zipfile: P, + template: &Sketch, + threshold: usize, + queries: Option<&[KmerMinHash]>, + _keep_sigs: bool, + ) -> Result { + // If threshold is zero, let's merge all queries and save time later + let merged_query = queries.and_then(|qs| Self::merge_queries(qs, threshold)); - #[cfg(not(feature = "parallel"))] - let (hash_to_color, colors) = filtered_sigs.fold( - (HashToColor::new(), Colors::default()), - HashToColor::reduce_hashes_colors, - ); + let collection = + Collection::from_zipfile(zipfile)?.select(&Selection::from_template(template))?; + let linear = LinearIndex::from_collection(collection.try_into()?); - // TODO: build this together with hash_to_idx? - let ref_sigs = if keep_sigs { - #[cfg(feature = "parallel")] - let sigs_iter = search_sigs.par_iter(); - - #[cfg(not(feature = "parallel"))] - let sigs_iter = search_sigs.iter(); - - Some( - sigs_iter - .map(|ref_path| { - Signature::from_path(ref_path) - .unwrap_or_else(|_| panic!("Error processing {:?}", ref_path)) - .swap_remove(0) - }) - .collect(), - ) - } else { - None - }; - - RevIndex { - hash_to_color, - sig_files: search_sigs.into(), - ref_sigs, - template: template.clone(), - colors, - // storage: Some(InnerStorage::new(MemStorage::default())), - } + Ok(linear.index(threshold, merged_query, queries)) } fn merge_queries(qs: &[KmerMinHash], threshold: usize) -> Option { @@ -252,53 +266,17 @@ impl RevIndex { template: &Sketch, threshold: usize, queries: Option<&[KmerMinHash]>, - ) -> RevIndex { + ) -> Result { // If threshold is zero, let's merge all queries and save time later let merged_query = queries.and_then(|qs| Self::merge_queries(qs, threshold)); - let processed_sigs = AtomicUsize::new(0); - - #[cfg(feature = "parallel")] - let sigs_iter = search_sigs.par_iter(); - #[cfg(not(feature = "parallel"))] - let sigs_iter = search_sigs.iter(); - - let filtered_sigs = sigs_iter.enumerate().filter_map(|(dataset_id, sig)| { - let i = processed_sigs.fetch_add(1, Ordering::SeqCst); - if i % 1000 == 0 { - info!("Processed {} reference sigs", i); - } + let collection = + Collection::from_sigs(search_sigs)?.select(&Selection::from_template(template))?; + let linear = LinearIndex::from_collection(collection.try_into()?); - RevIndex::map_hashes_colors( - dataset_id, - sig, - queries, - &merged_query, - threshold, - template, - ) - }); - - #[cfg(feature = "parallel")] - let (hash_to_color, colors) = filtered_sigs.reduce( - || (HashToColor::new(), Colors::default()), - HashToColor::reduce_hashes_colors, - ); - - #[cfg(not(feature = "parallel"))] - let (hash_to_color, colors) = filtered_sigs.fold( - (HashToColor::new(), Colors::default()), - HashToColor::reduce_hashes_colors, - ); + let idx = linear.index(threshold, merged_query, queries); - RevIndex { - hash_to_color, - sig_files: vec![], - ref_sigs: search_sigs.into(), - template: template.clone(), - colors, - //storage: None, - } + Ok(idx) } fn map_hashes_colors( @@ -352,20 +330,8 @@ impl RevIndex { counter: SigCounter, similarity: bool, threshold: usize, - ) -> Result, Box> { - let mut matches = vec![]; - if similarity { - unimplemented!("TODO: threshold correction") - } - - for (dataset_id, size) in counter.most_common() { - if size >= threshold { - matches.push(self.sig_files[dataset_id as usize].to_str().unwrap().into()); - } else { - break; - }; - } - Ok(matches) + ) -> Result> { + self.linear.search(counter, similarity, threshold) } pub fn gather( @@ -373,109 +339,37 @@ impl RevIndex { mut counter: SigCounter, threshold: usize, query: &KmerMinHash, - ) -> Result, Box> { + ) -> Result> { let mut match_size = usize::max_value(); let mut matches = vec![]; while match_size > threshold && !counter.is_empty() { let (dataset_id, size) = counter.most_common()[0]; match_size = if size >= threshold { size } else { break }; - - let p; - let match_path = if self.sig_files.is_empty() { - p = PathBuf::new(); // TODO: Fix somehow? - &p - } else { - &self.sig_files[dataset_id as usize] - }; - - let ref_match; - let match_sig = if let Some(refsigs) = &self.ref_sigs { - &refsigs[dataset_id as usize] - } else { - // TODO: remove swap_remove - ref_match = Signature::from_path(match_path)?.swap_remove(0); - &ref_match - }; - - let mut match_mh = None; - if let Some(Sketch::MinHash(mh)) = match_sig.select_sketch(&self.template) { - match_mh = Some(mh); - } - let match_mh = match_mh.expect("Couldn't find a compatible MinHash"); - - // Calculate stats - let f_orig_query = match_size as f64 / query.size() as f64; - let f_match = match_size as f64 / match_mh.size() as f64; - let filename = match_path.to_str().unwrap().into(); - let name = match_sig.name(); - let unique_intersect_bp = match_mh.scaled() as usize * match_size; - let gather_result_rank = matches.len(); - - let (intersect_orig, _) = match_mh.intersection_size(query)?; - let intersect_bp = (match_mh.scaled() * intersect_orig) as usize; - - let f_unique_to_query = intersect_orig as f64 / query.size() as f64; - let match_ = match_sig.clone(); - - // TODO: all of these - let f_unique_weighted = 0.; - let average_abund = 0; - let median_abund = 0; - let std_abund = 0; - let md5 = "".into(); - let f_match_orig = 0.; - let remaining_bp = 0; - - let result = GatherResult { - intersect_bp, - f_orig_query, - f_match, - f_unique_to_query, - f_unique_weighted, - average_abund, - median_abund, - std_abund, - filename, - name, - md5, - match_, - f_match_orig, - unique_intersect_bp, - gather_result_rank, - remaining_bp, - }; - matches.push(result); - - // Prepare counter for finding the next match by decrementing - // all hashes found in the current match in other datasets - for hash in match_mh.iter_mins() { - if let Some(color) = self.hash_to_color.get(hash) { - for dataset in self.colors.indices(color) { - counter.entry(*dataset).and_modify(|e| { - if *e > 0 { - *e -= 1 - } - }); + let result = self + .linear + .gather_round(dataset_id, match_size, query, matches.len())?; + if let Some(Sketch::MinHash(match_mh)) = + result.match_.select_sketch(self.linear.template()) + { + // Prepare counter for finding the next match by decrementing + // all hashes found in the current match in other datasets + for hash in match_mh.iter_mins() { + if let Some(color) = self.hash_to_color.get(hash) { + counter.subtract(self.colors.indices(color).cloned()); } } + counter.remove(&dataset_id); + matches.push(result); + } else { + unimplemented!() } - counter.remove(&dataset_id); } Ok(matches) } - pub fn counter_for_query(&self, query: &KmerMinHash) -> SigCounter { - query - .iter_mins() - .filter_map(|hash| self.hash_to_color.get(hash)) - .flat_map(|color| self.colors.indices(color)) - .cloned() - .collect() - } - pub fn template(&self) -> Sketch { - self.template.clone() + self.linear.template().clone() } // TODO: mh should be a sketch, or even a sig... @@ -485,7 +379,7 @@ impl RevIndex { threshold: f64, containment: bool, _ignore_scaled: bool, - ) -> Result, Error> { + ) -> Result> { /* let template_mh = None; if let Sketch::MinHash(mh) = self.template { @@ -526,25 +420,12 @@ impl RevIndex { for (dataset_id, size) in counter.most_common() { let match_size = if size >= threshold { size } else { break }; - let p; - let match_path = if self.sig_files.is_empty() { - p = PathBuf::new(); // TODO: Fix somehow? - &p - } else { - &self.sig_files[dataset_id as usize] - }; - - let ref_match; - let match_sig = if let Some(refsigs) = &self.ref_sigs { - &refsigs[dataset_id as usize] - } else { - // TODO: remove swap_remove - ref_match = Signature::from_path(match_path)?.swap_remove(0); - &ref_match - }; + let match_sig = self.linear.sig_for_dataset(dataset_id)?; + let match_path = + self.linear.collection().manifest[dataset_id as usize].internal_location(); let mut match_mh = None; - if let Some(Sketch::MinHash(mh)) = match_sig.select_sketch(&self.template) { + if let Some(Sketch::MinHash(mh)) = match_sig.select_sketch(self.linear.template()) { match_mh = Some(mh); } let match_mh = match_mh.unwrap(); @@ -555,8 +436,8 @@ impl RevIndex { } else { size as f64 / (mh.size() + match_size - size) as f64 }; - let filename = match_path.to_str().unwrap().into(); - let mut sig = match_sig.clone(); + let filename = match_path.to_string(); + let mut sig: Signature = match_sig.clone().into(); sig.reset_sketches(); sig.push(Sketch::MinHash(match_mh.clone())); results.push((score, sig, filename)); @@ -566,74 +447,42 @@ impl RevIndex { } Ok(results) } -} - -#[derive(CopyGetters, Getters, Setters, Serialize, Deserialize, Debug)] -pub struct GatherResult { - #[getset(get_copy = "pub")] - intersect_bp: usize, - - #[getset(get_copy = "pub")] - f_orig_query: f64, - - #[getset(get_copy = "pub")] - f_match: f64, - f_unique_to_query: f64, - f_unique_weighted: f64, - average_abund: usize, - median_abund: usize, - std_abund: usize, - - #[getset(get = "pub")] - filename: String, - - #[getset(get = "pub")] - name: String, - - md5: String, - match_: Signature, - f_match_orig: f64, - unique_intersect_bp: usize, - gather_result_rank: usize, - remaining_bp: usize, -} - -impl GatherResult { - pub fn get_match(&self) -> Signature { - self.match_.clone() + pub fn counter_for_query(&self, query: &KmerMinHash) -> SigCounter { + query + .iter_mins() + .filter_map(|hash| self.hash_to_color.get(hash)) + .flat_map(|color| self.colors.indices(color)) + .cloned() + .collect() } } impl<'a> Index<'a> for RevIndex { type Item = Signature; - fn insert(&mut self, _node: Self::Item) -> Result<(), Error> { + fn insert(&mut self, _node: Self::Item) -> Result<()> { unimplemented!() } - fn save>(&self, _path: P) -> Result<(), Error> { + fn save>(&self, _path: P) -> Result<()> { unimplemented!() } - fn load>(_path: P) -> Result<(), Error> { + fn load>(_path: P) -> Result<()> { unimplemented!() } fn len(&self) -> usize { - if let Some(refs) = &self.ref_sigs { - refs.len() - } else { - self.sig_files.len() - } + self.linear.len() } fn signatures(&self) -> Vec { - if let Some(ref sigs) = self.ref_sigs { - sigs.to_vec() - } else { - unimplemented!() - } + self.linear + .signatures() + .into_iter() + .map(|sig| sig.into()) + .collect() } fn signature_refs(&self) -> Vec<&Self::Item> { @@ -641,14 +490,52 @@ impl<'a> Index<'a> for RevIndex { } } +/* +impl RevIndexOps for RevIndex { + /* TODO: need the repair_cf variant, not available in rocksdb-rust yet + pub fn repair(index: &Path, colors: bool); + */ + + fn matches_from_counter(&self, counter: SigCounter, threshold: usize) -> Vec<(String, usize)>; + + fn prepare_gather_counters( + &self, + query: &KmerMinHash, + ) -> (SigCounter, QueryColors, HashToColor); + + fn index(&self, index_sigs: Vec, template: &Sketch, threshold: f64, save_paths: bool); + + fn update(&self, index_sigs: Vec, template: &Sketch, threshold: f64, save_paths: bool); + + fn compact(&self); + + fn flush(&self) -> Result<()>; + + fn convert(&self, output_db: RevIndex) -> Result<()>; + + fn check(&self, quick: bool); + + fn gather( + &self, + counter: SigCounter, + query_colors: QueryColors, + hash_to_color: HashToColor, + threshold: usize, + query: &KmerMinHash, + template: &Sketch, + ) -> Result>; +} +*/ + #[cfg(test)] mod test { use super::*; use crate::sketch::minhash::max_hash_for_scaled; + use crate::Result; #[test] - fn revindex_new() { + fn revindex_new() -> Result<()> { let max_hash = max_hash_for_scaled(10000); let template = Sketch::MinHash( KmerMinHash::builder() @@ -661,12 +548,14 @@ mod test { "../../tests/test-data/gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig".into(), "../../tests/test-data/gather/GCF_000007545.1_ASM754v1_genomic.fna.gz.sig".into(), ]; - let index = RevIndex::new(&search_sigs, &template, 0, None, false); + let index = RevIndex::new(&search_sigs, &template, 0, None, false)?; assert_eq!(index.colors.len(), 3); + + Ok(()) } #[test] - fn revindex_many() { + fn revindex_many() -> Result<()> { let max_hash = max_hash_for_scaled(10000); let template = Sketch::MinHash( KmerMinHash::builder() @@ -681,7 +570,46 @@ mod test { "../../tests/test-data/gather/GCF_000008105.1_ASM810v1_genomic.fna.gz.sig".into(), ]; - let index = RevIndex::new(&search_sigs, &template, 0, None, false); + let index = RevIndex::new(&search_sigs, &template, 0, None, false)?; + //dbg!(&index.linear.collection().manifest); + /* + dbg!(&index.colors.colors); + 0: 86 + 1: 132 + 2: 91 + (0, 1): 53 + (0, 2): 90 + (1, 2): 26 + (0, 1, 2): 261 + union: 739 + + */ + //assert_eq!(index.colors.len(), 3); + assert_eq!(index.colors.len(), 7); + + Ok(()) + } + + #[test] + fn revindex_from_sigs() -> Result<()> { + let max_hash = max_hash_for_scaled(10000); + let template = Sketch::MinHash( + KmerMinHash::builder() + .num(0u32) + .ksize(31) + .max_hash(max_hash) + .build(), + ); + let search_sigs: Vec = [ + "../../tests/test-data/gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig", + "../../tests/test-data/gather/GCF_000007545.1_ASM754v1_genomic.fna.gz.sig", + "../../tests/test-data/gather/GCF_000008105.1_ASM810v1_genomic.fna.gz.sig", + ] + .into_iter() + .map(|path| Signature::from_path(path).unwrap().swap_remove(0)) + .collect(); + + let index = RevIndex::new_with_sigs(search_sigs, &template, 0, None)?; /* dbg!(&index.colors.colors); 0: 86 @@ -695,5 +623,67 @@ mod test { */ //assert_eq!(index.colors.len(), 3); assert_eq!(index.colors.len(), 7); + + Ok(()) + } + + #[test] + fn revindex_from_zipstorage() -> Result<()> { + let max_hash = max_hash_for_scaled(100); + let template = Sketch::MinHash( + KmerMinHash::builder() + .num(0u32) + .ksize(19) + .hash_function(crate::encodings::HashFunctions::murmur64_protein) + .max_hash(max_hash) + .build(), + ); + let index = RevIndex::from_zipfile( + "../../tests/test-data/prot/protein.zip", + &template, + 0, + None, + false, + ) + .expect("error building from ziptorage"); + + assert_eq!(index.colors.len(), 3); + + let query_sig = Signature::from_path( + "../../tests/test-data/prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig", + ) + .expect("Error processing query") + .swap_remove(0); + + let template = Sketch::MinHash( + KmerMinHash::builder() + .num(0u32) + .ksize(57) + .hash_function(crate::encodings::HashFunctions::murmur64_protein) + .max_hash(max_hash) + .build(), + ); + let mut query_mh = None; + if let Some(Sketch::MinHash(mh)) = query_sig.select_sketch(&template) { + query_mh = Some(mh); + } + let query_mh = query_mh.expect("Couldn't find a compatible MinHash"); + + let counter_rev = index.counter_for_query(query_mh); + let counter_lin = index.linear.counter_for_query(query_mh); + + let results_rev = index.search(counter_rev, false, 0).unwrap(); + let results_linear = index.linear.search(counter_lin, false, 0).unwrap(); + assert_eq!(results_rev, results_linear); + + let counter_rev = index.counter_for_query(query_mh); + let counter_lin = index.linear.counter_for_query(query_mh); + + let results_rev = index.gather(counter_rev, 0, query_mh).unwrap(); + let results_linear = index.linear.gather(counter_lin, 0, query_mh).unwrap(); + assert_eq!(results_rev.len(), 1); + assert_eq!(results_rev, results_linear); + + Ok(()) } } diff --git a/src/core/src/index/revindex/mod.rs b/src/core/src/index/revindex/mod.rs new file mode 100644 index 0000000000..460a5429af --- /dev/null +++ b/src/core/src/index/revindex/mod.rs @@ -0,0 +1,538 @@ +pub mod disk_revindex; +pub mod mem_revindex; + +use std::collections::HashMap; +use std::hash::{Hash, Hasher}; +use std::path::{Path, PathBuf}; +use std::sync::Arc; + +use byteorder::{LittleEndian, WriteBytesExt}; +use enum_dispatch::enum_dispatch; + +use roaring::RoaringBitmap; + +use crate::collection::CollectionSet; +use crate::encodings::{Color, Idx}; +use crate::index::{GatherResult, SigCounter}; +use crate::signature::{Signature, SigsTrait}; +use crate::sketch::minhash::{max_hash_for_scaled, KmerMinHash}; +use crate::sketch::Sketch; +use crate::HashIntoType; +use crate::Result; + +//type DB = rocksdb::DBWithThreadMode; +type DB = rocksdb::DBWithThreadMode; + +type QueryColors = HashMap; +type HashToColor = HashMap; + +const HASHES: &str = "hashes"; +const COLORS: &str = "colors"; +const METADATA: &str = "metadata"; +const MANIFEST: &str = "manifest"; +const STORAGE_SPEC: &str = "storage_spec"; + +#[enum_dispatch(RevIndexOps)] +pub enum RevIndex { + //Color(color_revindex::ColorRevIndex), + Plain(disk_revindex::RevIndex), + //Mem(mem_revindex::RevIndex), +} + +#[enum_dispatch] +pub trait RevIndexOps { + /* TODO: need the repair_cf variant, not available in rocksdb-rust yet + pub fn repair(index: &Path, colors: bool); + */ + + fn counter_for_query(&self, query: &KmerMinHash) -> SigCounter; + + fn matches_from_counter(&self, counter: SigCounter, threshold: usize) -> Vec<(String, usize)>; + + fn prepare_gather_counters( + &self, + query: &KmerMinHash, + ) -> (SigCounter, QueryColors, HashToColor); + + fn update(&self, index_sigs: Vec, template: &Sketch, threshold: f64, save_paths: bool); + + fn compact(&self); + + fn flush(&self) -> Result<()>; + + fn convert(&self, output_db: RevIndex) -> Result<()>; + + fn check(&self, quick: bool); + + fn gather( + &self, + counter: SigCounter, + query_colors: QueryColors, + hash_to_color: HashToColor, + threshold: usize, + query: &KmerMinHash, + template: &Sketch, + ) -> Result>; +} + +impl RevIndex { + /* TODO: need the repair_cf variant, not available in rocksdb-rust yet + pub fn repair(index: &Path, colors: bool) { + if colors { + color_revindex::repair(index); + } else { + disk_revindex::repair(index); + } + } + */ + + pub fn create>(index: P, collection: CollectionSet, colors: bool) -> Self { + if colors { + todo!() //color_revindex::ColorRevIndex::create(index) + } else { + disk_revindex::RevIndex::create(index.as_ref(), collection) + } + } + + pub fn open>(index: P, read_only: bool) -> Result { + let opts = Self::db_options(); + let cfs = DB::list_cf(&opts, index.as_ref()).unwrap(); + + if cfs.into_iter().any(|c| c == COLORS) { + // TODO: ColorRevIndex can't be read-only for now, + // due to pending unmerged colors + todo!() //color_revindex::ColorRevIndex::open(index, false) + } else { + disk_revindex::RevIndex::open(index, read_only) + } + } + + fn db_options() -> rocksdb::Options { + let mut opts = rocksdb::Options::default(); + opts.set_max_open_files(500); + + // Updated defaults from + // https://github.com/facebook/rocksdb/wiki/Setup-Options-and-Basic-Tuning#other-general-options + opts.set_bytes_per_sync(1048576); + let mut block_opts = rocksdb::BlockBasedOptions::default(); + block_opts.set_block_size(16 * 1024); + block_opts.set_cache_index_and_filter_blocks(true); + block_opts.set_pin_l0_filter_and_index_blocks_in_cache(true); + block_opts.set_format_version(5); + opts.set_block_based_table_factory(&block_opts); + // End of updated defaults + + opts.increase_parallelism(8); + // opts.optimize_level_style_compaction(); + // opts.optimize_universal_style_compaction(); + + opts + } +} + +fn check_compatible_downsample(me: &KmerMinHash, other: &KmerMinHash) -> Result<()> { + /* + if self.num != other.num { + return Err(Error::MismatchNum { + n1: self.num, + n2: other.num, + } + .into()); + } + */ + use crate::Error; + + if me.ksize() != other.ksize() { + return Err(Error::MismatchKSizes); + } + if me.hash_function() != other.hash_function() { + // TODO: fix this error + return Err(Error::MismatchDNAProt); + } + if me.max_hash() < other.max_hash() { + return Err(Error::MismatchScaled); + } + if me.seed() != other.seed() { + return Err(Error::MismatchSeed); + } + Ok(()) +} + +pub fn prepare_query(search_sig: &Signature, template: &Sketch) -> Option { + let mut search_mh = None; + if let Some(Sketch::MinHash(mh)) = search_sig.select_sketch(template) { + search_mh = Some(mh.clone()); + } else { + // try to find one that can be downsampled + if let Sketch::MinHash(template_mh) = template { + for sketch in search_sig.sketches() { + if let Sketch::MinHash(ref_mh) = sketch { + if check_compatible_downsample(&ref_mh, template_mh).is_ok() { + let max_hash = max_hash_for_scaled(template_mh.scaled()); + let mh = ref_mh.downsample_max_hash(max_hash).unwrap(); + search_mh = Some(mh); + } + } + } + } + } + search_mh +} + +#[derive(Debug, Default, PartialEq, Clone)] +pub enum Datasets { + #[default] + Empty, + Unique(Idx), + Many(RoaringBitmap), +} + +impl Hash for Datasets { + fn hash(&self, state: &mut H) + where + H: Hasher, + { + match self { + Self::Empty => todo!(), + Self::Unique(v) => v.hash(state), + Self::Many(v) => { + for value in v.iter() { + value.hash(state); + } + } + } + } +} + +impl IntoIterator for Datasets { + type Item = Idx; + type IntoIter = Box>; + + fn into_iter(self) -> Self::IntoIter { + match self { + Self::Empty => Box::new(std::iter::empty()), + Self::Unique(v) => Box::new(std::iter::once(v)), + Self::Many(v) => Box::new(v.into_iter()), + } + } +} + +impl Extend for Datasets { + fn extend(&mut self, iter: T) + where + T: IntoIterator, + { + if let Self::Many(v) = self { + v.extend(iter); + return; + } + + let mut it = iter.into_iter(); + while let Some(value) = it.next() { + match self { + Self::Empty => *self = Datasets::Unique(value), + Self::Unique(v) => { + if *v != value { + *self = Self::Many([*v, value].iter().copied().collect()); + } + } + Self::Many(v) => { + v.extend(it); + return; + } + } + } + } +} + +impl Datasets { + fn new(vals: &[Idx]) -> Self { + if vals.is_empty() { + Self::Empty + } else if vals.len() == 1 { + Self::Unique(vals[0]) + } else { + Self::Many(RoaringBitmap::from_sorted_iter(vals.iter().copied()).unwrap()) + } + } + + fn from_slice(slice: &[u8]) -> Option { + use byteorder::ReadBytesExt; + + if slice.len() == 8 { + // Unique + Some(Self::Unique( + (&slice[..]).read_u32::().unwrap(), + )) + } else if slice.len() == 1 { + // Empty + Some(Self::Empty) + } else { + // Many + Some(Self::Many(RoaringBitmap::deserialize_from(slice).unwrap())) + } + } + + fn as_bytes(&self) -> Option> { + match self { + Self::Empty => Some(vec![42_u8]), + Self::Unique(v) => { + let mut buf = vec![0u8; 8]; + (&mut buf[..]) + .write_u32::(*v) + .expect("error writing bytes"); + Some(buf) + } + Self::Many(v) => { + let mut buf = vec![]; + v.serialize_into(&mut buf).unwrap(); + Some(buf) + } + } + } + + fn union(&mut self, other: Datasets) { + match self { + Datasets::Empty => match other { + Datasets::Empty => (), + Datasets::Unique(_) | Datasets::Many(_) => *self = other, + }, + Datasets::Unique(v) => match other { + Datasets::Empty => (), + Datasets::Unique(o) => { + if *v != o { + *self = Datasets::Many([*v, o].iter().copied().collect()) + } + } + Datasets::Many(mut o) => { + o.extend([*v]); + *self = Datasets::Many(o); + } + }, + Datasets::Many(ref mut v) => v.extend(other), + } + } + + fn len(&self) -> usize { + match self { + Self::Empty => 0, + Self::Unique(_) => 1, + Self::Many(ref v) => v.len() as usize, + } + } + + /* + fn contains(&self, value: &Idx) -> bool { + match self { + Self::Empty => false, + Self::Unique(v) => v == value, + Self::Many(ref v) => v.contains(*value), + } + } + */ +} + +fn stats_for_cf(db: Arc, cf_name: &str, deep_check: bool, quick: bool) { + use byteorder::ReadBytesExt; + use histogram::Histogram; + use log::info; + use numsep::{separate, Locale}; + + let cf = db.cf_handle(cf_name).unwrap(); + + let iter = db.iterator_cf(&cf, rocksdb::IteratorMode::Start); + let mut kcount = 0; + let mut vcount = 0; + let mut vcounts = Histogram::new(); + let mut datasets: Datasets = Default::default(); + + for result in iter { + let (key, value) = result.unwrap(); + let _k = (&key[..]).read_u64::().unwrap(); + kcount += key.len(); + + //println!("Saw {} {:?}", k, Datasets::from_slice(&value)); + vcount += value.len(); + + if !quick && deep_check { + let v = Datasets::from_slice(&value).expect("Error with value"); + vcounts.increment(v.len() as u64).unwrap(); + datasets.union(v); + } + //println!("Saw {} {:?}", k, value); + } + + info!("*** {} ***", cf_name); + use size::Size; + let ksize = Size::from_bytes(kcount); + let vsize = Size::from_bytes(vcount); + if !quick && cf_name == COLORS { + info!( + "total datasets: {}", + separate(datasets.len(), Locale::English) + ); + } + info!("total keys: {}", separate(kcount / 8, Locale::English)); + + info!("k: {}", ksize.to_string()); + info!("v: {}", vsize.to_string()); + + if !quick && kcount > 0 && deep_check { + info!("max v: {}", vcounts.maximum().unwrap()); + info!("mean v: {}", vcounts.mean().unwrap()); + info!("stddev: {}", vcounts.stddev().unwrap()); + info!("median v: {}", vcounts.percentile(50.0).unwrap()); + info!("p25 v: {}", vcounts.percentile(25.0).unwrap()); + info!("p75 v: {}", vcounts.percentile(75.0).unwrap()); + } +} + +fn build_template(ksize: u8, scaled: usize) -> Sketch { + let max_hash = max_hash_for_scaled(scaled as u64); + let template_mh = KmerMinHash::builder() + .num(0u32) + .ksize(ksize as u32) + .max_hash(max_hash) + .build(); + Sketch::MinHash(template_mh) +} + +#[cfg(test)] +mod test { + + use camino::Utf8PathBuf as PathBuf; + use tempfile::TempDir; + + use crate::prelude::*; + use crate::Result; + use crate::{collection::Collection, index::Selection}; + + use super::{build_template, prepare_query, RevIndex, RevIndexOps}; + + #[test] + fn revindex_index() -> Result<()> { + let mut basedir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + basedir.push("../../tests/test-data/scaled/"); + + let siglist: Vec<_> = (10..=12) + .map(|i| { + let mut filename = basedir.clone(); + filename.push(format!("genome-s{}.fa.gz.sig", i)); + filename + }) + .collect(); + + let template = build_template(31, 10000); + let output = TempDir::new()?; + + let query_sig = Signature::from_path(&siglist[0])?; + let mut query = None; + for sig in &query_sig { + if let Some(q) = prepare_query(sig, &template) { + query = Some(q); + } + } + let query = query.unwrap(); + + let collection = + Collection::from_paths(&siglist)?.select(&Selection::from_template(&template))?; + let index = RevIndex::create(output.path(), collection.try_into()?, false); + + let counter = index.counter_for_query(&query); + let matches = index.matches_from_counter(counter, 0); + + assert_eq!(matches, [("../genome-s10.fa.gz".into(), 48)]); + + Ok(()) + } + + /* + #[test] + fn revindex_update() -> Result<()> { + let mut basedir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + basedir.push("../../tests/test-data/scaled/"); + + let siglist: Vec<_> = (10..=11) + .map(|i| { + let mut filename = basedir.clone(); + filename.push(format!("genome-s{}.fa.gz.sig", i)); + filename + }) + .collect(); + + let template = build_template(31, 10000); + let output = TempDir::new()?; + + let mut new_siglist = siglist.clone(); + { + let index = RevIndex::create(output.path(), false); + index.index(siglist, &template, 0., true); + } + + let mut filename = basedir.clone(); + filename.push("genome-s12.fa.gz.sig"); + new_siglist.push(filename); + + let query_sig = Signature::from_path(&new_siglist[2])?; + let mut query = None; + for sig in &query_sig { + if let Some(q) = prepare_query(sig, &template) { + query = Some(q); + } + } + let query = query.unwrap(); + + let index = RevIndex::open(output.path(), false); + index.update(new_siglist, &template, 0., true); + + let counter = index.counter_for_query(&query); + let matches = index.matches_from_counter(counter, 0); + + assert!(matches[0].0.ends_with("/genome-s12.fa.gz.sig")); + assert_eq!(matches[0].1, 45); + + Ok(()) + } + */ + + #[test] + fn revindex_load_and_gather() -> Result<()> { + let mut basedir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + basedir.push("../../tests/test-data/scaled/"); + + let siglist: Vec<_> = (10..=12) + .map(|i| { + let mut filename = basedir.clone(); + filename.push(format!("genome-s{}.fa.gz.sig", i)); + filename + }) + .collect(); + + let template = build_template(31, 10000); + let output = TempDir::new()?; + + let query_sig = Signature::from_path(&siglist[0])?; + let mut query = None; + for sig in &query_sig { + if let Some(q) = prepare_query(sig, &template) { + query = Some(q); + } + } + let query = query.unwrap(); + + { + let collection = + Collection::from_paths(&siglist)?.select(&Selection::from_template(&template))?; + let _index = RevIndex::create(output.path(), collection.try_into()?, false); + } + + let index = RevIndex::open(output.path(), true)?; + + let counter = index.counter_for_query(&query); + let matches = index.matches_from_counter(counter, 0); + + assert_eq!(matches, [("../genome-s10.fa.gz".into(), 48)]); + + Ok(()) + } +} diff --git a/src/core/src/lib.rs b/src/core/src/lib.rs index 66de82e6a0..dc88d34363 100644 --- a/src/core/src/lib.rs +++ b/src/core/src/lib.rs @@ -21,11 +21,16 @@ pub mod errors; pub use errors::SourmashError as Error; +pub type Result = std::result::Result; pub mod prelude; pub mod cmd; +pub mod collection; +pub mod index; +pub mod manifest; +pub mod picklist; pub mod signature; pub mod sketch; pub mod storage; @@ -44,7 +49,6 @@ cfg_if! { pub mod wasm; } else { pub mod ffi; - pub mod index; } } diff --git a/src/core/src/manifest.rs b/src/core/src/manifest.rs new file mode 100644 index 0000000000..ca2602b269 --- /dev/null +++ b/src/core/src/manifest.rs @@ -0,0 +1,256 @@ +use std::convert::TryInto; +use std::io::{Read, Write}; +use std::ops::Deref; + +use camino::Utf8PathBuf as PathBuf; +use getset::{CopyGetters, Getters, Setters}; +use serde::de; +use serde::{Deserialize, Serialize}; + +use crate::encodings::HashFunctions; +use crate::index::Selection; +use crate::signature::{Signature, SigsTrait}; +use crate::sketch::Sketch; +use crate::Result; + +#[derive(Debug, Serialize, Deserialize, Clone, CopyGetters, Getters, Setters)] +pub struct Record { + #[getset(get = "pub", set = "pub")] + internal_location: PathBuf, + + #[getset(get = "pub", set = "pub")] + ksize: u32, + + #[getset(get = "pub", set = "pub")] + #[serde(deserialize_with = "to_bool")] + with_abundance: bool, + + #[getset(get = "pub", set = "pub")] + md5: String, + + #[getset(get = "pub", set = "pub")] + name: String, + + moltype: String, + /* + md5short: String, + num: String, + scaled: String, + n_hashes: String, + filename: String, + */ +} + +fn to_bool<'de, D>(deserializer: D) -> std::result::Result +where + D: de::Deserializer<'de>, +{ + match String::deserialize(deserializer)? + .to_ascii_lowercase() + .as_ref() + { + "0" | "false" => Ok(false), + "1" | "true" => Ok(true), + other => Err(de::Error::invalid_value( + de::Unexpected::Str(other), + &"0/1 or true/false are the only supported values", + )), + } +} + +#[derive(Debug, Default, Serialize, Deserialize, Clone)] +pub struct Manifest { + records: Vec, +} + +impl Record { + pub fn from_sig(sig: &Signature, path: &str) -> Vec { + sig.iter() + .map(|sketch| { + let (ksize, md5, with_abundance, moltype) = match sketch { + Sketch::MinHash(mh) => ( + mh.ksize() as u32, + mh.md5sum(), + mh.track_abundance(), + mh.hash_function(), + ), + Sketch::LargeMinHash(mh) => ( + mh.ksize() as u32, + mh.md5sum(), + mh.track_abundance(), + mh.hash_function(), + ), + _ => unimplemented!(), + }; + + Self { + internal_location: path.into(), + moltype: moltype.to_string(), + name: sig.name(), + ksize, + md5, + with_abundance, + } + }) + .collect() + } + + pub fn moltype(&self) -> HashFunctions { + self.moltype.as_str().try_into().unwrap() + } + + pub fn check_compatible(&self, other: &Record) -> Result<()> { + /* + if self.num != other.num { + return Err(Error::MismatchNum { + n1: self.num, + n2: other.num, + } + .into()); + } + */ + use crate::Error; + + if self.ksize() != other.ksize() { + return Err(Error::MismatchKSizes); + } + if self.moltype() != other.moltype() { + // TODO: fix this error + return Err(Error::MismatchDNAProt); + } + /* + if self.scaled() < other.scaled() { + return Err(Error::MismatchScaled); + } + if self.seed() != other.seed() { + return Err(Error::MismatchSeed); + } + */ + Ok(()) + } +} + +impl Manifest { + pub fn from_reader(rdr: R) -> Result { + let mut records = vec![]; + + let mut rdr = csv::ReaderBuilder::new() + .comment(Some(b'#')) + .from_reader(rdr); + for result in rdr.deserialize() { + let record: Record = result?; + records.push(record); + } + Ok(Manifest { records }) + } + + pub fn to_writer(&self, mut wtr: W) -> Result<()> { + wtr.write_all(b"# SOURMASH-MANIFEST-VERSION: 1.0\n")?; + + let mut wtr = csv::Writer::from_writer(wtr); + + for record in &self.records { + wtr.serialize(record)?; + } + + Ok(()) + } + + pub fn internal_locations(&self) -> impl Iterator { + self.records.iter().map(|r| r.internal_location.as_str()) + } + + pub fn iter(&self) -> impl Iterator { + self.records.iter() + } + + pub fn select_to_manifest(&self, selection: &Selection) -> Result { + let rows = self.records.iter().filter(|row| { + let mut valid = true; + valid = if let Some(ksize) = selection.ksize() { + row.ksize == ksize + } else { + valid + }; + valid = if let Some(abund) = selection.abund() { + valid && *row.with_abundance() == abund + } else { + valid + }; + valid = if let Some(moltype) = selection.moltype() { + valid && row.moltype() == moltype + } else { + valid + }; + valid + }); + + Ok(Manifest { + records: rows.cloned().collect(), + }) + + /* + matching_rows = self.rows + if ksize: + matching_rows = ( row for row in matching_rows + if row['ksize'] == ksize ) + if moltype: + matching_rows = ( row for row in matching_rows + if row['moltype'] == moltype ) + if scaled or containment: + if containment and not scaled: + raise ValueError("'containment' requires 'scaled' in Index.select'") + + matching_rows = ( row for row in matching_rows + if row['scaled'] and not row['num'] ) + if num: + matching_rows = ( row for row in matching_rows + if row['num'] and not row['scaled'] ) + + if abund: + # only need to concern ourselves if abundance is _required_ + matching_rows = ( row for row in matching_rows + if row['with_abundance'] ) + + if picklist: + matching_rows = ( row for row in matching_rows + if picklist.matches_manifest_row(row) ) + + # return only the internal filenames! + for row in matching_rows: + yield row + */ + } +} + +impl From> for Manifest { + fn from(records: Vec) -> Self { + Manifest { records } + } +} + +impl From<&[PathBuf]> for Manifest { + fn from(v: &[PathBuf]) -> Self { + Manifest { + records: v + .iter() + .map(|p| Record { + internal_location: p.clone(), + ksize: 0, // FIXME + with_abundance: false, // FIXME + md5: "".into(), // FIXME + name: "".into(), // FIXME + moltype: "".into(), // FIXME + }) + .collect(), + } + } +} + +impl Deref for Manifest { + type Target = Vec; + + fn deref(&self) -> &Self::Target { + &self.records + } +} diff --git a/src/core/src/picklist.rs b/src/core/src/picklist.rs new file mode 100644 index 0000000000..943d3f051a --- /dev/null +++ b/src/core/src/picklist.rs @@ -0,0 +1,29 @@ +use getset::{CopyGetters, Getters, Setters}; +use typed_builder::TypedBuilder; + +#[derive(Default, TypedBuilder, CopyGetters, Getters, Setters, Clone, Debug)] +pub struct Picklist { + #[getset(get = "pub", set = "pub")] + #[builder(default = "".into())] + coltype: String, + + #[getset(get = "pub", set = "pub")] + #[builder(default = "".into())] + pickfile: String, + + #[getset(get = "pub", set = "pub")] + #[builder(default = "".into())] + column_name: String, + + #[getset(get = "pub", set = "pub")] + #[builder] + pickstyle: PickStyle, +} + +#[derive(Clone, Default, Debug)] +#[repr(u32)] +pub enum PickStyle { + #[default] + Include = 1, + Exclude = 2, +} diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index db2a85ea05..19ec308617 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -2,6 +2,8 @@ //! //! A signature is a collection of sketches for a genomic dataset. +use core::iter::FusedIterator; + use std::fs::File; use std::io; use std::iter::Iterator; @@ -15,11 +17,14 @@ use serde::{Deserialize, Serialize}; use typed_builder::TypedBuilder; use crate::encodings::{aa_to_dayhoff, aa_to_hp, revcomp, to_aa, HashFunctions, VALID}; +use crate::index::Selection; use crate::prelude::*; use crate::sketch::Sketch; use crate::Error; use crate::HashIntoType; +// TODO: this is the behavior expected from Sketch, but that name is already +// used. Sketchable? pub trait SigsTrait { fn size(&self) -> usize; fn to_vec(&self) -> Vec; @@ -395,6 +400,10 @@ impl Iterator for SeqToHashes { } #[derive(Serialize, Deserialize, Debug, Clone, TypedBuilder)] +#[cfg_attr( + feature = "rkyv", + derive(rkyv::Serialize, rkyv::Deserialize, rkyv::Archive) +)] pub struct Signature { #[serde(default = "default_class")] #[builder(default = default_class())] @@ -525,6 +534,32 @@ impl Signature { None } + pub fn select(mut self, selection: &Selection) -> Result { + self.signatures.retain(|s| { + let mut valid = true; + valid = if let Some(ksize) = selection.ksize() { + let k = s.ksize() as u32; + k == ksize || k == ksize * 3 + } else { + valid + }; + /* + valid = if let Some(abund) = selection.abund() { + valid && *s.with_abundance() == abund + } else { + valid + }; + valid = if let Some(moltype) = selection.moltype() { + valid && s.moltype() == moltype + } else { + valid + }; + */ + valid + }); + Ok(self) + } + pub fn from_path>(path: P) -> Result, Error> { let mut reader = io::BufReader::new(File::open(path)?); Signature::from_reader(&mut reader) @@ -654,6 +689,92 @@ impl Signature { Ok(()) } + + pub fn iter_mut(&mut self) -> IterMut<'_> { + let length = self.signatures.len(); + IterMut { + iter: self.signatures.iter_mut(), + length, + } + } + + pub fn iter(&self) -> Iter<'_> { + let length = self.signatures.len(); + Iter { + iter: self.signatures.iter(), + length, + } + } +} + +pub struct IterMut<'a> { + iter: std::slice::IterMut<'a, Sketch>, + length: usize, +} + +impl<'a> IntoIterator for &'a mut Signature { + type Item = &'a mut Sketch; + type IntoIter = IterMut<'a>; + + fn into_iter(self) -> IterMut<'a> { + self.iter_mut() + } +} + +impl<'a> Iterator for IterMut<'a> { + type Item = &'a mut Sketch; + + fn next(&mut self) -> Option<&'a mut Sketch> { + if self.length == 0 { + None + } else { + self.length -= 1; + self.iter.next() + } + } + + fn size_hint(&self) -> (usize, Option) { + (self.length, Some(self.length)) + } +} + +pub struct Iter<'a> { + iter: std::slice::Iter<'a, Sketch>, + length: usize, +} + +impl<'a> Iterator for Iter<'a> { + type Item = &'a Sketch; + + fn next(&mut self) -> Option<&'a Sketch> { + if self.length == 0 { + None + } else { + self.length -= 1; + self.iter.next() + } + } + + fn size_hint(&self) -> (usize, Option) { + (self.length, Some(self.length)) + } +} + +impl FusedIterator for Iter<'_> {} + +impl ExactSizeIterator for Iter<'_> { + fn len(&self) -> usize { + self.length + } +} + +impl Clone for Iter<'_> { + fn clone(&self) -> Self { + Iter { + iter: self.iter.clone(), + length: self.length, + } + } } impl ToWriter for Signature { diff --git a/src/core/src/sketch/hyperloglog/mod.rs b/src/core/src/sketch/hyperloglog/mod.rs index 409d2a2c44..df22dad9d1 100644 --- a/src/core/src/sketch/hyperloglog/mod.rs +++ b/src/core/src/sketch/hyperloglog/mod.rs @@ -26,6 +26,10 @@ pub mod estimators; use estimators::CounterType; #[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[cfg_attr( + feature = "rkyv", + derive(rkyv::Serialize, rkyv::Deserialize, rkyv::Archive) +)] pub struct HyperLogLog { registers: Vec, p: usize, @@ -318,15 +322,15 @@ mod test { assert!(abs_error < ERR_RATE, "{}", abs_error); let similarity = hll1.similarity(&hll2); - let abs_error = (1. - (similarity / SIMILARITY as f64)).abs(); + let abs_error = (1. - (similarity / SIMILARITY)).abs(); assert!(abs_error < ERR_RATE, "{} {}", similarity, SIMILARITY); let containment = hll1.containment(&hll2); - let abs_error = (1. - (containment / CONTAINMENT_H1 as f64)).abs(); + let abs_error = (1. - (containment / CONTAINMENT_H1)).abs(); assert!(abs_error < ERR_RATE, "{} {}", containment, CONTAINMENT_H1); let containment = hll2.containment(&hll1); - let abs_error = (1. - (containment / CONTAINMENT_H2 as f64)).abs(); + let abs_error = (1. - (containment / CONTAINMENT_H2)).abs(); assert!(abs_error < ERR_RATE, "{} {}", containment, CONTAINMENT_H2); let intersection = hll1.intersection(&hll2) as f64; @@ -335,13 +339,13 @@ mod test { hll1.merge(&hll2).unwrap(); - let abs_error = (1. - (hllu.similarity(&hll1) as f64 / 1.)).abs(); + let abs_error = (1. - (hllu.similarity(&hll1) / 1.)).abs(); assert!(abs_error < ERR_RATE, "{}", abs_error); - let abs_error = (1. - (hllu.containment(&hll1) as f64 / 1.)).abs(); + let abs_error = (1. - (hllu.containment(&hll1) / 1.)).abs(); assert!(abs_error < ERR_RATE, "{}", abs_error); - let abs_error = (1. - (hll1.containment(&hllu) as f64 / 1.)).abs(); + let abs_error = (1. - (hll1.containment(&hllu) / 1.)).abs(); assert!(abs_error < ERR_RATE, "{}", abs_error); let intersection = hll1.intersection(&hllu) as f64; diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index 5c5f1114f8..454a50f7a7 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -33,6 +33,10 @@ pub fn scaled_for_max_hash(max_hash: u64) -> u64 { } #[derive(Debug, TypedBuilder)] +#[cfg_attr( + feature = "rkyv", + derive(rkyv::Serialize, rkyv::Deserialize, rkyv::Archive) +)] pub struct KmerMinHash { num: u32, ksize: u32, @@ -53,6 +57,8 @@ pub struct KmerMinHash { abunds: Option>, #[builder(default)] + //#[cfg_attr(feature = "rkyv", with(rkyv::with::Lock))] + #[cfg_attr(feature = "rkyv", with(rkyv::with::Skip))] md5sum: Mutex>, } @@ -927,6 +933,10 @@ mod test { // A MinHash implementation for low scaled or large cardinalities #[derive(Debug, TypedBuilder)] +#[cfg_attr( + feature = "rkyv", + derive(rkyv::Serialize, rkyv::Deserialize, rkyv::Archive) +)] pub struct KmerMinHashBTree { num: u32, ksize: u32, @@ -950,6 +960,8 @@ pub struct KmerMinHashBTree { current_max: u64, #[builder(default)] + //#[cfg_attr(feature = "rkyv", with(rkyv::with::Lock))] + #[cfg_attr(feature = "rkyv", with(rkyv::with::Skip))] md5sum: Mutex>, } diff --git a/src/core/src/sketch/mod.rs b/src/core/src/sketch/mod.rs index 09bd51085c..3ef04e43df 100644 --- a/src/core/src/sketch/mod.rs +++ b/src/core/src/sketch/mod.rs @@ -10,6 +10,10 @@ use crate::sketch::minhash::{KmerMinHash, KmerMinHashBTree}; #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(untagged)] +#[cfg_attr( + feature = "rkyv", + derive(rkyv::Serialize, rkyv::Deserialize, rkyv::Archive) +)] pub enum Sketch { MinHash(KmerMinHash), LargeMinHash(KmerMinHashBTree), diff --git a/src/core/src/sketch/nodegraph.rs b/src/core/src/sketch/nodegraph.rs index cbca8915ba..bbfef5cd0d 100644 --- a/src/core/src/sketch/nodegraph.rs +++ b/src/core/src/sketch/nodegraph.rs @@ -7,7 +7,7 @@ use byteorder::{BigEndian, ByteOrder, LittleEndian, ReadBytesExt, WriteBytesExt} use fixedbitset::FixedBitSet; use crate::prelude::*; -use crate::sketch::minhash::KmerMinHash; +use crate::sketch::minhash::{KmerMinHash, KmerMinHashBTree}; use crate::Error; use crate::HashIntoType; @@ -58,6 +58,15 @@ impl Update for KmerMinHash { } } +impl Update for KmerMinHashBTree { + fn update(&self, other: &mut Nodegraph) -> Result<(), Error> { + for h in self.mins() { + other.count(h); + } + Ok(()) + } +} + impl Nodegraph { pub fn new(tablesizes: &[usize], ksize: usize) -> Nodegraph { let mut bs = Vec::with_capacity(tablesizes.len()); diff --git a/src/core/src/storage.rs b/src/core/src/storage.rs index f4f942d330..3e2a2cab6e 100644 --- a/src/core/src/storage.rs +++ b/src/core/src/storage.rs @@ -1,47 +1,48 @@ -use std::collections::BTreeMap; +use std::collections::{BTreeMap, HashMap}; use std::ffi::OsStr; use std::fs::{DirBuilder, File}; use std::io::{BufReader, BufWriter, Read, Write}; -use std::path::{Path, PathBuf}; -use std::rc::Rc; -use std::sync::RwLock; +use std::ops::Deref; +use std::sync::{Arc, RwLock}; +use camino::Utf8Path as Path; +use camino::Utf8PathBuf as PathBuf; +use once_cell::sync::OnceCell; use serde::{Deserialize, Serialize}; use thiserror::Error; use typed_builder::TypedBuilder; -use crate::Error; +use crate::errors::ReadDataError; +use crate::index::Selection; +use crate::prelude::*; +use crate::signature::SigsTrait; +use crate::sketch::Sketch; +use crate::{Error, Result}; /// An abstraction for any place where we can store data. pub trait Storage { /// Save bytes into path - fn save(&self, path: &str, content: &[u8]) -> Result; + fn save(&self, path: &str, content: &[u8]) -> Result; /// Load bytes from path - fn load(&self, path: &str) -> Result, Error>; + fn load(&self, path: &str) -> Result>; /// Args for initializing a new Storage fn args(&self) -> StorageArgs; -} -#[derive(Clone)] -pub struct InnerStorage(Rc>); + /// Load signature from internal path + fn load_sig(&self, path: &str) -> Result; -impl InnerStorage { - pub fn new(inner: impl Storage + 'static) -> InnerStorage { - InnerStorage(Rc::new(RwLock::new(inner))) - } -} + /// Return a spec for creating/opening a storage + fn spec(&self) -> String; -impl Storage for InnerStorage { - fn save(&self, path: &str, content: &[u8]) -> Result { - self.0.save(path, content) - } - fn load(&self, path: &str) -> Result, Error> { - self.0.load(path) - } - fn args(&self) -> StorageArgs { - self.0.args() + /// Save signature to internal path + fn save_sig(&self, path: &str, sig: Signature) -> Result { + let mut buffer = vec![]; + { + sig.to_writer(&mut buffer).unwrap(); + } + self.save(path, &buffer) } } @@ -57,6 +58,26 @@ pub enum StorageError { DataReadError(String), } +#[derive(Clone)] +pub struct InnerStorage(Arc>); + +#[derive(TypedBuilder, Default, Clone)] +pub struct SigStore { + #[builder(setter(into))] + filename: String, + + #[builder(setter(into))] + name: String, + + #[builder(setter(into))] + metadata: String, + + storage: Option, + + #[builder(setter(into), default)] + data: OnceCell, +} + #[derive(Serialize, Deserialize)] pub(crate) struct StorageInfo { pub backend: String, @@ -69,6 +90,86 @@ pub enum StorageArgs { FSStorage { path: String }, } +/// Store files locally into a directory +#[derive(TypedBuilder, Debug, Clone, Default)] +pub struct FSStorage { + /// absolute path for the directory where data is saved. + fullpath: PathBuf, + subdir: String, +} + +#[ouroboros::self_referencing] +pub struct ZipStorage { + mapping: Option, + + #[borrows(mapping)] + #[covariant] + archive: piz::ZipArchive<'this>, + + subdir: Option, + path: Option, + + #[borrows(archive)] + #[covariant] + metadata: Metadata<'this>, +} + +/// Store data in memory (no permanent storage) +#[derive(TypedBuilder, Debug, Clone, Default)] +pub struct MemStorage { + //store: HashMap>, + sigs: Arc>>, +} + +pub type Metadata<'a> = BTreeMap<&'a OsStr, &'a piz::read::FileMetadata<'a>>; + +// ========================================= + +impl InnerStorage { + pub fn new(inner: impl Storage + Send + Sync + 'static) -> InnerStorage { + InnerStorage(Arc::new(RwLock::new(inner))) + } + + pub fn from_spec(spec: String) -> Result { + Ok(match spec { + x if x.starts_with("fs") => { + let path = x.split("://").last().expect("not a valid path"); + InnerStorage::new(FSStorage::new("", path)) + } + x if x.starts_with("memory") => InnerStorage::new(MemStorage::new()), + x if x.starts_with("zip") => { + let path = x.split("://").last().expect("not a valid path"); + InnerStorage::new(ZipStorage::from_file(path)?) + } + _ => todo!("storage not supported, throw error"), + }) + } +} + +impl Storage for InnerStorage { + fn save(&self, path: &str, content: &[u8]) -> Result { + self.0.save(path, content) + } + + fn load(&self, path: &str) -> Result> { + self.0.load(path) + } + + fn args(&self) -> StorageArgs { + self.0.args() + } + + fn load_sig(&self, path: &str) -> Result { + let mut store = self.0.load_sig(path)?; + store.storage = Some(self.clone()); + Ok(store) + } + + fn spec(&self) -> String { + self.0.spec() + } +} + impl From<&StorageArgs> for FSStorage { fn from(other: &StorageArgs) -> FSStorage { match other { @@ -90,25 +191,25 @@ impl Storage for RwLock where L: ?Sized + Storage, { - fn save(&self, path: &str, content: &[u8]) -> Result { + fn save(&self, path: &str, content: &[u8]) -> Result { self.read().unwrap().save(path, content) } - fn load(&self, path: &str) -> Result, Error> { + fn load(&self, path: &str) -> Result> { self.read().unwrap().load(path) } fn args(&self) -> StorageArgs { self.read().unwrap().args() } -} -/// Store files locally into a directory -#[derive(TypedBuilder, Debug, Clone, Default)] -pub struct FSStorage { - /// absolute path for the directory where data is saved. - fullpath: PathBuf, - subdir: String, + fn load_sig(&self, path: &str) -> Result { + self.read().unwrap().load_sig(path) + } + + fn spec(&self) -> String { + self.read().unwrap().spec() + } } impl FSStorage { @@ -132,7 +233,7 @@ impl FSStorage { } impl Storage for FSStorage { - fn save(&self, path: &str, content: &[u8]) -> Result { + fn save(&self, path: &str, content: &[u8]) -> Result { if path.is_empty() { return Err(StorageError::EmptyPathError.into()); } @@ -148,7 +249,7 @@ impl Storage for FSStorage { Ok(path.into()) } - fn load(&self, path: &str) -> Result, Error> { + fn load(&self, path: &str) -> Result> { let path = self.fullpath.join(path); let file = File::open(path)?; let mut buf_reader = BufReader::new(file); @@ -162,38 +263,33 @@ impl Storage for FSStorage { path: self.subdir.clone(), } } -} -#[ouroboros::self_referencing] -pub struct ZipStorage { - mapping: Option, - - #[borrows(mapping)] - #[covariant] - archive: piz::ZipArchive<'this>, + fn load_sig(&self, path: &str) -> Result { + let raw = self.load(path)?; + let sig = Signature::from_reader(&mut &raw[..])? + // TODO: select the right sig? + .swap_remove(0); - subdir: Option, - path: Option, + Ok(sig.into()) + } - #[borrows(archive)] - #[covariant] - metadata: Metadata<'this>, + fn spec(&self) -> String { + format!("fs://{}", self.subdir) + } } -pub type Metadata<'a> = BTreeMap<&'a OsStr, &'a piz::read::FileMetadata<'a>>; - fn lookup<'a, P: AsRef>( metadata: &'a Metadata, path: P, -) -> Result<&'a piz::read::FileMetadata<'a>, Error> { +) -> Result<&'a piz::read::FileMetadata<'a>> { let path = path.as_ref(); metadata .get(&path.as_os_str()) - .ok_or_else(|| StorageError::PathNotFoundError(path.to_str().unwrap().into()).into()) + .ok_or_else(|| StorageError::PathNotFoundError(path.to_string()).into()) .map(|entry| *entry) } -fn find_subdirs<'a>(archive: &'a piz::ZipArchive<'a>) -> Result, Error> { +fn find_subdirs<'a>(archive: &'a piz::ZipArchive<'a>) -> Result> { let subdirs: Vec<_> = archive .entries() .iter() @@ -207,11 +303,11 @@ fn find_subdirs<'a>(archive: &'a piz::ZipArchive<'a>) -> Result, } impl Storage for ZipStorage { - fn save(&self, _path: &str, _content: &[u8]) -> Result { + fn save(&self, _path: &str, _content: &[u8]) -> Result { unimplemented!(); } - fn load(&self, path: &str) -> Result, Error> { + fn load(&self, path: &str) -> Result> { let metadata = self.borrow_metadata(); let entry = lookup(metadata, path).or_else(|_| { @@ -237,11 +333,24 @@ impl Storage for ZipStorage { fn args(&self) -> StorageArgs { unimplemented!(); } + + fn load_sig(&self, path: &str) -> Result { + let raw = self.load(path)?; + let sig = Signature::from_reader(&mut &raw[..])? + // TODO: select the right sig? + .swap_remove(0); + + Ok(sig.into()) + } + + fn spec(&self) -> String { + format!("zip://{}", self.path().unwrap_or("".into())) + } } impl ZipStorage { - pub fn from_file(location: &str) -> Result { - let zip_file = File::open(location)?; + pub fn from_file>(location: P) -> Result { + let zip_file = File::open(location.as_ref())?; let mapping = unsafe { memmap2::Mmap::map(&zip_file)? }; let mut storage = ZipStorageBuilder { @@ -257,7 +366,7 @@ impl ZipStorage { .collect() }, subdir: None, - path: Some(location.to_owned()), + path: Some(location.as_ref().into()), } .build(); @@ -267,7 +376,7 @@ impl ZipStorage { Ok(storage) } - pub fn path(&self) -> Option { + pub fn path(&self) -> Option { self.borrow_path().clone() } @@ -279,7 +388,7 @@ impl ZipStorage { self.with_mut(|fields| *fields.subdir = Some(path)) } - pub fn list_sbts(&self) -> Result, Error> { + pub fn list_sbts(&self) -> Result> { Ok(self .borrow_archive() .entries() @@ -295,7 +404,7 @@ impl ZipStorage { .collect()) } - pub fn filenames(&self) -> Result, Error> { + pub fn filenames(&self) -> Result> { Ok(self .borrow_archive() .entries() @@ -304,3 +413,219 @@ impl ZipStorage { .collect()) } } + +impl SigStore { + pub fn new_with_storage(sig: Signature, storage: InnerStorage) -> Self { + let name = sig.name(); + let filename = sig.filename(); + + SigStore::builder() + .name(name) + .filename(filename) + .data(sig) + .metadata("") + .storage(Some(storage)) + .build() + } + + pub fn name(&self) -> String { + self.name.clone() + } + + pub fn select(mut self, selection: &Selection) -> Result { + // TODO: find better error + let sig = self.data.take().ok_or(Error::MismatchKSizes)?; + self.data = OnceCell::with_value(sig.select(selection)?); + Ok(self) + } +} + +impl std::fmt::Debug for SigStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "SigStore [filename: {}, name: {}, metadata: {}]", + self.filename, self.name, self.metadata + ) + } +} + +impl ReadData for SigStore { + fn data(&self) -> Result<&Signature> { + if let Some(sig) = self.data.get() { + Ok(sig) + } else if let Some(storage) = &self.storage { + let sig = self.data.get_or_init(|| { + let raw = storage.load(&self.filename).unwrap(); + Signature::from_reader(&mut &raw[..]) + .unwrap() + // TODO: select the right sig? + .swap_remove(0) + }); + + Ok(sig) + } else { + Err(ReadDataError::LoadError.into()) + } + } +} + +impl SigStore { + pub fn save(&self, path: &str) -> Result { + if let Some(storage) = &self.storage { + if let Some(data) = self.data.get() { + let mut buffer = Vec::new(); + data.to_writer(&mut buffer)?; + + Ok(storage.save(path, &buffer)?) + } else { + unimplemented!() + } + } else { + unimplemented!() + } + } +} + +impl From for Signature { + fn from(other: SigStore) -> Signature { + other.data.get().unwrap().to_owned() + } +} + +impl Deref for SigStore { + type Target = Signature; + + fn deref(&self) -> &Signature { + self.data.get().unwrap() + } +} + +impl From for SigStore { + fn from(other: Signature) -> SigStore { + let name = other.name(); + let filename = other.filename(); + + SigStore::builder() + .name(name) + .filename(filename) + .data(other) + .metadata("") + .storage(None) + .build() + } +} + +impl Comparable for SigStore { + fn similarity(&self, other: &SigStore) -> f64 { + let ng: &Signature = self.data().unwrap(); + let ong: &Signature = other.data().unwrap(); + + // TODO: select the right signatures... + // TODO: better matching here, what if it is not a mh? + if let Sketch::MinHash(mh) = &ng.signatures[0] { + if let Sketch::MinHash(omh) = &ong.signatures[0] { + return mh.similarity(omh, true, false).unwrap(); + } + } + + unimplemented!() + } + + fn containment(&self, other: &SigStore) -> f64 { + let ng: &Signature = self.data().unwrap(); + let ong: &Signature = other.data().unwrap(); + + // TODO: select the right signatures... + // TODO: better matching here, what if it is not a mh? + if let Sketch::MinHash(mh) = &ng.signatures[0] { + if let Sketch::MinHash(omh) = &ong.signatures[0] { + let common = mh.count_common(omh, false).unwrap(); + let size = mh.size(); + return common as f64 / size as f64; + } + } + unimplemented!() + } +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct DatasetInfo { + pub filename: String, + pub name: String, + pub metadata: String, +} +impl From for SigStore { + fn from(other: DatasetInfo) -> SigStore { + SigStore { + filename: other.filename, + name: other.name, + metadata: other.metadata, + storage: None, + data: OnceCell::new(), + } + } +} + +impl Comparable for Signature { + fn similarity(&self, other: &Signature) -> f64 { + // TODO: select the right signatures... + // TODO: better matching here, what if it is not a mh? + if let Sketch::MinHash(mh) = &self.signatures[0] { + if let Sketch::MinHash(omh) = &other.signatures[0] { + return mh.similarity(omh, true, false).unwrap(); + } + } + unimplemented!() + } + + fn containment(&self, other: &Signature) -> f64 { + // TODO: select the right signatures... + // TODO: better matching here, what if it is not a mh? + if let Sketch::MinHash(mh) = &self.signatures[0] { + if let Sketch::MinHash(omh) = &other.signatures[0] { + let common = mh.count_common(omh, false).unwrap(); + let size = mh.size(); + return common as f64 / size as f64; + } + } + unimplemented!() + } +} + +impl MemStorage { + pub fn new() -> Self { + Self { + sigs: Arc::new(RwLock::new(HashMap::default())), + } + } +} + +impl Storage for MemStorage { + fn save(&self, _path: &str, _content: &[u8]) -> Result { + unimplemented!() + } + + fn load(&self, _path: &str) -> Result> { + unimplemented!() + } + + fn args(&self) -> StorageArgs { + unimplemented!() + } + + fn load_sig(&self, path: &str) -> Result { + Ok(self.sigs.read().unwrap().get(path).unwrap().clone()) + } + + fn save_sig(&self, path: &str, sig: Signature) -> Result { + // side-step saving to store + let sig_store: SigStore = sig.into(); + self.sigs.write().unwrap().insert(path.into(), sig_store); + Ok(path.into()) + } + + fn spec(&self) -> String { + "memory://".into() + } +} diff --git a/src/core/tests/storage.rs b/src/core/tests/storage.rs index 5a60e02fcc..d41053c160 100644 --- a/src/core/tests/storage.rs +++ b/src/core/tests/storage.rs @@ -42,3 +42,38 @@ fn zipstorage_list_sbts() -> Result<(), Box> { Ok(()) } + +#[cfg(feature = "parallel")] +#[test] +fn zipstorage_parallel_access() -> Result<(), Box> { + use rayon::prelude::*; + use sourmash::signature::{Signature, SigsTrait}; + + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("../../tests/test-data/v6.sbt.zip"); + + let zs = ZipStorage::from_file(filename.to_str().unwrap())?; + + let total_hashes: usize = [ + ".sbt.v3/f71e78178af9e45e6f1d87a0c53c465c", + ".sbt.v3/f0c834bc306651d2b9321fb21d3e8d8f", + ".sbt.v3/4e94e60265e04f0763142e20b52c0da1", + ".sbt.v3/6d6e87e1154e95b279e5e7db414bc37b", + ".sbt.v3/0107d767a345eff67ecdaed2ee5cd7ba", + ".sbt.v3/b59473c94ff2889eca5d7165936e64b3", + ".sbt.v3/60f7e23c24a8d94791cc7a8680c493f9", + ] + .par_iter() + .map(|path| { + let data = zs.load(path).unwrap(); + let sigs: Vec = serde_json::from_reader(&data[..]).expect("Loading error"); + sigs.iter() + .map(|v| v.sketches().iter().map(|mh| mh.size()).sum::()) + .sum::() + }) + .sum(); + + assert_eq!(total_hashes, 3500); + + Ok(()) +} diff --git a/src/sourmash/sbt_storage.py b/src/sourmash/sbt_storage.py index a22e782d69..42a4fceaa6 100644 --- a/src/sourmash/sbt_storage.py +++ b/src/sourmash/sbt_storage.py @@ -130,7 +130,7 @@ def subdir(self, value): self._methodcall(lib.zipstorage_set_subdir, to_bytes(value), len(value)) def _filenames(self): - if self.__inner: + if not self._objptr: return self.__inner._filenames() size = ffi.new("uintptr_t *") @@ -150,7 +150,7 @@ def save(self, path, content, *, overwrite=False, compress=False): raise NotImplementedError() def load(self, path): - if self.__inner: + if not self._objptr: return self.__inner.load(path) try: diff --git a/tox.ini b/tox.ini index 41734a6a3b..f73758ab7f 100644 --- a/tox.ini +++ b/tox.ini @@ -50,6 +50,11 @@ commands = pytest \ --junitxml {toxworkdir}/junit.{envname}.xml \ {posargs:doc tests} +[testenv:.pkg] +pass_env = + LIBCLANG_PATH + BINDGEN_EXTRA_CLANG_ARGS + [testenv:pypy3] deps = pip >= 19.3.1 @@ -104,7 +109,7 @@ commands = description = invoke sphinx-build to build the HTML docs basepython = python3.10 extras = doc -whitelist_externals = pandoc +allowlist_externals = pandoc pass_env = HOME change_dir = {toxinidir} #commands = sphinx-build -d "{toxworkdir}/docs_doctree" doc "{toxworkdir}/docs_out" --color -W -bhtml {posargs} From f09476163d2443afa6b3530dae17d8dff5e32541 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sat, 16 Sep 2023 12:57:53 -0700 Subject: [PATCH 02/19] reorg select --- include/sourmash.h | 1 + src/core/src/collection.rs | 14 +- src/core/src/index/linear.rs | 57 ++---- src/core/src/index/mod.rs | 106 +--------- src/core/src/index/revindex/disk_revindex.rs | 6 - src/core/src/index/revindex/mem_revindex.rs | 191 +------------------ src/core/src/index/revindex/mod.rs | 114 +++++++++-- src/core/src/lib.rs | 2 +- src/core/src/manifest.rs | 6 +- src/core/src/picklist.rs | 29 --- src/core/src/prelude.rs | 11 +- src/core/src/selection.rs | 141 ++++++++++++++ src/core/src/signature.rs | 56 +++--- src/core/src/storage.rs | 5 +- 14 files changed, 315 insertions(+), 424 deletions(-) delete mode 100644 src/core/src/picklist.rs create mode 100644 src/core/src/selection.rs diff --git a/include/sourmash.h b/include/sourmash.h index 011aee2925..d647378da7 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -43,6 +43,7 @@ enum SourmashErrorCode { SOURMASH_ERROR_CODE_SERDE_ERROR = 100004, SOURMASH_ERROR_CODE_NIFFLER_ERROR = 100005, SOURMASH_ERROR_CODE_CSV_ERROR = 100006, + SOURMASH_ERROR_CODE_ROCKS_DB_ERROR = 100007, }; typedef uint32_t SourmashErrorCode; diff --git a/src/core/src/collection.rs b/src/core/src/collection.rs index 164db5efe7..8f3b049313 100644 --- a/src/core/src/collection.rs +++ b/src/core/src/collection.rs @@ -3,8 +3,8 @@ use std::ops::{Deref, DerefMut}; use camino::Utf8Path as Path; use crate::encodings::Idx; -use crate::index::Selection; use crate::manifest::{Manifest, Record}; +use crate::prelude::*; use crate::signature::Signature; use crate::storage::{FSStorage, InnerStorage, MemStorage, SigStore, Storage, ZipStorage}; use crate::Result; @@ -53,6 +53,12 @@ impl TryFrom for CollectionSet { } } +impl CollectionSet { + pub fn into_inner(self) -> Collection { + self.collection + } +} + impl Collection { pub fn from_zipfile>(zipfile: P) -> Result { let storage = ZipStorage::from_file(zipfile)?; @@ -127,9 +133,11 @@ impl Collection { assert_eq!(sig.signatures.len(), 1); Ok(sig) } +} - pub fn select(mut self, selection: &Selection) -> Result { - self.manifest = self.manifest.select_to_manifest(selection)?; +impl Select for Collection { + fn select(mut self, selection: &Selection) -> Result { + self.manifest = self.manifest.select(selection)?; Ok(self) } } diff --git a/src/core/src/index/linear.rs b/src/core/src/index/linear.rs index ed12bbd745..1b4cd2f8ec 100644 --- a/src/core/src/index/linear.rs +++ b/src/core/src/index/linear.rs @@ -11,13 +11,13 @@ use crate::collection::CollectionSet; use crate::encodings::Idx; use crate::index::{GatherResult, Index, Selection, SigCounter}; use crate::manifest::Manifest; +use crate::selection::Select; use crate::signature::{Signature, SigsTrait}; use crate::sketch::minhash::KmerMinHash; use crate::sketch::Sketch; use crate::storage::{InnerStorage, SigStore, Storage}; use crate::Result; -//#[derive(Serialize, Deserialize)] pub struct LinearIndex { collection: CollectionSet, template: Sketch, @@ -58,46 +58,6 @@ impl LinearIndex { Some(self.collection.storage.clone()) } - pub fn select(mut self, selection: &Selection) -> Result { - let manifest = self.collection.manifest.select_to_manifest(selection)?; - self.collection.manifest = manifest; - - Ok(self) - /* - # if we have a manifest, run 'select' on the manifest. - manifest = self.manifest - traverse_yield_all = self.traverse_yield_all - - if manifest is not None: - manifest = manifest.select_to_manifest(**kwargs) - return ZipFileLinearIndex(self.storage, - selection_dict=None, - traverse_yield_all=traverse_yield_all, - manifest=manifest, - use_manifest=True) - else: - # no manifest? just pass along all the selection kwargs to - # the new ZipFileLinearIndex. - - assert manifest is None - if self.selection_dict: - # combine selects... - d = dict(self.selection_dict) - for k, v in kwargs.items(): - if k in d: - if d[k] is not None and d[k] != v: - raise ValueError(f"incompatible select on '{k}'") - d[k] = v - kwargs = d - - return ZipFileLinearIndex(self.storage, - selection_dict=kwargs, - traverse_yield_all=traverse_yield_all, - manifest=None, - use_manifest=False) - */ - } - pub fn counter_for_query(&self, query: &KmerMinHash) -> SigCounter { let processed_sigs = AtomicUsize::new(0); @@ -348,6 +308,21 @@ impl LinearIndex { } } +impl Select for LinearIndex { + fn select(self, selection: &Selection) -> Result { + let Self { + collection, + template, + } = self; + let collection = collection.into_inner().select(selection)?.try_into()?; + + Ok(Self { + collection, + template, + }) + } +} + impl<'a> Index<'a> for LinearIndex { type Item = SigStore; diff --git a/src/core/src/index/mod.rs b/src/core/src/index/mod.rs index ad65bf9d08..ec55249b04 100644 --- a/src/core/src/index/mod.rs +++ b/src/core/src/index/mod.rs @@ -18,13 +18,9 @@ use getset::{CopyGetters, Getters, Setters}; use serde::{Deserialize, Serialize}; use typed_builder::TypedBuilder; -use crate::encodings::{HashFunctions, Idx}; +use crate::encodings::Idx; use crate::index::search::{search_minhashes, search_minhashes_containment}; -use crate::manifest::Record; -use crate::picklist::Picklist; use crate::prelude::*; -use crate::signature::SigsTrait; -use crate::sketch::Sketch; use crate::Result; #[derive(TypedBuilder, CopyGetters, Getters, Setters, Serialize, Deserialize, Debug, PartialEq)] @@ -68,108 +64,8 @@ impl GatherResult { } } -#[derive(Default, Debug)] -pub struct Selection { - ksize: Option, - abund: Option, - num: Option, - scaled: Option, - containment: Option, - moltype: Option, - picklist: Option, -} - type SigCounter = counter::Counter; -impl Selection { - pub fn ksize(&self) -> Option { - self.ksize - } - - pub fn set_ksize(&mut self, ksize: u32) { - self.ksize = Some(ksize); - } - - pub fn abund(&self) -> Option { - self.abund - } - - pub fn set_abund(&mut self, value: bool) { - self.abund = Some(value); - } - - pub fn num(&self) -> Option { - self.num - } - - pub fn set_num(&mut self, num: u32) { - self.num = Some(num); - } - - pub fn scaled(&self) -> Option { - self.scaled - } - - pub fn set_scaled(&mut self, scaled: u32) { - self.scaled = Some(scaled); - } - - pub fn containment(&self) -> Option { - self.containment - } - - pub fn set_containment(&mut self, containment: bool) { - self.containment = Some(containment); - } - - pub fn moltype(&self) -> Option { - self.moltype - } - - pub fn set_moltype(&mut self, value: HashFunctions) { - self.moltype = Some(value); - } - - pub fn picklist(&self) -> Option { - self.picklist.clone() - } - - pub fn set_picklist(&mut self, value: Picklist) { - self.picklist = Some(value); - } - - pub fn from_template(template: &Sketch) -> Self { - let (num, scaled) = match template { - Sketch::MinHash(mh) => (Some(mh.num()), Some(mh.scaled() as u32)), - Sketch::LargeMinHash(mh) => (Some(mh.num()), Some(mh.scaled() as u32)), - _ => (None, None), - }; - - Selection { - ksize: Some(template.ksize() as u32), - abund: None, - containment: None, - //moltype: Some(template.hash_function()), - moltype: None, - num, - picklist: None, - scaled, - } - } - - pub fn from_record(row: &Record) -> Result { - Ok(Self { - ksize: Some(*row.ksize()), - abund: Some(*row.with_abundance()), - moltype: Some(row.moltype()), - num: None, - scaled: None, - containment: None, - picklist: None, - }) - } -} - pub trait Index<'a> { type Item: Comparable; //type SignatureIterator: Iterator; diff --git a/src/core/src/index/revindex/disk_revindex.rs b/src/core/src/index/revindex/disk_revindex.rs index 60e33cf1d4..4c4064acc4 100644 --- a/src/core/src/index/revindex/disk_revindex.rs +++ b/src/core/src/index/revindex/disk_revindex.rs @@ -26,11 +26,6 @@ use crate::Result; fn compute_color(idxs: &Datasets) -> Color { let s = BuildHasherDefault::::default(); let mut hasher = s.build_hasher(); - /* - // TODO: remove this... - let mut sorted: Vec<_> = idxs.iter().collect(); - sorted.sort(); - */ idxs.hash(&mut hasher); hasher.finish() } @@ -198,7 +193,6 @@ impl RevIndex { .merge_cf(&cf_hashes, &hash_bytes[..], colors.as_slice()) .expect("error merging"); } - // TODO: save collection to DB? } } diff --git a/src/core/src/index/revindex/mem_revindex.rs b/src/core/src/index/revindex/mem_revindex.rs index 113452ed1b..b951d9513d 100644 --- a/src/core/src/index/revindex/mem_revindex.rs +++ b/src/core/src/index/revindex/mem_revindex.rs @@ -1,4 +1,4 @@ -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::sync::atomic::{AtomicUsize, Ordering}; use camino::Utf8Path as Path; @@ -13,7 +13,9 @@ use rayon::prelude::*; use crate::collection::Collection; use crate::encodings::{Color, Colors, Idx}; use crate::index::linear::LinearIndex; -use crate::index::{GatherResult, Index, Selection, SigCounter}; +use crate::index::revindex::HashToColor; +use crate::index::{GatherResult, Index, SigCounter}; +use crate::prelude::*; use crate::signature::{Signature, SigsTrait}; use crate::sketch::minhash::KmerMinHash; use crate::sketch::Sketch; @@ -21,90 +23,12 @@ use crate::storage::Storage; use crate::HashIntoType; use crate::Result; -// Use rkyv for serialization? -// https://davidkoloski.me/rkyv/ -//#[derive(Serialize, Deserialize)] pub struct RevIndex { linear: LinearIndex, hash_to_color: HashToColor, colors: Colors, } -#[derive(Serialize, Deserialize)] -struct HashToColor(HashMap>); - -impl HashToColor { - fn new() -> Self { - HashToColor(HashMap::< - HashIntoType, - Color, - BuildNoHashHasher, - >::with_hasher(BuildNoHashHasher::default())) - } - - fn get(&self, hash: &HashIntoType) -> Option<&Color> { - self.0.get(hash) - } - - fn retain(&mut self, hashes: &HashSet) { - self.0.retain(|hash, _| hashes.contains(hash)) - } - - fn len(&self) -> usize { - self.0.len() - } - - fn is_empty(&self) -> bool { - self.0.is_empty() - } - - fn add_to(&mut self, colors: &mut Colors, dataset_id: usize, matched_hashes: Vec) { - let mut color = None; - - matched_hashes.into_iter().for_each(|hash| { - color = Some(colors.update(color, &[dataset_id as Idx]).unwrap()); - self.0.insert(hash, color.unwrap()); - }); - } - - fn reduce_hashes_colors( - a: (HashToColor, Colors), - b: (HashToColor, Colors), - ) -> (HashToColor, Colors) { - let ((small_hashes, small_colors), (mut large_hashes, mut large_colors)) = - if a.0.len() > b.0.len() { - (b, a) - } else { - (a, b) - }; - - small_hashes.0.into_iter().for_each(|(hash, color)| { - large_hashes - .0 - .entry(hash) - .and_modify(|entry| { - // Hash is already present. - // Update the current color by adding the indices from - // small_colors. - let ids = small_colors.indices(&color); - let new_color = large_colors.update(Some(*entry), ids).unwrap(); - *entry = new_color; - }) - .or_insert_with(|| { - // In this case, the hash was not present yet. - // we need to create the same color from small_colors - // into large_colors. - let ids = small_colors.indices(&color); - let new_color = large_colors.update(None, ids).unwrap(); - assert_eq!(new_color, color); - new_color - }); - }); - - (large_hashes, large_colors) - } -} - impl LinearIndex { fn index( self, @@ -171,50 +95,6 @@ impl LinearIndex { } impl RevIndex { - pub fn load>( - _index_path: P, - _queries: Option<&[KmerMinHash]>, - ) -> Result { - unimplemented!() - /* - let (rdr, _) = niffler::from_path(index_path)?; - let revindex = if let Some(qs) = queries { - // TODO: avoid loading full revindex if query != None - /* - struct PartialRevIndex { - hashes_to_keep: Option>, - marker: PhantomData T>, - } - - impl PartialRevIndex { - pub fn new(hashes_to_keep: HashSet) -> Self { - PartialRevIndex { - hashes_to_keep: Some(hashes_to_keep), - marker: PhantomData, - } - } - } - */ - - let mut hashes: HashSet = HashSet::new(); - for q in qs { - hashes.extend(q.iter_mins()); - } - - //let mut revindex: RevIndex = PartialRevIndex::new(hashes).deserialize(&rdr).unwrap(); - - let mut revindex: RevIndex = serde_json::from_reader(rdr)?; - revindex.hash_to_color.retain(&hashes); - revindex - } else { - // Load the full revindex - serde_json::from_reader(rdr)? - }; - - Ok(revindex) - */ - } - pub fn new( search_sigs: &[PathBuf], template: &Sketch, @@ -380,32 +260,6 @@ impl RevIndex { containment: bool, _ignore_scaled: bool, ) -> Result> { - /* - let template_mh = None; - if let Sketch::MinHash(mh) = self.template { - template_mh = Some(mh); - }; - // TODO: throw error - let template_mh = template_mh.unwrap(); - - let tmp_mh; - let mh = if template_mh.scaled() > mh.scaled() { - // TODO: proper error here - tmp_mh = mh.downsample_scaled(self.scaled)?; - &tmp_mh - } else { - mh - }; - - if self.scaled < mh.scaled() && !ignore_scaled { - return Err(LcaDBError::ScaledMismatchError { - db: self.scaled, - query: mh.scaled(), - } - .into()); - } - */ - // TODO: proper threshold calculation let threshold: usize = (threshold * (mh.size() as f64)) as _; @@ -490,43 +344,6 @@ impl<'a> Index<'a> for RevIndex { } } -/* -impl RevIndexOps for RevIndex { - /* TODO: need the repair_cf variant, not available in rocksdb-rust yet - pub fn repair(index: &Path, colors: bool); - */ - - fn matches_from_counter(&self, counter: SigCounter, threshold: usize) -> Vec<(String, usize)>; - - fn prepare_gather_counters( - &self, - query: &KmerMinHash, - ) -> (SigCounter, QueryColors, HashToColor); - - fn index(&self, index_sigs: Vec, template: &Sketch, threshold: f64, save_paths: bool); - - fn update(&self, index_sigs: Vec, template: &Sketch, threshold: f64, save_paths: bool); - - fn compact(&self); - - fn flush(&self) -> Result<()>; - - fn convert(&self, output_db: RevIndex) -> Result<()>; - - fn check(&self, quick: bool); - - fn gather( - &self, - counter: SigCounter, - query_colors: QueryColors, - hash_to_color: HashToColor, - threshold: usize, - query: &KmerMinHash, - template: &Sketch, - ) -> Result>; -} -*/ - #[cfg(test)] mod test { use super::*; diff --git a/src/core/src/index/revindex/mod.rs b/src/core/src/index/revindex/mod.rs index 460a5429af..42a9837d13 100644 --- a/src/core/src/index/revindex/mod.rs +++ b/src/core/src/index/revindex/mod.rs @@ -8,11 +8,12 @@ use std::sync::Arc; use byteorder::{LittleEndian, WriteBytesExt}; use enum_dispatch::enum_dispatch; - +use nohash_hasher::BuildNoHashHasher; use roaring::RoaringBitmap; +use serde::{Deserialize, Serialize}; use crate::collection::CollectionSet; -use crate::encodings::{Color, Idx}; +use crate::encodings::{Color, Colors, Idx}; use crate::index::{GatherResult, SigCounter}; use crate::signature::{Signature, SigsTrait}; use crate::sketch::minhash::{max_hash_for_scaled, KmerMinHash}; @@ -20,11 +21,12 @@ use crate::sketch::Sketch; use crate::HashIntoType; use crate::Result; -//type DB = rocksdb::DBWithThreadMode; type DB = rocksdb::DBWithThreadMode; type QueryColors = HashMap; -type HashToColor = HashMap; +type HashToColorT = HashMap>; +#[derive(Serialize, Deserialize)] +pub struct HashToColor(HashToColorT); const HASHES: &str = "hashes"; const COLORS: &str = "colors"; @@ -75,6 +77,83 @@ pub trait RevIndexOps { ) -> Result>; } +impl HashToColor { + fn new() -> Self { + HashToColor(HashMap::< + HashIntoType, + Color, + BuildNoHashHasher, + >::with_hasher(BuildNoHashHasher::default())) + } + + fn get(&self, hash: &HashIntoType) -> Option<&Color> { + self.0.get(hash) + } + + fn len(&self) -> usize { + self.0.len() + } + + fn is_empty(&self) -> bool { + self.0.is_empty() + } + + fn add_to(&mut self, colors: &mut Colors, dataset_id: usize, matched_hashes: Vec) { + let mut color = None; + + matched_hashes.into_iter().for_each(|hash| { + color = Some(colors.update(color, &[dataset_id as Idx]).unwrap()); + self.0.insert(hash, color.unwrap()); + }); + } + + fn reduce_hashes_colors( + a: (HashToColor, Colors), + b: (HashToColor, Colors), + ) -> (HashToColor, Colors) { + let ((small_hashes, small_colors), (mut large_hashes, mut large_colors)) = + if a.0.len() > b.0.len() { + (b, a) + } else { + (a, b) + }; + + small_hashes.0.into_iter().for_each(|(hash, color)| { + large_hashes + .0 + .entry(hash) + .and_modify(|entry| { + // Hash is already present. + // Update the current color by adding the indices from + // small_colors. + let ids = small_colors.indices(&color); + let new_color = large_colors.update(Some(*entry), ids).unwrap(); + *entry = new_color; + }) + .or_insert_with(|| { + // In this case, the hash was not present yet. + // we need to create the same color from small_colors + // into large_colors. + let ids = small_colors.indices(&color); + let new_color = large_colors.update(None, ids).unwrap(); + assert_eq!(new_color, color); + new_color + }); + }); + + (large_hashes, large_colors) + } +} + +impl FromIterator<(HashIntoType, Color)> for HashToColor { + fn from_iter(iter: T) -> Self + where + T: IntoIterator, + { + HashToColor(HashToColorT::from_iter(iter)) + } +} + impl RevIndex { /* TODO: need the repair_cf variant, not available in rocksdb-rust yet pub fn repair(index: &Path, colors: bool) { @@ -387,27 +466,30 @@ fn stats_for_cf(db: Arc, cf_name: &str, deep_check: bool, quick: bool) { } } -fn build_template(ksize: u8, scaled: usize) -> Sketch { - let max_hash = max_hash_for_scaled(scaled as u64); - let template_mh = KmerMinHash::builder() - .num(0u32) - .ksize(ksize as u32) - .max_hash(max_hash) - .build(); - Sketch::MinHash(template_mh) -} - #[cfg(test)] mod test { use camino::Utf8PathBuf as PathBuf; use tempfile::TempDir; + use crate::collection::Collection; use crate::prelude::*; + use crate::selection::Selection; + use crate::sketch::minhash::{max_hash_for_scaled, KmerMinHash}; + use crate::sketch::Sketch; use crate::Result; - use crate::{collection::Collection, index::Selection}; - use super::{build_template, prepare_query, RevIndex, RevIndexOps}; + use super::{prepare_query, RevIndex, RevIndexOps}; + + fn build_template(ksize: u8, scaled: usize) -> Sketch { + let max_hash = max_hash_for_scaled(scaled as u64); + let template_mh = KmerMinHash::builder() + .num(0u32) + .ksize(ksize as u32) + .max_hash(max_hash) + .build(); + Sketch::MinHash(template_mh) + } #[test] fn revindex_index() -> Result<()> { diff --git a/src/core/src/lib.rs b/src/core/src/lib.rs index dc88d34363..da383372a0 100644 --- a/src/core/src/lib.rs +++ b/src/core/src/lib.rs @@ -30,7 +30,7 @@ pub mod cmd; pub mod collection; pub mod index; pub mod manifest; -pub mod picklist; +pub mod selection; pub mod signature; pub mod sketch; pub mod storage; diff --git a/src/core/src/manifest.rs b/src/core/src/manifest.rs index ca2602b269..39647480f2 100644 --- a/src/core/src/manifest.rs +++ b/src/core/src/manifest.rs @@ -8,7 +8,7 @@ use serde::de; use serde::{Deserialize, Serialize}; use crate::encodings::HashFunctions; -use crate::index::Selection; +use crate::prelude::*; use crate::signature::{Signature, SigsTrait}; use crate::sketch::Sketch; use crate::Result; @@ -163,8 +163,10 @@ impl Manifest { pub fn iter(&self) -> impl Iterator { self.records.iter() } +} - pub fn select_to_manifest(&self, selection: &Selection) -> Result { +impl Select for Manifest { + fn select(self, selection: &Selection) -> Result { let rows = self.records.iter().filter(|row| { let mut valid = true; valid = if let Some(ksize) = selection.ksize() { diff --git a/src/core/src/picklist.rs b/src/core/src/picklist.rs deleted file mode 100644 index 943d3f051a..0000000000 --- a/src/core/src/picklist.rs +++ /dev/null @@ -1,29 +0,0 @@ -use getset::{CopyGetters, Getters, Setters}; -use typed_builder::TypedBuilder; - -#[derive(Default, TypedBuilder, CopyGetters, Getters, Setters, Clone, Debug)] -pub struct Picklist { - #[getset(get = "pub", set = "pub")] - #[builder(default = "".into())] - coltype: String, - - #[getset(get = "pub", set = "pub")] - #[builder(default = "".into())] - pickfile: String, - - #[getset(get = "pub", set = "pub")] - #[builder(default = "".into())] - column_name: String, - - #[getset(get = "pub", set = "pub")] - #[builder] - pickstyle: PickStyle, -} - -#[derive(Clone, Default, Debug)] -#[repr(u32)] -pub enum PickStyle { - #[default] - Include = 1, - Exclude = 2, -} diff --git a/src/core/src/prelude.rs b/src/core/src/prelude.rs index ef7d4aa27b..90598186c4 100644 --- a/src/core/src/prelude.rs +++ b/src/core/src/prelude.rs @@ -1,27 +1,28 @@ use std::io::Write; -use crate::Error; +use crate::Result; +pub use crate::selection::{Select, Selection}; pub use crate::signature::Signature; pub use crate::storage::Storage; pub trait ToWriter { - fn to_writer(&self, writer: &mut W) -> Result<(), Error> + fn to_writer(&self, writer: &mut W) -> Result<()> where W: Write; } pub trait Update { - fn update(&self, other: &mut O) -> Result<(), Error>; + fn update(&self, other: &mut O) -> Result<()>; } pub trait FromFactory { - fn factory(&self, name: &str) -> Result; + fn factory(&self, name: &str) -> Result; } /// Implemented by anything that wants to read specific data from a storage. pub trait ReadData { - fn data(&self) -> Result<&D, Error>; + fn data(&self) -> Result<&D>; } // TODO: split into two traits, Similarity and Containment? diff --git a/src/core/src/selection.rs b/src/core/src/selection.rs new file mode 100644 index 0000000000..3e18f8fb31 --- /dev/null +++ b/src/core/src/selection.rs @@ -0,0 +1,141 @@ +use getset::{CopyGetters, Getters, Setters}; +use typed_builder::TypedBuilder; + +use crate::encodings::HashFunctions; +use crate::manifest::Record; +use crate::signature::SigsTrait; +use crate::sketch::Sketch; +use crate::Result; + +#[derive(Default, TypedBuilder, CopyGetters, Getters, Setters, Clone, Debug)] +pub struct Picklist { + #[getset(get = "pub", set = "pub")] + #[builder(default = "".into())] + coltype: String, + + #[getset(get = "pub", set = "pub")] + #[builder(default = "".into())] + pickfile: String, + + #[getset(get = "pub", set = "pub")] + #[builder(default = "".into())] + column_name: String, + + #[getset(get = "pub", set = "pub")] + #[builder] + pickstyle: PickStyle, +} + +#[derive(Clone, Default, Debug)] +#[repr(u32)] +pub enum PickStyle { + #[default] + Include = 1, + Exclude = 2, +} + +#[derive(Default, Debug)] +pub struct Selection { + ksize: Option, + abund: Option, + num: Option, + scaled: Option, + containment: Option, + moltype: Option, + picklist: Option, +} + +pub trait Select { + fn select(self, selection: &Selection) -> Result + where + Self: Sized; +} + +impl Selection { + pub fn ksize(&self) -> Option { + self.ksize + } + + pub fn set_ksize(&mut self, ksize: u32) { + self.ksize = Some(ksize); + } + + pub fn abund(&self) -> Option { + self.abund + } + + pub fn set_abund(&mut self, value: bool) { + self.abund = Some(value); + } + + pub fn num(&self) -> Option { + self.num + } + + pub fn set_num(&mut self, num: u32) { + self.num = Some(num); + } + + pub fn scaled(&self) -> Option { + self.scaled + } + + pub fn set_scaled(&mut self, scaled: u32) { + self.scaled = Some(scaled); + } + + pub fn containment(&self) -> Option { + self.containment + } + + pub fn set_containment(&mut self, containment: bool) { + self.containment = Some(containment); + } + + pub fn moltype(&self) -> Option { + self.moltype + } + + pub fn set_moltype(&mut self, value: HashFunctions) { + self.moltype = Some(value); + } + + pub fn picklist(&self) -> Option { + self.picklist.clone() + } + + pub fn set_picklist(&mut self, value: Picklist) { + self.picklist = Some(value); + } + + pub fn from_template(template: &Sketch) -> Self { + let (num, scaled) = match template { + Sketch::MinHash(mh) => (Some(mh.num()), Some(mh.scaled() as u32)), + Sketch::LargeMinHash(mh) => (Some(mh.num()), Some(mh.scaled() as u32)), + _ => (None, None), + }; + + Selection { + ksize: Some(template.ksize() as u32), + abund: None, + containment: None, + //moltype: Some(template.hash_function()), + moltype: None, + num, + picklist: None, + scaled, + } + } + + pub fn from_record(row: &Record) -> Result { + Ok(Self { + ksize: Some(*row.ksize()), + abund: Some(*row.with_abundance()), + moltype: Some(row.moltype()), + num: None, + scaled: None, + containment: None, + picklist: None, + }) + } +} diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index 19ec308617..9ac9bfe2aa 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -17,8 +17,8 @@ use serde::{Deserialize, Serialize}; use typed_builder::TypedBuilder; use crate::encodings::{aa_to_dayhoff, aa_to_hp, revcomp, to_aa, HashFunctions, VALID}; -use crate::index::Selection; use crate::prelude::*; +use crate::selection::{Select, Selection}; use crate::sketch::Sketch; use crate::Error; use crate::HashIntoType; @@ -534,32 +534,6 @@ impl Signature { None } - pub fn select(mut self, selection: &Selection) -> Result { - self.signatures.retain(|s| { - let mut valid = true; - valid = if let Some(ksize) = selection.ksize() { - let k = s.ksize() as u32; - k == ksize || k == ksize * 3 - } else { - valid - }; - /* - valid = if let Some(abund) = selection.abund() { - valid && *s.with_abundance() == abund - } else { - valid - }; - valid = if let Some(moltype) = selection.moltype() { - valid && s.moltype() == moltype - } else { - valid - }; - */ - valid - }); - Ok(self) - } - pub fn from_path>(path: P) -> Result, Error> { let mut reader = io::BufReader::new(File::open(path)?); Signature::from_reader(&mut reader) @@ -787,6 +761,34 @@ impl ToWriter for Signature { } } +impl Select for Signature { + fn select(mut self, selection: &Selection) -> Result { + self.signatures.retain(|s| { + let mut valid = true; + valid = if let Some(ksize) = selection.ksize() { + let k = s.ksize() as u32; + k == ksize || k == ksize * 3 + } else { + valid + }; + /* + valid = if let Some(abund) = selection.abund() { + valid && *s.with_abundance() == abund + } else { + valid + }; + valid = if let Some(moltype) = selection.moltype() { + valid && s.moltype() == moltype + } else { + valid + }; + */ + valid + }); + Ok(self) + } +} + impl Default for Signature { fn default() -> Signature { Signature { diff --git a/src/core/src/storage.rs b/src/core/src/storage.rs index 3e2a2cab6e..ad017e65a7 100644 --- a/src/core/src/storage.rs +++ b/src/core/src/storage.rs @@ -13,7 +13,6 @@ use thiserror::Error; use typed_builder::TypedBuilder; use crate::errors::ReadDataError; -use crate::index::Selection; use crate::prelude::*; use crate::signature::SigsTrait; use crate::sketch::Sketch; @@ -431,8 +430,10 @@ impl SigStore { pub fn name(&self) -> String { self.name.clone() } +} - pub fn select(mut self, selection: &Selection) -> Result { +impl Select for SigStore { + fn select(mut self, selection: &Selection) -> Result { // TODO: find better error let sig = self.data.take().ok_or(Error::MismatchKSizes)?; self.data = OnceCell::with_value(sig.select(selection)?); From b77a452c4dfc13d03c9788a739c4fb2d4cc49532 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sat, 16 Sep 2023 17:38:18 -0700 Subject: [PATCH 03/19] make storage and manifest private in collection --- src/core/src/collection.rs | 43 +++++++++- src/core/src/index/linear.rs | 85 ++++++-------------- src/core/src/index/revindex/disk_revindex.rs | 30 ++++--- src/core/src/index/revindex/mem_revindex.rs | 36 +++------ src/core/src/index/revindex/mod.rs | 4 +- 5 files changed, 94 insertions(+), 104 deletions(-) diff --git a/src/core/src/collection.rs b/src/core/src/collection.rs index 8f3b049313..eaed3fbffa 100644 --- a/src/core/src/collection.rs +++ b/src/core/src/collection.rs @@ -9,9 +9,12 @@ use crate::signature::Signature; use crate::storage::{FSStorage, InnerStorage, MemStorage, SigStore, Storage, ZipStorage}; use crate::Result; +#[cfg(feature = "parallel")] +use rayon::prelude::*; + pub struct Collection { - pub(crate) manifest: Manifest, - pub(crate) storage: InnerStorage, + manifest: Manifest, + storage: InnerStorage, } pub struct CollectionSet { @@ -60,6 +63,38 @@ impl CollectionSet { } impl Collection { + pub fn new(manifest: Manifest, storage: InnerStorage) -> Self { + Self { manifest, storage } + } + + pub fn iter(&self) -> impl Iterator { + self.manifest.iter().enumerate().map(|(i, r)| (i as Idx, r)) + } + + #[cfg(feature = "parallel")] + pub fn par_iter(&self) -> impl IndexedParallelIterator { + self.manifest + .par_iter() + .enumerate() + .map(|(i, r)| (i as Idx, r)) + } + + pub fn len(&self) -> usize { + self.manifest.len() + } + + pub fn is_empty(&self) -> bool { + self.manifest.len() == 0 + } + + pub fn manifest(&self) -> &Manifest { + &self.manifest + } + + pub fn storage(&self) -> &InnerStorage { + &self.storage + } + pub fn from_zipfile>(zipfile: P) -> Result { let storage = ZipStorage::from_file(zipfile)?; // Load manifest from standard location in zipstorage @@ -119,6 +154,10 @@ impl Collection { }) } + pub fn record_for_dataset(&self, dataset_id: Idx) -> Result<&Record> { + Ok(&self.manifest[dataset_id as usize]) + } + pub fn sig_for_dataset(&self, dataset_id: Idx) -> Result { let match_path = if self.manifest.is_empty() { "" diff --git a/src/core/src/index/linear.rs b/src/core/src/index/linear.rs index 1b4cd2f8ec..ff919b6f57 100644 --- a/src/core/src/index/linear.rs +++ b/src/core/src/index/linear.rs @@ -10,12 +10,11 @@ use rayon::prelude::*; use crate::collection::CollectionSet; use crate::encodings::Idx; use crate::index::{GatherResult, Index, Selection, SigCounter}; -use crate::manifest::Manifest; use crate::selection::Select; use crate::signature::{Signature, SigsTrait}; use crate::sketch::minhash::KmerMinHash; use crate::sketch::Sketch; -use crate::storage::{InnerStorage, SigStore, Storage}; +use crate::storage::SigStore; use crate::Result; pub struct LinearIndex { @@ -46,53 +45,32 @@ impl LinearIndex { } pub fn location(&self) -> Option { - if let Some(_storage) = &self.storage() { - // storage.path() - unimplemented!() - } else { - None - } - } - - pub fn storage(&self) -> Option { - Some(self.collection.storage.clone()) + unimplemented!() } pub fn counter_for_query(&self, query: &KmerMinHash) -> SigCounter { let processed_sigs = AtomicUsize::new(0); - let search_sigs: Vec<_> = self - .collection - .manifest - .internal_locations() - .map(PathBuf::from) - .collect(); - let template = self.template(); #[cfg(feature = "parallel")] - let sig_iter = search_sigs.par_iter(); + let sig_iter = self.collection.par_iter(); #[cfg(not(feature = "parallel"))] - let sig_iter = search_sigs.iter(); + let sig_iter = self.collection.iter(); + + let counters = sig_iter.filter_map(|(dataset_id, record)| { + let filename = record.internal_location(); - let counters = sig_iter.enumerate().filter_map(|(dataset_id, filename)| { let i = processed_sigs.fetch_add(1, Ordering::SeqCst); if i % 1000 == 0 { info!("Processed {} reference sigs", i); } - let search_sig = if let Some(storage) = &self.storage() { - let sig_data = storage - .load(filename.as_str()) - .unwrap_or_else(|_| panic!("error loading {:?}", filename)); - - Signature::from_reader(sig_data.as_slice()) - } else { - Signature::from_path(filename) - } - .unwrap_or_else(|_| panic!("Error processing {:?}", filename)) - .swap_remove(0); + let search_sig = self + .collection + .sig_for_dataset(dataset_id) + .unwrap_or_else(|_| panic!("error loading {:?}", filename)); let mut search_mh = None; if let Some(Sketch::MinHash(mh)) = search_sig.select_sketch(template) { @@ -147,7 +125,8 @@ impl LinearIndex { for (dataset_id, size) in counter.most_common() { if size >= threshold { matches.push( - self.collection.manifest[dataset_id as usize] + self.collection + .record_for_dataset(dataset_id)? .internal_location() .to_string(), ); @@ -165,14 +144,11 @@ impl LinearIndex { query: &KmerMinHash, round: usize, ) -> Result { - let match_path = if self.collection.manifest.is_empty() { - "" - } else { - self.collection.manifest[dataset_id as usize] - .internal_location() - .as_str() - } - .into(); + let match_path = self + .collection + .record_for_dataset(dataset_id)? + .internal_location() + .into(); let match_sig = self.collection.sig_for_dataset(dataset_id)?; let result = self.stats_for_match(&match_sig, query, match_size, match_path, round)?; Ok(result) @@ -289,18 +265,8 @@ impl LinearIndex { Ok(matches) } - pub fn manifest(&self) -> Manifest { - self.collection.manifest.clone() - } - - pub fn set_manifest(&mut self, new_manifest: Manifest) -> Result<()> { - self.collection.manifest = new_manifest; - Ok(()) - } - pub fn signatures_iter(&self) -> impl Iterator + '_ { - // FIXME temp solution, must find better one! - (0..self.collection.manifest.len()).map(move |dataset_id| { + (0..self.collection.len()).map(move |dataset_id| { self.collection .sig_for_dataset(dataset_id as Idx) .expect("error loading sig") @@ -339,19 +305,16 @@ impl<'a> Index<'a> for LinearIndex { } fn len(&self) -> usize { - self.collection.manifest.len() + self.collection.len() } fn signatures(&self) -> Vec { self.collection() - .manifest - .internal_locations() - .map(PathBuf::from) - .map(|p| { + .iter() + .map(|(i, p)| { self.collection() - .storage - .load_sig(p.as_str()) - .unwrap_or_else(|_| panic!("Error processing {:?}", p)) + .sig_for_dataset(i as Idx) + .unwrap_or_else(|_| panic!("Error processing {}", p.internal_location())) }) .collect() } diff --git a/src/core/src/index/revindex/disk_revindex.rs b/src/core/src/index/revindex/disk_revindex.rs index 4c4064acc4..e8b440d745 100644 --- a/src/core/src/index/revindex/disk_revindex.rs +++ b/src/core/src/index/revindex/disk_revindex.rs @@ -80,19 +80,14 @@ impl RevIndex { collection: Arc::new(collection), }; - index - .collection - .manifest - .par_iter() - .enumerate() - .for_each(|(dataset_id, _)| { - let i = processed_sigs.fetch_add(1, Ordering::SeqCst); - if i % 1000 == 0 { - info!("Processed {} reference sigs", i); - } + index.collection.par_iter().for_each(|(dataset_id, _)| { + let i = processed_sigs.fetch_add(1, Ordering::SeqCst); + if i % 1000 == 0 { + info!("Processed {} reference sigs", i); + } - index.map_hashes_colors(dataset_id as Idx); - }); + index.map_hashes_colors(dataset_id as Idx); + }); index.save_collection().expect("Error saving collection"); @@ -143,7 +138,7 @@ impl RevIndex { InnerStorage::from_spec(spec)? }; - Collection { manifest, storage }.try_into() + Collection::new(manifest, storage).try_into() } fn save_collection(&self) -> Result<()> { @@ -152,12 +147,12 @@ impl RevIndex { // write manifest let mut wtr = vec![]; { - self.collection.manifest.to_writer(&mut wtr)?; + self.collection.manifest().to_writer(&mut wtr)?; } self.db.put_cf(&cf_metadata, MANIFEST, &wtr[..])?; // write storage spec - let spec = self.collection.storage.spec(); + let spec = self.collection.storage().spec(); // TODO: check if spec if memstorage, would probably have to // save into rocksdb in that case! @@ -269,7 +264,10 @@ impl RevIndexOps for RevIndex { .into_iter() .filter_map(|(dataset_id, size)| { if size >= threshold { - let row = &self.collection.manifest[dataset_id as usize]; + let row = &self + .collection + .record_for_dataset(dataset_id) + .expect("dataset not found"); Some((row.name().into(), size)) } else { None diff --git a/src/core/src/index/revindex/mem_revindex.rs b/src/core/src/index/revindex/mem_revindex.rs index b951d9513d..e3efb7146a 100644 --- a/src/core/src/index/revindex/mem_revindex.rs +++ b/src/core/src/index/revindex/mem_revindex.rs @@ -1,17 +1,14 @@ -use std::collections::HashMap; use std::sync::atomic::{AtomicUsize, Ordering}; use camino::Utf8Path as Path; use camino::Utf8PathBuf as PathBuf; use log::{debug, info}; -use nohash_hasher::BuildNoHashHasher; -use serde::{Deserialize, Serialize}; #[cfg(feature = "parallel")] use rayon::prelude::*; use crate::collection::Collection; -use crate::encodings::{Color, Colors, Idx}; +use crate::encodings::{Colors, Idx}; use crate::index::linear::LinearIndex; use crate::index::revindex::HashToColor; use crate::index::{GatherResult, Index, SigCounter}; @@ -19,8 +16,6 @@ use crate::prelude::*; use crate::signature::{Signature, SigsTrait}; use crate::sketch::minhash::KmerMinHash; use crate::sketch::Sketch; -use crate::storage::Storage; -use crate::HashIntoType; use crate::Result; pub struct RevIndex { @@ -38,20 +33,13 @@ impl LinearIndex { ) -> RevIndex { let processed_sigs = AtomicUsize::new(0); - let search_sigs: Vec<_> = self - .collection() - .manifest - .internal_locations() - .map(PathBuf::from) - .collect(); - #[cfg(feature = "parallel")] - let sig_iter = search_sigs.par_iter(); + let sig_iter = self.collection().par_iter(); #[cfg(not(feature = "parallel"))] - let sig_iter = search_sigs.iter(); + let sig_iter = self.collection().iter(); - let filtered_sigs = sig_iter.enumerate().filter_map(|(dataset_id, filename)| { + let filtered_sigs = sig_iter.enumerate().filter_map(|(dataset_id, _)| { let i = processed_sigs.fetch_add(1, Ordering::SeqCst); if i % 1000 == 0 { info!("Processed {} reference sigs", i); @@ -59,13 +47,12 @@ impl LinearIndex { let search_sig = self .collection() - .storage - .load_sig(filename.as_str()) - .unwrap_or_else(|_| panic!("Error processing {:?}", filename)) + .sig_for_dataset(dataset_id as Idx) + .expect("Error loading sig") .into(); RevIndex::map_hashes_colors( - dataset_id, + dataset_id as Idx, &search_sig, queries, &merged_query, @@ -160,7 +147,7 @@ impl RevIndex { } fn map_hashes_colors( - dataset_id: usize, + dataset_id: Idx, search_sig: &Signature, queries: Option<&[KmerMinHash]>, merged_query: &Option, @@ -275,8 +262,11 @@ impl RevIndex { let match_size = if size >= threshold { size } else { break }; let match_sig = self.linear.sig_for_dataset(dataset_id)?; - let match_path = - self.linear.collection().manifest[dataset_id as usize].internal_location(); + let match_path = self + .linear + .collection() + .record_for_dataset(dataset_id)? + .internal_location(); let mut match_mh = None; if let Some(Sketch::MinHash(mh)) = match_sig.select_sketch(self.linear.template()) { diff --git a/src/core/src/index/revindex/mod.rs b/src/core/src/index/revindex/mod.rs index 42a9837d13..ec7c6a00ad 100644 --- a/src/core/src/index/revindex/mod.rs +++ b/src/core/src/index/revindex/mod.rs @@ -98,11 +98,11 @@ impl HashToColor { self.0.is_empty() } - fn add_to(&mut self, colors: &mut Colors, dataset_id: usize, matched_hashes: Vec) { + fn add_to(&mut self, colors: &mut Colors, dataset_id: Idx, matched_hashes: Vec) { let mut color = None; matched_hashes.into_iter().for_each(|hash| { - color = Some(colors.update(color, &[dataset_id as Idx]).unwrap()); + color = Some(colors.update(color, &[dataset_id]).unwrap()); self.0.insert(hash, color.unwrap()); }); } From 4fd2f467bea362d79b8d9c8aa012a1fe50c5bc2c Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sat, 16 Sep 2023 21:39:56 -0700 Subject: [PATCH 04/19] bring back update --- src/core/src/collection.rs | 11 +++- src/core/src/index/revindex/disk_revindex.rs | 68 +++++--------------- src/core/src/index/revindex/mod.rs | 28 ++++---- src/core/src/manifest.rs | 2 +- 4 files changed, 44 insertions(+), 65 deletions(-) diff --git a/src/core/src/collection.rs b/src/core/src/collection.rs index eaed3fbffa..4e5a81a6ba 100644 --- a/src/core/src/collection.rs +++ b/src/core/src/collection.rs @@ -7,7 +7,7 @@ use crate::manifest::{Manifest, Record}; use crate::prelude::*; use crate::signature::Signature; use crate::storage::{FSStorage, InnerStorage, MemStorage, SigStore, Storage, ZipStorage}; -use crate::Result; +use crate::{Error, Result}; #[cfg(feature = "parallel")] use rayon::prelude::*; @@ -95,6 +95,15 @@ impl Collection { &self.storage } + pub fn check_superset(&self, other: &Collection) -> Result { + self.iter() + .zip(other.iter()) + .all(|((id1, rec1), (id2, rec2))| id1 == id2 && rec1 == rec2) + .then(|| self.len()) + // TODO: right error here + .ok_or(Error::MismatchKSizes) + } + pub fn from_zipfile>(zipfile: P) -> Result { let storage = ZipStorage::from_file(zipfile)?; // Load manifest from standard location in zipstorage diff --git a/src/core/src/index/revindex/disk_revindex.rs b/src/core/src/index/revindex/disk_revindex.rs index e8b440d745..8bf98cde09 100644 --- a/src/core/src/index/revindex/disk_revindex.rs +++ b/src/core/src/index/revindex/disk_revindex.rs @@ -1,5 +1,5 @@ use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher}; -use std::path::{Path, PathBuf}; +use std::path::Path; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; @@ -62,7 +62,7 @@ pub fn repair(path: &Path) { */ impl RevIndex { - pub fn create(path: &Path, collection: CollectionSet) -> module::RevIndex { + pub fn create(path: &Path, collection: CollectionSet) -> Result { let mut opts = module::RevIndex::db_options(); opts.create_if_missing(true); opts.create_missing_column_families(true); @@ -95,7 +95,7 @@ impl RevIndex { index.compact(); info!("Processed {} reference sigs", processed_sigs.into_inner()); - module::RevIndex::Plain(index) + Ok(module::RevIndex::Plain(index)) } pub fn open>(path: P, read_only: bool) -> Result { @@ -374,60 +374,20 @@ impl RevIndexOps for RevIndex { Ok(matches) } - fn update( - &self, - _index_sigs: Vec, - _template: &Sketch, - _threshold: f64, - _save_paths: bool, - ) { - todo!() - /* - use byteorder::ReadBytesExt; - - if !save_paths { - todo!("only supports with save_paths=True for now"); - } - - let cf_sigs = self.db.cf_handle(SIGS).unwrap(); - let iter = self.db.iterator_cf(&cf_sigs, rocksdb::IteratorMode::Start); - - info!("Verifying existing sigs"); - // verify data match up to this point - let mut max_dataset_id = 0; - let to_skip = iter - .map(|result| { - let (key, value) = result.unwrap(); - let current_dataset_id = (&key[..]).read_u64::().unwrap(); - - let filename = &index_sigs[current_dataset_id as usize]; - let sig_data = SignatureData::from_slice(&value).unwrap(); - match sig_data { - SignatureData::External(sig) => { - assert_eq!(sig, filename.as_os_str().to_str().unwrap().to_string()) - } - SignatureData::Empty => (), - SignatureData::Internal(_) => { - todo!("only supports with save_paths=True for now") - } - }; - max_dataset_id = max_dataset_id.max(current_dataset_id); - }) - .count(); - - max_dataset_id += 1; - assert_eq!(max_dataset_id as usize, to_skip); + fn update(mut self, collection: CollectionSet) -> Result { + // TODO: verify new collection manifest is a superset of current one, + // and the initial chunk is the same + let to_skip = self.collection.check_superset(&collection)?; // process the remainder let processed_sigs = AtomicUsize::new(0); - index_sigs + self.collection = Arc::new(collection); + + self.collection .par_iter() .skip(to_skip) - .enumerate() - .for_each(|(i, filename)| { - let dataset_id = i + to_skip; - + .for_each(|(dataset_id, _)| { let i = processed_sigs.fetch_add(1, Ordering::SeqCst); if i % 1000 == 0 { info!("Processed {} reference sigs", i); @@ -436,11 +396,15 @@ impl RevIndexOps for RevIndex { self.map_hashes_colors(dataset_id as Idx); }); + info!("Compact SSTs"); + self.compact(); + info!( "Processed additional {} reference sigs", processed_sigs.into_inner() ); - */ + + Ok(module::RevIndex::Plain(self)) } fn check(&self, quick: bool) { diff --git a/src/core/src/index/revindex/mod.rs b/src/core/src/index/revindex/mod.rs index ec7c6a00ad..c1762811a9 100644 --- a/src/core/src/index/revindex/mod.rs +++ b/src/core/src/index/revindex/mod.rs @@ -3,7 +3,7 @@ pub mod mem_revindex; use std::collections::HashMap; use std::hash::{Hash, Hasher}; -use std::path::{Path, PathBuf}; +use std::path::Path; use std::sync::Arc; use byteorder::{LittleEndian, WriteBytesExt}; @@ -56,7 +56,9 @@ pub trait RevIndexOps { query: &KmerMinHash, ) -> (SigCounter, QueryColors, HashToColor); - fn update(&self, index_sigs: Vec, template: &Sketch, threshold: f64, save_paths: bool); + fn update(self, collection: CollectionSet) -> Result + where + Self: Sized; fn compact(&self); @@ -165,7 +167,11 @@ impl RevIndex { } */ - pub fn create>(index: P, collection: CollectionSet, colors: bool) -> Self { + pub fn create>( + index: P, + collection: CollectionSet, + colors: bool, + ) -> Result { if colors { todo!() //color_revindex::ColorRevIndex::create(index) } else { @@ -518,7 +524,7 @@ mod test { let collection = Collection::from_paths(&siglist)?.select(&Selection::from_template(&template))?; - let index = RevIndex::create(output.path(), collection.try_into()?, false); + let index = RevIndex::create(output.path(), collection.try_into()?, false)?; let counter = index.counter_for_query(&query); let matches = index.matches_from_counter(counter, 0); @@ -528,7 +534,6 @@ mod test { Ok(()) } - /* #[test] fn revindex_update() -> Result<()> { let mut basedir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); @@ -547,8 +552,9 @@ mod test { let mut new_siglist = siglist.clone(); { - let index = RevIndex::create(output.path(), false); - index.index(siglist, &template, 0., true); + let collection = + Collection::from_paths(&siglist)?.select(&Selection::from_template(&template))?; + RevIndex::create(output.path(), collection.try_into()?, false)?; } let mut filename = basedir.clone(); @@ -564,18 +570,18 @@ mod test { } let query = query.unwrap(); - let index = RevIndex::open(output.path(), false); - index.update(new_siglist, &template, 0., true); + let new_collection = + Collection::from_paths(&new_siglist)?.select(&Selection::from_template(&template))?; + let index = RevIndex::open(output.path(), false)?.update(new_collection.try_into()?)?; let counter = index.counter_for_query(&query); let matches = index.matches_from_counter(counter, 0); - assert!(matches[0].0.ends_with("/genome-s12.fa.gz.sig")); + assert!(matches[0].0.ends_with("/genome-s12.fa.gz")); assert_eq!(matches[0].1, 45); Ok(()) } - */ #[test] fn revindex_load_and_gather() -> Result<()> { diff --git a/src/core/src/manifest.rs b/src/core/src/manifest.rs index 39647480f2..de3075faa9 100644 --- a/src/core/src/manifest.rs +++ b/src/core/src/manifest.rs @@ -13,7 +13,7 @@ use crate::signature::{Signature, SigsTrait}; use crate::sketch::Sketch; use crate::Result; -#[derive(Debug, Serialize, Deserialize, Clone, CopyGetters, Getters, Setters)] +#[derive(Debug, Serialize, Deserialize, Clone, CopyGetters, Getters, Setters, PartialEq)] pub struct Record { #[getset(get = "pub", set = "pub")] internal_location: PathBuf, From 6e69b7f98e1d04c440db4961cac5c35991cb3d86 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sun, 17 Sep 2023 13:41:36 -0700 Subject: [PATCH 05/19] cleanup template, replace with selection --- src/core/src/collection.rs | 4 + src/core/src/ffi/index/revindex.rs | 25 +++- src/core/src/index/revindex/disk_revindex.rs | 18 +-- src/core/src/index/revindex/mem_revindex.rs | 94 ++++--------- src/core/src/index/revindex/mod.rs | 136 +++++++------------ src/core/src/selection.rs | 56 ++++---- src/core/src/signature.rs | 2 + src/core/src/sketch/minhash.rs | 2 + 8 files changed, 142 insertions(+), 195 deletions(-) diff --git a/src/core/src/collection.rs b/src/core/src/collection.rs index 4e5a81a6ba..ca340af011 100644 --- a/src/core/src/collection.rs +++ b/src/core/src/collection.rs @@ -60,6 +60,10 @@ impl CollectionSet { pub fn into_inner(self) -> Collection { self.collection } + + pub fn selection(&self) -> Selection { + todo!("Extract selection from first sig") + } } impl Collection { diff --git a/src/core/src/ffi/index/revindex.rs b/src/core/src/ffi/index/revindex.rs index ab24e2ac3e..e38bdef7fb 100644 --- a/src/core/src/ffi/index/revindex.rs +++ b/src/core/src/ffi/index/revindex.rs @@ -8,6 +8,7 @@ use crate::ffi::signature::SourmashSignature; use crate::ffi::utils::{ForeignObject, SourmashStr}; use crate::index::revindex::mem_revindex::RevIndex; use crate::index::Index; +use crate::prelude::*; use crate::signature::{Signature, SigsTrait}; use crate::sketch::minhash::KmerMinHash; use crate::sketch::Sketch; @@ -18,6 +19,21 @@ impl ForeignObject for SourmashRevIndex { type RustObject = RevIndex; } +// TODO: remove this when it is possible to pass Selection thru the FFI +fn from_template(template: &Sketch) -> Selection { + let (num, scaled) = match template { + Sketch::MinHash(mh) => (mh.num(), mh.scaled() as u32), + Sketch::LargeMinHash(mh) => (mh.num(), mh.scaled() as u32), + _ => unimplemented!(), + }; + + Selection::builder() + .ksize(template.ksize() as u32) + .num(num) + .scaled(scaled) + .build() +} + ffi_fn! { unsafe fn revindex_new_with_paths( search_sigs_ptr: *const *const SourmashStr, @@ -58,9 +74,12 @@ unsafe fn revindex_new_with_paths( .collect(); Some(queries_vec.as_ref()) }; + + let selection = from_template(&template); + let revindex = RevIndex::new( search_sigs.as_ref(), - &template, + &selection, threshold, queries, keep_sigs, @@ -105,7 +124,9 @@ unsafe fn revindex_new_with_sigs( .collect(); Some(queries_vec.as_ref()) }; - let revindex = RevIndex::new_with_sigs(search_sigs, &template, threshold, queries)?; + + let selection = from_template(&template); + let revindex = RevIndex::new_with_sigs(search_sigs, &selection, threshold, queries)?; Ok(SourmashRevIndex::from_rust(revindex)) } } diff --git a/src/core/src/index/revindex/disk_revindex.rs b/src/core/src/index/revindex/disk_revindex.rs index 8bf98cde09..d63abf58cd 100644 --- a/src/core/src/index/revindex/disk_revindex.rs +++ b/src/core/src/index/revindex/disk_revindex.rs @@ -17,6 +17,7 @@ use crate::index::revindex::{ }; use crate::index::{GatherResult, SigCounter}; use crate::manifest::Manifest; +use crate::prelude::*; use crate::signature::SigsTrait; use crate::sketch::minhash::KmerMinHash; use crate::sketch::Sketch; @@ -283,11 +284,12 @@ impl RevIndexOps for RevIndex { hash_to_color: HashToColor, threshold: usize, orig_query: &KmerMinHash, - template: &Sketch, + selection: Option, ) -> Result> { let mut match_size = usize::max_value(); let mut matches = vec![]; //let mut query: KmerMinHashBTree = orig_query.clone().into(); + let selection = selection.unwrap_or_else(|| self.collection.selection()); while match_size > threshold && !counter.is_empty() { trace!("counter len: {}", counter.len()); @@ -298,22 +300,20 @@ impl RevIndexOps for RevIndex { let match_sig = self.collection.sig_for_dataset(dataset_id)?; - let match_mh = - prepare_query(&match_sig, template).expect("Couldn't find a compatible MinHash"); - // Calculate stats let f_orig_query = match_size as f64 / orig_query.size() as f64; - let f_match = match_size as f64 / match_mh.size() as f64; let name = match_sig.name(); - let unique_intersect_bp = match_mh.scaled() as usize * match_size; let gather_result_rank = matches.len(); + let match_ = match_sig.clone(); + let md5 = match_sig.md5sum(); + let match_mh = prepare_query(match_sig.into(), &selection) + .expect("Couldn't find a compatible MinHash"); + let f_match = match_size as f64 / match_mh.size() as f64; + let unique_intersect_bp = match_mh.scaled() as usize * match_size; let (intersect_orig, _) = match_mh.intersection_size(orig_query)?; let intersect_bp = (match_mh.scaled() * intersect_orig) as usize; - let f_unique_to_query = intersect_orig as f64 / orig_query.size() as f64; - let match_ = match_sig.clone(); - let md5 = match_sig.md5sum(); // TODO: all of these let filename = "".into(); diff --git a/src/core/src/index/revindex/mem_revindex.rs b/src/core/src/index/revindex/mem_revindex.rs index e3efb7146a..2d37d4c274 100644 --- a/src/core/src/index/revindex/mem_revindex.rs +++ b/src/core/src/index/revindex/mem_revindex.rs @@ -84,7 +84,7 @@ impl LinearIndex { impl RevIndex { pub fn new( search_sigs: &[PathBuf], - template: &Sketch, + selection: &Selection, threshold: usize, queries: Option<&[KmerMinHash]>, _keep_sigs: bool, @@ -92,8 +92,7 @@ impl RevIndex { // If threshold is zero, let's merge all queries and save time later let merged_query = queries.and_then(|qs| Self::merge_queries(qs, threshold)); - let collection = - Collection::from_paths(search_sigs)?.select(&Selection::from_template(template))?; + let collection = Collection::from_paths(search_sigs)?.select(&selection)?; let linear = LinearIndex::from_collection(collection.try_into()?); Ok(linear.index(threshold, merged_query, queries)) @@ -101,7 +100,7 @@ impl RevIndex { pub fn from_zipfile>( zipfile: P, - template: &Sketch, + selection: &Selection, threshold: usize, queries: Option<&[KmerMinHash]>, _keep_sigs: bool, @@ -109,8 +108,7 @@ impl RevIndex { // If threshold is zero, let's merge all queries and save time later let merged_query = queries.and_then(|qs| Self::merge_queries(qs, threshold)); - let collection = - Collection::from_zipfile(zipfile)?.select(&Selection::from_template(template))?; + let collection = Collection::from_zipfile(zipfile)?.select(&selection)?; let linear = LinearIndex::from_collection(collection.try_into()?); Ok(linear.index(threshold, merged_query, queries)) @@ -130,15 +128,14 @@ impl RevIndex { pub fn new_with_sigs( search_sigs: Vec, - template: &Sketch, + selection: &Selection, threshold: usize, queries: Option<&[KmerMinHash]>, ) -> Result { // If threshold is zero, let's merge all queries and save time later let merged_query = queries.and_then(|qs| Self::merge_queries(qs, threshold)); - let collection = - Collection::from_sigs(search_sigs)?.select(&Selection::from_template(template))?; + let collection = Collection::from_sigs(search_sigs)?.select(selection)?; let linear = LinearIndex::from_collection(collection.try_into()?); let idx = linear.index(threshold, merged_query, queries); @@ -338,24 +335,18 @@ impl<'a> Index<'a> for RevIndex { mod test { use super::*; + use crate::index::revindex::prepare_query; use crate::sketch::minhash::max_hash_for_scaled; use crate::Result; #[test] fn revindex_new() -> Result<()> { - let max_hash = max_hash_for_scaled(10000); - let template = Sketch::MinHash( - KmerMinHash::builder() - .num(0u32) - .ksize(31) - .max_hash(max_hash) - .build(), - ); + let selection = Selection::builder().ksize(31).scaled(10000).build(); let search_sigs = [ "../../tests/test-data/gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig".into(), "../../tests/test-data/gather/GCF_000007545.1_ASM754v1_genomic.fna.gz.sig".into(), ]; - let index = RevIndex::new(&search_sigs, &template, 0, None, false)?; + let index = RevIndex::new(&search_sigs, &selection, 0, None, false)?; assert_eq!(index.colors.len(), 3); Ok(()) @@ -363,21 +354,14 @@ mod test { #[test] fn revindex_many() -> Result<()> { - let max_hash = max_hash_for_scaled(10000); - let template = Sketch::MinHash( - KmerMinHash::builder() - .num(0u32) - .ksize(31) - .max_hash(max_hash) - .build(), - ); + let selection = Selection::builder().ksize(31).scaled(10000).build(); let search_sigs = [ "../../tests/test-data/gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig".into(), "../../tests/test-data/gather/GCF_000007545.1_ASM754v1_genomic.fna.gz.sig".into(), "../../tests/test-data/gather/GCF_000008105.1_ASM810v1_genomic.fna.gz.sig".into(), ]; - let index = RevIndex::new(&search_sigs, &template, 0, None, false)?; + let index = RevIndex::new(&search_sigs, &selection, 0, None, false)?; //dbg!(&index.linear.collection().manifest); /* dbg!(&index.colors.colors); @@ -399,14 +383,7 @@ mod test { #[test] fn revindex_from_sigs() -> Result<()> { - let max_hash = max_hash_for_scaled(10000); - let template = Sketch::MinHash( - KmerMinHash::builder() - .num(0u32) - .ksize(31) - .max_hash(max_hash) - .build(), - ); + let selection = Selection::builder().ksize(31).scaled(10000).build(); let search_sigs: Vec = [ "../../tests/test-data/gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig", "../../tests/test-data/gather/GCF_000007545.1_ASM754v1_genomic.fna.gz.sig", @@ -416,7 +393,7 @@ mod test { .map(|path| Signature::from_path(path).unwrap().swap_remove(0)) .collect(); - let index = RevIndex::new_with_sigs(search_sigs, &template, 0, None)?; + let index = RevIndex::new_with_sigs(search_sigs, &selection, 0, None)?; /* dbg!(&index.colors.colors); 0: 86 @@ -436,18 +413,14 @@ mod test { #[test] fn revindex_from_zipstorage() -> Result<()> { - let max_hash = max_hash_for_scaled(100); - let template = Sketch::MinHash( - KmerMinHash::builder() - .num(0u32) - .ksize(19) - .hash_function(crate::encodings::HashFunctions::murmur64_protein) - .max_hash(max_hash) - .build(), - ); + let selection = Selection::builder() + .ksize(19) + .scaled(100) + .moltype(crate::encodings::HashFunctions::murmur64_protein) + .build(); let index = RevIndex::from_zipfile( "../../tests/test-data/prot/protein.zip", - &template, + &selection, 0, None, false, @@ -460,34 +433,27 @@ mod test { "../../tests/test-data/prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig", ) .expect("Error processing query") - .swap_remove(0); - - let template = Sketch::MinHash( - KmerMinHash::builder() - .num(0u32) - .ksize(57) - .hash_function(crate::encodings::HashFunctions::murmur64_protein) - .max_hash(max_hash) - .build(), - ); + .swap_remove(0) + .select(&selection)?; + let mut query_mh = None; - if let Some(Sketch::MinHash(mh)) = query_sig.select_sketch(&template) { - query_mh = Some(mh); + if let Some(q) = prepare_query(query_sig, &selection) { + query_mh = Some(q); } let query_mh = query_mh.expect("Couldn't find a compatible MinHash"); - let counter_rev = index.counter_for_query(query_mh); - let counter_lin = index.linear.counter_for_query(query_mh); + let counter_rev = index.counter_for_query(&query_mh); + let counter_lin = index.linear.counter_for_query(&query_mh); let results_rev = index.search(counter_rev, false, 0).unwrap(); let results_linear = index.linear.search(counter_lin, false, 0).unwrap(); assert_eq!(results_rev, results_linear); - let counter_rev = index.counter_for_query(query_mh); - let counter_lin = index.linear.counter_for_query(query_mh); + let counter_rev = index.counter_for_query(&query_mh); + let counter_lin = index.linear.counter_for_query(&query_mh); - let results_rev = index.gather(counter_rev, 0, query_mh).unwrap(); - let results_linear = index.linear.gather(counter_lin, 0, query_mh).unwrap(); + let results_rev = index.gather(counter_rev, 0, &query_mh).unwrap(); + let results_linear = index.linear.gather(counter_lin, 0, &query_mh).unwrap(); assert_eq!(results_rev.len(), 1); assert_eq!(results_rev, results_linear); diff --git a/src/core/src/index/revindex/mod.rs b/src/core/src/index/revindex/mod.rs index c1762811a9..30f7630c1b 100644 --- a/src/core/src/index/revindex/mod.rs +++ b/src/core/src/index/revindex/mod.rs @@ -15,8 +15,9 @@ use serde::{Deserialize, Serialize}; use crate::collection::CollectionSet; use crate::encodings::{Color, Colors, Idx}; use crate::index::{GatherResult, SigCounter}; -use crate::signature::{Signature, SigsTrait}; -use crate::sketch::minhash::{max_hash_for_scaled, KmerMinHash}; +use crate::prelude::*; +use crate::signature::Signature; +use crate::sketch::minhash::KmerMinHash; use crate::sketch::Sketch; use crate::HashIntoType; use crate::Result; @@ -75,7 +76,7 @@ pub trait RevIndexOps { hash_to_color: HashToColor, threshold: usize, query: &KmerMinHash, - template: &Sketch, + selection: Option, ) -> Result>; } @@ -215,53 +216,16 @@ impl RevIndex { } } -fn check_compatible_downsample(me: &KmerMinHash, other: &KmerMinHash) -> Result<()> { - /* - if self.num != other.num { - return Err(Error::MismatchNum { - n1: self.num, - n2: other.num, - } - .into()); - } - */ - use crate::Error; - - if me.ksize() != other.ksize() { - return Err(Error::MismatchKSizes); - } - if me.hash_function() != other.hash_function() { - // TODO: fix this error - return Err(Error::MismatchDNAProt); - } - if me.max_hash() < other.max_hash() { - return Err(Error::MismatchScaled); - } - if me.seed() != other.seed() { - return Err(Error::MismatchSeed); - } - Ok(()) -} +pub fn prepare_query(search_sig: Signature, selection: &Selection) -> Option { + let sig = search_sig.select(selection).ok(); -pub fn prepare_query(search_sig: &Signature, template: &Sketch) -> Option { - let mut search_mh = None; - if let Some(Sketch::MinHash(mh)) = search_sig.select_sketch(template) { - search_mh = Some(mh.clone()); - } else { - // try to find one that can be downsampled - if let Sketch::MinHash(template_mh) = template { - for sketch in search_sig.sketches() { - if let Sketch::MinHash(ref_mh) = sketch { - if check_compatible_downsample(&ref_mh, template_mh).is_ok() { - let max_hash = max_hash_for_scaled(template_mh.scaled()); - let mh = ref_mh.downsample_max_hash(max_hash).unwrap(); - search_mh = Some(mh); - } - } - } + sig.and_then(|sig| { + if let Sketch::MinHash(mh) = sig.sketches().swap_remove(0) { + Some(mh) + } else { + None } - } - search_mh + }) } #[derive(Debug, Default, PartialEq, Clone)] @@ -481,22 +445,12 @@ mod test { use crate::collection::Collection; use crate::prelude::*; use crate::selection::Selection; - use crate::sketch::minhash::{max_hash_for_scaled, KmerMinHash}; + use crate::sketch::minhash::KmerMinHash; use crate::sketch::Sketch; use crate::Result; use super::{prepare_query, RevIndex, RevIndexOps}; - fn build_template(ksize: u8, scaled: usize) -> Sketch { - let max_hash = max_hash_for_scaled(scaled as u64); - let template_mh = KmerMinHash::builder() - .num(0u32) - .ksize(ksize as u32) - .max_hash(max_hash) - .build(); - Sketch::MinHash(template_mh) - } - #[test] fn revindex_index() -> Result<()> { let mut basedir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); @@ -510,20 +464,19 @@ mod test { }) .collect(); - let template = build_template(31, 10000); + let selection = Selection::builder().ksize(31).scaled(10000).build(); let output = TempDir::new()?; - let query_sig = Signature::from_path(&siglist[0])?; let mut query = None; - for sig in &query_sig { - if let Some(q) = prepare_query(sig, &template) { - query = Some(q); - } + let query_sig = Signature::from_path(&siglist[0])? + .swap_remove(0) + .select(&selection)?; + if let Some(q) = prepare_query(query_sig, &selection) { + query = Some(q); } let query = query.unwrap(); - let collection = - Collection::from_paths(&siglist)?.select(&Selection::from_template(&template))?; + let collection = Collection::from_paths(&siglist)?.select(&selection)?; let index = RevIndex::create(output.path(), collection.try_into()?, false)?; let counter = index.counter_for_query(&query); @@ -547,13 +500,12 @@ mod test { }) .collect(); - let template = build_template(31, 10000); + let selection = Selection::builder().ksize(31).scaled(10000).build(); let output = TempDir::new()?; let mut new_siglist = siglist.clone(); { - let collection = - Collection::from_paths(&siglist)?.select(&Selection::from_template(&template))?; + let collection = Collection::from_paths(&siglist)?.select(&selection)?; RevIndex::create(output.path(), collection.try_into()?, false)?; } @@ -561,17 +513,16 @@ mod test { filename.push("genome-s12.fa.gz.sig"); new_siglist.push(filename); - let query_sig = Signature::from_path(&new_siglist[2])?; let mut query = None; - for sig in &query_sig { - if let Some(q) = prepare_query(sig, &template) { - query = Some(q); - } + let query_sig = Signature::from_path(&new_siglist[2])? + .swap_remove(0) + .select(&selection)?; + if let Some(q) = prepare_query(query_sig, &selection) { + query = Some(q); } let query = query.unwrap(); - let new_collection = - Collection::from_paths(&new_siglist)?.select(&Selection::from_template(&template))?; + let new_collection = Collection::from_paths(&new_siglist)?.select(&selection)?; let index = RevIndex::open(output.path(), false)?.update(new_collection.try_into()?)?; let counter = index.counter_for_query(&query); @@ -596,30 +547,39 @@ mod test { }) .collect(); - let template = build_template(31, 10000); + let selection = Selection::builder().ksize(31).scaled(10000).build(); let output = TempDir::new()?; - let query_sig = Signature::from_path(&siglist[0])?; let mut query = None; - for sig in &query_sig { - if let Some(q) = prepare_query(sig, &template) { - query = Some(q); - } + let query_sig = Signature::from_path(&siglist[0])? + .swap_remove(0) + .select(&selection)?; + if let Some(q) = prepare_query(query_sig, &selection) { + query = Some(q); } let query = query.unwrap(); { - let collection = - Collection::from_paths(&siglist)?.select(&Selection::from_template(&template))?; + let collection = Collection::from_paths(&siglist)?.select(&selection)?; let _index = RevIndex::create(output.path(), collection.try_into()?, false); } let index = RevIndex::open(output.path(), true)?; - let counter = index.counter_for_query(&query); - let matches = index.matches_from_counter(counter, 0); + let (counter, query_colors, hash_to_color) = index.prepare_gather_counters(&query); - assert_eq!(matches, [("../genome-s10.fa.gz".into(), 48)]); + let matches = index.gather( + counter, + query_colors, + hash_to_color, + 0, + &query, + Some(selection), + )?; + + assert_eq!(matches.len(), 1); + assert_eq!(matches[0].name(), "../genome-s10.fa.gz"); + assert_eq!(matches[0].f_match(), 1.0); Ok(()) } diff --git a/src/core/src/selection.rs b/src/core/src/selection.rs index 3e18f8fb31..86d42273e0 100644 --- a/src/core/src/selection.rs +++ b/src/core/src/selection.rs @@ -3,10 +3,32 @@ use typed_builder::TypedBuilder; use crate::encodings::HashFunctions; use crate::manifest::Record; -use crate::signature::SigsTrait; -use crate::sketch::Sketch; use crate::Result; +#[derive(Default, Debug, TypedBuilder)] +pub struct Selection { + #[builder(default, setter(strip_option))] + ksize: Option, + + #[builder(default, setter(strip_option))] + abund: Option, + + #[builder(default, setter(strip_option))] + num: Option, + + #[builder(default, setter(strip_option))] + scaled: Option, + + #[builder(default, setter(strip_option))] + containment: Option, + + #[builder(default, setter(strip_option))] + moltype: Option, + + #[builder(default, setter(strip_option))] + picklist: Option, +} + #[derive(Default, TypedBuilder, CopyGetters, Getters, Setters, Clone, Debug)] pub struct Picklist { #[getset(get = "pub", set = "pub")] @@ -34,17 +56,6 @@ pub enum PickStyle { Exclude = 2, } -#[derive(Default, Debug)] -pub struct Selection { - ksize: Option, - abund: Option, - num: Option, - scaled: Option, - containment: Option, - moltype: Option, - picklist: Option, -} - pub trait Select { fn select(self, selection: &Selection) -> Result where @@ -108,25 +119,6 @@ impl Selection { self.picklist = Some(value); } - pub fn from_template(template: &Sketch) -> Self { - let (num, scaled) = match template { - Sketch::MinHash(mh) => (Some(mh.num()), Some(mh.scaled() as u32)), - Sketch::LargeMinHash(mh) => (Some(mh.num()), Some(mh.scaled() as u32)), - _ => (None, None), - }; - - Selection { - ksize: Some(template.ksize() as u32), - abund: None, - containment: None, - //moltype: Some(template.hash_function()), - moltype: None, - num, - picklist: None, - scaled, - } - } - pub fn from_record(row: &Record) -> Result { Ok(Self { ksize: Some(*row.ksize()), diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index 9ac9bfe2aa..dd05f9005d 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -771,6 +771,8 @@ impl Select for Signature { } else { valid }; + // TODO: execute downsample if needed + /* valid = if let Some(abund) = selection.abund() { valid && *s.with_abundance() == abund diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index 454a50f7a7..4f61056853 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -829,6 +829,8 @@ impl SigsTrait for KmerMinHash { // TODO: fix this error return Err(Error::MismatchDNAProt); } + // TODO: if supporting downsampled to be compatible + //if self.max_hash < other.max_hash { if self.max_hash != other.max_hash { return Err(Error::MismatchScaled); } From 95bcd2eaca075ff0cfd087af8e8b851469c06c20 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sun, 17 Sep 2023 13:55:55 -0700 Subject: [PATCH 06/19] Placate linters on HashFunctions --- Makefile | 1 + src/core/src/cmd.rs | 8 +- src/core/src/encodings.rs | 34 +++--- src/core/src/ffi/minhash.rs | 9 +- src/core/src/ffi/mod.rs | 36 +++++++ src/core/src/index/revindex/mem_revindex.rs | 3 +- src/core/src/index/revindex/mod.rs | 2 - src/core/src/signature.rs | 4 +- src/core/src/sketch/hyperloglog/mod.rs | 2 +- src/core/src/sketch/minhash.rs | 36 +++---- src/core/tests/minhash.rs | 110 ++++++++++---------- 11 files changed, 139 insertions(+), 106 deletions(-) diff --git a/Makefile b/Makefile index f964bc3cce..9b26d91331 100644 --- a/Makefile +++ b/Makefile @@ -25,6 +25,7 @@ doc: .PHONY tox -e docs include/sourmash.h: src/core/src/lib.rs \ + src/core/src/ffi/mod.rs \ src/core/src/ffi/hyperloglog.rs \ src/core/src/ffi/minhash.rs \ src/core/src/ffi/signature.rs \ diff --git a/src/core/src/cmd.rs b/src/core/src/cmd.rs index 436c2ca7df..a760e0f79d 100644 --- a/src/core/src/cmd.rs +++ b/src/core/src/cmd.rs @@ -119,7 +119,7 @@ pub fn build_template(params: &ComputeParameters) -> Vec { KmerMinHashBTree::builder() .num(params.num_hashes) .ksize(*k) - .hash_function(HashFunctions::murmur64_protein) + .hash_function(HashFunctions::Murmur64Protein) .max_hash(max_hash) .seed(params.seed) .abunds(if params.track_abundance { @@ -136,7 +136,7 @@ pub fn build_template(params: &ComputeParameters) -> Vec { KmerMinHashBTree::builder() .num(params.num_hashes) .ksize(*k) - .hash_function(HashFunctions::murmur64_dayhoff) + .hash_function(HashFunctions::Murmur64Dayhoff) .max_hash(max_hash) .seed(params.seed) .abunds(if params.track_abundance { @@ -153,7 +153,7 @@ pub fn build_template(params: &ComputeParameters) -> Vec { KmerMinHashBTree::builder() .num(params.num_hashes) .ksize(*k) - .hash_function(HashFunctions::murmur64_hp) + .hash_function(HashFunctions::Murmur64Hp) .max_hash(max_hash) .seed(params.seed) .abunds(if params.track_abundance { @@ -170,7 +170,7 @@ pub fn build_template(params: &ComputeParameters) -> Vec { KmerMinHashBTree::builder() .num(params.num_hashes) .ksize(*k) - .hash_function(HashFunctions::murmur64_DNA) + .hash_function(HashFunctions::Murmur64Dna) .max_hash(max_hash) .seed(params.seed) .abunds(if params.track_abundance { diff --git a/src/core/src/encodings.rs b/src/core/src/encodings.rs index be34a00444..752b6d892f 100644 --- a/src/core/src/encodings.rs +++ b/src/core/src/encodings.rs @@ -22,35 +22,33 @@ pub type Idx = u32; type IdxTracker = (vec_collections::VecSet<[Idx; 8]>, u64); type ColorToIdx = HashMap>; -#[allow(non_camel_case_types)] #[derive(Debug, Clone, Copy, PartialEq, Eq)] #[cfg_attr( feature = "rkyv", derive(rkyv::Serialize, rkyv::Deserialize, rkyv::Archive) )] -#[repr(u32)] pub enum HashFunctions { - murmur64_DNA = 1, - murmur64_protein = 2, - murmur64_dayhoff = 3, - murmur64_hp = 4, + Murmur64Dna, + Murmur64Protein, + Murmur64Dayhoff, + Murmur64Hp, } impl HashFunctions { pub fn dna(&self) -> bool { - *self == HashFunctions::murmur64_DNA + *self == HashFunctions::Murmur64Dna } pub fn protein(&self) -> bool { - *self == HashFunctions::murmur64_protein + *self == HashFunctions::Murmur64Protein } pub fn dayhoff(&self) -> bool { - *self == HashFunctions::murmur64_dayhoff + *self == HashFunctions::Murmur64Dayhoff } pub fn hp(&self) -> bool { - *self == HashFunctions::murmur64_hp + *self == HashFunctions::Murmur64Hp } } @@ -60,10 +58,10 @@ impl std::fmt::Display for HashFunctions { f, "{}", match self { - HashFunctions::murmur64_DNA => "dna", - HashFunctions::murmur64_protein => "protein", - HashFunctions::murmur64_dayhoff => "dayhoff", - HashFunctions::murmur64_hp => "hp", + HashFunctions::Murmur64Dna => "dna", + HashFunctions::Murmur64Protein => "protein", + HashFunctions::Murmur64Dayhoff => "dayhoff", + HashFunctions::Murmur64Hp => "hp", } ) } @@ -74,10 +72,10 @@ impl TryFrom<&str> for HashFunctions { fn try_from(moltype: &str) -> Result { match moltype.to_lowercase().as_ref() { - "dna" => Ok(HashFunctions::murmur64_DNA), - "dayhoff" => Ok(HashFunctions::murmur64_dayhoff), - "hp" => Ok(HashFunctions::murmur64_hp), - "protein" => Ok(HashFunctions::murmur64_protein), + "dna" => Ok(HashFunctions::Murmur64Dna), + "dayhoff" => Ok(HashFunctions::Murmur64Dayhoff), + "hp" => Ok(HashFunctions::Murmur64Hp), + "protein" => Ok(HashFunctions::Murmur64Protein), v => unimplemented!("{v}"), } } diff --git a/src/core/src/ffi/minhash.rs b/src/core/src/ffi/minhash.rs index 45890b81d9..11863ba265 100644 --- a/src/core/src/ffi/minhash.rs +++ b/src/core/src/ffi/minhash.rs @@ -2,8 +2,9 @@ use std::ffi::CStr; use std::os::raw::c_char; use std::slice; -use crate::encodings::{aa_to_dayhoff, aa_to_hp, translate_codon, HashFunctions}; +use crate::encodings::{aa_to_dayhoff, aa_to_hp, translate_codon}; use crate::ffi::utils::{ForeignObject, SourmashStr}; +use crate::ffi::HashFunctions; use crate::signature::SeqToHashes; use crate::signature::SigsTrait; use crate::sketch::minhash::KmerMinHash; @@ -23,7 +24,7 @@ pub unsafe extern "C" fn kmerminhash_new( track_abundance: bool, n: u32, ) -> *mut SourmashKmerMinHash { - let mh = KmerMinHash::new(scaled, k, hash_function, seed, track_abundance, n); + let mh = KmerMinHash::new(scaled, k, hash_function.into(), seed, track_abundance, n); SourmashKmerMinHash::from_rust(mh) } @@ -367,13 +368,13 @@ pub unsafe extern "C" fn kmerminhash_hash_function( ptr: *const SourmashKmerMinHash, ) -> HashFunctions { let mh = SourmashKmerMinHash::as_rust(ptr); - mh.hash_function() + mh.hash_function().into() } ffi_fn! { unsafe fn kmerminhash_hash_function_set(ptr: *mut SourmashKmerMinHash, hash_function: HashFunctions) -> Result<()> { let mh = SourmashKmerMinHash::as_rust_mut(ptr); - mh.set_hash_function(hash_function) + mh.set_hash_function(hash_function.into()) } } diff --git a/src/core/src/ffi/mod.rs b/src/core/src/ffi/mod.rs index a67de37176..6e28c648cf 100644 --- a/src/core/src/ffi/mod.rs +++ b/src/core/src/ffi/mod.rs @@ -29,3 +29,39 @@ pub unsafe extern "C" fn hash_murmur(kmer: *const c_char, seed: u64) -> u64 { _hash_murmur(c_str.to_bytes(), seed) } + +#[repr(u32)] +pub enum HashFunctions { + Murmur64Dna = 1, + Murmur64Protein = 2, + Murmur64Dayhoff = 3, + Murmur64Hp = 4, +} + +impl From for crate::encodings::HashFunctions { + fn from(v: HashFunctions) -> crate::encodings::HashFunctions { + use crate::encodings::HashFunctions::{ + Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein, + }; + match v { + HashFunctions::Murmur64Dna => Murmur64Dna, + HashFunctions::Murmur64Protein => Murmur64Protein, + HashFunctions::Murmur64Dayhoff => Murmur64Dayhoff, + HashFunctions::Murmur64Hp => Murmur64Hp, + } + } +} + +impl From for HashFunctions { + fn from(v: crate::encodings::HashFunctions) -> HashFunctions { + use crate::encodings::HashFunctions::{ + Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein, + }; + match v { + Murmur64Dna => HashFunctions::Murmur64Dna, + Murmur64Protein => HashFunctions::Murmur64Protein, + Murmur64Dayhoff => HashFunctions::Murmur64Dayhoff, + Murmur64Hp => HashFunctions::Murmur64Hp, + } + } +} diff --git a/src/core/src/index/revindex/mem_revindex.rs b/src/core/src/index/revindex/mem_revindex.rs index 2d37d4c274..5264c8550d 100644 --- a/src/core/src/index/revindex/mem_revindex.rs +++ b/src/core/src/index/revindex/mem_revindex.rs @@ -336,7 +336,6 @@ mod test { use super::*; use crate::index::revindex::prepare_query; - use crate::sketch::minhash::max_hash_for_scaled; use crate::Result; #[test] @@ -416,7 +415,7 @@ mod test { let selection = Selection::builder() .ksize(19) .scaled(100) - .moltype(crate::encodings::HashFunctions::murmur64_protein) + .moltype(crate::encodings::HashFunctions::Murmur64Protein) .build(); let index = RevIndex::from_zipfile( "../../tests/test-data/prot/protein.zip", diff --git a/src/core/src/index/revindex/mod.rs b/src/core/src/index/revindex/mod.rs index 30f7630c1b..a1f796bb7f 100644 --- a/src/core/src/index/revindex/mod.rs +++ b/src/core/src/index/revindex/mod.rs @@ -445,8 +445,6 @@ mod test { use crate::collection::Collection; use crate::prelude::*; use crate::selection::Selection; - use crate::sketch::minhash::KmerMinHash; - use crate::sketch::Sketch; use crate::Result; use super::{prepare_query, RevIndex, RevIndexOps}; diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index dd05f9005d..b521191806 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -372,10 +372,10 @@ impl Iterator for SeqToHashes { } else { if !self.prot_configured { self.aa_seq = match self.hash_function { - HashFunctions::murmur64_dayhoff => { + HashFunctions::Murmur64Dayhoff => { self.sequence.iter().cloned().map(aa_to_dayhoff).collect() } - HashFunctions::murmur64_hp => { + HashFunctions::Murmur64Hp => { self.sequence.iter().cloned().map(aa_to_hp).collect() } invalid => { diff --git a/src/core/src/sketch/hyperloglog/mod.rs b/src/core/src/sketch/hyperloglog/mod.rs index df22dad9d1..ee09caa6e5 100644 --- a/src/core/src/sketch/hyperloglog/mod.rs +++ b/src/core/src/sketch/hyperloglog/mod.rs @@ -184,7 +184,7 @@ impl SigsTrait for HyperLogLog { fn hash_function(&self) -> HashFunctions { //TODO support other hash functions - HashFunctions::murmur64_DNA + HashFunctions::Murmur64Dna } fn add_hash(&mut self, hash: HashIntoType) { diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index 4f61056853..22fe7159c3 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -41,7 +41,7 @@ pub struct KmerMinHash { num: u32, ksize: u32, - #[builder(setter(into), default = HashFunctions::murmur64_DNA)] + #[builder(setter(into), default = HashFunctions::Murmur64Dna)] hash_function: HashFunctions, #[builder(default = 42u64)] @@ -89,7 +89,7 @@ impl Default for KmerMinHash { KmerMinHash { num: 1000, ksize: 21, - hash_function: HashFunctions::murmur64_DNA, + hash_function: HashFunctions::Murmur64Dna, seed: 42, max_hash: 0, mins: Vec::with_capacity(1000), @@ -148,10 +148,10 @@ impl<'de> Deserialize<'de> for KmerMinHash { let num = if tmpsig.max_hash != 0 { 0 } else { tmpsig.num }; let hash_function = match tmpsig.molecule.to_lowercase().as_ref() { - "protein" => HashFunctions::murmur64_protein, - "dayhoff" => HashFunctions::murmur64_dayhoff, - "hp" => HashFunctions::murmur64_hp, - "dna" => HashFunctions::murmur64_DNA, + "protein" => HashFunctions::Murmur64Protein, + "dayhoff" => HashFunctions::Murmur64Dayhoff, + "hp" => HashFunctions::Murmur64Hp, + "dna" => HashFunctions::Murmur64Dna, _ => unimplemented!(), // TODO: throw error here }; @@ -222,7 +222,7 @@ impl KmerMinHash { } pub fn is_protein(&self) -> bool { - self.hash_function == HashFunctions::murmur64_protein + self.hash_function == HashFunctions::Murmur64Protein } pub fn max_hash(&self) -> u64 { @@ -715,11 +715,11 @@ impl KmerMinHash { } pub fn dayhoff(&self) -> bool { - self.hash_function == HashFunctions::murmur64_dayhoff + self.hash_function == HashFunctions::Murmur64Dayhoff } pub fn hp(&self) -> bool { - self.hash_function == HashFunctions::murmur64_hp + self.hash_function == HashFunctions::Murmur64Hp } pub fn mins(&self) -> Vec { @@ -943,7 +943,7 @@ pub struct KmerMinHashBTree { num: u32, ksize: u32, - #[builder(setter(into), default = HashFunctions::murmur64_DNA)] + #[builder(setter(into), default = HashFunctions::Murmur64Dna)] hash_function: HashFunctions, #[builder(default = 42u64)] @@ -995,7 +995,7 @@ impl Default for KmerMinHashBTree { KmerMinHashBTree { num: 1000, ksize: 21, - hash_function: HashFunctions::murmur64_DNA, + hash_function: HashFunctions::Murmur64Dna, seed: 42, max_hash: 0, mins: Default::default(), @@ -1056,10 +1056,10 @@ impl<'de> Deserialize<'de> for KmerMinHashBTree { let num = if tmpsig.max_hash != 0 { 0 } else { tmpsig.num }; let hash_function = match tmpsig.molecule.to_lowercase().as_ref() { - "protein" => HashFunctions::murmur64_protein, - "dayhoff" => HashFunctions::murmur64_dayhoff, - "hp" => HashFunctions::murmur64_hp, - "dna" => HashFunctions::murmur64_DNA, + "protein" => HashFunctions::Murmur64Protein, + "dayhoff" => HashFunctions::Murmur64Dayhoff, + "hp" => HashFunctions::Murmur64Hp, + "dna" => HashFunctions::Murmur64Dna, _ => unimplemented!(), // TODO: throw error here }; @@ -1129,7 +1129,7 @@ impl KmerMinHashBTree { } pub fn is_protein(&self) -> bool { - self.hash_function == HashFunctions::murmur64_protein + self.hash_function == HashFunctions::Murmur64Protein } pub fn max_hash(&self) -> u64 { @@ -1492,11 +1492,11 @@ impl KmerMinHashBTree { } pub fn dayhoff(&self) -> bool { - self.hash_function == HashFunctions::murmur64_dayhoff + self.hash_function == HashFunctions::Murmur64Dayhoff } pub fn hp(&self) -> bool { - self.hash_function == HashFunctions::murmur64_hp + self.hash_function == HashFunctions::Murmur64Hp } pub fn hash_function(&self) -> HashFunctions { diff --git a/src/core/tests/minhash.rs b/src/core/tests/minhash.rs index bcb3fdb4fa..12477ed0d2 100644 --- a/src/core/tests/minhash.rs +++ b/src/core/tests/minhash.rs @@ -18,7 +18,7 @@ const EPSILON: f64 = 0.01; #[test] fn throws_error() { - let mut mh = KmerMinHash::new(0, 4, HashFunctions::murmur64_DNA, 42, false, 1); + let mut mh = KmerMinHash::new(0, 4, HashFunctions::Murmur64Dna, 42, false, 1); assert!( mh.add_sequence(b"ATGR", false).is_err(), @@ -28,8 +28,8 @@ fn throws_error() { #[test] fn merge() { - let mut a = KmerMinHash::new(0, 10, HashFunctions::murmur64_DNA, 42, false, 20); - let mut b = KmerMinHash::new(0, 10, HashFunctions::murmur64_DNA, 42, false, 20); + let mut a = KmerMinHash::new(0, 10, HashFunctions::Murmur64Dna, 42, false, 20); + let mut b = KmerMinHash::new(0, 10, HashFunctions::Murmur64Dna, 42, false, 20); a.add_sequence(b"TGCCGCCCAGCA", false).unwrap(); b.add_sequence(b"TGCCGCCCAGCA", false).unwrap(); @@ -55,20 +55,20 @@ fn merge() { #[test] fn invalid_dna() { - let mut a = KmerMinHash::new(0, 3, HashFunctions::murmur64_DNA, 42, false, 20); + let mut a = KmerMinHash::new(0, 3, HashFunctions::Murmur64Dna, 42, false, 20); a.add_sequence(b"AAANNCCCTN", true).unwrap(); assert_eq!(a.mins().len(), 3); - let mut b = KmerMinHash::new(0, 3, HashFunctions::murmur64_DNA, 42, false, 20); + let mut b = KmerMinHash::new(0, 3, HashFunctions::Murmur64Dna, 42, false, 20); b.add_sequence(b"NAAA", true).unwrap(); assert_eq!(b.mins().len(), 1); } #[test] fn similarity() -> Result<(), Box> { - let mut a = KmerMinHash::new(0, 20, HashFunctions::murmur64_hp, 42, true, 5); - let mut b = KmerMinHash::new(0, 20, HashFunctions::murmur64_hp, 42, true, 5); + let mut a = KmerMinHash::new(0, 20, HashFunctions::Murmur64Hp, 42, true, 5); + let mut b = KmerMinHash::new(0, 20, HashFunctions::Murmur64Hp, 42, true, 5); a.add_hash(1); b.add_hash(1); @@ -82,8 +82,8 @@ fn similarity() -> Result<(), Box> { #[test] fn similarity_2() -> Result<(), Box> { - let mut a = KmerMinHash::new(0, 5, HashFunctions::murmur64_DNA, 42, true, 5); - let mut b = KmerMinHash::new(0, 5, HashFunctions::murmur64_DNA, 42, true, 5); + let mut a = KmerMinHash::new(0, 5, HashFunctions::Murmur64Dna, 42, true, 5); + let mut b = KmerMinHash::new(0, 5, HashFunctions::Murmur64Dna, 42, true, 5); a.add_sequence(b"ATGGA", false)?; a.add_sequence(b"GGACA", false)?; @@ -102,8 +102,8 @@ fn similarity_2() -> Result<(), Box> { #[test] fn similarity_3() -> Result<(), Box> { - let mut a = KmerMinHash::new(0, 20, HashFunctions::murmur64_dayhoff, 42, true, 5); - let mut b = KmerMinHash::new(0, 20, HashFunctions::murmur64_dayhoff, 42, true, 5); + let mut a = KmerMinHash::new(0, 20, HashFunctions::Murmur64Dayhoff, 42, true, 5); + let mut b = KmerMinHash::new(0, 20, HashFunctions::Murmur64Dayhoff, 42, true, 5); a.add_hash(1); a.add_hash(1); @@ -126,8 +126,8 @@ fn similarity_3() -> Result<(), Box> { #[test] fn angular_similarity_requires_abundance() -> Result<(), Box> { - let mut a = KmerMinHash::new(0, 20, HashFunctions::murmur64_dayhoff, 42, false, 5); - let mut b = KmerMinHash::new(0, 20, HashFunctions::murmur64_dayhoff, 42, false, 5); + let mut a = KmerMinHash::new(0, 20, HashFunctions::Murmur64Dayhoff, 42, false, 5); + let mut b = KmerMinHash::new(0, 20, HashFunctions::Murmur64Dayhoff, 42, false, 5); a.add_hash(1); b.add_hash(1); @@ -139,8 +139,8 @@ fn angular_similarity_requires_abundance() -> Result<(), Box Result<(), Box> { - let mut a = KmerMinHashBTree::new(0, 20, HashFunctions::murmur64_dayhoff, 42, false, 5); - let mut b = KmerMinHashBTree::new(0, 20, HashFunctions::murmur64_dayhoff, 42, false, 5); + let mut a = KmerMinHashBTree::new(0, 20, HashFunctions::Murmur64Dayhoff, 42, false, 5); + let mut b = KmerMinHashBTree::new(0, 20, HashFunctions::Murmur64Dayhoff, 42, false, 5); a.add_hash(1); b.add_hash(1); @@ -152,8 +152,8 @@ fn angular_similarity_btree_requires_abundance() -> Result<(), Box = Vec::new(); @@ -769,7 +769,7 @@ fn seq_to_hashes(seq in "ACGTGTAGCTAGACACTGACTGACTGAC") { fn seq_to_hashes_2(seq in "QRMTHINK") { let scaled = 1; - let mut mh = KmerMinHash::new(scaled, 3, HashFunctions::murmur64_protein, 42, true, 0); + let mut mh = KmerMinHash::new(scaled, 3, HashFunctions::Murmur64Protein, 42, true, 0); mh.add_protein(seq.as_bytes())?; // .unwrap(); let mut hashes: Vec = Vec::new(); From 9bb513c387e31fb46b7c5d15b38ff7e07768f217 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sun, 17 Sep 2023 14:24:15 -0700 Subject: [PATCH 07/19] Initial support for custom hash function --- src/core/src/encodings.rs | 5 ++++- src/core/src/ffi/mod.rs | 3 ++- src/core/src/from.rs | 6 +++--- src/core/src/selection.rs | 2 +- src/core/src/signature.rs | 10 +++++----- src/core/src/sketch/minhash.rs | 22 +++++++++++----------- src/core/src/wasm.rs | 8 ++++---- 7 files changed, 30 insertions(+), 26 deletions(-) diff --git a/src/core/src/encodings.rs b/src/core/src/encodings.rs index 752b6d892f..ac69cd58eb 100644 --- a/src/core/src/encodings.rs +++ b/src/core/src/encodings.rs @@ -22,16 +22,18 @@ pub type Idx = u32; type IdxTracker = (vec_collections::VecSet<[Idx; 8]>, u64); type ColorToIdx = HashMap>; -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq)] #[cfg_attr( feature = "rkyv", derive(rkyv::Serialize, rkyv::Deserialize, rkyv::Archive) )] +#[non_exhaustive] pub enum HashFunctions { Murmur64Dna, Murmur64Protein, Murmur64Dayhoff, Murmur64Hp, + Custom(String), } impl HashFunctions { @@ -62,6 +64,7 @@ impl std::fmt::Display for HashFunctions { HashFunctions::Murmur64Protein => "protein", HashFunctions::Murmur64Dayhoff => "dayhoff", HashFunctions::Murmur64Hp => "hp", + HashFunctions::Custom(v) => v, } ) } diff --git a/src/core/src/ffi/mod.rs b/src/core/src/ffi/mod.rs index 6e28c648cf..6f1dff78e4 100644 --- a/src/core/src/ffi/mod.rs +++ b/src/core/src/ffi/mod.rs @@ -1,6 +1,6 @@ //! # Foreign Function Interface for calling sourmash from a C API //! -//! Primary client for now is the Python version, using CFFI and milksnake. +//! Primary client for now is the Python version, using CFFI and maturin. #![allow(clippy::missing_safety_doc)] #[macro_use] @@ -62,6 +62,7 @@ impl From for HashFunctions { Murmur64Protein => HashFunctions::Murmur64Protein, Murmur64Dayhoff => HashFunctions::Murmur64Dayhoff, Murmur64Hp => HashFunctions::Murmur64Hp, + _ => todo!("Not supported, probably custom"), } } } diff --git a/src/core/src/from.rs b/src/core/src/from.rs index 7847714cfe..dbeeb58a2f 100644 --- a/src/core/src/from.rs +++ b/src/core/src/from.rs @@ -17,7 +17,7 @@ impl From for KmerMinHash { let mut new_mh = KmerMinHash::new( 0, values.get(0).unwrap().kmer.len() as u32, - HashFunctions::murmur64_DNA, + HashFunctions::Murmur64Dna, 42, true, values.len() as u32, @@ -51,7 +51,7 @@ mod test { #[test] fn finch_behavior() { - let mut a = KmerMinHash::new(0, 10, HashFunctions::murmur64_DNA, 42, true, 20); + let mut a = KmerMinHash::new(0, 10, HashFunctions::Murmur64Dna, 42, true, 20); let mut b = MashSketcher::new(20, 10, 42); let seq = b"TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA"; @@ -87,7 +87,7 @@ mod test { #[test] fn from_finch() { - let mut a = KmerMinHash::new(0, 10, HashFunctions::murmur64_DNA, 42, true, 20); + let mut a = KmerMinHash::new(0, 10, HashFunctions::Murmur64Dna, 42, true, 20); let mut b = MashSketcher::new(20, 10, 42); let seq = b"TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA"; diff --git a/src/core/src/selection.rs b/src/core/src/selection.rs index 86d42273e0..cfe871663f 100644 --- a/src/core/src/selection.rs +++ b/src/core/src/selection.rs @@ -104,7 +104,7 @@ impl Selection { } pub fn moltype(&self) -> Option { - self.moltype + self.moltype.clone() } pub fn set_moltype(&mut self, value: HashFunctions) { diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index b521191806..f5cb9a2b4e 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -371,7 +371,7 @@ impl Iterator for SeqToHashes { Some(Ok(hash)) } else { if !self.prot_configured { - self.aa_seq = match self.hash_function { + self.aa_seq = match &self.hash_function { HashFunctions::Murmur64Dayhoff => { self.sequence.iter().cloned().map(aa_to_dayhoff).collect() } @@ -584,9 +584,9 @@ impl Signature { } }; - match moltype { + match &moltype { Some(x) => { - if mh.hash_function() == x { + if mh.hash_function() == *x { return true; } } @@ -600,9 +600,9 @@ impl Signature { } }; - match moltype { + match &moltype { Some(x) => { - if mh.hash_function() == x { + if mh.hash_function() == *x { return true; } } diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index 22fe7159c3..36f11a589e 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -74,7 +74,7 @@ impl Clone for KmerMinHash { KmerMinHash { num: self.num, ksize: self.ksize, - hash_function: self.hash_function, + hash_function: self.hash_function.clone(), seed: self.seed, max_hash: self.max_hash, mins: self.mins.clone(), @@ -579,7 +579,7 @@ impl KmerMinHash { let mut combined_mh = KmerMinHash::new( self.scaled(), self.ksize, - self.hash_function, + self.hash_function.clone(), self.seed, self.abunds.is_some(), self.num, @@ -612,7 +612,7 @@ impl KmerMinHash { let mut combined_mh = KmerMinHash::new( self.scaled(), self.ksize, - self.hash_function, + self.hash_function.clone(), self.seed, self.abunds.is_some(), self.num, @@ -741,7 +741,7 @@ impl KmerMinHash { let mut new_mh = KmerMinHash::new( scaled, self.ksize, - self.hash_function, + self.hash_function.clone(), self.seed, self.abunds.is_some(), self.num, @@ -805,7 +805,7 @@ impl SigsTrait for KmerMinHash { } fn hash_function(&self) -> HashFunctions { - self.hash_function + self.hash_function.clone() } fn add_hash(&mut self, hash: u64) { @@ -979,7 +979,7 @@ impl Clone for KmerMinHashBTree { KmerMinHashBTree { num: self.num, ksize: self.ksize, - hash_function: self.hash_function, + hash_function: self.hash_function.clone(), seed: self.seed, max_hash: self.max_hash, mins: self.mins.clone(), @@ -1372,7 +1372,7 @@ impl KmerMinHashBTree { let mut combined_mh = KmerMinHashBTree::new( self.scaled(), self.ksize, - self.hash_function, + self.hash_function.clone(), self.seed, self.abunds.is_some(), self.num, @@ -1404,7 +1404,7 @@ impl KmerMinHashBTree { let mut combined_mh = KmerMinHashBTree::new( self.scaled(), self.ksize, - self.hash_function, + self.hash_function.clone(), self.seed, self.abunds.is_some(), self.num, @@ -1500,7 +1500,7 @@ impl KmerMinHashBTree { } pub fn hash_function(&self) -> HashFunctions { - self.hash_function + self.hash_function.clone() } pub fn mins(&self) -> Vec { @@ -1524,7 +1524,7 @@ impl KmerMinHashBTree { let mut new_mh = KmerMinHashBTree::new( scaled, self.ksize, - self.hash_function, + self.hash_function.clone(), self.seed, self.abunds.is_some(), self.num, @@ -1574,7 +1574,7 @@ impl SigsTrait for KmerMinHashBTree { } fn hash_function(&self) -> HashFunctions { - self.hash_function + self.hash_function.clone() } fn add_hash(&mut self, hash: u64) { diff --git a/src/core/src/wasm.rs b/src/core/src/wasm.rs index ad656d9955..c2a0eb6c30 100644 --- a/src/core/src/wasm.rs +++ b/src/core/src/wasm.rs @@ -37,13 +37,13 @@ impl KmerMinHash { // TODO: at most one of (prot, dayhoff, hp) should be true let hash_function = if dayhoff { - HashFunctions::murmur64_dayhoff + HashFunctions::Murmur64Dayhoff } else if hp { - HashFunctions::murmur64_hp + HashFunctions::Murmur64Hp } else if is_protein { - HashFunctions::murmur64_protein + HashFunctions::Murmur64Protein } else { - HashFunctions::murmur64_DNA + HashFunctions::Murmur64Dna }; KmerMinHash(_KmerMinHash::new( From 7afebdae4bf907f4f47fa36c44cf4b8ff08d1c21 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sun, 17 Sep 2023 20:36:30 -0700 Subject: [PATCH 08/19] collection: build from_sigs and from_paths in parallel if possible --- src/core/src/collection.rs | 52 +++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/src/core/src/collection.rs b/src/core/src/collection.rs index ca340af011..ba2cbb3d6b 100644 --- a/src/core/src/collection.rs +++ b/src/core/src/collection.rs @@ -1,6 +1,7 @@ use std::ops::{Deref, DerefMut}; use camino::Utf8Path as Path; +use camino::Utf8PathBuf as PathBuf; use crate::encodings::Idx; use crate::manifest::{Manifest, Record}; @@ -121,16 +122,24 @@ impl Collection { pub fn from_sigs(sigs: Vec) -> Result { let storage = MemStorage::new(); - let mut records = vec![]; - for (i, sig) in sigs.into_iter().enumerate() { - let path = format!("{}", i); - let mut record = Record::from_sig(&sig, &path); - let path = storage.save_sig(&path, sig)?; - record.iter_mut().for_each(|rec| { - rec.set_internal_location(path.clone().into()); - }); - records.extend(record); - } + #[cfg(feature = "parallel")] + let iter = sigs.into_par_iter(); + + #[cfg(not(feature = "parallel"))] + let iter = sigs.into_iter(); + + let records: Vec<_> = iter + .enumerate() + .flat_map(|(i, sig)| { + let path = format!("{}", i); + let mut record = Record::from_sig(&sig, &path); + let path = storage.save_sig(&path, sig).expect("Error saving sig"); + record.iter_mut().for_each(|rec| { + rec.set_internal_location(path.clone().into()); + }); + record + }) + .collect(); Ok(Self { manifest: records.into(), @@ -138,22 +147,25 @@ impl Collection { }) } - pub fn from_paths>(paths: &[P]) -> Result { + pub fn from_paths(paths: &[PathBuf]) -> Result { // TODO: - // - Build manifest from paths - // - Might need to load the data? - // - Use FSStorage (figure out if there is a common path between sigs?) - let records: Vec = paths - .iter() + // - figure out if there is a common path between sigs for FSStorage? + + #[cfg(feature = "parallel")] + let iter = paths.par_iter(); + + #[cfg(not(feature = "parallel"))] + let iter = paths.iter(); + + let records: Vec = iter .flat_map(|p| { - let recs: Vec = Signature::from_path(p.as_ref()) - .unwrap_or_else(|_| panic!("Error processing {:?}", p.as_ref())) + let recs: Vec = Signature::from_path(p) + .unwrap_or_else(|_| panic!("Error processing {:?}", p)) .into_iter() - .flat_map(|v| Record::from_sig(&v, p.as_ref().as_str())) + .flat_map(|v| Record::from_sig(&v, p.as_str())) .collect(); recs }) - //.map(|p| self.collection().storage.load_sig(p.as_str())?.into()) .collect(); Ok(Self { From 0799adc386e316e325cfc2f47c9367908af7cf2a Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Wed, 20 Sep 2023 09:04:22 -0700 Subject: [PATCH 09/19] save collection on update... --- src/core/src/index/revindex/disk_revindex.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/core/src/index/revindex/disk_revindex.rs b/src/core/src/index/revindex/disk_revindex.rs index d63abf58cd..5080ca7ce1 100644 --- a/src/core/src/index/revindex/disk_revindex.rs +++ b/src/core/src/index/revindex/disk_revindex.rs @@ -396,6 +396,8 @@ impl RevIndexOps for RevIndex { self.map_hashes_colors(dataset_id as Idx); }); + self.save_collection().expect("Error saving collection"); + info!("Compact SSTs"); self.compact(); From feaafb1ed4cdc455797b3db25c26a4e692494753 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Tue, 26 Sep 2023 18:31:30 -0700 Subject: [PATCH 10/19] clean up check --- Cargo.lock | 30 ----------- src/core/Cargo.toml | 3 -- src/core/src/index/revindex/disk_revindex.rs | 9 ++-- src/core/src/index/revindex/mod.rs | 53 ++++++++++---------- 4 files changed, 31 insertions(+), 64 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 122923d2e7..d21cb3d52b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -52,12 +52,6 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "41ed9a86bf92ae6580e0a31281f65a1b1d867c0cc68d5346e2ae128dddfa6a7d" -[[package]] -name = "assert_matches" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b34d609dfbaf33d6889b2b7106d3ca345eacad44200913df5ba02bfd31d2ba9" - [[package]] name = "autocfg" version = "1.1.0" @@ -1019,15 +1013,6 @@ dependencies = [ "libc", ] -[[package]] -name = "numsep" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad5c49c3e12c314efb1f43cba136031b657dcd59ee26936ab2be313c5e97da22" -dependencies = [ - "slicestring", -] - [[package]] name = "once_cell" version = "1.18.0" @@ -1486,18 +1471,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" -[[package]] -name = "size" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fed904c7fb2856d868b92464fc8fa597fce366edea1a9cbfaa8cb5fe080bd6d" - -[[package]] -name = "slicestring" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "636b979c5672ac7c2a1120ca0a9a6074cd090dadfec42af6f8a5baea1223d180" - [[package]] name = "smallvec" version = "1.8.0" @@ -1514,7 +1487,6 @@ checksum = "9f1341053f34bb13b5e9590afb7d94b48b48d4b87467ec28e3c238693bb553de" name = "sourmash" version = "0.12.0" dependencies = [ - "assert_matches", "az", "bytecount", "byteorder", @@ -1538,7 +1510,6 @@ dependencies = [ "niffler", "nohash-hasher", "num-iter", - "numsep", "once_cell", "ouroboros", "piz", @@ -1551,7 +1522,6 @@ dependencies = [ "rocksdb", "serde", "serde_json", - "size", "tempfile", "thiserror", "twox-hash", diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 73e42057e2..d2b07848bb 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -46,7 +46,6 @@ murmurhash3 = "0.0.5" niffler = { version = "2.3.1", default-features = false, features = [ "gz" ] } nohash-hasher = "0.2.0" num-iter = "0.1.43" -numsep = "0.1.12" once_cell = "1.18.0" ouroboros = "0.18.0" piz = "0.5.0" @@ -56,14 +55,12 @@ rkyv = { version = "0.7.39", optional = true } roaring = "0.10.0" serde = { version = "1.0.168", features = ["derive"] } serde_json = "1.0.107" -size = "0.4.0" thiserror = "1.0" twox-hash = "1.6.0" typed-builder = "0.14.0" vec-collections = "0.4.3" [dev-dependencies] -assert_matches = "1.3.0" criterion = "0.5.1" needletail = { version = "0.5.1", default-features = false } proptest = { version = "1.2.0", default-features = false, features = ["std"]} diff --git a/src/core/src/index/revindex/disk_revindex.rs b/src/core/src/index/revindex/disk_revindex.rs index 5080ca7ce1..5ab4a5c321 100644 --- a/src/core/src/index/revindex/disk_revindex.rs +++ b/src/core/src/index/revindex/disk_revindex.rs @@ -10,10 +10,9 @@ use rocksdb::{ColumnFamilyDescriptor, MergeOperands, Options}; use crate::collection::{Collection, CollectionSet}; use crate::encodings::{Color, Idx}; -use crate::index::revindex::prepare_query; use crate::index::revindex::{ - self as module, stats_for_cf, Datasets, HashToColor, QueryColors, RevIndexOps, DB, HASHES, - MANIFEST, METADATA, STORAGE_SPEC, + self as module, prepare_query, stats_for_cf, Datasets, DbStats, HashToColor, QueryColors, + RevIndexOps, DB, HASHES, MANIFEST, METADATA, STORAGE_SPEC, }; use crate::index::{GatherResult, SigCounter}; use crate::manifest::Manifest; @@ -409,8 +408,8 @@ impl RevIndexOps for RevIndex { Ok(module::RevIndex::Plain(self)) } - fn check(&self, quick: bool) { - stats_for_cf(self.db.clone(), HASHES, true, quick); + fn check(&self, quick: bool) -> DbStats { + stats_for_cf(self.db.clone(), HASHES, true, quick) } fn compact(&self) { diff --git a/src/core/src/index/revindex/mod.rs b/src/core/src/index/revindex/mod.rs index a1f796bb7f..36245c604d 100644 --- a/src/core/src/index/revindex/mod.rs +++ b/src/core/src/index/revindex/mod.rs @@ -8,6 +8,7 @@ use std::sync::Arc; use byteorder::{LittleEndian, WriteBytesExt}; use enum_dispatch::enum_dispatch; +use getset::{Getters, Setters}; use nohash_hasher::BuildNoHashHasher; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; @@ -67,7 +68,7 @@ pub trait RevIndexOps { fn convert(&self, output_db: RevIndex) -> Result<()>; - fn check(&self, quick: bool); + fn check(&self, quick: bool) -> DbStats; fn gather( &self, @@ -381,11 +382,27 @@ impl Datasets { */ } -fn stats_for_cf(db: Arc, cf_name: &str, deep_check: bool, quick: bool) { +#[derive(Getters, Setters, Debug)] +pub struct DbStats { + #[getset(get = "pub")] + total_datasets: usize, + + #[getset(get = "pub")] + total_keys: usize, + + #[getset(get = "pub")] + kcount: usize, + + #[getset(get = "pub")] + vcount: usize, + + #[getset(get = "pub")] + vcounts: histogram::Histogram, +} + +fn stats_for_cf(db: Arc, cf_name: &str, deep_check: bool, quick: bool) -> DbStats { use byteorder::ReadBytesExt; use histogram::Histogram; - use log::info; - use numsep::{separate, Locale}; let cf = db.cf_handle(cf_name).unwrap(); @@ -411,28 +428,12 @@ fn stats_for_cf(db: Arc, cf_name: &str, deep_check: bool, quick: bool) { //println!("Saw {} {:?}", k, value); } - info!("*** {} ***", cf_name); - use size::Size; - let ksize = Size::from_bytes(kcount); - let vsize = Size::from_bytes(vcount); - if !quick && cf_name == COLORS { - info!( - "total datasets: {}", - separate(datasets.len(), Locale::English) - ); - } - info!("total keys: {}", separate(kcount / 8, Locale::English)); - - info!("k: {}", ksize.to_string()); - info!("v: {}", vsize.to_string()); - - if !quick && kcount > 0 && deep_check { - info!("max v: {}", vcounts.maximum().unwrap()); - info!("mean v: {}", vcounts.mean().unwrap()); - info!("stddev: {}", vcounts.stddev().unwrap()); - info!("median v: {}", vcounts.percentile(50.0).unwrap()); - info!("p25 v: {}", vcounts.percentile(25.0).unwrap()); - info!("p75 v: {}", vcounts.percentile(75.0).unwrap()); + DbStats { + total_datasets: datasets.len(), + total_keys: kcount / 8, + kcount, + vcount, + vcounts, } } From 5e9e3d47d67bd545d761b56fdadbec1a10d3a8e1 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Fri, 29 Sep 2023 18:01:13 -0700 Subject: [PATCH 11/19] derive clone for selection --- src/core/src/selection.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/src/selection.rs b/src/core/src/selection.rs index cfe871663f..1add597173 100644 --- a/src/core/src/selection.rs +++ b/src/core/src/selection.rs @@ -5,7 +5,7 @@ use crate::encodings::HashFunctions; use crate::manifest::Record; use crate::Result; -#[derive(Default, Debug, TypedBuilder)] +#[derive(Default, Debug, TypedBuilder, Clone)] pub struct Selection { #[builder(default, setter(strip_option))] ksize: Option, From 3a13ab261a254a51ea41fcbdd41f727ed6600b8a Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sat, 30 Sep 2023 11:30:15 -0700 Subject: [PATCH 12/19] Build manifest from paths in parallel --- src/core/src/collection.rs | 19 +--------- src/core/src/manifest.rs | 71 ++++++++++++++++++++++++-------------- 2 files changed, 47 insertions(+), 43 deletions(-) diff --git a/src/core/src/collection.rs b/src/core/src/collection.rs index ba2cbb3d6b..c00b2fd288 100644 --- a/src/core/src/collection.rs +++ b/src/core/src/collection.rs @@ -151,25 +151,8 @@ impl Collection { // TODO: // - figure out if there is a common path between sigs for FSStorage? - #[cfg(feature = "parallel")] - let iter = paths.par_iter(); - - #[cfg(not(feature = "parallel"))] - let iter = paths.iter(); - - let records: Vec = iter - .flat_map(|p| { - let recs: Vec = Signature::from_path(p) - .unwrap_or_else(|_| panic!("Error processing {:?}", p)) - .into_iter() - .flat_map(|v| Record::from_sig(&v, p.as_str())) - .collect(); - recs - }) - .collect(); - Ok(Self { - manifest: records.into(), + manifest: paths.into(), storage: InnerStorage::new( FSStorage::builder() .fullpath("".into()) diff --git a/src/core/src/manifest.rs b/src/core/src/manifest.rs index de3075faa9..5bad8ec81b 100644 --- a/src/core/src/manifest.rs +++ b/src/core/src/manifest.rs @@ -4,6 +4,8 @@ use std::ops::Deref; use camino::Utf8PathBuf as PathBuf; use getset::{CopyGetters, Getters, Setters}; +#[cfg(feature = "parallel")] +use rayon::prelude::*; use serde::de; use serde::{Deserialize, Serialize}; @@ -18,27 +20,28 @@ pub struct Record { #[getset(get = "pub", set = "pub")] internal_location: PathBuf, + #[getset(get = "pub", set = "pub")] + md5: String, + + md5short: String, + #[getset(get = "pub", set = "pub")] ksize: u32, + moltype: String, + + num: u32, + scaled: u64, + n_hashes: usize, + #[getset(get = "pub", set = "pub")] #[serde(deserialize_with = "to_bool")] with_abundance: bool, - #[getset(get = "pub", set = "pub")] - md5: String, - #[getset(get = "pub", set = "pub")] name: String, - moltype: String, - /* - md5short: String, - num: String, - scaled: String, - n_hashes: String, filename: String, - */ } fn to_bool<'de, D>(deserializer: D) -> std::result::Result @@ -67,29 +70,42 @@ impl Record { pub fn from_sig(sig: &Signature, path: &str) -> Vec { sig.iter() .map(|sketch| { - let (ksize, md5, with_abundance, moltype) = match sketch { + let (ksize, md5, with_abundance, moltype, n_hashes, num, scaled) = match sketch { Sketch::MinHash(mh) => ( mh.ksize() as u32, mh.md5sum(), mh.track_abundance(), mh.hash_function(), + mh.size(), + mh.num(), + mh.scaled(), ), Sketch::LargeMinHash(mh) => ( mh.ksize() as u32, mh.md5sum(), mh.track_abundance(), mh.hash_function(), + mh.size(), + mh.num(), + mh.scaled(), ), _ => unimplemented!(), }; + let md5short = md5[0..8].into(); + Self { internal_location: path.into(), moltype: moltype.to_string(), name: sig.name(), ksize, md5, + md5short, with_abundance, + filename: sig.filename(), + n_hashes, + num, + scaled, } }) .collect() @@ -232,20 +248,25 @@ impl From> for Manifest { } impl From<&[PathBuf]> for Manifest { - fn from(v: &[PathBuf]) -> Self { - Manifest { - records: v - .iter() - .map(|p| Record { - internal_location: p.clone(), - ksize: 0, // FIXME - with_abundance: false, // FIXME - md5: "".into(), // FIXME - name: "".into(), // FIXME - moltype: "".into(), // FIXME - }) - .collect(), - } + fn from(paths: &[PathBuf]) -> Self { + #[cfg(feature = "parallel")] + let iter = paths.par_iter(); + + #[cfg(not(feature = "parallel"))] + let iter = paths.iter(); + + let records: Vec = iter + .flat_map(|p| { + let recs: Vec = Signature::from_path(p) + .unwrap_or_else(|_| panic!("Error processing {:?}", p)) + .into_iter() + .flat_map(|v| Record::from_sig(&v, p.as_str())) + .collect(); + recs + }) + .collect(); + + Manifest { records } } } From 8464094c9b8fc7c306171b91388c9b3370dba6a0 Mon Sep 17 00:00:00 2001 From: Tessa Pierce Ward Date: Mon, 9 Oct 2023 16:14:35 -0700 Subject: [PATCH 13/19] Deps: chrono version pin (#2798) To install sourmash from mastiff branch in pyo3-branchwater, I need to bump chrono back to 0.4.28 (from 0.4.31) to avoid incompatibilities with `piz`. --- pyproject.toml | 2 +- src/core/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ecdfd87beb..16bf15d207 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [build-system] requires = [ - "maturin>=1,<2", + "maturin>=1,<1.3.0", "cffi", ] build-backend = 'maturin' diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 05896dffb3..488d629b7c 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -97,7 +97,7 @@ version = "0.3.64" features = ["console", "File"] [target.'cfg(all(target_arch = "wasm32"))'.dependencies.chrono] -version = "0.4.31" +version = "0.4.28" features = ["wasmbind"] [target.'cfg(all(target_arch = "wasm32", target_os="unknown"))'.dev-dependencies] From 2a0a6190103ac1b6b547b762622b66467e10893f Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Mon, 9 Oct 2023 16:15:23 -0700 Subject: [PATCH 14/19] save version in DB --- src/core/src/index/revindex/disk_revindex.rs | 12 +++++++++++- src/core/src/index/revindex/mod.rs | 9 +++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/core/src/index/revindex/disk_revindex.rs b/src/core/src/index/revindex/disk_revindex.rs index 5ab4a5c321..05efad9ecb 100644 --- a/src/core/src/index/revindex/disk_revindex.rs +++ b/src/core/src/index/revindex/disk_revindex.rs @@ -12,7 +12,7 @@ use crate::collection::{Collection, CollectionSet}; use crate::encodings::{Color, Idx}; use crate::index::revindex::{ self as module, prepare_query, stats_for_cf, Datasets, DbStats, HashToColor, QueryColors, - RevIndexOps, DB, HASHES, MANIFEST, METADATA, STORAGE_SPEC, + RevIndexOps, DB, HASHES, MANIFEST, METADATA, STORAGE_SPEC, VERSION, }; use crate::index::{GatherResult, SigCounter}; use crate::manifest::Manifest; @@ -23,6 +23,8 @@ use crate::sketch::Sketch; use crate::storage::{InnerStorage, Storage}; use crate::Result; +const DB_VERSION: u8 = 1; + fn compute_color(idxs: &Datasets) -> Color { let s = BuildHasherDefault::::default(); let mut hasher = s.build_hasher(); @@ -126,6 +128,9 @@ impl RevIndex { fn load_collection_from_rocksdb(db: Arc) -> Result { let cf_metadata = db.cf_handle(METADATA).unwrap(); + let rdr = db.get_cf(&cf_metadata, VERSION)?.unwrap(); + assert_eq!(rdr[0], DB_VERSION); + let rdr = db.get_cf(&cf_metadata, MANIFEST)?.unwrap(); let manifest = Manifest::from_reader(&rdr[..])?; @@ -144,6 +149,11 @@ impl RevIndex { fn save_collection(&self) -> Result<()> { let cf_metadata = self.db.cf_handle(METADATA).unwrap(); + // save DB version + // TODO: probably should go together with a more general + // saving procedure used in create/update + self.db.put_cf(&cf_metadata, VERSION, &[DB_VERSION])?; + // write manifest let mut wtr = vec![]; { diff --git a/src/core/src/index/revindex/mod.rs b/src/core/src/index/revindex/mod.rs index 36245c604d..0765ee71d9 100644 --- a/src/core/src/index/revindex/mod.rs +++ b/src/core/src/index/revindex/mod.rs @@ -30,11 +30,15 @@ type HashToColorT = HashMap #[derive(Serialize, Deserialize)] pub struct HashToColor(HashToColorT); +// Column families const HASHES: &str = "hashes"; const COLORS: &str = "colors"; const METADATA: &str = "metadata"; + +// DB metadata saved in the METADATA column family const MANIFEST: &str = "manifest"; const STORAGE_SPEC: &str = "storage_spec"; +const VERSION: &str = "version"; #[enum_dispatch(RevIndexOps)] pub enum RevIndex { @@ -205,11 +209,12 @@ impl RevIndex { block_opts.set_block_size(16 * 1024); block_opts.set_cache_index_and_filter_blocks(true); block_opts.set_pin_l0_filter_and_index_blocks_in_cache(true); - block_opts.set_format_version(5); + block_opts.set_format_version(6); opts.set_block_based_table_factory(&block_opts); // End of updated defaults - opts.increase_parallelism(8); + opts.increase_parallelism(rayon::current_num_threads() as i32); + //opts.max_background_jobs = 6; // opts.optimize_level_style_compaction(); // opts.optimize_universal_style_compaction(); From f07bc345ebaf6a3257dbe9359b98579725885523 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Tue, 21 Nov 2023 17:24:05 -0800 Subject: [PATCH 15/19] Use byteorder 1.4.3 for pyo3_branchwater compat --- Cargo.lock | 4 ++-- src/core/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0107d7111e..085b9b938f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -163,9 +163,9 @@ checksum = "2f5715e491b5a1598fc2bef5a606847b5dc1d48ea625bd3c02c00de8285591da" [[package]] name = "byteorder" -version = "1.5.0" +version = "1.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "bzip2" diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 0c6623f697..dc50b6beb6 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -28,7 +28,7 @@ default = [] [dependencies] az = "1.0.0" bytecount = "0.6.7" -byteorder = "1.5.0" +byteorder = "1.4.3" camino = { version = "1.1.6", features = ["serde1"] } cfg-if = "1.0" counter = "0.5.7" From cfe28341af5847235bbf853424ee4917995665ee Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Tue, 21 Nov 2023 17:29:00 -0800 Subject: [PATCH 16/19] No dep on bytecount --- Cargo.lock | 1 - src/core/Cargo.toml | 1 - 2 files changed, 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 085b9b938f..bd78b7b421 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1462,7 +1462,6 @@ name = "sourmash" version = "0.12.0" dependencies = [ "az", - "bytecount", "byteorder", "camino", "cfg-if", diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index dc50b6beb6..cb5f499cab 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -27,7 +27,6 @@ default = [] [dependencies] az = "1.0.0" -bytecount = "0.6.7" byteorder = "1.4.3" camino = { version = "1.1.6", features = ["serde1"] } cfg-if = "1.0" From be6b03832c6be3b3dca152b850a12247910128f2 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Wed, 22 Nov 2023 11:27:13 -0800 Subject: [PATCH 17/19] Add a test for innerstorage save_sig --- src/core/tests/storage.rs | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/src/core/tests/storage.rs b/src/core/tests/storage.rs index d41053c160..1f15c5aff0 100644 --- a/src/core/tests/storage.rs +++ b/src/core/tests/storage.rs @@ -1,6 +1,9 @@ use std::path::PathBuf; -use sourmash::storage::{Storage, ZipStorage}; +use tempfile::TempDir; + +use sourmash::signature::Signature; +use sourmash::storage::{FSStorage, InnerStorage, Storage, ZipStorage}; #[test] fn zipstorage_load_file() -> Result<(), Box> { @@ -47,7 +50,7 @@ fn zipstorage_list_sbts() -> Result<(), Box> { #[test] fn zipstorage_parallel_access() -> Result<(), Box> { use rayon::prelude::*; - use sourmash::signature::{Signature, SigsTrait}; + use sourmash::signature::SigsTrait; let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); filename.push("../../tests/test-data/v6.sbt.zip"); @@ -77,3 +80,26 @@ fn zipstorage_parallel_access() -> Result<(), Box> { Ok(()) } + +#[test] +fn innerstorage_save_sig() -> Result<(), Box> { + let output = TempDir::new()?; + + let fst = FSStorage::new(output.path().as_os_str().to_str().unwrap(), "".into()); + + let instorage = InnerStorage::new(fst); + + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("../../tests/test-data/genome-s10.fa.gz.sig"); + + let sig = Signature::from_path(filename)?.swap_remove(0); + let new_path = instorage.save_sig("test", sig.clone())?; + dbg!(new_path); + + let loaded_sig = instorage.load_sig("test")?; + + assert_eq!(sig.name(), loaded_sig.name()); + assert_eq!(sig.md5sum(), loaded_sig.md5sum()); + + Ok(()) +} From d4ddd8a9f65e195d1f11811123e899225c9fe1fa Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Thu, 23 Nov 2023 15:28:54 -0800 Subject: [PATCH 18/19] more storage tests --- src/core/tests/storage.rs | 68 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 2 deletions(-) diff --git a/src/core/tests/storage.rs b/src/core/tests/storage.rs index 1f15c5aff0..e0d355d6b0 100644 --- a/src/core/tests/storage.rs +++ b/src/core/tests/storage.rs @@ -3,7 +3,7 @@ use std::path::PathBuf; use tempfile::TempDir; use sourmash::signature::Signature; -use sourmash::storage::{FSStorage, InnerStorage, Storage, ZipStorage}; +use sourmash::storage::{FSStorage, InnerStorage, Storage, StorageArgs, ZipStorage}; #[test] fn zipstorage_load_file() -> Result<(), Box> { @@ -85,7 +85,7 @@ fn zipstorage_parallel_access() -> Result<(), Box> { fn innerstorage_save_sig() -> Result<(), Box> { let output = TempDir::new()?; - let fst = FSStorage::new(output.path().as_os_str().to_str().unwrap(), "".into()); + let fst = FSStorage::new("".into(), output.path().as_os_str().to_str().unwrap()); let instorage = InnerStorage::new(fst); @@ -103,3 +103,67 @@ fn innerstorage_save_sig() -> Result<(), Box> { Ok(()) } + +#[test] +fn innerstorage_load() -> Result<(), Box> { + let output = TempDir::new()?; + + let fst = FSStorage::new("".into(), output.path().as_os_str().to_str().unwrap()); + + let instorage = InnerStorage::new(fst); + + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("../../tests/test-data/genome-s10.fa.gz.sig"); + + let sig = Signature::from_path(filename)?.swap_remove(0); + let new_path = instorage.save_sig("test", sig.clone())?; + dbg!(new_path); + + let raw_data = instorage.load("test")?; + let loaded_sig = Signature::from_reader(raw_data.as_slice())?.swap_remove(0); + + assert_eq!(sig.name(), loaded_sig.name()); + assert_eq!(sig.md5sum(), loaded_sig.md5sum()); + + Ok(()) +} + +#[test] +fn innerstorage_args() -> Result<(), Box> { + let output = TempDir::new()?; + let path = output.path().as_os_str().to_str().unwrap(); + + let fst = FSStorage::new("".into(), path); + + let instorage = InnerStorage::new(fst); + + let args = instorage.args(); + + assert!(matches!(args, StorageArgs::FSStorage { .. })); + let StorageArgs::FSStorage { path: p } = args; + assert_eq!(p, path); + + Ok(()) +} + +#[test] +fn innerstorage_from_args() -> Result<(), Box> { + let output = TempDir::new()?; + let path = output.path().as_os_str().to_str().unwrap(); + + let fst = FSStorage::new("".into(), path); + let args = fst.args(); + + let instorage = InnerStorage::new(FSStorage::from(&args)); + let inargs = instorage.args(); + + assert!(matches!(inargs, StorageArgs::FSStorage { .. })); + let StorageArgs::FSStorage { path: p1 } = inargs; + assert_eq!(p1, path); + + assert!(matches!(args, StorageArgs::FSStorage { .. })); + let StorageArgs::FSStorage { path: p2 } = args; + assert_eq!(p2, path); + + Ok(()) +} From 8e2eb517a8a540810e16ea84b06abe76a171fd0a Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sat, 25 Nov 2023 18:10:40 -0800 Subject: [PATCH 19/19] Update rust changelog --- src/core/CHANGELOG.md | 74 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/src/core/CHANGELOG.md b/src/core/CHANGELOG.md index 3915a62086..d9807e8ebf 100644 --- a/src/core/CHANGELOG.md +++ b/src/core/CHANGELOG.md @@ -5,11 +5,81 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## Unreleased +## [0.12.0] - 2023-11-26 + +MSRV: 1.64 Added: -- An inverted index, codename Greyhound (#1238) +- Initial implementation for `Manifest`, `Selection`, and `Picklist` following + the Python API. (#2230) +- `Collection` is a new abstraction for working with a set of signatures. A + collection needs a `Storage` for holding the signatures (on-disk, in-memory, + or remotely), and a `Manifest` to describe the metadata for each signature. (#2230) +- Expose CSV parsing and RocksDB errors. (#2230) +- New module `sourmash::index::revindex::disk_revindex` with the on-disk + RevIndex implementation based on RocksDB. (#2230) +- Add `iter` and `iter_mut` methods for `Signature`. (#2230) +- Add `load_sig` and `save_sig` methods to `Storage` trait for higher-level data + manipulation and caching. (#2230) +- Add `spec` method to `Storage` to allow constructing a concrete `Storage` from + a string description. (#2230) +- Add `InnerStorage` for synchronizing parallel access to `Storage` + implementations. (#2230) +- Add `MemStorage` for keeping signatures in-memory (mostly for debugging and + testing). (#2230) +- Add new `branchwater` feature (enabled by default), which can be disabled by + downstream projects to limit bringing heavy dependencies like rocksdb. (#2230) +- Add new `rkyv` feature (disabled by default), making `MinHash` serializable + with the `rkyv` crate. (#2230) +- Add semver checks for CI (so we bump versions accordingly, or avoid breaking + changes). (#2230) +- Add cargo deny config. (#2724) +- Benchmarks for seq_to_hashes in protein mode. (#1944) +- Oxidize ZipStorage. (#1909) +- Move greyhound-core into sourmash. (#1238) +- add `MinHash.kmers_and_hashes(...)` and `sourmash sig kmers`. (#1695) +- Produce list of hashes from a sequence. (#1653) + +Changed: + +- Rename `HashFunctions` variants to follow camel-case, so `Murmur64Protein` + instead of `murmur64_protein`. (#2230) +- `LinearIndex` is now implemented as a thin layer on top of `Collection`. (#2230) +- Move `GatherResult` to `sourmash::index` module. (#2230) +- Move `sourmash::index::revindex` to `sourmash::index::mem_revindex` (this is + the Greyhound version of revindex, in-memory only). It was also refactored + internally to build a version of a `LinearIndex` that will be merged in the + future with `sourmash::index::LinearIndex`. (#2230) +- Move `select` method from `Index` trait into a separate `Select` trait, + and implement it for `Signature` based on the new `Selection` API. (#2230) +- Move `SigStore` into `sourmash::storage` module, and remove the generic. Now + it always stores `Signature`. Also implement `Select` for it. (#2230) +- Disable `musllinux` wheels (need to figure out how to build rocksdb for it). (#2230) +- Reorganize traits for easier wasm and native compilation. (#1836) +- Adjust dayhoff and hp encodings to tolerate stop codons in the protein sequence. (#1673) + +Fixed: + +- Reduce features combinations on Rust checks (takes much less time to run). (#2230) +- Build: MSRV check for 1.64. (#2680) +- maturin: move deprecated definition from Cargo.toml to pyproject.toml. (#2597) +- Fix broken crates.io badge. (#2556) +- Fix unnecessary typecasts in Rust. (#2366) +- Fix `Signature.minhash` API during `sourmash sketch`. (#2329) +- Return Err for angular_similarity when abundance tracking is off. (#2327) +- Update various descriptions to talk about k-mers, not just DNA. (#2137) +- Fix downsample_scaled in `core`. (#2108) +- speed up `SeqToHashes` `translate`. (#1946) +- Speed-up `SeqToHashes()`. (#1938) +- Fix containment calculation for nodegraphs. (#1862) +- Fix panic bug in `sourmash sketch` dna with bad input and `--check-sequence`. (#1702) +- Fix Rust panic in `MinHash.seq_to_hashes`. (#1701) +- Beta lints. (#2841 #2630 #2596 #2298 #1791 #1786 #1760) + +Removed: + +- Remove BIGSI and SBT code. (#2732) ## [0.11.0] - 2021-07-07