Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] compute-optimized MinHash (for small scaled or large cardinalities) #1045

Merged
merged 14 commits into from
Jun 26, 2020
2 changes: 1 addition & 1 deletion .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ jobs:
uses: actions-rs/cargo@v1
with:
command: clippy
args: -- -D warnings
args: --all -- -D warnings

wasm-pack:
name: Check if wasm-pack builds a valid package for the sourmash crate
Expand Down
1 change: 1 addition & 0 deletions src/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ parallel = ["rayon"]
#cbindgen = "~0.14.2"

[dependencies]
backtrace = "=0.3.46" # later versions require rust 1.40
byteorder = "1.3.4"
cfg-if = "0.1.10"
failure = "0.1.8"
Expand Down
26 changes: 13 additions & 13 deletions src/core/src/cmd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use failure::Error;

use crate::index::MHBT;
use crate::signature::Signature;
use crate::sketch::minhash::{max_hash_for_scaled, HashFunctions, KmerMinHash};
use crate::sketch::minhash::{max_hash_for_scaled, HashFunctions, KmerMinHashBTree};
use crate::sketch::Sketch;

pub fn prepare(index_path: &str) -> Result<(), Error> {
Expand Down Expand Up @@ -101,15 +101,15 @@ pub fn build_template(params: &ComputeParameters) -> Vec<Sketch> {
let mut ksigs = vec![];

if params.protein {
ksigs.push(Sketch::MinHash(
KmerMinHash::builder()
ksigs.push(Sketch::LargeMinHash(
KmerMinHashBTree::builder()
.num(params.num_hashes)
.ksize(*k)
.hash_function(HashFunctions::murmur64_protein)
.max_hash(max_hash)
.seed(params.seed)
.abunds(if params.track_abundance {
Some(vec![])
Some(Default::default())
} else {
None
})
Expand All @@ -118,15 +118,15 @@ pub fn build_template(params: &ComputeParameters) -> Vec<Sketch> {
}

if params.dayhoff {
ksigs.push(Sketch::MinHash(
KmerMinHash::builder()
ksigs.push(Sketch::LargeMinHash(
KmerMinHashBTree::builder()
.num(params.num_hashes)
.ksize(*k)
.hash_function(HashFunctions::murmur64_dayhoff)
.max_hash(max_hash)
.seed(params.seed)
.abunds(if params.track_abundance {
Some(vec![])
Some(Default::default())
} else {
None
})
Expand All @@ -135,15 +135,15 @@ pub fn build_template(params: &ComputeParameters) -> Vec<Sketch> {
}

if params.hp {
ksigs.push(Sketch::MinHash(
KmerMinHash::builder()
ksigs.push(Sketch::LargeMinHash(
KmerMinHashBTree::builder()
.num(params.num_hashes)
.ksize(*k)
.hash_function(HashFunctions::murmur64_hp)
.max_hash(max_hash)
.seed(params.seed)
.abunds(if params.track_abundance {
Some(vec![])
Some(Default::default())
} else {
None
})
Expand All @@ -152,15 +152,15 @@ pub fn build_template(params: &ComputeParameters) -> Vec<Sketch> {
}

if params.dna {
ksigs.push(Sketch::MinHash(
KmerMinHash::builder()
ksigs.push(Sketch::LargeMinHash(
KmerMinHashBTree::builder()
.num(params.num_hashes)
.ksize(*k)
.hash_function(HashFunctions::murmur64_DNA)
.max_hash(max_hash)
.seed(params.seed)
.abunds(if params.track_abundance {
Some(vec![])
Some(Default::default())
} else {
None
})
Expand Down
5 changes: 1 addition & 4 deletions src/core/src/index/sbt/mhbt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,6 @@ mod test {
use std::path::PathBuf;

use assert_matches::assert_matches;
use tempfile;

use super::Factory;

Expand Down Expand Up @@ -206,9 +205,7 @@ mod test {
None,
)
.unwrap();
let sig_data = sigs[0].clone();

let leaf = sig_data.into();
let leaf = sigs[0].clone();

let results = sbt.find(search_minhashes, &leaf, 0.5).unwrap();
assert_eq!(results.len(), 1);
Expand Down
26 changes: 26 additions & 0 deletions src/core/src/signature.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,20 +34,23 @@ impl SigsTrait for Sketch {
match *self {
Sketch::UKHS(ref ukhs) => ukhs.size(),
Sketch::MinHash(ref mh) => mh.size(),
Sketch::LargeMinHash(ref mh) => mh.size(),
}
}

fn to_vec(&self) -> Vec<u64> {
match *self {
Sketch::UKHS(ref ukhs) => ukhs.to_vec(),
Sketch::MinHash(ref mh) => mh.to_vec(),
Sketch::LargeMinHash(ref mh) => mh.to_vec(),
}
}

fn ksize(&self) -> usize {
match *self {
Sketch::UKHS(ref ukhs) => ukhs.ksize(),
Sketch::MinHash(ref mh) => mh.ksize(),
Sketch::LargeMinHash(ref mh) => mh.ksize(),
}
}

Expand All @@ -61,19 +64,25 @@ impl SigsTrait for Sketch {
Sketch::MinHash(ref ot) => mh.check_compatible(ot),
_ => Err(SourmashError::MismatchSignatureType.into()),
},
Sketch::LargeMinHash(ref mh) => match other {
Sketch::LargeMinHash(ref ot) => mh.check_compatible(ot),
_ => Err(SourmashError::MismatchSignatureType.into()),
},
}
}

fn add_sequence(&mut self, seq: &[u8], force: bool) -> Result<(), Error> {
match *self {
Sketch::MinHash(ref mut mh) => mh.add_sequence(seq, force),
Sketch::LargeMinHash(ref mut mh) => mh.add_sequence(seq, force),
Sketch::UKHS(_) => unimplemented!(),
}
}

fn add_protein(&mut self, seq: &[u8]) -> Result<(), Error> {
match *self {
Sketch::MinHash(ref mut mh) => mh.add_protein(seq),
Sketch::LargeMinHash(ref mut mh) => mh.add_protein(seq),
Sketch::UKHS(_) => unimplemented!(),
}
}
Expand Down Expand Up @@ -183,6 +192,7 @@ impl Signature {
if self.signatures.len() == 1 {
match &self.signatures[0] {
Sketch::MinHash(mh) => mh.md5sum(),
Sketch::LargeMinHash(mh) => mh.md5sum(),
Sketch::UKHS(hs) => hs.md5sum(),
}
} else {
Expand Down Expand Up @@ -267,6 +277,22 @@ impl Signature {
None => return true, // TODO: match previous behavior
};
}
Sketch::LargeMinHash(mh) => {
if let Some(k) = ksize {
if k != mh.ksize() as usize {
return false;
}
};

match moltype {
Some(x) => {
if mh.hash_function() == x {
return true;
}
}
None => return true, // TODO: match previous behavior
};
}
Sketch::UKHS(hs) => {
if let Some(k) = ksize {
if k != hs.ksize() as usize {
Expand Down
Loading