Skip to content

Commit

Permalink
MRG: Add ANI output to pairwise, multisearch (#236)
Browse files Browse the repository at this point in the history
Adds ANI columns to pairwise and multisearch output, building off of @mr-eyes's ANI translations (#188) which got streamlined + added into sourmash core in sourmash-bio/sourmash#2943

Split from #234 to make things more concise/simpler.

## benchmark summary

## 12k ICTV viral genomes, scaled=200

79.8m comparisons

| version | experiment | time |
| -------- | -------- | -------- |
| PR | no ANI     | 21s     | 
| PR | with ANI     | 20s     |
| v0.9.0 | no ANI | 21s |


## 12k ICTV viral genomes, scaled=10

79.8m comparisons

| version | experiment | time |
| -------- | -------- | -------- |
| PR | no ANI     | 14m 0s     | 
| PR | with ANI     | 14m 6s     | 
| main branch (~v0.9.0) | no ANI | 14m 47s |
  • Loading branch information
bluegenes committed Feb 26, 2024
1 parent 43caeba commit 6bc49df
Show file tree
Hide file tree
Showing 7 changed files with 475 additions and 157 deletions.
8 changes: 6 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ fn do_multisearch(
ksize: u8,
scaled: usize,
moltype: String,
estimate_ani: bool,
output_path: Option<String>,
) -> anyhow::Result<u8> {
let selection = build_selection(ksize, scaled, &moltype);
Expand All @@ -217,8 +218,9 @@ fn do_multisearch(
siglist_path,
threshold,
&selection,
output_path,
allow_failed_sigpaths,
estimate_ani,
output_path,
) {
Ok(_) => Ok(0),
Err(e) => {
Expand All @@ -235,6 +237,7 @@ fn do_pairwise(
ksize: u8,
scaled: usize,
moltype: String,
estimate_ani: bool,
output_path: Option<String>,
) -> anyhow::Result<u8> {
let selection = build_selection(ksize, scaled, &moltype);
Expand All @@ -243,8 +246,9 @@ fn do_pairwise(
siglist_path,
threshold,
&selection,
output_path,
allow_failed_sigpaths,
estimate_ani,
output_path,
) {
Ok(_) => Ok(0),
Err(e) => {
Expand Down
31 changes: 27 additions & 4 deletions src/multisearch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use std::sync::atomic::AtomicUsize;
use crate::utils::{
csvwriter_thread, load_collection, load_sketches, MultiSearchResult, ReportType,
};
use sourmash::ani_utils::ani_from_containment;

/// Search many queries against a list of signatures.
///
Expand All @@ -20,8 +21,9 @@ pub fn multisearch(
against_filepath: String,
threshold: f64,
selection: &Selection,
output: Option<String>,
allow_failed_sigpaths: bool,
estimate_ani: bool,
output: Option<String>,
) -> Result<(), Box<dyn std::error::Error>> {
// Load all queries into memory at once.

Expand Down Expand Up @@ -56,6 +58,7 @@ pub fn multisearch(
//

let processed_cmp = AtomicUsize::new(0);
let ksize = selection.ksize().unwrap() as f64;

let send = against
.par_iter()
Expand All @@ -74,11 +77,27 @@ pub fn multisearch(
let target_size = against.minhash.size() as f64;

let containment_query_in_target = overlap / query_size;
let containment_in_target = overlap / target_size;
let max_containment = containment_query_in_target.max(containment_in_target);
let jaccard = overlap / (target_size + query_size - overlap);

if containment_query_in_target > threshold {
let containment_target_in_query = overlap / target_size;
let max_containment =
containment_query_in_target.max(containment_target_in_query);
let jaccard = overlap / (target_size + query_size - overlap);
let mut query_ani = None;
let mut match_ani = None;
let mut average_containment_ani = None;
let mut max_containment_ani = None;

// estimate ANI values
if estimate_ani {
let qani = ani_from_containment(containment_query_in_target, ksize) * 100.0;
let mani = ani_from_containment(containment_target_in_query, ksize) * 100.0;
query_ani = Some(qani);
match_ani = Some(mani);
average_containment_ani = Some((qani + mani) / 2.);
max_containment_ani = Some(f64::max(qani, mani));
}

results.push(MultiSearchResult {
query_name: query.name.clone(),
query_md5: query.md5sum.clone(),
Expand All @@ -88,6 +107,10 @@ pub fn multisearch(
max_containment,
jaccard,
intersect_hashes: overlap,
query_ani,
match_ani,
average_containment_ani,
max_containment_ani,
})
}
}
Expand Down
27 changes: 24 additions & 3 deletions src/pairwise.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use std::sync::atomic::AtomicUsize;
use crate::utils::{
csvwriter_thread, load_collection, load_sketches, MultiSearchResult, ReportType,
};
use sourmash::ani_utils::ani_from_containment;
use sourmash::selection::Selection;
use sourmash::signature::SigsTrait;

Expand All @@ -18,8 +19,9 @@ pub fn pairwise(
siglist: String,
threshold: f64,
selection: &Selection,
output: Option<String>,
allow_failed_sigpaths: bool,
estimate_ani: bool,
output: Option<String>,
) -> Result<(), Box<dyn std::error::Error>> {
// Load all sigs into memory at once.
let collection = load_collection(
Expand Down Expand Up @@ -49,6 +51,7 @@ pub fn pairwise(
// Results written to the writer thread above.

let processed_cmp = AtomicUsize::new(0);
let ksize = selection.ksize().unwrap() as f64;

sketches.par_iter().enumerate().for_each(|(idx, query)| {
for against in sketches.iter().skip(idx + 1) {
Expand All @@ -58,10 +61,24 @@ pub fn pairwise(

let containment_q1_in_q2 = overlap / query1_size;
let containment_q2_in_q1 = overlap / query2_size;
let max_containment = containment_q1_in_q2.max(containment_q2_in_q1);
let jaccard = overlap / (query1_size + query2_size - overlap);

if containment_q1_in_q2 > threshold || containment_q2_in_q1 > threshold {
let max_containment = containment_q1_in_q2.max(containment_q2_in_q1);
let jaccard = overlap / (query1_size + query2_size - overlap);
let mut query_ani = None;
let mut match_ani = None;
let mut average_containment_ani = None;
let mut max_containment_ani = None;

// estimate ANI values
if estimate_ani {
let qani = ani_from_containment(containment_q1_in_q2, ksize) * 100.0;
let mani = ani_from_containment(containment_q2_in_q1, ksize) * 100.0;
query_ani = Some(qani);
match_ani = Some(mani);
average_containment_ani = Some((qani + mani) / 2.);
max_containment_ani = Some(f64::max(qani, mani));
}
send.send(MultiSearchResult {
query_name: query.name.clone(),
query_md5: query.md5sum.clone(),
Expand All @@ -71,6 +88,10 @@ pub fn pairwise(
max_containment,
jaccard,
intersect_hashes: overlap,
query_ani,
match_ani,
average_containment_ani,
max_containment_ani,
})
.unwrap();
}
Expand Down
6 changes: 6 additions & 0 deletions src/python/sourmash_plugin_branchwater/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,8 @@ def __init__(self, p):
help = 'molecule type (DNA, protein, dayhoff, or hp; default DNA)')
p.add_argument('-c', '--cores', default=0, type=int,
help='number of cores to use (default is all available)')
p.add_argument('-a', '--ani', action='store_true',
help='estimate ANI from containment')

def main(self, args):
print_version()
Expand All @@ -269,6 +271,7 @@ def main(self, args):
args.ksize,
args.scaled,
args.moltype,
args.ani,
args.output)
if status == 0:
notify(f"...multisearch is done! results in '{args.output}'")
Expand All @@ -294,6 +297,8 @@ def __init__(self, p):
help = 'molecule type (DNA, protein, dayhoff, or hp; default DNA)')
p.add_argument('-c', '--cores', default=0, type=int,
help='number of cores to use (default is all available)')
p.add_argument('-a', '--ani', action='store_true',
help='estimate ANI from containment')

def main(self, args):
print_version()
Expand All @@ -310,6 +315,7 @@ def main(self, args):
args.ksize,
args.scaled,
args.moltype,
args.ani,
args.output)
if status == 0:
notify(f"...pairwise is done! results in '{args.output}'")
Expand Down
Loading

0 comments on commit 6bc49df

Please sign in to comment.