Skip to content

Commit

Permalink
Merge branch 'main' into sigcat
Browse files Browse the repository at this point in the history
  • Loading branch information
bluegenes committed Sep 18, 2024
2 parents 4b97cfc + cf5b757 commit be71f5e
Show file tree
Hide file tree
Showing 24 changed files with 822 additions and 623 deletions.
350 changes: 91 additions & 259 deletions Cargo.lock

Large diffs are not rendered by default.

24 changes: 12 additions & 12 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "sourmash_plugin_branchwater"
version = "0.9.6"
version = "0.9.8-dev"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand All @@ -9,30 +9,30 @@ name = "sourmash_plugin_branchwater"
crate-type = ["cdylib"]

[dependencies]
pyo3 = { version = "0.22.1", features = ["extension-module", "anyhow"] }
pyo3 = { version = "0.22.3", features = ["extension-module", "anyhow"] }
rayon = "1.10.0"
serde = { version = "1.0.204", features = ["derive"] }
sourmash = { version = "0.14.1", features = ["branchwater"] }
serde_json = "1.0.120"
serde = { version = "1.0.210", features = ["derive"] }
sourmash = { version = "0.15.1", features = ["branchwater"] }
serde_json = "1.0.128"
niffler = "2.4.0"
log = "0.4.22"
env_logger = { version = "0.11.3", optional = true }
env_logger = { version = "0.11.5", optional = true }
simple-error = "0.3.1"
anyhow = "1.0.86"
anyhow = "1.0.89"
zip = { version = "2.0", default-features = false }
tempfile = "3.10"
tempfile = "3.12"
needletail = "0.5.1"
csv = "1.3.0"
camino = "1.1.7"
camino = "1.1.9"
glob = "0.3.1"
rustworkx-core = "0.15.1"
streaming-stats = "0.2.3"

[dev-dependencies]
assert_cmd = "2.0.14"
assert_cmd = "2.0.16"
assert_matches = "1.5.0"
predicates = "3.1.0"
tempfile = "3.10.0"
predicates = "3.1.2"
tempfile = "3.12.0"

[profile.release]
#target-cpu=native
Expand Down
5 changes: 2 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
.PHONY: all install test wheel sdist upload_dist

PYTHON ?= python

all:
Expand All @@ -6,9 +8,6 @@ all:
install:
$(PYTHON) -m pip install -e .

clean:
$(PYTHON) -m pip uninstall .

test:
$(PYTHON) -m pytest

Expand Down
235 changes: 198 additions & 37 deletions doc/README.md

Large diffs are not rendered by default.

5 changes: 1 addition & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ sigcat = "sourmash_plugin_branchwater:Branchwater_SigCat"

[project.optional-dependencies]
test = [
"pytest>=6.2.4,<8.3.0",
"pytest>=6.2.4,<8.4.0",
"pytest-cov>=2.12,<6.0",
"pytest-xdist",
"pandas",
Expand All @@ -44,8 +44,5 @@ test = [
[tool.maturin]
python-source = "src/python"

[tool.maturin.target.x86_64-apple-darwin]
macos-deployment-target = "10.14"

[metadata]
license = { text = "GNU Affero General Public License v3" }
67 changes: 66 additions & 1 deletion src/fastmultigather.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
use anyhow::Result;
use rayon::prelude::*;

use sourmash::selection::Selection;
use sourmash::prelude::ToWriter;
use sourmash::{selection::Selection, signature::SigsTrait};

use std::sync::atomic;
use std::sync::atomic::AtomicUsize;
Expand All @@ -11,6 +12,13 @@ use std::collections::BinaryHeap;

use camino::Utf8Path as PathBuf;

use std::collections::HashSet;
use std::fs::File;

use sourmash::signature::Signature;
use sourmash::sketch::minhash::KmerMinHash;
use sourmash::sketch::Sketch;

use crate::utils::{
consume_query_by_gather, load_collection, load_sketches, write_prefetch, PrefetchResult,
ReportType,
Expand All @@ -23,6 +31,8 @@ pub fn fastmultigather(
scaled: usize,
selection: &Selection,
allow_failed_sigpaths: bool,
save_matches: bool,
create_empty_results: bool,
) -> Result<()> {
let allow_empty_collection = false;
// load query collection
Expand Down Expand Up @@ -73,12 +83,23 @@ pub fn fastmultigather(
let prefix = name.split(' ').next().unwrap_or_default().to_string();
let location = PathBuf::new(&prefix).file_name().unwrap();
if let Some(query_mh) = query_sig.minhash() {
let mut matching_hashes = if save_matches { Some(Vec::new()) } else { None };
let matchlist: BinaryHeap<PrefetchResult> = against
.iter()
.filter_map(|against| {
let mut mm: Option<PrefetchResult> = None;
if let Ok(overlap) = against.minhash.count_common(query_mh, false) {
if overlap >= threshold_hashes {
if save_matches {
if let Ok(intersection) =
against.minhash.intersection(query_mh)
{
matching_hashes
.as_mut()
.unwrap()
.extend(intersection.0);
}
}
let result = PrefetchResult {
name: against.name.clone(),
md5sum: against.md5sum.clone(),
Expand Down Expand Up @@ -108,8 +129,52 @@ pub fn fastmultigather(
Some(gather_output),
)
.ok();

// Save matching hashes to .sig file if save_matches is true
if save_matches {
if let Some(hashes) = matching_hashes {
let sig_filename = format!("{}.matches.sig", name);
if let Ok(mut file) = File::create(&sig_filename) {
let unique_hashes: HashSet<u64> = hashes.into_iter().collect();
let mut new_mh = KmerMinHash::new(
query_mh.scaled().try_into().unwrap(),
query_mh.ksize().try_into().unwrap(),
query_mh.hash_function().clone(),
query_mh.seed(),
false,
query_mh.num(),
);
new_mh
.add_many(&unique_hashes.into_iter().collect::<Vec<_>>())
.ok();
let mut signature = Signature::default();
signature.push(Sketch::MinHash(new_mh));
signature.set_filename(&name);
if let Err(e) = signature.to_writer(&mut file) {
eprintln!("Error writing signature file: {}", e);
}
} else {
eprintln!("Error creating signature file: {}", sig_filename);
}
}
}
} else {
println!("No matches to '{}'", location);
if create_empty_results {
let prefetch_output = format!("{}.prefetch.csv", location);
let gather_output = format!("{}.gather.csv", location);
// touch output files
match std::fs::File::create(&prefetch_output) {
Ok(_) => {}
Err(e) => {
eprintln!("Failed to create empty prefetch output: {}", e)
}
}
match std::fs::File::create(&gather_output) {
Ok(_) => {}
Err(e) => eprintln!("Failed to create empty gather output: {}", e),
}
}
}
} else {
// different warning here? Could not load sig from record??
Expand Down
8 changes: 7 additions & 1 deletion src/index.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use sourmash::index::revindex::RevIndex;
use sourmash::index::revindex::RevIndexOps;
use sourmash::prelude::*;
use std::path::Path;

Expand All @@ -10,6 +11,7 @@ pub fn index<P: AsRef<Path>>(
output: P,
colors: bool,
allow_failed_sigpaths: bool,
use_internal_storage: bool,
) -> Result<(), Box<dyn std::error::Error>> {
println!("Loading siglist");
let allow_empty_collection = false;
Expand All @@ -22,11 +24,15 @@ pub fn index<P: AsRef<Path>>(
allow_empty_collection,
)?;

RevIndex::create(
let mut index = RevIndex::create(
output.as_ref(),
collection.select(selection)?.try_into()?,
colors,
)?;

if use_internal_storage {
index.internalize_storage()?;
}

Ok(())
}
20 changes: 19 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ mod sigcat;
use camino::Utf8PathBuf as PathBuf;

#[pyfunction]
#[pyo3(signature = (querylist_path, siglist_path, threshold, ksize, scaled, moltype, output_path=None))]
fn do_manysearch(
querylist_path: String,
siglist_path: String,
Expand Down Expand Up @@ -73,6 +74,7 @@ fn do_manysearch(

#[pyfunction]
#[allow(clippy::too_many_arguments)]
#[pyo3(signature = (query_filename, siglist_path, threshold_bp, ksize, scaled, moltype, output_path_prefetch=None, output_path_gather=None))]
fn do_fastgather(
query_filename: String,
siglist_path: String,
Expand Down Expand Up @@ -105,6 +107,7 @@ fn do_fastgather(
}

#[pyfunction]
#[pyo3(signature = (query_filenames, siglist_path, threshold_bp, ksize, scaled, moltype, output_path=None, save_matches=false, create_empty_results=false))]
fn do_fastmultigather(
query_filenames: String,
siglist_path: String,
Expand All @@ -113,6 +116,8 @@ fn do_fastmultigather(
scaled: usize,
moltype: String,
output_path: Option<String>,
save_matches: bool,
create_empty_results: bool,
) -> anyhow::Result<u8> {
let againstfile_path: camino::Utf8PathBuf = siglist_path.clone().into();
let selection = build_selection(Some(ksize), Some(scaled), Some(&moltype));
Expand Down Expand Up @@ -145,6 +150,8 @@ fn do_fastmultigather(
scaled,
&selection,
allow_failed_sigpaths,
save_matches,
create_empty_results,
) {
Ok(_) => Ok(0),
Err(e) => {
Expand Down Expand Up @@ -180,10 +187,18 @@ fn do_index(
moltype: String,
output: String,
colors: bool,
use_internal_storage: bool,
) -> anyhow::Result<u8> {
let selection = build_selection(Some(ksize), Some(scaled), Some(&moltype));
let allow_failed_sigpaths = false;
match index::index(siglist, &selection, output, colors, allow_failed_sigpaths) {
match index::index(
siglist,
&selection,
output,
colors,
allow_failed_sigpaths,
use_internal_storage,
) {
Ok(_) => Ok(0),
Err(e) => {
eprintln!("Error: {e}");
Expand All @@ -205,6 +220,7 @@ fn do_check(index: String, quick: bool) -> anyhow::Result<u8> {
}

#[pyfunction]
#[pyo3(signature = (querylist_path, siglist_path, threshold, ksize, scaled, moltype, estimate_ani, output_path=None))]
#[allow(clippy::too_many_arguments)]
fn do_multisearch(
querylist_path: String,
Expand Down Expand Up @@ -238,6 +254,7 @@ fn do_multisearch(

#[pyfunction]
#[allow(clippy::too_many_arguments)]
#[pyo3(signature = (siglist_path, threshold, ksize, scaled, moltype, estimate_ani, write_all, output_path=None))]
fn do_pairwise(
siglist_path: String,
threshold: f64,
Expand Down Expand Up @@ -285,6 +302,7 @@ fn do_manysketch(
}

#[pyfunction]
#[pyo3(signature = (pairwise_csv, output_clusters, similarity_column, similarity_threshold, cluster_sizes=None))]
fn do_cluster(
pairwise_csv: String,
output_clusters: String,
Expand Down
4 changes: 2 additions & 2 deletions src/manysearch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,15 +113,15 @@ pub fn manysearch(
let average_abund = sum_weighted_overlap as f64 / abunds.len() as f64;
let median_abund = median(abunds.iter().cloned()).unwrap();
let std_abund = stddev(abunds.iter().cloned());
(Some(sum_all_abunds), Some(sum_weighted_overlap as usize), average_abund, median_abund, std_abund)
(Some(sum_all_abunds), Some(sum_weighted_overlap as usize), Some(average_abund), Some(median_abund), Some(std_abund))
}
Err(e) => {
eprintln!("Error calculating abundances for query: {}, against: {}; Error: {}", query.name, against_sig.name(), e);
continue;
}
}
} else {
(None, None, 1.0, 1.0, 0.0)
(None, None, None, None, None)
};

results.push(SearchResult {
Expand Down
6 changes: 3 additions & 3 deletions src/mastiff_manysearch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,9 @@ pub fn mastiff_manysearch(
jaccard: None,
max_containment: None,
// can't calculate from here -- need to get these from w/in sourmash
average_abund: 1.0,
median_abund: 1.0,
std_abund: 0.0,
average_abund: None,
median_abund: None,
std_abund: None,
query_containment_ani,
match_containment_ani: None,
average_containment_ani: None,
Expand Down
Loading

0 comments on commit be71f5e

Please sign in to comment.