diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000000..832e891af1 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,14 @@ +version: 2 +updates: +- package-ecosystem: pip + directory: "/" + schedule: + interval: daily + time: "13:00" + open-pull-requests-limit: 10 +- package-ecosystem: cargo + directory: "/" + schedule: + interval: daily + time: "13:00" + open-pull-requests-limit: 10 diff --git a/src/sourmash/index.py b/src/sourmash/index.py index 7992d7fe30..3fb131cb3d 100644 --- a/src/sourmash/index.py +++ b/src/sourmash/index.py @@ -125,8 +125,8 @@ def prepare_query(query_mh, subj_mh): if search_fn.passes(score): # note: here we yield the original signature, not the # downsampled minhash. - search_fn.collect(score) - yield subj, score + if search_fn.collect(score, subj): + yield subj, score def search_abund(self, query, *, threshold=None, **kwargs): """Return set of matches with angular similarity above 'threshold'. diff --git a/src/sourmash/lca/lca_db.py b/src/sourmash/lca/lca_db.py index 0a5fd8a57b..4af77b5a5b 100644 --- a/src/sourmash/lca/lca_db.py +++ b/src/sourmash/lca/lca_db.py @@ -462,9 +462,14 @@ def find(self, search_fn, query, **kwargs): score = search_fn.score_fn(query_size, shared_size, subj_size, total_size) + + # note to self: even with JaccardSearchBestOnly, this will + # still iterate over & score all signatures. We should come + # up with a protocol by which the JaccardSearch object can + # signal that it is done, or something. if search_fn.passes(score): - search_fn.collect(score) - yield subj, score + if search_fn.collect(score, subj): + yield subj, score @cached_property def lid_to_idx(self): diff --git a/src/sourmash/sbt.py b/src/sourmash/sbt.py index fed0bb7a62..af9617235e 100644 --- a/src/sourmash/sbt.py +++ b/src/sourmash/sbt.py @@ -436,9 +436,12 @@ def node_search(node, *args, **kwargs): if search_fn.passes(score): if is_leaf: # terminal node? keep. - results[node.data] = score - search_fn.collect(score) - return True + if search_fn.collect(score, node.data): + results[node.data] = score + return True + else: # it's a good internal node, keep. + return True + return False # & execute! diff --git a/src/sourmash/search.py b/src/sourmash/search.py index 461f0e7d88..0106e8de95 100644 --- a/src/sourmash/search.py +++ b/src/sourmash/search.py @@ -69,7 +69,8 @@ def make_gather_query(query_mh, threshold_bp): if threshold > 1.0: return None - search_obj = JaccardSearchBestOnly(SearchType.CONTAINMENT, threshold=threshold) + search_obj = JaccardSearchBestOnly(SearchType.CONTAINMENT, + threshold=threshold) return search_obj @@ -111,14 +112,20 @@ def check_is_compatible(self, sig): raise TypeError("this search cannot be done with an abund signature") def passes(self, score): - "Return True if this score meets or exceeds the threshold." + """Return True if this score meets or exceeds the threshold. + + Note: this can be used whenever a score or estimate is available + (e.g. internal nodes on an SBT). `collect(...)`, below, decides + whether a particular signature should be collected, and/or can + update the threshold (used for BestOnly behavior). + """ if score and score >= self.threshold: return True return False - def collect(self, score): - "Is this a potential match?" - pass + def collect(self, score, match_sig): + "Return True if this match should be collected." + return True def score_jaccard(self, query_size, shared_size, subject_size, total_size): "Calculate Jaccard similarity." @@ -142,9 +149,10 @@ def score_max_containment(self, query_size, shared_size, subject_size, class JaccardSearchBestOnly(JaccardSearch): "A subclass of JaccardSearch that implements best-only." - def collect(self, score): + def collect(self, score, match): "Raise the threshold to the best match found so far." self.threshold = max(self.threshold, score) + return True # generic SearchResult tuple. diff --git a/tests/conftest.py b/tests/conftest.py index 91a12dd0a9..f4badac793 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,14 +1,24 @@ import os -import matplotlib.pyplot as plt -plt.rcParams.update({'figure.max_open_warning': 0}) - from hypothesis import settings, Verbosity import pytest import matplotlib.pyplot as plt plt.rcParams.update({'figure.max_open_warning': 0}) +from sourmash_tst_utils import TempDirectory, RunnerContext + + +@pytest.fixture +def runtmp(): + with TempDirectory() as location: + yield RunnerContext(location) + + +@pytest.fixture +def run(): + yield RunnerContext(os.getcwd()) + @pytest.fixture(params=[True, False]) def track_abundance(request): diff --git a/tests/sourmash_tst_utils.py b/tests/sourmash_tst_utils.py index 3c84ce1f59..2ab0175e55 100644 --- a/tests/sourmash_tst_utils.py +++ b/tests/sourmash_tst_utils.py @@ -1,5 +1,4 @@ "Various utilities used by sourmash tests." - import sys import os import tempfile @@ -12,10 +11,7 @@ from pkg_resources import Requirement, resource_filename, ResolutionError import traceback from io import open # pylint: disable=redefined-builtin -try: - from StringIO import StringIO -except ImportError: - from io import StringIO +from io import StringIO SIG_FILES = [os.path.join('demo', f) for f in ( @@ -193,6 +189,7 @@ def run_sourmash(self, *args, **kwargs): raise ValueError(self) return self.last_result + sourmash = run_sourmash def run(self, scriptname, *args, **kwargs): "Run a script with the given arguments." diff --git a/tests/test_index.py b/tests/test_index.py index 01cadb6cec..2227010eaa 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -14,6 +14,7 @@ from sourmash.sbt import SBT, GraphFactory, Leaf from sourmash.sbtmh import SigLeaf from sourmash import sourmash_args +from sourmash.search import JaccardSearch, SearchType import sourmash_tst_utils as utils @@ -1081,3 +1082,128 @@ def test_multi_index_load_from_pathlist_3_zipfile(c): mi = MultiIndex.load_from_pathlist(file_list) assert len(mi) == 7 + +## +## test a slightly outre version of JaccardSearch - this is a test of the +## JaccardSearch 'collect' protocol, in particular... +## + +class JaccardSearchBestOnly_ButIgnore(JaccardSearch): + "A class that ignores certain results, but still does all the pruning." + def __init__(self, ignore_list): + super().__init__(SearchType.JACCARD, threshold=0.1) + self.ignore_list = ignore_list + + # a collect function that _ignores_ things in the ignore_list + def collect(self, score, match): + print('in collect; current threshold:', self.threshold) + for q in self.ignore_list: + print('ZZZ', match, match.similarity(q)) + if match.similarity(q) == 1.0: + print('yes, found.') + return False + + # update threshold if not perfect match, which could help prune. + self.threshold = score + return True + + +def test_linear_index_gather_ignore(): + sig2 = utils.get_test_data('2.fa.sig') + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + ss2 = sourmash.load_one_signature(sig2, ksize=31) + ss47 = sourmash.load_one_signature(sig47, ksize=31) + ss63 = sourmash.load_one_signature(sig63, ksize=31) + + # construct an index... + lidx = LinearIndex([ss2, ss47, ss63]) + + # ...now search with something that should ignore sig47, the exact match. + search_fn = JaccardSearchBestOnly_ButIgnore([ss47]) + + results = list(lidx.find(search_fn, ss47)) + results = [ ss for (ss, score) in results ] + + def is_found(ss, xx): + for q in xx: + print(ss, ss.similarity(q)) + if ss.similarity(q) == 1.0: + return True + return False + + assert not is_found(ss47, results) + assert not is_found(ss2, results) + assert is_found(ss63, results) + + +def test_lca_index_gather_ignore(): + from sourmash.lca import LCA_Database + + sig2 = utils.get_test_data('2.fa.sig') + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + ss2 = sourmash.load_one_signature(sig2, ksize=31) + ss47 = sourmash.load_one_signature(sig47, ksize=31) + ss63 = sourmash.load_one_signature(sig63, ksize=31) + + # construct an index... + db = LCA_Database(ksize=31, scaled=1000) + db.insert(ss2) + db.insert(ss47) + db.insert(ss63) + + # ...now search with something that should ignore sig47, the exact match. + search_fn = JaccardSearchBestOnly_ButIgnore([ss47]) + + results = list(db.find(search_fn, ss47)) + results = [ ss for (ss, score) in results ] + + def is_found(ss, xx): + for q in xx: + print(ss, ss.similarity(q)) + if ss.similarity(q) == 1.0: + return True + return False + + assert not is_found(ss47, results) + assert not is_found(ss2, results) + assert is_found(ss63, results) + + +def test_sbt_index_gather_ignore(): + sig2 = utils.get_test_data('2.fa.sig') + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + ss2 = sourmash.load_one_signature(sig2, ksize=31) + ss47 = sourmash.load_one_signature(sig47, ksize=31) + ss63 = sourmash.load_one_signature(sig63, ksize=31) + + # construct an index... + factory = GraphFactory(5, 100, 3) + db = SBT(factory, d=2) + + db.insert(ss2) + db.insert(ss47) + db.insert(ss63) + + # ...now search with something that should ignore sig47, the exact match. + print(f'\n** trying to ignore {ss47}') + search_fn = JaccardSearchBestOnly_ButIgnore([ss47]) + + results = list(db.find(search_fn, ss47)) + results = [ ss for (ss, score) in results ] + + def is_found(ss, xx): + for q in xx: + print('is found?', ss, ss.similarity(q)) + if ss.similarity(q) == 1.0: + return True + return False + + assert not is_found(ss47, results) + assert not is_found(ss2, results) + assert is_found(ss63, results) diff --git a/tests/test_search.py b/tests/test_search.py index efe61ea809..d52582b0cc 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -118,13 +118,13 @@ def test_score_jaccard_max_containment_zero_query_size(): def test_collect(): search_obj = make_jaccard_search_query(threshold=0) - search_obj.collect(1.0) + search_obj.collect(1.0, None) assert search_obj.threshold == 0 def test_collect_best_only(): search_obj = make_jaccard_search_query(threshold=0, best_only=True) - search_obj.collect(1.0) + search_obj.collect(1.0, None) assert search_obj.threshold == 1.0 diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 8d6009c0f2..135efae65b 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -859,15 +859,14 @@ def test_gather_query_db_md5_ambiguous(c): assert "Error! Multiple signatures start with md5 '1'" in err -@utils.in_tempdir -def test_gather_lca_db(c): +def test_gather_lca_db(runtmp): # can we do a 'sourmash gather' on an LCA database? query = utils.get_test_data('47+63.fa.sig') lca_db = utils.get_test_data('lca/47+63.lca.json') - c.run_sourmash('gather', query, lca_db) - print(c) - assert 'NC_009665.1 Shewanella baltica OS185' in str(c.last_result.out) + runtmp.sourmash('gather', query, lca_db) + print(runtmp) + assert 'NC_009665.1 Shewanella baltica OS185' in str(runtmp.last_result.out) @utils.in_tempdir @@ -1443,19 +1442,18 @@ def test_search_containment_s10(): assert '16.7%' in out -@utils.in_thisdir -def test_search_containment_s10_no_max(c): +def test_search_containment_s10_no_max(run): # check --containment for s10/s10-small q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') q2 = utils.get_test_data('scaled/genome-s10-small.fa.gz.sig') with pytest.raises(ValueError) as exc: - c.run_sourmash('search', q1, q2, '--containment', + run.run_sourmash('search', q1, q2, '--containment', '--max-containment') - print(c.last_result.out) - print(c.last_result.err) - assert "ERROR: cannot specify both --containment and --max-containment!" in c.last_result.err + print(run.last_result.out) + print(run.last_result.err) + assert "ERROR: cannot specify both --containment and --max-containment!" in run.last_result.err def test_search_max_containment_s10_pairwise(): @@ -4118,12 +4116,11 @@ def test_gather_abund_10_1_ignore_abundance(c): some_results = False for row in r: some_results = True - assert row['average_abund'] is '' - assert row['median_abund'] is '' - assert row['std_abund'] is '' + assert row['average_abund'] == '' + assert row['median_abund'] == '' + assert row['std_abund'] == '' assert some_results - @utils.in_tempdir diff --git a/tox.ini b/tox.ini index 7ab78f4187..228f689f07 100644 --- a/tox.ini +++ b/tox.ini @@ -46,6 +46,7 @@ commands = pytest \ --cov "{envsitepackagesdir}/sourmash" \ --cov . \ --cov-config "{toxinidir}/tox.ini" \ + --cov-report= \ --junitxml {toxworkdir}/junit.{envname}.xml \ {posargs:.} @@ -59,6 +60,7 @@ commands = pytest \ --cov "{envsitepackagesdir}/sourmash" \ --cov . \ --cov-config "{toxinidir}/tox.ini" \ + --cov-report= \ --junitxml {toxworkdir}/junit.{envname}.xml \ --run-hypothesis \ --hypothesis-show-statistics \ @@ -72,6 +74,7 @@ commands = pytest \ --cov "{envsitepackagesdir}/sourmash" \ --cov . \ --cov-config "{toxinidir}/tox.ini" \ + --cov-report= \ --junitxml {toxworkdir}/junit.{envname}.xml \ -k test_nodegraph \ {posargs:.} @@ -83,6 +86,7 @@ commands = pytest \ --cov "{envsitepackagesdir}/sourmash" \ --cov . \ --cov-config "{toxinidir}/tox.ini" \ + --cov-report= \ --junitxml {toxworkdir}/junit.{envname}.xml \ -k test_nodegraph \ {posargs:.}