sourmash-bio · luizirber · Apr 1, 2021 · Mar 28, 2021 · Mar 28, 2021 · Mar 29, 2021
diff --git a/src/sourmash/index.py b/src/sourmash/index.py
@@ -7,6 +7,8 @@
 
 
 class Index(ABC):
+    is_database = False
+
     @abstractmethod
     def signatures(self):
         "Return an iterator over all signatures in the Index object."
@@ -124,8 +126,55 @@ def gather(self, query, *args, **kwargs):
         return results
 
     @abstractmethod
-    def select(self, ksize=None, moltype=None):
-        ""
+    def select(self, ksize=None, moltype=None, scaled=None, num=None,
+               abund=None, containment=None):
+        """Return Index containing only signatures that match requirements.
+
+        Current arguments can be any or all of:
+        * ksize
+        * moltype
+        * scaled
+        * num
+        * containment
+
+        'select' will raise ValueError if the requirements are incompatible
+        with the Index subclass.
+
+        'select' may return an empty object or None if no matches can be
+        found.
+        """
+
+
+def select_signature(ss, ksize=None, moltype=None, scaled=0, num=0,
+                     containment=False):
+    "Check that the given signature matches the specificed requirements."
+    # ksize match?
+    if ksize and ksize != ss.minhash.ksize:
+        return False
+
+    # moltype match?
+    if moltype and moltype != ss.minhash.moltype:
+        return False
+
+    # containment requires scaled; similarity does not.
+    if containment:
+        if not scaled:
+            raise ValueError("'containment' requires 'scaled' in Index.select'")
+        if not ss.minhash.scaled:
+            return False
+
+    # 'scaled' and 'num' are incompatible
+    if scaled:
+        if ss.minhash.num:
+            return False
+    if num:
+        # note, here we check if 'num' is identical; this can be
+        # changed later.
+        if ss.minhash.scaled or num != ss.minhash.num:
+            return False
+
+    return True
+
 
 class LinearIndex(Index):
     "An Index for a collection of signatures. Can load from a .sig file."
@@ -157,18 +206,17 @@ def load(cls, location):
         lidx = LinearIndex(si, filename=location)
         return lidx
 
-    def select(self, ksize=None, moltype=None):
-        def select_sigs(ss, ksize=ksize, moltype=moltype):
-            if (ksize is None or ss.minhash.ksize == ksize) and \
-               (moltype is None or ss.minhash.moltype == moltype):
-               return True
+    def select(self, **kwargs):
+        """Return new LinearIndex containing only signatures that match req's.
 
-        return self.filter(select_sigs)
+        Does not raise ValueError, but may return an empty Index.
+        """
+        # eliminate things from kwargs with None or zero value
+        kw = { k : v for (k, v) in kwargs.items() if v }
 
-    def filter(self, filter_fn):
         siglist = []
         for ss in self._signatures:
-            if filter_fn(ss):
+            if select_signature(ss, **kwargs):
                 siglist.append(ss)
 
         return LinearIndex(siglist, self.filename)
@@ -260,11 +308,12 @@ def load_from_file_list(cls, filename):
     def save(self, *args):
         raise NotImplementedError
 
-    def select(self, ksize=None, moltype=None):
+    def select(self, **kwargs):
+        "Run 'select' on all indices within this MultiIndex."
         new_idx_list = []
         new_src_list = []
         for idx, src in zip(self.index_list, self.source_list):
-            idx = idx.select(ksize=ksize, moltype=moltype)
+            idx = idx.select(**kwargs)
             new_idx_list.append(idx)
             new_src_list.append(src)
 

diff --git a/src/sourmash/lca/lca_db.py b/src/sourmash/lca/lca_db.py
@@ -55,6 +55,8 @@ class LCA_Database(Index):
     `hashval_to_idx` is a dictionary from individual hash values to sets of
     `idx`.
     """
+    is_database = True
+
     def __init__(self, ksize, scaled, moltype='DNA'):
         self.ksize = int(ksize)
         self.scaled = int(scaled)
@@ -169,18 +171,29 @@ def signatures(self):
         for v in self._signatures.values():
             yield v
 
-    def select(self, ksize=None, moltype=None):
-        "Selector interface - make sure this database matches requirements."
-        ok = True
+    def select(self, ksize=None, moltype=None, num=0, scaled=0,
+               containment=False):
+        """Make sure this database matches the requested requirements.
+
+        As with SBTs, queries with higher scaled values than the database
+        can still be used for containment search, but not for similarity
+        search. See SBT.select(...) for details, and _find_signatures for
+        implementation.
+
+        Will always raise ValueError if a requirement cannot be met.
+        """
+        if num:
+            raise ValueError("cannot use 'num' MinHashes to search LCA database")
+
+        if scaled > self.scaled and not containment:
+            raise ValueError(f"cannot use scaled={scaled} on this database (scaled={self.scaled})")
+
         if ksize is not None and self.ksize != ksize:
-            ok = False
+            raise ValueError(f"ksize on this database is {self.ksize}; this is different from requested ksize of {ksize}")
         if moltype is not None and moltype != self.moltype:
-            ok = False
+            raise ValueError(f"moltype on this database is {self.moltype}; this is different from requested moltype of {moltype}")
 
-        if ok:
-            return self
-
-        raise ValueError("cannot select LCA on ksize {} / moltype {}".format(ksize, moltype))
+        return self
 
     @classmethod
     def load(cls, db_name):
@@ -467,12 +480,16 @@ def _find_signatures(self, minhash, threshold, containment=False,
         This is essentially a fast implementation of find that collects all
         the signatures with overlapping hash values. Note that similarity
         searches (containment=False) will not be returned in sorted order.
+
+        As with SBTs, queries with higher scaled values than the database
+        can still be used for containment search, but not for similarity
+        search. See SBT.select(...) for details.
         """
         # make sure we're looking at the same scaled value as database
         if self.scaled > minhash.scaled:
             minhash = minhash.downsample(scaled=self.scaled)
         elif self.scaled < minhash.scaled and not ignore_scaled:
-            # note that containment cannot be calculated w/o matching scaled.
+            # note that similarity cannot be calculated w/o matching scaled.
             raise ValueError("lca db scaled is {} vs query {}; must downsample".format(self.scaled, minhash.scaled))
 
         query_mins = set(minhash.hashes)

diff --git a/src/sourmash/logging.py b/src/sourmash/logging.py
@@ -41,6 +41,17 @@ def debug(s, *args, **kwargs):
         sys.stderr.flush()
 
 
+def debug_literal(s, *args, **kwargs):
+    "A debug logging function => stderr."
+    if _quiet or not _debug:
+        return
+
+    print(u'\r\033[K', end=u'', file=sys.stderr)
+    print(s, file=sys.stderr, end=kwargs.get('end', u'\n'))
+    if kwargs.get('flush'):
+        sys.stderr.flush()
+
+
 def error(s, *args, **kwargs):
     "A simple error logging function => stderr."
     print(u'\r\033[K', end=u'', file=sys.stderr)

diff --git a/src/sourmash/sbt.py b/src/sourmash/sbt.py
@@ -171,6 +171,7 @@ class SBT(Index):
     We use two dicts to store the tree structure: One for the internal nodes,
     and another for the leaves (datasets).
     """
+    is_database = True
 
     def __init__(self, factory, *, d=2, storage=None, cache_size=None):
         self.factory = factory
@@ -189,19 +190,60 @@ def signatures(self):
         for k in self.leaves():
             yield k.data
 
-    def select(self, ksize=None, moltype=None):
-        first_sig = next(iter(self.signatures()))
+    def select(self, ksize=None, moltype=None, num=0, scaled=0,
+               containment=False):
+        """Make sure this database matches the requested requirements.
 
-        ok = True
-        if ksize is not None and first_sig.minhash.ksize != ksize:
-            ok = False
-        if moltype is not None and first_sig.minhash.moltype != moltype:
-            ok = False
+        Will always raise ValueError if a requirement cannot be met.
 
-        if ok:
-            return self
+        The only tricky bit here is around downsampling: if the scaled
+        value being requested is higher than the signatures in the
+        SBT, we can use the SBT for containment but not for
+        similarity. This is because:
 
-        raise ValueError("cannot select SBT on ksize {} / moltype {}".format(ksize, moltype))
+        * if we are doing containment searches, the intermediate nodes
+          can still be used for calculating containment of signatures
+          with higher scaled values. This is because only hashes that match
+          in the higher range are used for containment scores.
+        * however, for similarity, _all_ hashes are used, and we cannot
+          implicitly downsample or necessarily estimate similarity if
+          the scaled values differ.
+        """
+        # pull out a signature from this collection -
+        first_sig = next(iter(self.signatures()))
+        db_mh = first_sig.minhash
+
+        # check ksize.
+        if ksize is not None and db_mh.ksize != ksize:
+            raise ValueError(f"search ksize {ksize} is different from database ksize {db_mh.ksize}")
+
+        # check moltype.
+        if moltype is not None and db_mh.moltype != moltype:
+            raise ValueError(f"search moltype {moltype} is different from database moltype {db_mh.moltype}")
+
+        # containment requires 'scaled'.
+        if containment:
+            if not scaled:
+                raise ValueError("'containment' requires 'scaled' in SBT.select'")
+            if not db_mh.scaled:
+                raise ValueError("cannot search this SBT for containment; signatures are not calculated with scaled")
+
+        # 'num' and 'scaled' do not mix.
+        if num:
+            if not db_mh.num:
+                raise ValueError(f"this database was created with 'scaled' MinHash sketches, not 'num'")
+            if num != db_mh.num:
+                raise ValueError(f"num mismatch for SBT: num={num}, {db_mh.num}")
+
+        if scaled:
+            if not db_mh.scaled:
+                raise ValueError(f"this database was created with 'num' MinHash sketches, not 'scaled'")
+
+            # we can downsample SBTs for containment operations.
+            if scaled > db_mh.scaled and not containment:
+                raise ValueError(f"search scaled value {scaled} is less than database scaled value of {db_mh.scaled}")
+
+        return self
 
     def new_node_pos(self, node):
         if not self._nodes: