Skip to content

Commit

Permalink
add a flexible progressive signature output class
Browse files Browse the repository at this point in the history
  • Loading branch information
ctb committed Apr 27, 2021
1 parent 18d72c4 commit b1d54df
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 33 deletions.
50 changes: 17 additions & 33 deletions src/sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import os.path
import sys
import copy
import gzip

import screed
from .compare import (compare_all_pairs, compare_serial_containment,
Expand Down Expand Up @@ -1075,24 +1076,15 @@ def prefetch(args):
csvout_w = csv.DictWriter(csvout_fp, fieldnames=fieldnames)
csvout_w.writeheader()

# save matches to a directory?
matches_outdir = None
if args.save_matches and args.save_matches.endswith('/'):
matches_outdir = args.save_matches
try:
os.mkdir(matches_outdir)
except FileExistsError:
pass
except:
notify("ERROR: cannot create --save-matches directory '{}'",
args.save_matches)
sys.exit(-1)
notify("saving all matching database signatures to files under '{}'",
matches_outdir)
# track & maybe save matches progressively
from .sourmash_args import SaveMatchingSignatures
matches_out = SaveMatchingSignatures(args.save_matches)
if args.save_matches:
notify("saving all matching database signatures to '{}'",
args.save_matches)

# iterate over signatures in db one at a time, for each db;
# find those with any kind of containment.
keep = []
# find those with sufficient overlap
noident_mh = copy.copy(query_mh)
did_a_search = False # track whether we did _any_ search at all!
for dbfilename in args.databases:
Expand All @@ -1111,33 +1103,30 @@ def prefetch(args):
try:
for result in prefetch_database(query, db, args.threshold_bp):
match = result.match
# @CTB TODO: don't keep all matches in memory.
keep.append(match)

# track remaining "untouched" hashes.
noident_mh.remove_many(match.minhash.hashes)

# output matches as we go
# output match info as we go
if csvout_fp:
d = dict(result._asdict())
del d['match'] # actual signatures not in CSV.
del d['query']
csvout_w.writerow(d)

if matches_outdir:
md5 = result.match_md5
outname = os.path.join(matches_outdir, f"{md5}.sig")
with open(outname, "wt") as fp:
sig.save_signatures([match], fp)
# output match signatures as we go (maybe)
matches_out.add(match)

if len(keep) % 10 == 0:
notify(f"total of {len(keep)} matching signatures so far.",
if matches_out.count % 10 == 0:
notify(f"total of {matches_out.count} matching signatures so far.",
end="\r")
except ValueError as exc:
notify("ERROR in prefetch_databases:")
notify(str(exc))
sys.exit(-1)
# @CTB should we continue? or only continue if -f?
finally:
matches_out.close()

did_a_search = True

Expand All @@ -1152,22 +1141,17 @@ def prefetch(args):
notify("ERROR in prefetch: no compatible signatures in any databases?!")
sys.exit(-1)

notify(f"total of {len(keep)} matching signatures.")
notify(f"total of {matches_out.count} matching signatures.")

if csvout_fp:
notify(f"saved {len(keep)} matches to CSV file '{args.output}'")
notify(f"saved {matches_out.count} matches to CSV file '{args.output}'")
csvout_fp.close()

matched_query_mh = copy.copy(query_mh)
matched_query_mh.remove_many(noident_mh.hashes)
notify(f"of {len(query_mh)} distinct query hashes, {len(matched_query_mh)} were found in matches above threshold.")
notify(f"a total of {len(noident_mh)} query hashes remain unmatched.")

if args.save_matches and not matches_outdir:
notify("saving all matching database signatures to '{}'", args.save_matches)
with sourmash_args.FileOutput(args.save_matches, "wt") as fp:
sig.save_signatures(keep, fp)

if args.save_matching_hashes:
filename = args.save_matching_hashes
notify(f"saving {len(matched_query_mh)} matched hashes to '{filename}'")
Expand Down
103 changes: 103 additions & 0 deletions src/sourmash/sourmash_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import itertools
from enum import Enum
import traceback
import gzip
import zipfile

import screed

Expand Down Expand Up @@ -535,3 +537,104 @@ def start_file(self, filename, loader):
self.n_sig += n_this

self.short_notify("loaded {} sigs from '{}'", n_this, filename)


#
# enum and class for saving signatures progressively
#

class SigFileSaveType(Enum):
SIGFILE = 1
SIGFILE_GZ = 2
DIRECTORY = 3
ZIPFILE = 4
NO_OUTPUT = 5


class SaveMatchingSignatures:
# @CTB filename or fp?
# @CTB stdout?
# @CTB context manager?
# @CTB use elsewhere?
def __init__(self, filename, force_type=None):
save_type = None
if not force_type:
if filename is None:
save_type = SigFileSaveType.NO_OUTPUT
elif filename.endswith('/'):
save_type = SigFileSaveType.DIRECTORY
elif filename.endswith('.gz'):
save_type = SigFileSaveType.SIGFILE_GZ
elif filename.endswith('.zip'):
save_type = SigFileSaveType.ZIPFILE
else:
save_type = SigFileSaveType.SIGFILE
else:
save_type = force_type

self.filename = filename
self.save_type = save_type
self.count = 0

self.open()

def open(self):
if self.save_type == SigFileSaveType.NO_OUTPUT:
pass
elif self.save_type == SigFileSaveType.DIRECTORY:
try:
os.mkdir(self.filename)
except FileExistsError:
pass
except:
notify("ERROR: cannot create signature output directory '{}'",
self.filename)
sys.exit(-1)
elif self.save_type == SigFileSaveType.SIGFILE:
self.keep = []
elif self.save_type == SigFileSaveType.SIGFILE_GZ:
self.keep = []
elif self.save_type == SigFileSaveType.ZIPFILE:
self.zf = zipfile.ZipFile(self.filename, 'w',
zipfile.ZIP_DEFLATED,
compresslevel=9)
else:
assert 0

def close(self):
if self.save_type == SigFileSaveType.NO_OUTPUT:
pass
elif self.save_type == SigFileSaveType.DIRECTORY:
pass
elif self.save_type == SigFileSaveType.SIGFILE:
with open(self.filename, "wt") as fp:
sourmash.save_signatures(self.keep, fp)
elif self.save_type == SigFileSaveType.SIGFILE_GZ:
with gzip.open(self.filename, "wt") as fp:
sourmash.save_signatures(self.keep, fp)
elif self.save_type == SigFileSaveType.ZIPFILE:
self.zf.close()
else:
assert 0

def add(self, ss):
if self.save_type == SigFileSaveType.NO_OUTPUT:
pass
elif self.save_type == SigFileSaveType.DIRECTORY:
md5 = ss.md5sum()[:8]
outname = os.path.join(self.filename, f"{md5}.sig.gz")
with gzip.open(outname, "wt") as fp:
sig.save_signatures([ss], fp)
elif self.save_type == SigFileSaveType.SIGFILE:
self.keep.append(ss)
elif self.save_type == SigFileSaveType.SIGFILE_GZ:
self.keep.append(ss)
elif self.save_type == SigFileSaveType.ZIPFILE:
md5 = ss.md5sum()[:8]
outname = f"signatures/{md5}.sig.gz"
json_str = sourmash.save_signatures([ss])
self.zf.writestr(outname, json_str)
else:
assert 0

self.count += 1
1 change: 1 addition & 0 deletions tests/test_prefetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ def test_prefetch_matches_to_dir(c):

assert c.last_result.status == 0
assert os.path.exists(matches_out)
assert os.path.isdir(matches_out)

sigs = sourmash.load_file_as_signatures(matches_out)

Expand Down

0 comments on commit b1d54df

Please sign in to comment.