-
Notifications
You must be signed in to change notification settings - Fork 80
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* upgrade 'manifest' documentation, cli help * alias fileinfo to summarize * flakes cleanup * rescue shadowed tests * rescue shadowed tests * rescue shadowed tests * add 'sig grep' command * add some basic tests * fix get manifest stuff * fail on no manifest * check manifest req't * test various combinations of zip, -v, -i * update with CSV output/manifest * added -c/--count * adjust output * test fail extract * comment tests better * add test for count * update docs * remove warnings * cleanup; create CollectionManifest.filter_rows * create CollectionManifest.filter_on_columns * minor cleanup * Update src/sourmash/cli/sig/grep.py Co-authored-by: Tessa Pierce Ward <[email protected]> * Add a straight up picklist test Co-authored-by: Tessa Pierce Ward <[email protected]>
- Loading branch information
Showing
8 changed files
with
650 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
"""extract one or more signatures by substr/regex match""" | ||
|
||
usage=""" | ||
sourmash sig grep <pattern> <filename> [... <filenames>] | ||
This will search for the provided pattern in the files or databases, | ||
using the signature metadata, and output matching signatures. | ||
Currently 'grep' searches the 'name', 'filename', and 'md5' fields as | ||
displayed by `sig describe`. | ||
'pattern' can be a string or a regular expression. | ||
'sig grep' uses the built-in Python regexp module, 're', to implement | ||
regexp searching. See https://docs.python.org/3/howto/regex.html and | ||
https://docs.python.org/3/library/re.html for details. | ||
The '-v' (exclude), '-i' (case-insensitive), and `-c` (count) options of 'grep' are | ||
supported. | ||
'-o/--output' can be used to output matching signatures to a specific | ||
location. | ||
By default, 'sig grep' requires a pre-existing manifest for collections; | ||
this prevents potentially slow manifest rebuilding. You | ||
can turn this check off with '--no-require-manifest'. | ||
""" | ||
|
||
from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, | ||
add_picklist_args) | ||
|
||
|
||
def subparser(subparsers): | ||
subparser = subparsers.add_parser('grep', usage=usage) | ||
subparser.add_argument('pattern', help='search pattern (string/regex)') | ||
subparser.add_argument('signatures', nargs='*') | ||
subparser.add_argument( | ||
'-q', '--quiet', action='store_true', | ||
help='suppress non-error output' | ||
) | ||
subparser.add_argument( | ||
'-d', '--debug', action='store_true', | ||
help='output debug information' | ||
) | ||
subparser.add_argument( | ||
'-o', '--output', metavar='FILE', | ||
help='output matching signatures to this file (default stdout)', | ||
default='-', | ||
) | ||
subparser.add_argument( | ||
'-f', '--force', action='store_true', | ||
help='try to load all files as signatures, independent of filename' | ||
) | ||
subparser.add_argument( | ||
'--from-file', | ||
help='a text file containing a list of files to load signatures from' | ||
) | ||
subparser.add_argument( | ||
'-v', '--invert-match', | ||
help="select non-matching signatures", | ||
action="store_true" | ||
) | ||
subparser.add_argument( | ||
'-i', '--ignore-case', | ||
help="ignore case distinctions (search lower and upper case both)", | ||
action="store_true" | ||
) | ||
subparser.add_argument( | ||
'--no-require-manifest', | ||
help='do not require a manifest; generate dynamically if needed', | ||
action='store_true' | ||
) | ||
subparser.add_argument( | ||
'--csv', | ||
help='save CSV file containing signature data in manifest format' | ||
) | ||
subparser.add_argument( | ||
'--silent', '--no-signatures-output', | ||
help="do not output signatures", | ||
action='store_true', | ||
) | ||
subparser.add_argument( | ||
'-c', '--count', | ||
help="only output a count of discovered signatures; implies --silent", | ||
action='store_true' | ||
) | ||
add_ksize_arg(subparser, 31) | ||
add_moltype_args(subparser) | ||
add_picklist_args(subparser) | ||
|
||
|
||
def main(args): | ||
import sourmash.sig.grep | ||
return sourmash.sig.grep.main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
from .__main__ import main | ||
from . import grep |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
""" | ||
Command-line entry point for 'python -m sourmash.sig grep' | ||
""" | ||
import sys | ||
import re | ||
|
||
from sourmash import logging, sourmash_args | ||
from sourmash.logging import notify, error, debug, print_results | ||
from sourmash.manifest import CollectionManifest | ||
from .__main__ import _extend_signatures_with_from_file | ||
|
||
|
||
def main(args): | ||
""" | ||
extract signatures by pattern match. | ||
""" | ||
# basic argument parsing | ||
logging.set_quiet(args.quiet, args.debug) | ||
moltype = sourmash_args.calculate_moltype(args) | ||
picklist = sourmash_args.load_picklist(args) | ||
_extend_signatures_with_from_file(args) | ||
|
||
# build the search pattern | ||
pattern = args.pattern | ||
if args.ignore_case: | ||
pattern = re.compile(pattern, re.IGNORECASE) | ||
else: | ||
pattern = re.compile(pattern) | ||
|
||
# require manifests? | ||
require_manifest = True | ||
if args.no_require_manifest: | ||
require_manifest = False | ||
debug("sig grep: manifest will not be required") | ||
else: | ||
debug("sig grep: manifest required") | ||
|
||
# are we doing --count? if so, enforce --silent so no sigs are printed. | ||
if args.count: | ||
args.silent = True | ||
|
||
# define output type: signatures, or no? | ||
if args.silent: | ||
notify("(no signatures will be saved because of --silent/--count).") | ||
save_sigs = sourmash_args.SaveSignaturesToLocation(None) | ||
else: | ||
notify(f"saving matching signatures to '{args.output}'") | ||
save_sigs = sourmash_args.SaveSignaturesToLocation(args.output) | ||
save_sigs.open() | ||
|
||
# are we outputting a CSV? if so, initialize that, too. | ||
csv_obj = None | ||
if args.csv: | ||
csv_obj = sourmash_args.FileOutputCSV(args.csv) | ||
csv_fp = csv_obj.open() | ||
CollectionManifest.write_csv_header(csv_fp) | ||
|
||
# start loading! | ||
total_rows_examined = 0 | ||
for filename in args.signatures: | ||
idx = sourmash_args.load_file_as_index(filename, | ||
yield_all_files=args.force) | ||
|
||
idx = idx.select(ksize=args.ksize, | ||
moltype=moltype, | ||
picklist=picklist) | ||
|
||
# get (and maybe generate) the manifest. | ||
manifest = idx.manifest | ||
if manifest is None: | ||
if require_manifest: | ||
error(f"ERROR on filename '{filename}'.") | ||
error("sig grep requires a manifest by default, but no manifest present.") | ||
error("specify --no-require-manifest to dynamically generate one.") | ||
sys.exit(-1) | ||
else: | ||
manifest = sourmash_args.get_manifest(idx, | ||
require=False) | ||
|
||
# find all matching rows. | ||
sub_manifest = manifest.filter_on_columns(pattern.search, | ||
["name", "filename", "md5"], | ||
invert=args.invert_match) | ||
total_rows_examined += len(manifest) | ||
|
||
# write out to CSV, if desired. | ||
if args.csv: | ||
sub_manifest.write_to_csv(csv_fp) | ||
|
||
# just print out number of matches? | ||
if args.count: | ||
print_results(f"{len(sub_manifest)} matches: {filename}") | ||
elif not args.silent: | ||
# nope - do output signatures. convert manifest to picklist, apply. | ||
sub_picklist = sub_manifest.to_picklist() | ||
|
||
try: | ||
idx = idx.select(picklist=sub_picklist) | ||
except ValueError: | ||
error("** This input collection doesn't support 'grep' with picklists.") | ||
error("** EXITING.") | ||
error("**") | ||
error("** You can use 'sourmash sig cat' with a picklist,") | ||
error("** and then pipe the output to 'sourmash sig grep -") | ||
sys.exit(-1) | ||
|
||
# save! | ||
for ss in idx.signatures(): | ||
save_sigs.add(ss) | ||
# done with the big loop over all indexes! | ||
|
||
if args.silent: | ||
pass | ||
else: | ||
notify(f"loaded {total_rows_examined} total that matched ksize & molecule type") | ||
|
||
if save_sigs: | ||
notify(f"extracted {len(save_sigs)} signatures from {len(args.signatures)} file(s)") | ||
save_sigs.close() | ||
else: | ||
error("no matching signatures found!") | ||
sys.exit(-1) | ||
|
||
if args.csv: | ||
notify(f"wrote manifest containing all matches to CSV file '{args.csv}'") | ||
csv_obj.close() | ||
|
||
if picklist: | ||
sourmash_args.report_picklist(args, picklist) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.