Skip to content

Commit

Permalink
[MRG] Explore addition of a sketch command. (#1159)
Browse files Browse the repository at this point in the history
* add sourmash sketch command

* it's alive! sourmash sketch dna works!

* refactor make_minhashes code to allow more flexibility

* refactor internal compute code to better reflect reality

* update sketch dna as well

* pluralize signature factory and handling to get ready for sketch

* initial implementation with parameter strings

* add a variety of tests

* add sketch protein and sketch translate

* add --hp and --dayhoff to translate and protein

* add aa|prot and rna aliases for sketch command

* hacked and slashed ksize*3 :)

* refactor and simplify sketch code

* clean up command_sketch a bit

* provide better defaults for protein, translate

* allow param strings to set moltype and use moltype defaults

* add some tests for default parameters, etc

* refactoring of names

* check bad param string

* fix scaled/num checking and (some) associated tests

* do tests for multiple output sigs

* update protein/dayhoff/hp defaults

* add invalid parameter checking on dna/override protein, etc

* remove 10x tests, rename compute -> sketch for remaining

* add tests for incompatible sketch commands

* multiple ksizes within one param string

* cleanup of comments
  • Loading branch information
ctb authored Aug 14, 2020
1 parent 90db4c4 commit d921fc3
Show file tree
Hide file tree
Showing 9 changed files with 1,407 additions and 55 deletions.
2 changes: 2 additions & 0 deletions sourmash/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from . import lca
from . import sig
from . import sig as signature
from . import sketch
from . import storage


Expand Down Expand Up @@ -92,6 +93,7 @@ def parse_args(self, args=None, namespace=None):
def get_parser():
module_descs = {
'lca': 'Taxonomic operations',
'sketch': 'Create signatures',
'sig': 'Manipulate signature files',
'storage': 'Operations on storage',
}
Expand Down
35 changes: 35 additions & 0 deletions sourmash/cli/sketch/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""Define the command line interface for sourmash sketch
The top level CLI is defined in ../__init__.py. This module defines the CLI for
`sourmash sketch` operations.
"""

from . import dna
from . import dna as rna
from . import protein
from . import protein as aa
from . import protein as prot
from . import translate
from ..utils import command_list
from argparse import SUPPRESS, RawDescriptionHelpFormatter
import os
import sys


def subparser(subparsers):
subparser = subparsers.add_parser('sketch', formatter_class=RawDescriptionHelpFormatter, usage=SUPPRESS)
desc = 'Operations\n'
clidir = os.path.dirname(__file__)
ops = command_list(clidir)
for subcmd in ops:
docstring = getattr(sys.modules[__name__], subcmd).__doc__
helpstring = 'sourmash sketch {op:s} --help'.format(op=subcmd)
desc += ' {hs:33s} {ds:s}\n'.format(hs=helpstring, ds=docstring)
s = subparser.add_subparsers(
title='Create signatures', dest='subcmd', metavar='subcmd', help=SUPPRESS,
description=desc
)
for subcmd in ops:
getattr(sys.modules[__name__], subcmd).subparser(s)
subparser._action_groups.reverse()
subparser._optionals.title = 'Options'
61 changes: 61 additions & 0 deletions sourmash/cli/sketch/dna.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""create DNA signatures"""

import csv

import sourmash
from sourmash.logging import notify, print_results, error


def subparser(subparsers):
subparser = subparsers.add_parser('dna', aliases=['rna'])
subparser.add_argument(
'--license', default='CC0', type=str,
help='signature license. Currently only CC0 is supported.'
)
subparser.add_argument(
'--check-sequence', action='store_true',
help='complain if input sequence is invalid'
)
subparser.add_argument(
'-p', '--param-string', default=[],
help='signature parameters to use.', action='append',
)

subparser.add_argument(
'filenames', nargs='+', help='file(s) of sequences'
)
file_args = subparser.add_argument_group('File handling options')
file_args.add_argument(
'-f', '--force', action='store_true',
help='recompute signatures even if the file exists'
)
file_args.add_argument(
'-o', '--output',
help='output computed signatures to this file'
)
file_args.add_argument(
'--merge', '--name', type=str, default='', metavar="FILE",
help='merge all input files into one signature file with the '
'specified name'
)
file_args.add_argument(
'--outdir', help='output computed signatures to this directory'
)
file_args.add_argument(
'--singleton', action='store_true',
help='compute a signature for each sequence record individually'
)
file_args.add_argument(
'--name-from-first', action='store_true',
help='name the signature generated from each file after the first '
'record in the file'
)
file_args.add_argument(
'--randomize', action='store_true',
help='shuffle the list of input filenames randomly'
)


def main(args):
import sourmash.command_sketch
return sourmash.command_sketch.dna(args)
69 changes: 69 additions & 0 deletions sourmash/cli/sketch/protein.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""create protein signatures"""

import csv

import sourmash
from sourmash.logging import notify, print_results, error


def subparser(subparsers):
subparser = subparsers.add_parser('protein', aliases=['aa', 'prot'])
subparser.add_argument(
'--license', default='CC0', type=str,
help='signature license. Currently only CC0 is supported.'
)
subparser.add_argument(
'--check-sequence', action='store_true',
help='complain if input sequence is invalid'
)
subparser.add_argument(
'-p', '--param-string', default=[],
help='signature parameters to use.', action='append',
)

subparser.add_argument(
'filenames', nargs='+', help='file(s) of sequences'
)
file_args = subparser.add_argument_group('File handling options')
file_args.add_argument(
'-f', '--force', action='store_true',
help='recompute signatures even if the file exists'
)
file_args.add_argument(
'-o', '--output',
help='output computed signatures to this file'
)
file_args.add_argument(
'--merge', '--name', type=str, default='', metavar="FILE",
help='merge all input files into one signature file with the '
'specified name'
)
file_args.add_argument(
'--outdir', help='output computed signatures to this directory'
)
file_args.add_argument(
'--singleton', action='store_true',
help='compute a signature for each sequence record individually'
)
file_args.add_argument(
'--name-from-first', action='store_true',
help='name the signature generated from each file after the first '
'record in the file'
)
file_args.add_argument(
'--randomize', action='store_true',
help='shuffle the list of input filenames randomly'
)
file_args.add_argument(
'--dayhoff', action='store_true',
help='compute sketches using the dayhoff alphabet instead'
)
file_args.add_argument(
'--hp', action='store_true',
help='compute sketches using the dayhoff alphabet instead'
)


def main(args):
import sourmash.command_sketch
return sourmash.command_sketch.protein(args)
69 changes: 69 additions & 0 deletions sourmash/cli/sketch/translate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""create protein signature from DNA/RNA sequence"""

import csv

import sourmash
from sourmash.logging import notify, print_results, error


def subparser(subparsers):
subparser = subparsers.add_parser('translate')
subparser.add_argument(
'--license', default='CC0', type=str,
help='signature license. Currently only CC0 is supported.'
)
subparser.add_argument(
'--check-sequence', action='store_true',
help='complain if input sequence is invalid'
)
subparser.add_argument(
'-p', '--param-string', default=[],
help='signature parameters to use.', action='append',
)

subparser.add_argument(
'filenames', nargs='+', help='file(s) of sequences'
)
file_args = subparser.add_argument_group('File handling options')
file_args.add_argument(
'-f', '--force', action='store_true',
help='recompute signatures even if the file exists'
)
file_args.add_argument(
'-o', '--output',
help='output computed signatures to this file'
)
file_args.add_argument(
'--merge', '--name', type=str, default='', metavar="FILE",
help='merge all input files into one signature file with the '
'specified name'
)
file_args.add_argument(
'--outdir', help='output computed signatures to this directory'
)
file_args.add_argument(
'--singleton', action='store_true',
help='compute a signature for each sequence record individually'
)
file_args.add_argument(
'--name-from-first', action='store_true',
help='name the signature generated from each file after the first '
'record in the file'
)
file_args.add_argument(
'--randomize', action='store_true',
help='shuffle the list of input filenames randomly'
)
file_args.add_argument(
'--dayhoff', action='store_true',
help='compute sketches using the dayhoff alphabet instead'
)
file_args.add_argument(
'--hp', action='store_true',
help='compute sketches using the dayhoff alphabet instead'
)


def main(args):
import sourmash.command_sketch
return sourmash.command_sketch.translate(args)
Loading

0 comments on commit d921fc3

Please sign in to comment.