Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] add ksize selectors back into sourmash sig functions #1105

Merged
merged 9 commits into from
Jul 17, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions sourmash/sig/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,7 @@ def merge(args):
notify('loading signatures from {}...', sigfile, end='\r')
this_n = 0
for sigobj in sourmash_args.load_file_as_signatures(sigfile,
ksize=args.ksize,
select_moltype=moltype,
traverse=True,
progress=progress):
Expand Down Expand Up @@ -411,12 +412,20 @@ def intersect(args):

for sigfile in args.signatures:
for sigobj in sourmash_args.load_file_as_signatures(sigfile,
ksize=args.ksize,
select_moltype=moltype,
traverse=True,
progress=progress):
if first_sig is None:
first_sig = sigobj
mins = set(sigobj.minhash.get_mins())
else:
# check signature compatibility --
try:
sigobj.minhash.count_common(first_sig.minhash)
except ValueError:
error('incompatible minhashes; specify -k and/or molecule type.')
sys.exit(-1)

mins.intersection_update(sigobj.minhash.get_mins())
total_loaded += 1
Expand Down Expand Up @@ -481,9 +490,15 @@ def subtract(args):
total_loaded = 0
for sigfile in args.subtraction_sigs:
for sigobj in sourmash_args.load_file_as_signatures(sigfile,
ksize=args.ksize,
select_moltype=moltype,
traverse=True,
progress=progress):
try:
sigobj.minhash.count_common(from_mh)
except ValueError:
error('incompatible minhashes; specify -k and/or molecule type.')
sys.exit(-1)

if sigobj.minhash.track_abundance and not args.flatten:
error('Cannot use subtract on signatures with abundance tracking, sorry!')
Expand Down Expand Up @@ -523,6 +538,7 @@ def rename(args):
for filename in args.sigfiles:
debug('loading {}', filename)
siglist = sourmash_args.load_file_as_signatures(filename,
ksize=args.ksize,
select_moltype=moltype,
traverse=True,
progress=progress)
Expand Down Expand Up @@ -550,6 +566,7 @@ def extract(args):
total_loaded = 0
for filename in args.signatures:
siglist = sourmash_args.load_file_as_signatures(filename,
ksize=args.ksize,
select_moltype=moltype,
traverse=True,
progress=progress)
Expand Down Expand Up @@ -591,6 +608,7 @@ def filter(args):
total_loaded = 0
for filename in args.signatures:
siglist = sourmash_args.load_file_as_signatures(filename,
ksize=args.ksize,
select_moltype=moltype,
traverse=True,
progress=progress)
Expand Down Expand Up @@ -648,6 +666,7 @@ def flatten(args):
total_loaded = 0
for filename in args.signatures:
siglist = sourmash_args.load_file_as_signatures(filename,
ksize=args.ksize,
select_moltype=moltype,
traverse=True,
progress=progress)
Expand Down
1 change: 1 addition & 0 deletions tests/test-data/2+63.fa.sig

Large diffs are not rendered by default.

192 changes: 164 additions & 28 deletions tests/test_cmd_signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,15 +95,15 @@ def test_sig_merge_1_multisig(c):
@utils.in_tempdir
def test_sig_merge_1_ksize_moltype(c):
# check ksize, moltype args
sig47 = utils.get_test_data('47.fa.sig')
sig2 = utils.get_test_data('2.fa.sig')
sig63 = utils.get_test_data('63.fa.sig')
sig47and63 = utils.get_test_data('47+63.fa.sig')
c.run_sourmash('sig', 'merge', sig47, sig63, '--dna', '-k', '31')
sig2and63 = utils.get_test_data('2+63.fa.sig')
c.run_sourmash('sig', 'merge', sig2, sig63, '--dna', '-k', '31')

# stdout should be new signature
out = c.last_result.out

test_merge_sig = sourmash.load_one_signature(sig47and63)
test_merge_sig = sourmash.load_one_signature(sig2and63)
actual_merge_sig = sourmash.load_one_signature(out)

print(test_merge_sig.minhash)
Expand All @@ -113,6 +113,17 @@ def test_sig_merge_1_ksize_moltype(c):
assert actual_merge_sig.minhash == test_merge_sig.minhash


@utils.in_tempdir
def test_sig_merge_1_ksize_moltype_fail(c):
# check ksize, moltype args
sig2 = utils.get_test_data('2.fa.sig')
sig63 = utils.get_test_data('63.fa.sig')
sig2and63 = utils.get_test_data('2+63.fa.sig')

with pytest.raises(ValueError):
c.run_sourmash('sig', 'merge', sig2, sig63)


@utils.in_tempdir
def test_sig_merge_2(c):
# merge of 47 with nothing should be 47
Expand Down Expand Up @@ -227,6 +238,25 @@ def test_sig_filter_3(c):
assert filtered_sig.minhash.get_mins(True) == abunds


@utils.in_tempdir
def test_sig_filter_3_ksize_select(c):
# test filtering with ksize selectiong
psw_mag = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig')
c.run_sourmash('sig', 'filter', '-m', '2', psw_mag, '-k', '31')

# stdout should be new signature
out = c.last_result.out

filtered_sig = sourmash.load_one_signature(out)
test_sig = sourmash.load_one_signature(psw_mag, ksize=31)

abunds = test_sig.minhash.get_mins(True)
abunds = { k: v for (k, v) in abunds.items() if v >= 2 }
assert abunds

assert filtered_sig.minhash.get_mins(True) == abunds


@utils.in_tempdir
def test_sig_merge_flatten(c):
# merge of 47 without abund, with 63 with, will succeed with --flatten
Expand Down Expand Up @@ -389,6 +419,62 @@ def test_sig_intersect_5(c):
c.run_sourmash('sig', 'intersect', '--abundances-from', sig47, sig63)


@utils.in_tempdir
def test_sig_intersect_6_ksize_fail(c):
# specify ksize to intersect 2.fa.sig with 47.fa.sig - 2.fa.sig contains
# multiple ksizes.
sig2 = utils.get_test_data('2.fa.sig')
sig47 = utils.get_test_data('47.fa.sig')

with pytest.raises(ValueError):
c.run_sourmash('sig', 'intersect', sig2, sig47)


@utils.in_tempdir
def test_sig_intersect_6_ksize_succeed(c):
# specify ksize to intersect 2.fa.sig with 47.fa.sig - 2.fa.sig contains
# multiple ksizes.
sig2 = utils.get_test_data('2.fa.sig')
sig47 = utils.get_test_data('47.fa.sig')

c.run_sourmash('sig', 'intersect', '-k', '31', sig2, sig47)

assert 'loaded and intersected 2 signatures' in c.last_result.err


@utils.in_tempdir
def test_sig_intersect_7(c):
# intersect of 47 and nothing should be self
sig47 = utils.get_test_data('47.fa.sig')
c.run_sourmash('sig', 'intersect', sig47)

# stdout should be new signature
out = c.last_result.out

test_intersect_sig = sourmash.load_one_signature(sig47)
actual_intersect_sig = sourmash.load_one_signature(out)

print(test_intersect_sig.minhash)
print(actual_intersect_sig.minhash)
print(out)

assert actual_intersect_sig.minhash == test_intersect_sig.minhash


@utils.in_tempdir
def test_sig_intersect_8_multisig(c):
# intersect of all the multisig stuff should be nothing
sig47 = utils.get_test_data('47+63-multisig.sig')
c.run_sourmash('sig', 'intersect', sig47)

# stdout should be new signature
out = c.last_result.out

actual_intersect_sig = sourmash.load_one_signature(out)

assert not len(actual_intersect_sig.minhash)


@utils.in_tempdir
def test_sig_subtract_1(c):
# subtract of 63 from 47
Expand Down Expand Up @@ -445,36 +531,23 @@ def test_sig_subtract_3(c):


@utils.in_tempdir
def test_sig_intersect_2(c):
# intersect of 47 and nothing should be self
def test_sig_subtract_4_ksize_fail(c):
# subtract of 2 from 47 should fail without -k specified
sig47 = utils.get_test_data('47.fa.sig')
c.run_sourmash('sig', 'intersect', sig47)

# stdout should be new signature
out = c.last_result.out

test_intersect_sig = sourmash.load_one_signature(sig47)
actual_intersect_sig = sourmash.load_one_signature(out)

print(test_intersect_sig.minhash)
print(actual_intersect_sig.minhash)
print(out)
sig2 = utils.get_test_data('2.fa.sig')

assert actual_intersect_sig.minhash == test_intersect_sig.minhash
with pytest.raises(ValueError):
c.run_sourmash('sig', 'subtract', sig47, sig2)


@utils.in_tempdir
def test_sig_intersect_2_multisig(c):
# intersect of all the multisig stuff should be nothing
sig47 = utils.get_test_data('47+63-multisig.sig')
c.run_sourmash('sig', 'intersect', sig47)

# stdout should be new signature
out = c.last_result.out

actual_intersect_sig = sourmash.load_one_signature(out)
def test_sig_subtract_4_ksize_succeed(c):
# subtract of 2 from 47 should fail without -k specified
sig47 = utils.get_test_data('47.fa.sig')
sig2 = utils.get_test_data('2.fa.sig')

assert not len(actual_intersect_sig.minhash)
c.run_sourmash('sig', 'subtract', sig47, sig2, '-k', '31')
assert 'loaded and subtracted 1 signatures' in c.last_result.err


@utils.in_tempdir
Expand Down Expand Up @@ -515,6 +588,24 @@ def test_sig_rename_1_multisig(c):
assert n == 9, n


@utils.in_tempdir
def test_sig_rename_1_multisig_ksize(c):
# set new name for multiple signatures/files; select k=31
multisig = utils.get_test_data('47+63-multisig.sig')
other_sig = utils.get_test_data('2.fa.sig')
c.run_sourmash('sig', 'rename', multisig, other_sig, 'fiz bar', '-k', '31')

# stdout should be new signature
out = c.last_result.out

n = 0
for sig in sourmash.load_signatures(out):
assert sig.name() == 'fiz bar'
n += 1

assert n == 7, n


@utils.in_tempdir
def test_sig_rename_2_output_to_same(c):
# change name of signature "in place", same output file
Expand Down Expand Up @@ -875,6 +966,36 @@ def test_sig_extract_6(c):
assert len(siglist) == 2


@utils.in_tempdir
def test_sig_extract_7(c):
# extract matches based on ksize
sig2 = utils.get_test_data('2.fa.sig')
c.run_sourmash('sig', 'extract', sig2, '-k', '31')

# stdout should be new signature
out = c.last_result.out

siglist = sourmash.load_signatures(out)
siglist = list(siglist)

assert len(siglist) == 1


@utils.in_tempdir
def test_sig_extract_7_no_ksize(c):
# extract all three matches when -k not specified
sig2 = utils.get_test_data('2.fa.sig')
c.run_sourmash('sig', 'extract', sig2)

# stdout should be new signature
out = c.last_result.out

siglist = sourmash.load_signatures(out)
siglist = list(siglist)

assert len(siglist) == 3


@utils.in_tempdir
def test_sig_flatten_1(c):
# extract matches to several names from among several signatures & flatten
Expand All @@ -894,6 +1015,21 @@ def test_sig_flatten_1(c):
assert test_flattened.minhash == siglist[0].minhash


@utils.in_tempdir
def test_sig_flatten_2_ksize(c):
# flatten only one signature selected using ksize
psw_mag = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig')
c.run_sourmash('sig', 'flatten', psw_mag, '-k', '31')

# stdout should be new signature
out = c.last_result.out

siglist = sourmash.load_signatures(out)
siglist = list(siglist)

assert len(siglist) == 1


@utils.in_tempdir
def test_sig_downsample_1_scaled(c):
# downsample a scaled signature
Expand Down