Skip to content

Commit

Permalink
Merge pull request #343 from dib-lab/plot/random
Browse files Browse the repository at this point in the history
[MRG] plot a random subsample with 'sourmash plot --subsample'.
  • Loading branch information
ctb committed Sep 29, 2017
2 parents e76fcdf + c951907 commit d9ce80f
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 10 deletions.
2 changes: 2 additions & 0 deletions doc/command-line.rst
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ Options::
--indices -- turn off index display on the plot.
--vmax -- maximum value (default 1.0) for heatmap.
--vmin -- minimum value (deafult 0.0) for heatmap.
--subsample=<N> -- plot a maximum of <N> samples, randomly chosen.
--subsample-seed=<seed> -- seed for pseudorandom number generator.

Example figures:

Expand Down
55 changes: 45 additions & 10 deletions sourmash_lib/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,8 @@ def compare(args):

# load in the various signatures
siglist = []
ksizes = set()
moltypes = set()
for filename in args.signatures:
notify('loading {}', filename, end='\r')
loaded = sig.load_signatures(filename, select_ksize=args.ksize,
Expand All @@ -319,26 +321,39 @@ def compare(args):
notify('\nwarning: no signatures loaded at given ksize/molecule type from {}', filename)
siglist.extend(loaded)

notify(' '*79, end='\r')
notify('loaded {} signatures total.'.format(len(siglist)))
# track ksizes/moltypes
for s in loaded:
ksizes.add(s.minhash.ksize)
moltypes.add(sourmash_args.get_moltype(s))

# error out while loading if we have more than one ksize/moltype
if len(ksizes) > 1 or len(moltypes) > 1:
break

# check ksizes and type
ksizes = set([s.minhash.ksize for s in siglist])
if len(ksizes) > 1:
error('multiple k-mer sizes loaded; please specify one with -k.')
ksizes = sorted(ksizes)
error('(saw k-mer sizes {})'.format(', '.join(map(str, ksizes))))
sys.exit(-1)

moltypes = set([sourmash_args.get_moltype(x) for x in siglist])
if len(moltypes) > 1:
error('multiple molecule types loaded; please specify --dna, --protein')
sys.exit(-1)

notify(' '*79, end='\r')
notify('loaded {} signatures total.'.format(len(siglist)))

# check to make sure they're potentially compatible - either using
# max_hash/scaled, or not.
scaled_sigs = [s.minhash.max_hash for s in siglist]
is_scaled = all(scaled_sigs)
is_scaled_2 = any(scaled_sigs)

# complain if it's not all one or the other
if is_scaled != is_scaled_2:
error('cannot mix scaled signatures with bounded signatures')
sys.exit(-1)

# if using --scaled, downsample appropriately
if is_scaled:
Expand Down Expand Up @@ -410,13 +425,20 @@ def plot(args):
# set up cmd line arguments
parser = argparse.ArgumentParser()
parser.add_argument('distances', help="output from 'sourmash compare'")
parser.add_argument('--pdf', action='store_true')
parser.add_argument('--labels', action='store_true')
parser.add_argument('--indices', action='store_false')
parser.add_argument('--pdf', action='store_true',
help='output PDF, not PNG.')
parser.add_argument('--labels', action='store_true',
help='show sample labels on dendrogram/matrix')
parser.add_argument('--indices', action='store_false',
help='show sample indices but not labels')
parser.add_argument('--vmax', default=1.0, type=float,
help='(default: %(default)f)')
help='upper limit of heatmap scale; (default: %(default)f)')
parser.add_argument('--vmin', default=0.0, type=float,
help='(default: %(default)f)')
help='lower limit of heatmap scale; (default: %(default)f)')
parser.add_argument("--subsample", type=int,
help="randomly downsample to this many samples, max.")
parser.add_argument("--subsample-seed", type=int, default=1,
help="random seed for --subsample; default=1")
args = parser.parse_args(args)

# load files
Expand Down Expand Up @@ -445,7 +467,20 @@ def plot(args):
ax1.set_xticks([])
ax1.set_yticks([])

Y = sch.linkage(D, method='single') # cluster!
# subsample?
if args.subsample:
numpy.random.seed(args.subsample_seed)

sample_idx = list(range(len(labeltext)))
numpy.random.shuffle(sample_idx)
sample_idx = sample_idx[:args.subsample]

np_idx = numpy.array(sample_idx)
D = D[numpy.ix_(np_idx, np_idx)]
labeltext = [ labeltext[idx] for idx in sample_idx ]

### do clustering
Y = sch.linkage(D, method='single')
Z1 = sch.dendrogram(Y, orientation='right', labels=labeltext)
fig.savefig(dendrogram_out)
notify('wrote dendrogram to: {}', dendrogram_out)
Expand Down
54 changes: 54 additions & 0 deletions tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -765,6 +765,60 @@ def test_do_plot_comparison_3():
assert os.path.exists(os.path.join(location, "cmp.matrix.png"))


def test_plot_subsample_1():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('genome-s10.fa.gz.sig')
testdata2 = utils.get_test_data('genome-s11.fa.gz.sig')
testdata3 = utils.get_test_data('genome-s12.fa.gz.sig')
testdata4 = utils.get_test_data('genome-s10+s11.sig')
inp_sigs = [testdata1, testdata2, testdata3, testdata4]

status, out, err = utils.runscript('sourmash',
['compare'] + inp_sigs + \
['-o', 'cmp', '-k', '21', '--dna'],
in_directory=location)

status, out, err = utils.runscript('sourmash',
['plot', 'cmp',
'--subsample', '3'],
in_directory=location)

print(out)

expected = """\
0\ts10+s11
1\tgenome-s12.fa.gz
2\tgenome-s10.fa.gz"""
assert expected in out


def test_plot_subsample_2():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('genome-s10.fa.gz.sig')
testdata2 = utils.get_test_data('genome-s11.fa.gz.sig')
testdata3 = utils.get_test_data('genome-s12.fa.gz.sig')
testdata4 = utils.get_test_data('genome-s10+s11.sig')
inp_sigs = [testdata1, testdata2, testdata3, testdata4]

status, out, err = utils.runscript('sourmash',
['compare'] + inp_sigs + \
['-o', 'cmp', '-k', '21', '--dna'],
in_directory=location)

status, out, err = utils.runscript('sourmash',
['plot', 'cmp',
'--subsample', '3',
'--subsample-seed=2'],
in_directory=location)

print(out)
expected = """\
0\tgenome-s12.fa.gz
1\ts10+s11
2\tgenome-s11.fa.gz"""
assert expected in out


def test_search_sig_does_not_exist():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')
Expand Down

0 comments on commit d9ce80f

Please sign in to comment.