diff --git a/requirements.txt b/requirements.txt index 18a36ed08b..5d4c9724fb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,5 @@ sphinx alabaster recommonmark sphinxcontrib-napoleon +pathos +bamnostic>=0.9.2 \ No newline at end of file diff --git a/sourmash/commands.py b/sourmash/commands.py index 34e52b239b..e5ebc32c46 100644 --- a/sourmash/commands.py +++ b/sourmash/commands.py @@ -2,6 +2,9 @@ import argparse import csv +import itertools +import multiprocessing +import pathos.multiprocessing as mp import os import os.path import sys @@ -13,6 +16,7 @@ from . import sourmash_args from .logging import notify, error, print_results, set_quiet from .sbtmh import SearchMinHashesFindBest, SigLeaf +from .tenx import read_10x_folder from .sourmash_args import DEFAULT_LOAD_K DEFAULT_COMPUTE_K = '21,31,51' @@ -21,6 +25,8 @@ WATERMARK_SIZE = 10000 + + def info(args): "Report sourmash version + version of installed dependencies." parser = argparse.ArgumentParser() @@ -43,7 +49,6 @@ def info(args): notify('screed version {}', screed.__version__) notify('- loaded from path: {}', os.path.dirname(screed.__file__)) - def compute(args): """Compute the signature for one or more files. @@ -85,6 +90,10 @@ def compute(args): help="merge all input files into one signature named this") parser.add_argument('--name-from-first', action='store_true', help="name the signature generated from each file after the first record in the file (default: False)") + parser.add_argument('--input-is-10x', action='store_true', + help="Input is 10x single cell output folder (default: False)") + parser.add_argument('-p', '--processes', default=2, type=int, + help='Number of processes to use for reading 10x bam file') parser.add_argument('--track-abundance', action='store_true', help='track k-mer abundances in the generated signature (default: False)') parser.add_argument('--scaled', type=float, default=0, @@ -211,6 +220,26 @@ def save_siglist(siglist, output_fp, filename=None): sig.save_signatures(siglist, fp) notify('saved {} signature(s). Note: signature license is CC0.'.format(len(siglist))) + def maybe_add_barcode(barcode, cell_seqs): + if barcode not in cell_seqs: + cell_seqs[barcode] = make_minhashes() + + def maybe_add_alignment(alignment, cell_seqs, args, barcodes): + high_quality_mapping = alignment.mapq == 255 + good_barcode = 'CB' in alignment.tags and \ + alignment.get_tag('CB') in barcodes + good_umi = 'UB' in alignment.tags + + pass_qc = high_quality_mapping and good_barcode and \ + good_umi + if pass_qc: + barcode = alignment.get_tag('CB') + # if this isn't marked a duplicate, count it as a UMI + if not alignment.is_duplicate: + maybe_add_barcode(barcode, cell_seqs) + add_seq(cell_seqs[barcode], alignment.seq, + args.input_is_protein, args.check_sequence) + if args.track_abundance: notify('Tracking abundance of input k-mers.') @@ -237,6 +266,31 @@ def save_siglist(siglist, output_fp, filename=None): notify('calculated {} signatures for {} sequences in {}', len(siglist), n + 1, filename) + elif args.input_is_10x: + barcodes, bam_file = read_10x_folder(filename) + manager = multiprocessing.Manager() + + cell_seqs = manager.dict() + + notify('... reading sequences from {}', filename) + + pool = mp.Pool(processes=args.processes) + pool.map(lambda x: maybe_add_alignment(x, cell_seqs, args, barcodes), bam_file) + # for n, alignment in enumerate(bam_file): + # if n % 10000 == 0: + # if n: + # notify('\r...{} {}', filename, n, end='') + # maybe_add_alignment(alignment, cell_seqs) + + cell_signatures = [ + build_siglist(seqs, filename=filename, name=barcode) + for barcode, seqs in cell_seqs.items()] + sigs = list(itertools.chain(*cell_signatures)) + if args.output: + siglist += sigs + else: + siglist = sigs + else: # make minhashes for the whole file Elist = make_minhashes() diff --git a/sourmash/tenx.py b/sourmash/tenx.py new file mode 100644 index 0000000000..9fde7374f8 --- /dev/null +++ b/sourmash/tenx.py @@ -0,0 +1,19 @@ +import os +import bamnostic as bs + + +def read_single_column(filename): + """Read single-column barcodes.tsv and genes.tsv files from 10x""" + with open(filename) as f: + lines = set(line.strip() for line in f) + return lines + + +def read_10x_folder(folder): + """Get QC-pass barcodes, genes, and bam file from a 10x folder""" + barcodes = read_single_column(os.path.join(folder, 'barcodes.tsv')) + + bam_file = bs.AlignmentFile( + os.path.join(folder, 'possorted_genome_bam.bam'), mode='rb') + + return barcodes, bam_file diff --git a/tests/test-data/10x-example/barcodes.tsv b/tests/test-data/10x-example/barcodes.tsv new file mode 100644 index 0000000000..e1df0e133d --- /dev/null +++ b/tests/test-data/10x-example/barcodes.tsv @@ -0,0 +1,625 @@ +AAACGGGAGGATATAC-1 +AAACGGGTCTCGTATT-1 +AAAGATGCAGATCTGT-1 +AAATGCCAGATAGTCA-1 +AAATGCCCAAACTGCT-1 +AAATGCCGTGAACCTT-1 +AACACGTAGTGTACCT-1 +AACACGTGTGGCTCCA-1 +AACCATGAGTTGTCGT-1 +AACCATGCACGTCAGC-1 +AACCATGGTAATAGCA-1 +AACCATGTCTGTTGAG-1 +AACCGCGGTTACGGAG-1 +AACCGCGTCCTTGCCA-1 +AACGTTGGTAGCTCCG-1 +AACGTTGTCAACACAC-1 +AACTCAGCATTTCACT-1 +AACTCCCTCTTATCTG-1 +AACTCTTGTTCCAACA-1 +AACTCTTTCTTGTTTG-1 +AACTGGTGTTCAGTAC-1 +AACTTTCAGAGCTGCA-1 +AAGCCGCTCAGTTCGA-1 +AAGGAGCCACTAGTAC-1 +AAGGAGCCAGGAATCG-1 +AAGGCAGTCTGCTTGC-1 +AAGGTTCTCAATCACG-1 +AATCCAGGTACCGGCT-1 +AATCGGTGTCGCATAT-1 +ACACCGGCAAACCTAC-1 +ACACCGGCATCCTAGA-1 +ACACTGAAGACCTAGG-1 +ACAGCCGGTAATTGGA-1 +ACAGCCGTCACATAGC-1 +ACAGCCGTCGACAGCC-1 +ACAGCTACAAAGAATC-1 +ACAGCTAGTTCATGGT-1 +ACAGCTAGTTGGAGGT-1 +ACAGCTAGTTTAGCTG-1 +ACATACGAGAGTTGGC-1 +ACATACGCAGCATGAG-1 +ACATCAGGTTTCGCTC-1 +ACCAGTATCATCGGAT-1 +ACCAGTATCCCGACTT-1 +ACCCACTGTCTAAACC-1 +ACGAGGACAAGCCCAC-1 +ACGATGTGTAGGCATG-1 +ACGCAGCCAGGGTACA-1 +ACGCCAGAGGCTCATT-1 +ACGCCAGTCGATGAGG-1 +ACGCCGACAATAAGCA-1 +ACGGAGAAGTTAGCGG-1 +ACGGAGAGTGAGTGAC-1 +ACGGCCAAGACAAGCC-1 +ACGGCCACAATGGTCT-1 +ACGGGCTCAGTCGTGC-1 +ACGGGTCAGCTAAACA-1 +ACGGGTCGTCGAACAG-1 +ACGGGTCTCGTAGATC-1 +ACGTCAAAGGGCATGT-1 +ACGTCAAGTTCAACCA-1 +ACTATCTGTGTAATGA-1 +ACTATCTTCCCGGATG-1 +ACTGAACAGCCCGAAA-1 +ACTGAACGTTCCCGAG-1 +ACTGAACTCGTGTAGT-1 +ACTGATGAGTGGCACA-1 +ACTGATGCACGGCTAC-1 +ACTGATGTCATAAAGG-1 +ACTGCTCCAAGTACCT-1 +ACTGCTCCACAAGCCC-1 +ACTGTCCAGTGTACGG-1 +ACTTACTCAGCTCGAC-1 +ACTTGTTCATATGGTC-1 +ACTTGTTGTAAACCTC-1 +ACTTTCAGTTGACGTT-1 +ACTTTCAGTTTGACTG-1 +AGAATAGTCCTAGAAC-1 +AGACGTTGTCCGAGTC-1 +AGAGCGACAAGCGTAG-1 +AGAGCGATCTGCCAGG-1 +AGAGTGGAGATCCCGC-1 +AGAGTGGAGTAGTGCG-1 +AGAGTGGTCATGTCCC-1 +AGAGTGGTCGAACTGT-1 +AGAGTGGTCTGATTCT-1 +AGATCTGGTCTTCGTC-1 +AGATCTGTCACGCATA-1 +AGATTGCAGAGCAATT-1 +AGCATACCAAGAGTCG-1 +AGCCTAAAGAAGGGTA-1 +AGCGGTCCATGGTCAT-1 +AGCTCCTAGCAGGCTA-1 +AGCTCCTTCCGCGTTT-1 +AGCTCCTTCCTTAATC-1 +AGCTCTCTCGATGAGG-1 +AGCTTGAAGCCGATTT-1 +AGGGAGTCAACTGCGC-1 +AGGGAGTTCCGCGCAA-1 +AGGGATGGTCTAACGT-1 +AGGGATGTCCCGGATG-1 +AGGTCATAGCGTCTAT-1 +AGGTCATTCACGCGGT-1 +AGGTCCGCAACAACCT-1 +AGGTCCGCACTTAAGC-1 +AGGTCCGCATCGGACC-1 +AGGTCCGTCATCGGAT-1 +AGTCTTTCAATGAATG-1 +AGTCTTTGTCTCTCGT-1 +AGTCTTTTCATGTGGT-1 +AGTGAGGGTTACCGAT-1 +AGTGTCATCTTCAACT-1 +AGTTGGTGTAGAGTGC-1 +ATAACGCTCCCATTTA-1 +ATAAGAGAGCACACAG-1 +ATAAGAGCAGCTATTG-1 +ATAAGAGGTCCGTTAA-1 +ATAAGAGTCGCCAGCA-1 +ATCATCTGTTCGGCAC-1 +ATCATCTTCCTTAATC-1 +ATCCACCGTGTATGGG-1 +ATCCGAAAGCCTTGAT-1 +ATCCGAAGTGAAATCA-1 +ATCGAGTGTACCGCTG-1 +ATCTGCCTCCCTAACC-1 +ATGCGATCACATTAGC-1 +ATGGGAGAGCAAATCA-1 +ATTACTCGTATGCTTG-1 +ATTATCCAGGAATCGC-1 +ATTATCCAGGCTCTTA-1 +ATTATCCGTCGTTGTA-1 +ATTCTACAGATCACGG-1 +ATTGGACGTCATATGC-1 +ATTTCTGCAGCCTTTC-1 +ATTTCTGGTAGTGAAT-1 +CAACCAATCATGTCCC-1 +CAACTAGCAGCCTGTG-1 +CAAGAAATCAACGAAA-1 +CAAGAAATCAGTTCGA-1 +CAAGATCAGCAATATG-1 +CAAGTTGCAGCTTCGG-1 +CAAGTTGGTCTACCTC-1 +CACAAACGTGCGATAG-1 +CACACAAAGGATGTAT-1 +CACACCTCAGTCAGCC-1 +CACACCTTCCAAAGTC-1 +CACAGGCCACACTGCG-1 +CACATAGCAATAGCGG-1 +CACATAGTCGTCCAGG-1 +CACCTTGAGTTCCACA-1 +CACCTTGGTGTCAATC-1 +CACCTTGTCAGTTTGG-1 +CAGAATCAGGCAGTCA-1 +CAGAGAGCATCGGACC-1 +CAGCAGCAGGGTGTGT-1 +CAGCAGCGTCCTCTTG-1 +CAGCATAGTGCATCTA-1 +CAGCGACTCATTTGGG-1 +CAGTAACAGATAGTCA-1 +CAGTAACAGGATCGCA-1 +CAGTCCTTCCAAAGTC-1 +CAGTCCTTCTAACTGG-1 +CATATGGAGTTTCCTT-1 +CATATGGCACAGGTTT-1 +CATATGGGTGTTTGTG-1 +CATATGGTCGTGGACC-1 +CATATTCGTCGAATCT-1 +CATCCACAGATGGCGT-1 +CATCCACTCGGTGTCG-1 +CATCGGGCAGATGAGC-1 +CATCGGGTCAGCTGGC-1 +CATCGGGTCTCGCATC-1 +CATGACAGTATAAACG-1 +CATGGCGGTGCGATAG-1 +CCAATCCAGCAAATCA-1 +CCACCTAAGGCCCGTT-1 +CCACGGAGTAGAAGGA-1 +CCACGGATCAGCTCGG-1 +CCACTACTCTTGTTTG-1 +CCATGTCCACGGTTTA-1 +CCATGTCGTTCACGGC-1 +CCCATACAGCTTATCG-1 +CCGGTAGCAGTCAGAG-1 +CCGTACTCACTCGACG-1 +CCGTACTCATAGGATA-1 +CCGTACTGTCAGATAA-1 +CCGTACTGTCCCGACA-1 +CCGTACTTCACGGTTA-1 +CCGTGGAAGACGCACA-1 +CCGTGGATCGCGCCAA-1 +CCGTTCAAGTACGACG-1 +CCGTTCACATAGTAAG-1 +CCTAAAGAGGGAAACA-1 +CCTAAAGTCGCTGATA-1 +CCTACACCAACCGCCA-1 +CCTAGCTAGCACCGCT-1 +CCTAGCTGTGACTCAT-1 +CCTAGCTGTTCTGGTA-1 +CCTAGCTTCTTGTTTG-1 +CCTATTACAGCCACCA-1 +CCTCAGTTCTTAGAGC-1 +CCTCTGAAGCCACCTG-1 +CCTTCCCAGGACCACA-1 +CCTTCGAAGCCGTCGT-1 +CCTTTCTAGAGCCTAG-1 +CGAATGTGTCCGTGAC-1 +CGACCTTAGATAGGAG-1 +CGACCTTCACTGTGTA-1 +CGACCTTCAGACGCTC-1 +CGACCTTGTTCCCGAG-1 +CGAGAAGCAGGCAGTA-1 +CGAGAAGCATGGTAGG-1 +CGAGCCACATTGCGGC-1 +CGATCGGAGTGGGCTA-1 +CGATGGCGTAAGGATT-1 +CGATGGCGTAATCACC-1 +CGATGTACATATGCTG-1 +CGATTGATCTGGTTCC-1 +CGCGGTAAGAGGACGG-1 +CGCGTTTGTCCAGTAT-1 +CGCTATCAGAGTAATC-1 +CGCTGGACATGATCCA-1 +CGGACACAGCGCTTAT-1 +CGGACACAGTGTTTGC-1 +CGGACACCAGCTCGAC-1 +CGGACGTAGGCTCATT-1 +CGGAGTCCATGAGCGA-1 +CGGAGTCTCCCTTGTG-1 +CGGCTAGGTATCTGCA-1 +CGGCTAGTCCACGCAG-1 +CGGCTAGTCGCTGATA-1 +CGGGTCACACGCTTTC-1 +CGGGTCATCCCAAGTA-1 +CGTAGCGGTAGCCTAT-1 +CGTAGGCCAAGCGATG-1 +CGTCACTGTTCGTTGA-1 +CGTCAGGGTATCTGCA-1 +CGTCCATCACTAAGTC-1 +CGTCTACAGGGCACTA-1 +CGTGAGCGTACACCGC-1 +CGTGTAACAGGCAGTA-1 +CGTGTAAGTTTGGGCC-1 +CGTGTCTCAGTATCTG-1 +CGTTGGGCAAAGAATC-1 +CGTTGGGCACCGAAAG-1 +CGTTGGGGTAGCTTGT-1 +CGTTGGGGTCATCGGC-1 +CTAACTTAGTTGTAGA-1 +CTAACTTGTTGCGTTA-1 +CTAATGGTCGGCGCAT-1 +CTACACCTCCGTCAAA-1 +CTACACCTCTGCGGCA-1 +CTACCCAAGTATGACA-1 +CTACCCAGTAAAGTCA-1 +CTACCCATCAAGGTAA-1 +CTACCCATCCTGTAGA-1 +CTACCCATCTGGAGCC-1 +CTACGTCTCACTTCAT-1 +CTAGAGTTCACTCTTA-1 +CTAGCCTAGTCTTGCA-1 +CTAGCCTCAAGCGTAG-1 +CTAGTGAGTCGAATCT-1 +CTAGTGAGTTCCTCCA-1 +CTCACACAGCGTTGCC-1 +CTCACACCATATACCG-1 +CTCAGAAAGTTTCCTT-1 +CTCATTACAATGGTCT-1 +CTCATTATCCACGAAT-1 +CTCATTATCGTTTATC-1 +CTCCTAGAGAGGACGG-1 +CTCGAGGAGACAGACC-1 +CTCGAGGAGCTCCCAG-1 +CTCGAGGAGCTGCAAG-1 +CTCGGAGAGCAGCCTC-1 +CTCGGGAAGCTGATAA-1 +CTCGGGAGTATTAGCC-1 +CTCGTCAAGTCATGCT-1 +CTCGTCACAAATTGCC-1 +CTCGTCACAGGGTACA-1 +CTCGTCAGTTTGGGCC-1 +CTCGTCATCAGAGGTG-1 +CTCTAATCAGCTATTG-1 +CTCTAATTCGTGGGAA-1 +CTCTACGCATGCATGT-1 +CTCTGGTAGACCTAGG-1 +CTCTGGTGTTTGGGCC-1 +CTGATAGAGCGTCAAG-1 +CTGATAGCAGTGAGTG-1 +CTGATAGGTTGCTCCT-1 +CTGATAGTCCTGCCAT-1 +CTGATCCGTTGCGCAC-1 +CTGCCTAGTGTGCGTC-1 +CTGCCTATCGGGAGTA-1 +CTGGTCTCACAGACTT-1 +CTGGTCTGTCATCGGC-1 +CTGTTTACAAATACAG-1 +CTGTTTAGTATTACCG-1 +CTGTTTAGTGTTAAGA-1 +CTGTTTAGTTTAGCTG-1 +CTTAACTGTTTGTTTC-1 +CTTAACTTCTCAAGTG-1 +CTTACCGAGAAGCCCA-1 +CTTACCGCAAGCGTAG-1 +CTTACCGGTTTACTCT-1 +CTTCTCTCAGCTCGAC-1 +CTTGGCTTCCTCGCAT-1 +CTTTGCGAGCAATATG-1 +CTTTGCGCACTATCTT-1 +GAAACTCCAACGATCT-1 +GAAACTCCACTGTGTA-1 +GAAACTCTCTTTAGGG-1 +GAAATGACATTACCTT-1 +GAAATGAGTCGAAAGC-1 +GAACATCGTAGAAGGA-1 +GAACATCTCTTGAGGT-1 +GAACGGAGTTTGACAC-1 +GAAGCAGAGTACCGGA-1 +GAATAAGGTTATCGGT-1 +GAATGAAGTGCCTTGG-1 +GACAGAGAGAAAGTGG-1 +GACCAATAGTACGACG-1 +GACCAATTCTGCCCTA-1 +GACCTGGAGCTTCGCG-1 +GACCTGGAGTGATCGG-1 +GACGGCTAGACGCACA-1 +GACGGCTCAGATCCAT-1 +GACGGCTGTGGCTCCA-1 +GACGTGCGTAAGGGCT-1 +GACGTGCTCAGCACAT-1 +GACGTTACATCGATGT-1 +GACTAACAGCCACGTC-1 +GACTGCGAGGTCATCT-1 +GAGGTGAAGGGCTTGA-1 +GAGGTGAAGTAGCGGT-1 +GAGGTGAGTAGCGTGA-1 +GAGTCCGTCAGCTCTC-1 +GAGTCCGTCGGAGCAA-1 +GATCAGTCACACAGAG-1 +GATCAGTGTATTAGCC-1 +GATCGATAGACAAGCC-1 +GATCGATAGTGTACTC-1 +GATCGATCAATCGGTT-1 +GATCGATTCTGCTTGC-1 +GATCGCGGTAGCACGA-1 +GATCTAGTCGAGAACG-1 +GATGAAAGTGGAAAGA-1 +GATGAAATCGCCGTGA-1 +GATGAGGCATTCTTAC-1 +GATTCAGGTTCGGGCT-1 +GCAAACTGTACGAAAT-1 +GCAAACTTCGACAGCC-1 +GCAATCAAGATTACCC-1 +GCACATAAGGACCACA-1 +GCACTCTAGACAGACC-1 +GCACTCTTCCAAATGC-1 +GCAGTTATCTTGTTTG-1 +GCATACAAGACTTTCG-1 +GCATACAGTCACAAGG-1 +GCATGATGTCGGCACT-1 +GCATGCGAGCGTAGTG-1 +GCATGCGGTGGCCCTA-1 +GCATGTAAGTGCAAGC-1 +GCATGTACATAAGACA-1 +GCCAAATTCCTGCTTG-1 +GCCTCTACACATCCAA-1 +GCCTCTACATAGTAAG-1 +GCCTCTAGTTCCTCCA-1 +GCGACCAAGCACACAG-1 +GCGAGAACACATCTTT-1 +GCGAGAATCGGCATCG-1 +GCGCAACAGACTGGGT-1 +GCGCAACTCAAGATCC-1 +GCGCAGTCAACTGCGC-1 +GCGCAGTCACATCCAA-1 +GCGCCAAAGGCTATCT-1 +GCGGGTTTCAGGATCT-1 +GCTCTGTTCCACTCCA-1 +GCTGCAGTCCAAGCCG-1 +GCTGCGACAGCTTCGG-1 +GCTGCGACAGTAAGCG-1 +GCTGCTTAGCCAACAG-1 +GCTGCTTCAAACGTGG-1 +GCTGCTTTCATGCTCC-1 +GCTGGGTGTCGTCTTC-1 +GCTGGGTGTTCGCTAA-1 +GCTGGGTTCATATCGG-1 +GCTTCCAGTTCCGGCA-1 +GCTTGAAAGTTAGCGG-1 +GCTTGAAGTACGAAAT-1 +GCTTGAATCATAAAGG-1 +GGAAAGCGTGGCTCCA-1 +GGAATAATCCCGACTT-1 +GGACAAGGTATAGGTA-1 +GGACAGACATCGGGTC-1 +GGACAGATCCTTAATC-1 +GGATGTTAGGGAACGG-1 +GGATGTTTCTACCAGA-1 +GGATTACTCCAGAGGA-1 +GGATTACTCTGCTTGC-1 +GGCAATTGTAAAGTCA-1 +GGCAATTTCTTCTGGC-1 +GGCAATTTCTTGACGA-1 +GGCGACTGTCATCCCT-1 +GGCGACTTCCACGTTC-1 +GGCTCGAAGGTGATTA-1 +GGCTCGAGTTAGGGTG-1 +GGCTCGATCTGCGTAA-1 +GGCTGGTTCGCACTCT-1 +GGGAATGCAAGGTTTC-1 +GGGAATGCATACGCTA-1 +GGGAATGGTACCGTTA-1 +GGGAATGTCATATCGG-1 +GGGAATGTCGTTACGA-1 +GGGAGATGTGTGACGA-1 +GGGAGATTCGTCACGG-1 +GGGATGACAGCCAATT-1 +GGGATGAGTCTCCCTA-1 +GGGCACTTCGAGGTAG-1 +GGGTCTGCACGTAAGG-1 +GGGTTGCTCTCGTATT-1 +GGGTTGCTCTCTAAGG-1 +GGTATTGAGCCGGTAA-1 +GGTATTGCAATAGCAA-1 +GGTATTGGTCTGATTG-1 +GGTGTTAAGGACATTA-1 +GTAACGTAGATAGGAG-1 +GTAACTGAGCGAGAAA-1 +GTAACTGGTGTGTGCC-1 +GTAACTGTCAAACCGT-1 +GTAACTGTCGGAGGTA-1 +GTACGTAGTTAAGATG-1 +GTACTCCAGACCACGA-1 +GTACTCCAGCAGACTG-1 +GTACTCCCAGTACACT-1 +GTACTTTAGAGTGACC-1 +GTAGGCCAGAATCTCC-1 +GTAGGCCCATCCCATC-1 +GTATTCTCACCAACCG-1 +GTATTCTGTCCAGTTA-1 +GTATTCTGTTATGTGC-1 +GTATTCTGTTGTTTGG-1 +GTATTCTTCTTCTGGC-1 +GTCAAGTAGCTAAACA-1 +GTCAAGTAGTTTCCTT-1 +GTCAAGTTCGAATGCT-1 +GTCACAACACGTCAGC-1 +GTCACAATCTTGAGGT-1 +GTCATTTTCTGGCGTG-1 +GTCGGGTAGAGACTAT-1 +GTCGGGTCACACATGT-1 +GTCGTAAAGCTAGGCA-1 +GTCGTAACACACAGAG-1 +GTCGTAAGTCTAACGT-1 +GTCGTAAGTGAGGGTT-1 +GTCTCGTAGTCAAGCG-1 +GTCTCGTAGTGCTGCC-1 +GTCTCGTCATGTCGAT-1 +GTCTTCGGTCAAAGCG-1 +GTGCAGCCAAGTCTGT-1 +GTGCATAAGGGAAACA-1 +GTGCATAAGTGTCCAT-1 +GTGCATACACTTCGAA-1 +GTGCATATCAAAGTAG-1 +GTGCATATCGTATCAG-1 +GTGCTTCAGGGCTTCC-1 +GTGCTTCCACAGGCCT-1 +GTGCTTCTCAGGATCT-1 +GTGGGTCCATATGCTG-1 +GTGGGTCTCGTGGACC-1 +GTGTTAGAGTGGAGAA-1 +GTTAAGCTCGTTACGA-1 +GTTACAGCACATGGGA-1 +GTTACAGGTAGCACGA-1 +GTTCGGGCATTATCTC-1 +GTTCGGGCATTTGCCC-1 +GTTCGGGTCCGTTGCT-1 +GTTCTCGAGGATGCGT-1 +GTTTCTAAGTGCAAGC-1 +GTTTCTACAGTAAGAT-1 +GTTTCTATCGTCTGCT-1 +TAAACCGTCTCTGTCG-1 +TAAGAGACATTATCTC-1 +TAAGCGTGTAAACGCG-1 +TAAGCGTGTGAAATCA-1 +TAAGTGCGTATAGGGC-1 +TACACGAAGAGCAATT-1 +TACACGACAGATTGCT-1 +TACACGAGTTCGAATC-1 +TACAGTGGTGTTAAGA-1 +TACCTATGTTAAAGTG-1 +TACGGATCAGCCACCA-1 +TACGGGCCACTCGACG-1 +TACGGGCGTCTTGTCC-1 +TACGGGCGTGCAGACA-1 +TACGGTATCTGTTGAG-1 +TACTCATTCAAGGCTT-1 +TACTTACAGCCCAACC-1 +TACTTACAGCGAGAAA-1 +TACTTACTCTTACCGC-1 +TAGACCAGTGCTTCTC-1 +TAGACCAGTTGCTCCT-1 +TAGCCGGCAACGATCT-1 +TAGTGGTAGACTTTCG-1 +TAGTGGTGTTGATTGC-1 +TATCTCAGTGATGTGG-1 +TATGCCCAGGAGCGTT-1 +TATGCCCTCTTCGGTC-1 +TCAACGAAGGGATCTG-1 +TCAACGATCCACGTGG-1 +TCAATCTTCGGGAGTA-1 +TCAATCTTCTAACGGT-1 +TCACAAGAGTCCGTAT-1 +TCACAAGTCTATCGCC-1 +TCACGAAAGGGATCTG-1 +TCAGATGTCTTGTACT-1 +TCAGCAATCACCGTAA-1 +TCAGCTCAGATGTGGC-1 +TCAGGATCAAGCCGTC-1 +TCAGGATTCGTTTATC-1 +TCAGGTAGTGGTCCGT-1 +TCATTACCATGACGGA-1 +TCATTTGCAGTAAGCG-1 +TCATTTGTCAAACGGG-1 +TCCACACAGACAAGCC-1 +TCGAGGCTCCCTCTTT-1 +TCGGGACAGTCGTTTG-1 +TCGGGACTCGTGGACC-1 +TCGTACCAGTAGATGT-1 +TCGTACCCAGCTGCAC-1 +TCTATTGCATAAAGGT-1 +TCTCATACAAATACAG-1 +TCTCTAAGTGTGCCTG-1 +TCTGAGAAGTGTCCCG-1 +TCTGGAAGTCTTCAAG-1 +TCTGGAAGTTGGAGGT-1 +TCTTCGGTCGTCACGG-1 +TCTTTCCAGCTAAGAT-1 +TCTTTCCCAGACAGGT-1 +TCTTTCCGTGCAGACA-1 +TGACAACTCAGCACAT-1 +TGACGGCTCGGTGTTA-1 +TGAGAGGCAACACGCC-1 +TGAGAGGGTGTCAATC-1 +TGAGCATGTTCTCATT-1 +TGAGCCGAGAGATGAG-1 +TGAGCCGCACCCAGTG-1 +TGAGGGAGTACGAAAT-1 +TGAGGGAGTTGGGACA-1 +TGCACCTTCAAACCGT-1 +TGCCAAAAGGGCACTA-1 +TGCCAAAAGGGCTCTC-1 +TGCCAAAGTCTCCATC-1 +TGCCCATAGAGGGATA-1 +TGCCCATAGATTACCC-1 +TGCCCATCACTTAACG-1 +TGCCCATGTTATGCGT-1 +TGCGCAGCAGCTCCGA-1 +TGCGCAGTCTTAACCT-1 +TGCGGGTAGTGCGTGA-1 +TGCGGGTCAAACGCGA-1 +TGCGGGTGTGTCTGAT-1 +TGCGTGGAGGGTTTCT-1 +TGCTACCGTCACCTAA-1 +TGCTGCTCACTGTCGG-1 +TGCTGCTCATCACGTA-1 +TGGACGCTCTCCTATA-1 +TGGCCAGCATAGGATA-1 +TGGCCAGGTTAAGTAG-1 +TGGCCAGTCCGGGTGT-1 +TGGCTGGGTTGGTGGA-1 +TGGGAAGGTGAGGGTT-1 +TGGGAAGTCAACCAAC-1 +TGGGCGTCACCAGGTC-1 +TGGGCGTCAGATGGGT-1 +TGGGCGTCATTTGCTT-1 +TGGGCGTTCTGAGTGT-1 +TGGTTCCAGACGCTTT-1 +TGGTTCCAGTCAAGGC-1 +TGGTTCCGTTCGTCTC-1 +TGTATTCTCGGTCCGA-1 +TGTCCCACATATGGTC-1 +TGTCCCATCGAGAGCA-1 +TGTGGTAAGCCGATTT-1 +TGTGGTAGTTCCGGCA-1 +TGTGGTATCCAAGTAC-1 +TGTGGTATCGTACCGG-1 +TGTGTTTAGGACCACA-1 +TGTTCCGCACAGACAG-1 +TTAGGACCAATGGAGC-1 +TTAGGACCACTGAAGG-1 +TTAGGCAAGCCCAACC-1 +TTAGGCAAGCTGCCCA-1 +TTAGGCAAGGTGCACA-1 +TTAGGCATCCGCAGTG-1 +TTAGTTCAGGTGCTAG-1 +TTAGTTCGTTCAGCGC-1 +TTAGTTCTCCTCAACC-1 +TTATGCTCACAGACAG-1 +TTATGCTCACGGTGTC-1 +TTATGCTTCTGGCGAC-1 +TTCCCAGAGGGTGTTG-1 +TTCCCAGTCCAGTAGT-1 +TTCGAAGGTCTAAACC-1 +TTCGGTCTCATATCGG-1 +TTCTCAAGTCTCCCTA-1 +TTCTCCTAGGTAGCCA-1 +TTCTCCTCACAACGTT-1 +TTCTCCTCATACGCCG-1 +TTCTCCTGTGATAAGT-1 +TTCTTAGGTTCTGGTA-1 +TTGACTTTCCGCATCT-1 +TTGACTTTCTCTGTCG-1 +TTGCCGTTCCAAATGC-1 +TTGGAACAGTGTCCAT-1 +TTGGAACGTACAGCAG-1 +TTGGCAAAGTAGGCCA-1 +TTGTAGGAGGCCCTTG-1 +TTGTAGGAGTCTTGCA-1 +TTTACTGCAATGTAAG-1 +TTTATGCCAGACTCGC-1 +TTTGCGCGTAAGAGGA-1 +TTTGCGCTCAGTCAGT-1 +TTTGGTTAGTTGTCGT-1 +TTTGGTTGTAGTGAAT-1 +TTTGTCACAGTATCTG-1 diff --git a/tests/test-data/10x-example/possorted_genome_bam.bam b/tests/test-data/10x-example/possorted_genome_bam.bam new file mode 100644 index 0000000000..f6d3a4b91c Binary files /dev/null and b/tests/test-data/10x-example/possorted_genome_bam.bam differ diff --git a/tests/test-data/10x-example/possorted_genome_bam.bam.bai b/tests/test-data/10x-example/possorted_genome_bam.bam.bai new file mode 100644 index 0000000000..e97fe597ef Binary files /dev/null and b/tests/test-data/10x-example/possorted_genome_bam.bam.bai differ diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index b2532e6f10..5779ea2b45 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -10,6 +10,7 @@ import glob import json import csv +import pytest from . import sourmash_tst_utils as utils import sourmash_lib @@ -165,6 +166,32 @@ def test_do_sourmash_compute_singleton(): assert sig.name().endswith('shortName') +def test_do_sourmash_compute_10x(): + bamnostic = pytest.importorskip('bamnostic') + + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('10x-example') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '31', + '--input-is-10x', + testdata1], + in_directory=location) + + sigfile = os.path.join(location, '10x-example.sig') + assert os.path.exists(sigfile) + + with open(sigfile) as f: + data = json.load(f) + + barcode_signatures = [sig['name'] for sig in data] + + with open(utils.get_test_data('10x-example/barcodes.tsv')) as f: + true_barcodes = set(x.strip() for x in f.readlines()) + + assert all(bc in true_barcodes for bc in barcode_signatures) + + + def test_do_sourmash_compute_name(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa') diff --git a/tox.ini b/tox.ini index 8b5f3f9b55..7331de4ed2 100644 --- a/tox.ini +++ b/tox.ini @@ -9,6 +9,7 @@ deps= codecov ipfsapi redis + pysam commands= pip install -r requirements.txt pip install -e .[test]