Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate signatures from 10x single cell bam file #539

Merged
merged 24 commits into from
Oct 1, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
ca577ab
Add bam files for testing
olgabot Sep 1, 2018
0c31a4a
Add reading 10x bam files
olgabot Sep 1, 2018
07789c1
Add pysam to requirements
olgabot Sep 1, 2018
b890942
Add sourmash compute test
olgabot Sep 1, 2018
76f2d23
Move test data to actually be like the 10x structure
olgabot Sep 2, 2018
0e53935
tests pass!
olgabot Sep 2, 2018
687c155
avoid direct dep on pysam, fix tests and argparse in commands
luizirber Sep 2, 2018
3d6e769
Add updated test data location
olgabot Sep 2, 2018
3a22ab3
Skip 10x test if pysam isn't installed
olgabot Sep 2, 2018
cbfaed8
Merge pull request #1 from luizirber/fix_tests
olgabot Sep 2, 2018
ef25bcf
Add "import pytest"
olgabot Sep 4, 2018
836b66b
Move testing of alignment into function
olgabot Sep 6, 2018
15af5e0
Move import to top of file
olgabot Sep 6, 2018
3f03839
Try doing some multiprocessing
olgabot Sep 7, 2018
b6f47ca
Add threads to arguments
olgabot Sep 14, 2018
e4aa521
Add pathos to requirements
olgabot Sep 14, 2018
a857e76
Remove pysam, add bamnostic
olgabot Sep 14, 2018
f7f7dd3
add type for threads argument
olgabot Sep 14, 2018
4c9564f
lung_ptprc --> 10x-example
olgabot Sep 14, 2018
e1f8719
Specify version of bamnostic
olgabot Sep 17, 2018
c1d2dfa
multiprocessing.Pool isn't a context manager in Python2.7
olgabot Sep 17, 2018
02f1010
Skip 10x test only if bamnostic not installed
olgabot Sep 17, 2018
69b7eed
Increase bamnostic version
olgabot Sep 18, 2018
f0cf1b1
Merge branch 'master' into olgabot/10x-singlecell-bam
olgabot Sep 19, 2018
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,5 @@ sphinx
alabaster
recommonmark
sphinxcontrib-napoleon
pathos
bamnostic>=0.9.2
56 changes: 55 additions & 1 deletion sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

import argparse
import csv
import itertools
import multiprocessing
import pathos.multiprocessing as mp
import os
import os.path
import sys
Expand All @@ -13,6 +16,7 @@
from . import sourmash_args
from .logging import notify, error, print_results, set_quiet
from .sbtmh import SearchMinHashesFindBest, SigLeaf
from .tenx import read_10x_folder

from .sourmash_args import DEFAULT_LOAD_K
DEFAULT_COMPUTE_K = '21,31,51'
Expand All @@ -21,6 +25,8 @@
WATERMARK_SIZE = 10000




def info(args):
"Report sourmash version + version of installed dependencies."
parser = argparse.ArgumentParser()
Expand All @@ -43,7 +49,6 @@ def info(args):
notify('screed version {}', screed.__version__)
notify('- loaded from path: {}', os.path.dirname(screed.__file__))


def compute(args):
"""Compute the signature for one or more files.

Expand Down Expand Up @@ -85,6 +90,10 @@ def compute(args):
help="merge all input files into one signature named this")
parser.add_argument('--name-from-first', action='store_true',
help="name the signature generated from each file after the first record in the file (default: False)")
parser.add_argument('--input-is-10x', action='store_true',
help="Input is 10x single cell output folder (default: False)")
parser.add_argument('-p', '--processes', default=2, type=int,
help='Number of processes to use for reading 10x bam file')
parser.add_argument('--track-abundance', action='store_true',
help='track k-mer abundances in the generated signature (default: False)')
parser.add_argument('--scaled', type=float, default=0,
Expand Down Expand Up @@ -211,6 +220,26 @@ def save_siglist(siglist, output_fp, filename=None):
sig.save_signatures(siglist, fp)
notify('saved {} signature(s). Note: signature license is CC0.'.format(len(siglist)))

def maybe_add_barcode(barcode, cell_seqs):
if barcode not in cell_seqs:
cell_seqs[barcode] = make_minhashes()

def maybe_add_alignment(alignment, cell_seqs, args, barcodes):
high_quality_mapping = alignment.mapq == 255
good_barcode = 'CB' in alignment.tags and \
alignment.get_tag('CB') in barcodes
good_umi = 'UB' in alignment.tags

pass_qc = high_quality_mapping and good_barcode and \
good_umi
if pass_qc:
barcode = alignment.get_tag('CB')
# if this isn't marked a duplicate, count it as a UMI
if not alignment.is_duplicate:
maybe_add_barcode(barcode, cell_seqs)
add_seq(cell_seqs[barcode], alignment.seq,
args.input_is_protein, args.check_sequence)

if args.track_abundance:
notify('Tracking abundance of input k-mers.')

Expand All @@ -237,6 +266,31 @@ def save_siglist(siglist, output_fp, filename=None):

notify('calculated {} signatures for {} sequences in {}',
len(siglist), n + 1, filename)
elif args.input_is_10x:
barcodes, bam_file = read_10x_folder(filename)
manager = multiprocessing.Manager()

cell_seqs = manager.dict()

notify('... reading sequences from {}', filename)

pool = mp.Pool(processes=args.processes)
pool.map(lambda x: maybe_add_alignment(x, cell_seqs, args, barcodes), bam_file)
# for n, alignment in enumerate(bam_file):
# if n % 10000 == 0:
# if n:
# notify('\r...{} {}', filename, n, end='')
# maybe_add_alignment(alignment, cell_seqs)

cell_signatures = [
build_siglist(seqs, filename=filename, name=barcode)
for barcode, seqs in cell_seqs.items()]
sigs = list(itertools.chain(*cell_signatures))
if args.output:
siglist += sigs
else:
siglist = sigs

else:
# make minhashes for the whole file
Elist = make_minhashes()
Expand Down
19 changes: 19 additions & 0 deletions sourmash/tenx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import os
import bamnostic as bs


def read_single_column(filename):
"""Read single-column barcodes.tsv and genes.tsv files from 10x"""
with open(filename) as f:
lines = set(line.strip() for line in f)
return lines


def read_10x_folder(folder):
"""Get QC-pass barcodes, genes, and bam file from a 10x folder"""
barcodes = read_single_column(os.path.join(folder, 'barcodes.tsv'))

bam_file = bs.AlignmentFile(
os.path.join(folder, 'possorted_genome_bam.bam'), mode='rb')

return barcodes, bam_file
Loading