-
Notifications
You must be signed in to change notification settings - Fork 15
/
barcode_split_trim.py
executable file
·263 lines (228 loc) · 11.3 KB
/
barcode_split_trim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
#!/usr/bin/env python
from __future__ import division
import argparse
import fileinput
import re
import os
import numpy as np
import sys
from collections import Counter
from pandas import DataFrame
from rpy2 import robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.lib import ggplot2
from rpy2.robjects.packages import importr
from tabulate import tabulate
grdevices = importr('grDevices')
pandas2ri.activate()
current_version = "v1.5.0"
# Just a quick usage example:
# ./barcode_split_trim.py --id demo --list -b sample_files/barcode.list --out demo-output sample_files/sequences.fq
# TODO: Reformat per python standards
# TODO: Add NoPlot option (and put imports in the plot function?)
# TODO: Have auto-pre/suffix append to, rather than overwrite pre/suffix
def main():
args = get_options()
barcode_table = get_barcodes(args.list, args.barcode, args.id)
barcode_length = validate_barcodes(barcode_table)
directory, fq_name, barcode_name = parse_filenames(args.fastq, args.outdir, args.barcode)
if not os.path.exists(directory):
os.makedirs(directory)
if not args.stats:
unmatched_fh = open_fq_files(barcode_table, directory, fq_name, barcode_name, args.prefix, args.suffix, args.autoprefix, args.autosuffix)
total_matched, total_unmatched, barcodes_obs = split_trim_barcodes(args.fastq, barcode_table, barcode_length, args.notrim, args.stats, unmatched_fh)
if not args.stats:
close_fq_files(barcode_table, unmatched_fh)
total_count = total_matched + total_unmatched
summarize_observed_barcodes(barcode_table, barcodes_obs, total_count, directory, fq_name, barcode_name)
summarize_counts(barcode_table, args.fastq, total_count, total_matched, total_unmatched, directory, fq_name, barcode_name)
plot_summary(barcodes_obs, barcode_table, directory, args.id)
def get_options():
description = "Extracts fastq reads for specified barcode(s) from one or multiple FASTQ files."
epilog = """
An output file in fastq format is written for each barcode to the directory
containing IN.FASTQ, unless an output directory is specified.
The default name of the output file is ID.fq. The output names can be
customized using the Naming Options.
"""
parser = argparse.ArgumentParser(description=description, epilog=epilog)
parser.add_argument('--version', action='version', version="%(prog)s {0}".format(current_version))
parser.add_argument('-i', '--id', help='Sample or Experiment ID', required=True)
parser.add_argument('-b', '--barcode', help='Barcode or file w/ list of barcodes to extract', required=True)
parser.add_argument('-l', '--list', help='Indicate --barcode is a list of barcodes in a file', action="store_true")
parser.add_argument('-n', '--notrim', help='Split without trimming barcodes', action="store_true")
parser.add_argument('-s', '--stats', help='Output summary stats only (w/o creating fastq files)', action="store_true")
parser.add_argument('-o', '--outdir', help='Output file is saved in the specified directory (or same directory as IN.FASTQ, if --outdir is not used)')
parser.add_argument("fastq", help="FASTQ file(s). Use wildcards ('*') to match multiple input FASTQ files.", nargs="+")
naming=parser.add_argument_group('optional naming arguments')
naming.add_argument('--autoprefix', help='Append FASTQ file name onto output', action="store_true")
naming.add_argument('--autosuffix', help='Append barcode onto output', action="store_true")
naming.add_argument('--prefix', help='Add custom prefix to output')
naming.add_argument('--suffix', help='Add custom suffix to output')
return parser.parse_args()
def get_barcodes(list, barcode, id):
barcode_table = {}
if list == True:
with open(barcode, 'r') as barcode_list:
for line in barcode_list:
seq, sample_id = line.split()
barcode_table[seq] = {'id': sample_id, 'count': 0}
else:
barcode_table[barcode] = [id, 0]
return barcode_table
def validate_barcodes(barcode_table):
seqs = dict.keys(barcode_table)
bad_seq = re.compile('[^ACGT]', re.IGNORECASE)
if any(re.match(bad_seq, s) for s in seqs):
sys.exit('Invalid barcode found!')
min_length = len(min(seqs, key=len))
max_length = len(max(seqs, key=len))
if min_length != max_length:
sys.exit("Unexpected variation in barcode length (min={0}, max={1})".format(min_length, max_length))
return max_length
def parse_filenames(fastq, outdir, barcode):
if outdir:
directory = outdir
filename = os.path.basename(fastq[0])
else:
directory, filename = os.path.split(fastq[0])
if len(fastq) == 1:
fq_name, fq_suffix = os.path.splitext(filename)
else:
fq_name = "multi_fq"
barcode_name = os.path.basename(barcode)
return directory, fq_name, barcode_name
def open_fq_files(barcode_table, directory, fq_name, barcode_name, prefix, suffix, autoprefix, autosuffix):
if autoprefix:
prefix = fq_name + "."
elif prefix:
prefix += "."
else:
prefix = ""
if suffix:
suffix = "." + suffix
else:
suffix = ""
unmatched_fq = "{0}/unmatched.{1}fq_{2}.bar_{3}{4}.fq".format(directory, prefix, fq_name, barcode_name, suffix)
unmatched_fh = open(unmatched_fq, 'w')
for seq in dict.keys(barcode_table):
if autosuffix:
suffix = "." + seq
barcode_id = barcode_table[seq]['id']
fq_out = "{0}/{1}{2}{3}.fq".format(directory, prefix, barcode_id, suffix)
barcode_table[seq]['fh'] = open(fq_out, 'w')
return unmatched_fh
def split_trim_barcodes(fastq, barcode_table, barcode_length, notrim, stats, unmatched_fh):
total_matched = 0
total_unmatched = 0
barcodes_obs = Counter()
fq_data = fileinput.input(fastq)
for read_id in fq_data:
line_no = fq_data.filelineno()
seq = fq_data.readline()
qual_id = fq_data.readline()
qual = fq_data.readline()
validate_fq_read(read_id, seq, qual, fq_data, line_no)
cur_barcode = seq[0:5]
barcodes_obs[cur_barcode] += 1
if barcode_table.has_key(cur_barcode):
if not notrim:
seq = seq[(barcode_length + 1):]
qual = qual[(barcode_length + 1):]
if not stats:
barcode_table[cur_barcode]['fh'].write(read_id + seq + qual_id + qual)
barcode_table[cur_barcode]['count'] += 1
total_matched += 1
else:
if not stats:
unmatched_fh.write(read_id + seq + qual_id + qual)
total_unmatched += 1
return total_matched, total_unmatched, barcodes_obs
def validate_fq_read(read_id, seq, qual, fq_data, line_no):
read_id = read_id.rstrip('\n')
fq_file = fq_data.filename()
good_read_id = re.compile('^@')
if not re.match(good_read_id, read_id):
sys.exit("Encountered sequence ID ({0}) that doesn't start with '\@' on line {1} of FASTQ file: '{2}'...\nInvalid or corrupt FASTQ file?\n".format(read_id, line_no, fq_file))
if not len(seq) == len(qual):
sys.exit("Encountered unequal sequence and quality lengths for read ({0}) starting on line {1} of FASTQ file: '{2}'...\nInvalid or corrupt FASTQ file?\n".format(read_id, line_no, fq_file))
good_seq = re.compile('^[ACGTN]+$', re.IGNORECASE)
if not re.match(good_seq, seq):
sys.exit("Encountered sequence ({0}) containing non-nucleotide characters. See read ({1}) starting on line {2} of FASTQ file: '{3}'...\nInvalid or corrupt FASTQ file?\n".format(seq.rstrip('\n'), read_id, line_no, fq_file))
def close_fq_files(barcode_table, unmatched_fh):
unmatched_fh.close()
for seq in dict.keys(barcode_table):
barcode_table[seq]['fh'].close()
def summarize_observed_barcodes(barcode_table, barcodes_obs, total_count, directory, fq_name, barcode_name):
rows = []
for seq, count in barcodes_obs.most_common():
sample = barcode_table.get(seq)['id'] if barcode_table.has_key(seq) else ""
rows.append([seq, commify(count), percent(count, total_count), sample])
table = tabulate(rows, headers=["barcode", "count", "percent", "id"], tablefmt="plain", stralign="right")
summary = "{0}/log_barcodes_observed.fq_{1}.bar_{2}".format(directory, fq_name, barcode_name)
with open(summary, 'w') as f:
f.write(table)
def summarize_counts(barcode_table, fastq, total_count, total_matched, total_unmatched, directory, fq_name, barcode_name):
summary = "{0}/log_barcode_counts.fq_{1}.bar_{2}".format(directory, fq_name, barcode_name)
data = [('matched', total_matched), ('unmatched', total_unmatched)]
table1 = table_id_num_pct(data, total_count)
counts = map(lambda key: barcode_table[key]['count'], barcode_table.keys())
data = [('min', min(counts)), ('max', max(counts)),
('mean', np.mean(counts)), ('median', np.median(counts))]
table2 = table_id_num_pct(data, total_count)
rows = []
for seq in sorted(barcode_table, key=lambda key: barcode_table[key]['id']):
sample = barcode_table[seq]['id']
count = barcode_table[seq]['count']
rows.append([sample, seq, commify(count), percent(count, total_count)])
table3 = tabulate(rows, headers=["id", "barcode", "count", "percent"], tablefmt="plain", stralign="right")
with open(summary, 'w') as f:
f.write("Barcode splitting summary for:\n")
for fq in fastq:
f.write(" {0}\n".format(fq))
f.write("---------------------------\n")
f.write(table1)
f.write("\n---------------------------\n")
f.write("barcodes {0}\n".format(len(barcode_table)))
f.write(table2)
f.write("\n---------------------------\n")
f.write(table3)
def table_id_num_pct(data, total_count):
rows = []
for name, count in data:
rows.append([name, commify(count), percent(count, total_count)])
return tabulate(rows, tablefmt="plain", stralign="right")
def plot_summary(barcodes_obs, barcode_table, directory, expt_id):
barcodes, counts, matches = get_vectors(barcodes_obs, barcode_table)
df = DataFrame({'barcode': barcodes,
'count': counts,
'matched': matches})
p = ggplot2.ggplot(df) + \
ggplot2.aes_string(x='factor(matched)', y='count / 1000000') + \
ggplot2.geom_boxplot(outlier_size = 0) + \
ggplot2.geom_jitter() + \
ggplot2.ggtitle(label = expt_id) + \
ggplot2.ggplot2.xlab(label = "") + \
ggplot2.scale_y_continuous(name = "Count\n(million reads)")
filename = "{0}/{1}.png".format(directory, expt_id)
grdevices.png(filename=filename, width=4, height=5, unit='in', res=300)
p.plot()
grdevices.dev_off()
def get_vectors(barcodes_obs, barcode_table):
barcodes = []
counts = []
matches = []
for barcode in dict.keys(barcodes_obs):
barcodes.append(barcode)
counts.append(barcodes_obs[barcode])
match = "matched" if barcode_table.get(barcode) else "unmatched"
matches.append(match)
return barcodes, counts, matches
def percent(numerator, denominator, decimal_places=1):
if denominator == 0:
sys.exit("Oops, I divided by zero.")
return "{0:.{1}f}%".format(100 * numerator / denominator, decimal_places)
def commify(number, decimal_places=0):
return "{0:,.{1}f}".format(number, decimal_places)
if __name__ == '__main__':
main()