-
Notifications
You must be signed in to change notification settings - Fork 36
/
create_and_run_testing_datasets.py
57 lines (38 loc) · 1.81 KB
/
create_and_run_testing_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from basic import *
import time
if not isdir('testing/'):
mkdir('testing/')
cmd = 'rm testing/* testing/*web/*'
print cmd
system(cmd)
#examples = [ [ 'test_tiny', 1, 1, 15 ],
# [ 'test_small', 3, 3, 15 ] ]
examples = [ [ 'test_small', 3, 3, 15 ] ]
#examples = [ [ 'test_tiny', 1, 1, 15 ] ]
for filetag, max_epitopes, max_subjects, max_tcrs_per_subject in examples:
fields = 'id epitope subject a_nucseq b_nucseq a_quals b_quals'.split()
for organism in ['mouse','human']:
## this is the dataset from the paper
oldfile = './datasets/{}_pairseqs_v1.tsv'.format(organism)
assert exists( oldfile )
all_tcrs = parse_tsv_file( oldfile, ['epitope','subject'], [], True )
epitopes = sorted( all_tcrs.keys() )[:max_epitopes]
newfile = './testing/{}_{}_pairseqs_v1.tsv'.format(filetag, organism)
print 'making',newfile
out = open(newfile,'w')
out.write( '\t'.join( fields )+'\n' )
for epitope in epitopes:
epitope_tcrs = all_tcrs[epitope]
subjects = sorted( epitope_tcrs.keys() )[:max_subjects]
for subject in subjects:
tcrs = epitope_tcrs[subject]
for outl in tcrs[:max_tcrs_per_subject]:
out.write(make_tsv_line( outl, fields )+'\n' )
out.close()
## use intrasubject_nbrdists here because these mini-repertoires may contain only a single subject
## if there's only one subject, then we can't compute a nbrdist if we exclude intra-subject distances
cmd = 'python run_basic_analysis.py --constant_seed --intrasubject_nbrdists --organism {} --pair_seqs_file {} > {}.log 2> {}.err &'\
.format( organism, newfile, newfile, newfile )
print cmd
system(cmd)
time.sleep(1.0) ## short pause