-
Notifications
You must be signed in to change notification settings - Fork 4
/
get_data.py
73 lines (56 loc) · 2.6 KB
/
get_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import argparse
import gzip
import urllib.parse
import urllib.request
parser = argparse.ArgumentParser(description='Download the protein datasets')
parser.add_argument('--data', type=str, default='data/', help='location of the data ids')
parser.add_argument('--domain', choices=['euk','bac','arc','vir'], default='euk', help='domain of origin: "euk","bac","arc" or "vir"')
parser.add_argument('--complete', choices=['full','frag'], default='full', help='completeness of the protein: "full" or "frag"')
parser.add_argument('--quality', choices=['exp','pred'], default='exp', help='evidence of existence of the protein: "exp" or "pred"')
parser.add_argument('--all', help='downnloads all the data')
args = parser.parse_args()
all_domains = ['euk','bac','arc','vir']
all_complete = ['full','frag']
all_quality = ['exp','pred']
def download_set(loc,domain,complete,quality,dataset):
url = 'https://www.uniprot.org/uploadlists/'
query = [line.rstrip('\n') for line in gzip.open('%s/%s_%s_%s/%s_ids.txt.gz' % (loc,domain,complete,quality,dataset), 'rt')]
n_total = len(query)
n = 3000
formated_file = open('%s/%s_%s_%s/%s.txt' % (loc,domain,complete,quality,dataset),'w')
i = 0
while i*n < n_total:
tmp_query = query[i*n:i*n+n]
tmp_query = ' '.join(tmp_query)
params = {
'from': 'ACC+ID',
'to': 'ACC',
'format': 'tab',
'columns': 'sequence',
'query': tmp_query
}
data = urllib.parse.urlencode(params)
data = data.encode('utf-8')
req = urllib.request.Request(url, data)
with urllib.request.urlopen(req) as f:
f.readline()
for line in f:
line = line.decode('utf-8').strip().split('\t')
seq = ' '.join(list(line[0]))
formated_file.write('%s\n' % seq)
i += 1
formated_file.close()
if args.all:
print('Downloading all the data...')
for domain in all_domains:
for complete in all_complete:
for quality in all_quality:
download_set(args.data,domain,complete,quality,'train')
download_set(args.data,domain,complete,'exp','valid')
download_set(args.data,domain,complete,'exp','test')
else:
print('Downloading %s %s %s train, valid and test sets...' % (args.domain, args.complete, args.quality))
download_set(args.data,args.domain,args.complete,args.quality,'train')
download_set(args.data,args.domain,args.complete,'exp','valid')
download_set(args.data,args.domain,args.complete,'exp','test')
print('Download complete')