forked from npschafer/dca-frustratometer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
jackhmmer_web.py
247 lines (222 loc) · 10.2 KB
/
jackhmmer_web.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
#!/usr/bin/python
#Reads Seqs.fa and outputs the alignments
#Writes URLS.txt as a backup to restart jackhmmer if programs breaks
#Everything is hardcoded
def gotoMainPage(browser,Restart_url):
import time
while browser.title<>'score results | HMMER' or browser.current_url<>Restart_url:
print "In %s"%browser.title
print "Going to main page"
browser.get(Restart_url)
time.sleep(5)
def HMMER_onlinev2(name,seq,restart=None,database='Reference Proteomes'):
'''Does multiple iterations on jackhammer until the same number of sequences
(or less) is found three succesive for the exact domain. Downloads from the
set of three iterations that has the maximum number of sequences.'''
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
import selenium
import time
import urllib
#Start the first two iterations
print "Opening browser"
browser = webdriver.Firefox()
browser.implicitly_wait(5)
#webdriver.manage().timeouts().implicitlyWait()
if not restart:
print "Starting first iteration"
print "Going to submission page"
browser.get('https://www.ebi.ac.uk/Tools/hmmer/search/jackhmmer')
print "Writing sequence"
seq_box=browser.find_element_by_xpath(".//*[@id='seq']") #Sequence box
seq_box.send_keys(seq)
print "Selecting database %s"%database
select = Select(browser.find_element_by_xpath(".//*[@id='sequencedb']/div[1]/label/select")); #Database selection
#print [o.text for o in select.options]
select.select_by_visible_text(database);
print "Submitting"
browser.find_element_by_xpath(".//*[@id='subbutton']").click()
time.sleep(2) #Wait
print "Starting second iteration"
for i in range(20):
print "Pressing button"
try:
browser.find_element_by_xpath(".//*[@id='jackhmmer_nav']/p/input").click()
break #Start second iteration
except selenium.common.exceptions.NoSuchElementException:
pass
time.sleep(1)
time.sleep(25) #Wait
print "Saving URL"
with open('URLS.txt','a+') as URLs:
URLs.write('%s %s %s\n'%(name,browser.current_url,database))
Restart_url=browser.current_url
print Restart_url
else:
print "Restarting previous iteration for %s"%name
browser.get(restart) #Go to seq URL if saved
Restart_url=restart
#Go to page if not there yet
gotoMainPage(browser,Restart_url)
#Look for the exact domain of the query:
print "Looking for the exact domain"
browser.find_element_by_xpath(".//*[@id='batch_sum']/tbody/tr[%i]/td[2]/a"%1).click() #Go to First iteration
browser.find_element_by_xpath(".//*[@id='subnav']/ul/li[3]/a").click() #Go to Domain section
exact_name=browser.find_element_by_xpath(".//*[@id='exact']/p[2]/strong") #Locate exact domain name
Original_domain=exact_name.text
n=browser.find_element_by_xpath(".//*[@id='exact']/a/span[1]")#Locate exact domain number of sequences
number_sequences=int(n.text.split('\n')[0])
print "Original domain found: %s"%Original_domain
print "Number of sequences on original domain: %i" %number_sequences
#Do more iterations until convergence
Converged=0
I=1
for i in range(2,50):
if number_sequences>10000:
print "The number of sequences is too large, will change the working database from %s"%database
if database=='Reference Proteomes':
database='rp75'
elif database=='rp75':
database='rp55'
elif database=='rp55':
database='rp35'
elif database=='rp35':
database='rp15'
elif database=='rp15':
print "No more databases avaible"
return
print "to %s"%database
browser.close()
HMMER_onlinev2(name,seq,restart=None,database=database)
return
print "On iteration %i"%i
gotoMainPage(browser,Restart_url)
print "Checking if iteration ran already"
try: #Check if there is a new iteration and go
browser.find_element_by_xpath(".//*[@id='batch_sum']/tbody/tr[%i]/td[2]/a"%i).click() #Item in table
except selenium.common.exceptions.NoSuchElementException:
#Try to do another iteration
print "Iteration not run yet, will submit new iteration"
for k in range(20):
#Go to page if not there yet
gotoMainPage(browser,Restart_url)
#Do next iteration
print "Submitting next iteration"
try:
submit_button=browser.find_element_by_xpath(".//*[@id='next_iteration']/input[3]")
if submit_button.is_enabled():
submit_button.click()
time.sleep(2) #Start next iteration
except selenium.common.exceptions.NoSuchElementException:
print "Seems like jackhmmer has converged"
Converged=100
break
try: #Check if there is a new iteration and go
browser.find_element_by_xpath(".//*[@id='batch_sum']/tbody/tr[%i]/td[2]/a"%i).click() #Item in table
print "Yeah!, new iteration"
break
except selenium.common.exceptions.NoSuchElementException:
print "Iteration has not run yet, trying again"
continue
if Converged>20:
"No more iterations will be run"
break
print "Looking for convergence of domain"
browser.find_element_by_xpath(".//*[@id='subnav']/ul/li[3]/a").click()#Go to Domain section
#Look if there is an increase on the number of domains
found=False
for j in range(1,10):
print "Reading domain %i"%j
try:
domain_name=browser.find_element_by_xpath(".//*[@id='content']/div[5]/ul[2]/li[%i]/p[2]/strong"%j).text
print "Domain %i: %s"%(j,domain_name)
if domain_name<>Original_domain:
print "Domain %s different from Original domain %s"%(domain_name,Original_domain)
continue
else:
print "Domain %s found"%domain_name
found=True
J=j
n=browser.find_element_by_xpath(".//*[@id='content']/div[5]/ul[2]/li[%i]/a/span[1]"%j)
n=int(n.text.split('\n')[0])
if n<=number_sequences:
Converged+=1
if n==number_sequences:
print "%i sequences found again!"%n
I=i
else:
print "Sequences in domain are now only %i sequences (before %i)"%(n,number_sequences)
else:
Converged=0
print "There are now %i sequences (before %i)"%(n,number_sequences)
number_sequences=n
I=i
break
except selenium.common.exceptions.NoSuchElementException:
try:
browser.find_element_by_xpath(".//*[@id='wrapper']/div[2]/div[3]/ul/li[%i]/a/span[1]"%j)
print "Domain without architecture found"
continue
except selenium.common.exceptions.NoSuchElementException:
break
if not found:
print "No, Domain dissapeared!"
if i==2:
j=1
else:
j=J
i=i-1
break
if Converged==1:
print "Almost Converged..."
if Converged>1:
print "Yeah! Converged!"
break
#Download
print "Downloading multi-alignment from iteration %i and domain %s"%(I,Original_domain)
gotoMainPage(browser,Restart_url)
browser.find_element_by_xpath(".//*[@id='batch_sum']/tbody/tr[%i]/td[2]/a"%I).click() #Item in table
browser.find_element_by_xpath(".//*[@id='subnav']/ul/li[3]/a").click()#Domain section
browser.find_element_by_xpath(".//*[@id='content']/div[5]/ul[2]/li[%i]/p[1]/a"%j).click() #Select correct domain
time.sleep(10)
browser.find_element_by_xpath(".//*[@id='subnav']/ul/li[4]/a").click() #Download section
#browser.find_element_by_xpath(".//*[@id='format']/div[2]/a[9]").click()
Download_file=browser.current_url+'?format=afa'
#Download_file=Download_file.replace('/results/','/download/')
print Download_file
browser.close()
urllib.urlretrieve(Download_file, "%s_alignment.fasta.gz"%name)
if __name__=='__main__':
from Bio import SeqIO
import os
records = list(SeqIO.parse("Seqs.fa", "fasta"))
Restart={}
if os.path.isfile('URLS.txt'):
with open('URLS.txt') as URLs:
for line in URLs:
a,b=line[:-1].split()[0:2]
try:
c=line[:-1].split()[2]
except IndexError:
c=''
if c in ['rp75','rp55','rp35','rp15']:
Restart.update({a:[b,c]})
else:
Restart.update({a:[b]})
#print Restart
for i,record in enumerate(records):
if os.path.isfile("%s_alignment.fasta.gz"%record.name):
continue
elif record.name in Restart.keys():
database='Reference Proteomes'
if len(Restart[record.name])==2:
restart=Restart[record.name][0]
database=Restart[record.name][1]
else:
restart=Restart[record.name][0]
print "Restart page: %s"%restart
print "Restart database: %s"%database
HMMER_onlinev2(record.name,str(record.seq),restart,database)
else:
HMMER_onlinev2(record.name,str(record.seq))