-
Notifications
You must be signed in to change notification settings - Fork 0
/
login_spyder.py
42 lines (35 loc) · 1.32 KB
/
login_spyder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import scrapy
from scrapy.utils.url import escape_ajax
class StackOverflowSpider(scrapy.Spider):
name = 'login'
start_urls = ["https://accounts.google.com/ServiceLogin?hl=pt-BR#identifier"]
def parse(self, response):
"""
Insert the email. Next, go to the password page.
"""
return scrapy.FormRequest.from_response(
response,
formdata={'Email': self.user_name},
callback=self.log_password)
def log_password(self, response):
"""
Enter the password to complete the log in.
"""
return scrapy.FormRequest.from_response(
response,
formdata={'Passwd': self.user_pass},
callback=self.after_login)
def after_login(self, response):
return scrapy.Request("https://groups.google.com/forum/?_escaped_fragment_=forum/wca-delegates[1-100]", callback=self.parse_forum)
def parse_forum(self, response):
for href in response.css('a::attr(href)'):
full_url = response.urljoin(href.extract())
if "5B" in full_url:
yield scrapy.Request(full_url, callback=self.parse_forum)
yield scrapy.Request(full_url, callback=self.parse_thread)
def parse_thread(self, response):
yield {
'date': response.xpath('//td[@class="lastPostDate"]//text()').extract_first(),
'link': response.url,
'title': response.xpath('//h2//text()').extract()
}