-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
150 lines (118 loc) · 4.17 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup as BS
import mechanize
import datetime
import sys
import re
import ast
import mysql.connector
import config
import six
# --- Set Encoding ---
reload(sys)
sys.setdefaultencoding("utf-8")
# --- Connect to mySQL ---
## You should make "config.py" to set database name and so on.
dbcon = mysql.connector.connect(
database=config.database,
user=config.user,
password=config.password,
host=config.host
)
dbcur = dbcon.cursor()
try:
dbcur.execute(
'CREATE TABLE ivbpfdictionary (url VARCHAR(255) UNIQUE, html TEXT, title VARCHAR(255), parsed_html TEXT, posteriority INT, authors VARCHAR(255), channels VARCHAR(255));')
dbcon.commit()
except:
print("There already is the table!")
# --- Compile Pattern ---
pattern = re.compile('\{.+?\}')
# --- Get Date Time ---
now = datetime.datetime.now()
### With more precision, we should consider a time difference.
# --- Set First URL ---
print("This is the code for Crawling 'TECHCRUNCH'")
url = 'http://techcrunch.com/'
b = mechanize.Browser()
b.open(url)
# --- Click Stream by "mechanize" ---
#
# URL sample : http://techcrunch.com/2015/05/26/tc-cribs-lumositys-brilliant-soma-office/
# http://techcrunch.com/2015/05/26/workday-falls-5-despite-beating-market-expectations-in-its-fq1/
#
# each page has "Next Story" button.
# sample : <a href="http://techcrunch.com/2015/05/27/mapsense/" class="next-link" data-omni-sm="art_nextstory">
#
# In the top page, - class="river-block" - means the latest news class
#
# var sranalytics = {"version":"0.1.4","pid":"518833fa642b2405ba000008","iframe":"0","title":"Airware Launches Fund To Catalyze The Rest Of The Commercial Drone Equation","url":"http:\/\/techcrunch.com\/2015\/05\/27\/drone-fund\/","date":"2015-05-27 13:01:39","channels":["tc"],"tags":[],"authors":["Josh Constine"]};
#
html = b.response().read()
soup = BS(html)
'''
for link in soup.find_all('li', {"class":"river-block"}):
print(link.get('data-permalink'))
'''
# try:
# latest_url = soup.find('li', {"class":"river-block"}).get('data-permalink')
# except:
latest_url = soup.find_all('a', {"data-omni-sm": "gbl_river_headline,1"})[0].get('href')
print(latest_url)
print("Latest News : " + latest_url)
tmpurl = latest_url
while True:
b.open(tmpurl)
tmphtml = b.response().read()
soup = BS(tmphtml)
# --- GET SRANALYTICS ---
sranalytics = soup.find(text=re.compile("var sranalytics"))
match = pattern.search(sranalytics)
d = ast.literal_eval(match.group())
# print d
article_datetime = datetime.datetime.strptime(d['date'], '%Y-%m-%d %H:%M:%S')
# print article_datetime
# print now - datetime.timedelta(days=7)
if article_datetime < now - datetime.timedelta(days=7):
print("One Weeks Ago!")
break
# --- URL ---
print("<URL>")
print(tmpurl)
# --- HTML ---
# print tmphtml
# --- TITLE ---
print("<TITLE>")
print(d['title'])
# --- MAIN TEXT ---
print("<MAIN TEXT>")
# <div class="article-entry text">
main_text_asResultSet = soup.find_all('div', {"class": "article-entry text"})
main_text_asString = unicode.join(u'\n', map(unicode, main_text_asResultSet))
tmpsoup = BS(main_text_asString)
main_text_asResultSet = tmpsoup.find_all('p')
main_text_asString = unicode.join(u'\n', map(unicode, main_text_asResultSet))
tmpsoup = BS(main_text_asString)
# print tmpsoup.get_text()
# --- DATE TIME ---
print("<DATE TIME>")
print(d['date'])
# --- AUTHORS ---
print("<AUTHORS>")
print(d['authors'])
# --- CHANNELS ---
print("<CHANNELS>")
print(d['channels'])
# --- NEXT ---
for next_link in soup.find_all('a', {"class": "next-link"}):
next_url = next_link.get('href')
print("<NEXT>")
print(next_url)
tmpurl = next_url
INSERT_EXEC = ("INSERT IGNORE INTO techchrunch "
"(url, html, title, text, date, authors, channels) "
"VALUES (%s, %s, %s, %s, %s, %s, %s)")
INPUT_DATA = (
tmpurl, tmphtml, d['title'], tmpsoup.get_text(), d['date'], ','.join(d['authors']), ','.join(d['channels']))
dbcur.execute(INSERT_EXEC, INPUT_DATA)
dbcon.commit()