-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
397 lines (332 loc) · 14.1 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
import argparse
import csv
import hashlib
import subprocess
import json
import os
import re
import nltk
from bs4 import BeautifulSoup
from threading import Timer
from urllib.parse import urlparse
from boilerpipe.extract import Extractor
from newspaper import Article
from utils.stanford_NLP import StanfordNLP
from utils.url_classifier.url_utils import UrlUtils
nltk.download('punkt')
"""
Script for scraping news article provenance from news url
Integrating Python Newspaper3k library and boilerpipe to extract actual article from webpage
Using StanfordCoreNLP to extract quotations and attributed speakers.
Download StanfordCoreNLP from https://stanfordnlp.github.io/CoreNLP/download.html
"""
class Author(object):
def __init__(self, name, link):
self.name = name
self.link = link
def jsonify(self):
"""return a dictionary of author object"""
return {
'name': self.name,
'link': self.link
}
class NewsArticle(object):
"""
NewsArticle class is mainly to find and store the provance of one news article
Only two methods should be called outside the class
One is class static method:
build_news_article_from_url(url, sNLP)
return an NewsArticle object build from provided url
use provided sNLP (StanfordNLP class) to extract quotes
Another method:
jsonify()
return a dictionary only contain article provenance
"""
def __init__(self, newspaper_article, sNLP):
"""
constructor for NewsArticle object
NewsArticle constructor based on the result returned by
the Newspaper3k and Boilerpipe library.
Parameters
----------
newspaper_article : Article
the Article object returned by Newspaper3k library
and modified by the Boilerpipe library
"""
# some useful private properties
self.__article = newspaper_article
self.__fulfilled = False
self.__sNLP = sNLP
# news Provenance
self.url = newspaper_article.canonical_link
self.title = newspaper_article.title
self.authors = []
self.publisher = newspaper_article.source_url
self.publish_date = ''
self.text = newspaper_article.text
self.quotes = []
self.links = {'articles':[], 'unsure':[]}
self.key_words = newspaper_article.keywords
self.find_all_provenance()
def find_authors(self):
print('find author')
regex = '((For Mailonline)|(.*(Washington Times|Diplomat|Bbc|Abc|Reporter|Correspondent|Editor|Elections|Analyst|Min Read).*))'
# filter out unexpected word and slice the For Mailonline in dayliymail author's name
authors_name_segments = [x.replace(' For Mailonline', '') for x in self.__article.authors if not re.match(regex, x)]
# contact Jr to previous, get full name
pos = len(authors_name_segments) - 1
authors_name = []
while pos >= 0:
if re.match('(Jr|jr)', authors_name_segments[pos]):
full_name = authors_name_segments[pos - 1] + ', ' + authors_name_segments[pos]
authors_name.append(full_name)
pos -= 2
else:
authors_name.append(authors_name_segments[pos])
pos -= 1
self.authors = [Author(x, None) for x in authors_name]
print('find author')
return self.authors
def find_publish_date(self):
if self.__article.publish_date:
self.publish_date = self.__article.publish_date.strftime(
"%Y-%m-%d")
else:
self.publish_date = ''
return self.publish_date
def find_quotes(self):
print('find quotes')
# self.q
# list of bundle of quote: [text, speaker (if known, blank otherwise), number of words in quote, bigger than one sentence?]
try:
self.quotes = self.__sNLP.annotate(self.text)
except Exception as e:
print('err in find_quotes, ', e)
self.quotes = []
print('find quotes')
def find_links(self):
"""
Find all a tags and extract urls from href field
Then, categorize the urls before return
The result does not include the self reference link.
"""
print('find links')
parsed_uri = urlparse(self.url)
domain_name = parsed_uri.scheme + "://" + parsed_uri.netloc
html_new3k = self.__article.article_html
soup_a = BeautifulSoup(html_new3k, features="lxml")
a_tags_news3k = soup_a.find_all("a", href=True)
a_tags_all = a_tags_news3k
print('Newspaper a tag length is : ', len(a_tags_news3k))
links = { 'articles': [], 'gov_pgs': [], 'unsure': [] }
if len(a_tags_all):
url_utils = UrlUtils()
# List of all a-tags in article_html, with added domain name if needed
a_tags_proc = [url_utils.add_domain(a_tag, domain_name) for a_tag in a_tags_all ]
# Filter out author page URLs, and store them in their respective author objects
a_tags_no_author = [a_tag for a_tag in a_tags_proc if not url_utils.is_profile(self.authors, a_tag)]
# return_url(a_tag): a_tag is sometimes a string
urls_no_dup = list(set([url_utils.return_url(a_tag) for a_tag in a_tags_no_author]))
print(urls_no_dup)
# Should consider switching the order of unsure and articles
for url in urls_no_dup:
url = url_utils.return_actual_url(url)
if not url_utils.is_valid_url(url) : continue
elif url_utils.is_gov_page(url) : links['gov_pgs'].append(url)
elif url_utils.is_news_article(url): links['articles'].append(url)
elif url_utils.is_reference(url) : links['unsure'].append(url)
else : print('Not identified URL ', url)
print('gov_pgs : ', links['gov_pgs'])
print('articles : ', links['articles'])
print('unsure : ', links['unsure'])
print('find links')
self.links = links
def find_all_provenance(self):
if not self.__fulfilled:
self.find_authors()
self.find_publish_date()
self.find_quotes()
self.find_links()
self.__fulfilled = True
def jsonify(self):
""" return a dictionary only contain article provenance
"""
authors_dicts = [x.jsonify() for x in self.authors]
return {
'url': self.url,
'title': self.title,
'authors': authors_dicts,
'publisher': self.publisher,
'publish_date': self.publish_date,
'text': self.text,
'quotes': self.quotes,
'links': self.links,
'key_words': self.key_words
}
@staticmethod
def build_news_article_from_url(source_url, sNLP):
"""build new article object from source url, if build fail would return None
"""
try:
print('start to scrape from url: ', source_url)
# pre-process news by NewsPaper3k and Boilerpipe library
article = Article(source_url, keep_article_html=True)
article.build()
article.nlp()
e = Extractor(extractor='DefaultExtractor', html=article.html)
article.text = e.getText()
article.article_html = e.getHTML()
news_article = NewsArticle(article, sNLP)
print('success to scrape from url: ', source_url)
return news_article
except Exception as e:
print('fail to scrape from url: ', source_url)
print('reason:', e)
return None
class Scraper(object):
"""
Scraper class, for this class would build a list of NewsArticle objects from source url
if scraper from multiple source url should initiate different scraper
"""
def __init__(self):
self.sNLP = StanfordNLP()
self.visited = []
self.success = []
self.failed = []
def closeNLP(self):
self.sNLP.close()
def scrape_news(self, url, depth=0):
"""
scrape news article from url,
if depth greater than 0, scrape related url which is not in black list and not
be visited yet
"""
def generate_child_url_list(parent_articles_list):
"""
generate next leve child url list from previous articles list
filter out url has been visited
"""
url_list = [url for article in parent_articles_list for url in article.links['articles']]
url_list_unvisited = [url for url in url_list if url not in self.visited]
return url_list_unvisited
news_article_list = []
news_article = NewsArticle.build_news_article_from_url(url, self.sNLP)
if not news_article:
self.failed.append(url)
return news_article_list
news_article_list.append(news_article)
self.visited.append(url)
self.success.append(url)
# Steps for scraping links find in article.
# parent_articles_list would be only current level, from this list generates url list for next level
parent_articles_list = [news_article]
while depth > 0:
child_url_list = generate_child_url_list(parent_articles_list)
child_articles_list = self.scrape_news_list(child_url_list)
news_article_list += child_articles_list
parent_articles_list = child_articles_list
self.visited += child_url_list
depth -= 1
return news_article_list
def scrape_news_list(self, url_list):
"""
scrape news article from url list
"""
news_article_list = []
for url in url_list:
article = NewsArticle.build_news_article_from_url(url, self.sNLP)
if article:
news_article_list.append(article)
self.success.append(url)
else:
self.failed.append(url)
return news_article_list
def hash_url(url):
md5Hash = hashlib.md5()
md5Hash.update(url.encode())
return md5Hash.hexdigest()
def handle_one_url(scraper, url, depth, output=None):
# scrape from url
print('starting scraping from source url: %s, with depth %d' % (url, depth))
news_article_list = scraper.scrape_news(url, depth)
if not news_article_list:
print('fail scraping from source url: ', url)
return False
print('finished scraping urls from source: ', url)
print('total scraped %d pages:' %(len(scraper.visited)))
print('total successful %d pages:' %(len(scraper.success)))
print('success url list :')
print(*scraper.success, sep='\n')
if scraper.failed:
print('failed url list :')
print(*scraper.failed, sep='\n')
# build dict object list
output_json_list = []
for news_article in news_article_list:
output_json_list.append(news_article.jsonify())
# write reslut to file
if output is None:
if not os.path.exists('news_json'):
os.makedirs('news_json')
url_hash = hash_url(url)
output = 'news_json/' + str(url_hash) + '.json'
with open(output, 'w') as f:
json.dump(output_json_list, f, ensure_ascii=False, indent=4)
print('write scraping result to ', output)
return True
def handle_url_list_file(scraper, file_name, depth):
with open(file_name, 'r') as f:
csv_reader = csv.DictReader(f)
line_count = 0
fail_list = []
idx = 0
url_utils = UrlUtils()
for row in csv_reader:
if idx <= 1821:
print(idx)
idx += 1
continue
if line_count == 0:
header = list(row.keys())
url = row['url']
print('url in the dataset ', url)
print('index is ', idx)
label = row['label']
if not url_utils.is_news_article(url) or url_utils.is_gov_page(url) or not url_utils.is_valid_url(url):
idx += 1
continue
rsp = handle_one_url(scraper, url, depth, 'scraped_article'+str(idx) + '_' + label + '.json')
if not rsp:
fail_list.append(row)
line_count += 1
idx += 1
if fail_list:
with open('fail_list.csv', 'w') as fail_csv:
writer = csv.DictWriter(fail_csv, fieldnames=header)
writer.writeheader()
for row in fail_list:
writer.writerow(row)
print('finish process urls in ', file_name)
print('success on %d urls, failed on %d urls ' % (line_count - len(fail_list), len(fail_list)))
print('any failed case could be found in fail_list.csv')
def main():
parser = argparse.ArgumentParser(description='scraping news articles from web, and store result in file')
parser.add_argument('-u', '--url', dest='url', help='source news article web url')
parser.add_argument('-f', '--file', dest='file', help='file contain list of url')
parser.add_argument('-d', '--depth', type=int, dest='depth', default=2, help='the depth of related article would be scraped, defalut is 2')
parser.add_argument('-o', '--output', dest='output',
help='specify output file for url feed by -u option')
args = parser.parse_args()
if args.depth < 0:
print('scraping depth must greater or equal to 0')
return
if not args.url and not args.file:
print('must provide at least one url or file contain url')
return
scraper = Scraper()
if args.url:
handle_one_url(scraper, args.url, args.depth, args.output)
if args.file:
handle_url_list_file(scraper, args.file, args.depth)
scraper.closeNLP()
main()