-
Notifications
You must be signed in to change notification settings - Fork 0
/
FrequencySummarizer-Python2.py
224 lines (197 loc) · 12.6 KB
/
FrequencySummarizer-Python2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
# coding: utf-8
# In[6]:
######################################################################################
# THis example is pretty much entirely based on this excellent blog post
# http://glowingpython.blogspot.in/2014/09/text-summarization-with-nltk.html
# Thanks to TheGlowingPython, the good soul that wrote this excellent article!
# That blog is is really interesting btw.
######################################################################################
######################################################################################
# nltk - "natural language toolkit" is a python library with support for
# natural language processing. Super-handy.
# Specifically, we will use 2 functions from nltk
# sent_tokenize: given a group of text, tokenize (split) it into sentences
# word_tokenize: given a group of text, tokenize (split) it into words
# stopwords.words('english') to find and ignored very common words ('I', 'the',...)
# to use stopwords, you need to have run nltk.download() first - one-off setup
######################################################################################
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
######################################################################################
# We have use dictionaries so far, but now that we have covered classes - this is a good
# time to introduce defaultdict. THis is class that inherits from dictionary, but has
# one additional nice feature: Usually, a Python dictionary throws a KeyError if you try
# to get an item with a key that is not currently in the dictionary.
# The defaultdict in contrast will simply create any items that you try to access
# (provided of course they do not exist yet). To create such a "default" item, it relies
# a function that is passed in..more below.
######################################################################################
from collections import defaultdict
######################################################################################
# punctuation to ignore punctuation symbols
######################################################################################
from string import punctuation
######################################################################################
# heapq.nlargest is a function that given a list, easily and quickly returns
# the 'n' largest elements in the list. More below
######################################################################################
from heapq import nlargest
######################################################################################
# Our first class, named FrequencySummarizer
######################################################################################
class FrequencySummarizer:
# indentation changes - we are now inside the class definition
def __init__(self, min_cut=0.1, max_cut=0.9):
# The constructor named __init__
# THis function will be called each time an object of this class is
# instantiated
# btw, note how the special keyword 'self' is passed in as the first
# argument to each method (member function).
self._min_cut = min_cut
self._max_cut = max_cut
# Words that have a frequency term lower than min_cut
# or higer than max_cut will be ignored.
self._stopwords = set(stopwords.words('english') + list(punctuation))
# Punctuation symbols and stopwords (common words like 'an','the' etc) are ignored
#
# Here self._min_cut, self._max_cut and self._stopwords are all member variables
# i.e. each object (instance) of this class will have an independent version of these
# variables.
# Note how this function is used to set up the member variables to their appropriate values
# indentation changes - we are out of the constructor (member function, but we are still inside)
# the class.
# One important note: if you are used to programming in Java or C#: if you define a variable here
# i.e. outside a member function but inside the class - it becomes a STATIC member variable
# THis is an important difference from Java, C# (where all member variables would be defined here)
# and is a common gotcha to be avoided.
def _compute_frequencies(self, word_sent):
# next method (member function) which takes in self (the special keyword for this same object)
# as well as a list of sentences, and outputs a dictionary, where the keys are words, and
# values are the frequencies of those words in the set of sentences
freq = defaultdict(int)
# defaultdict, which we referred to above - is a class that inherits from dictionary,
# with one difference: Usually, a Python dictionary throws a KeyError if you try
# to get an item with a key that is not currently in the dictionary.
# The defaultdict in contrast will simply create any items that you try to access
# (provided of course they do not exist yet). THe 'int' passed in as argument tells
# the defaultdict object to create a default value of 0
for s in word_sent:
# indentation changes - we are inside the for loop, for each sentence
for word in s:
# indentation changes again - this is an inner for loop, once per each word_sent
# in that sentence
if word not in self._stopwords:
# if the word is in the member variable (dictionary) self._stopwords, then ignore it,
# else increment the frequency. Had the dictionary freq been a regular dictionary (not a
# defaultdict, we would have had to first check whether this word is in the dict
freq[word] += 1
# Done with the frequency calculation - now go through our frequency list and do 2 things
# normalize the frequencies by dividing each by the highest frequency (this allows us to
# always have frequencies between 0 and 1, which makes comparing them easy
# filter out frequencies that are too high or too low. A trick that yields better results.
m = float(max(freq.values()))
# get the highest frequency of any word in the list of words
for w in freq.keys():
# indentation changes - we are inside the for loop
freq[w] = freq[w]/m
# divide each frequency by that max value, so it is now between 0 and 1
if freq[w] >= self._max_cut or freq[w] <= self._min_cut:
# indentation changes - we are inside the if statement - if we are here the word is either
# really common or really uncommon. In either case - delete it from our dictionary
del freq[w]
# remember that del can be used to remove a key-value pair from the dictionary
return freq
# return the frequency list
def summarize(self, text, n):
# next method (member function) which takes in self (the special keyword for this same object)
# as well as the raw text, and the number of sentences we wish the summary to contain. Return the
# summary
sents = sent_tokenize(text)
# split the text into sentences
assert n <= len(sents)
# assert is a way of making sure a condition holds true, else an exception is thrown. Used to do
# sanity checks like making sure the summary is shorter than the original article.
word_sent = [word_tokenize(s.lower()) for s in sents]
# This 1 sentence does a lot: it converts each sentence to lower-case, then
# splits each sentence into words, then takes all of those lists (1 per sentence)
# and mushes them into 1 big list
self._freq = self._compute_frequencies(word_sent)
# make a call to the method (member function) _compute_frequencies, and places that in
# the member variable _freq.
ranking = defaultdict(int)
# create an empty dictionary (of the superior defaultdict variety) to hold the rankings of the
# sentences.
for i,sent in enumerate(word_sent):
# Indentation changes - we are inside the for loop. Oh! and this is a different type of for loop
# A new built-in function, enumerate(), will make certain loops a bit clearer. enumerate(sequence),
# will return (0, thing[0]), (1, thing[1]), (2, thing[2]), and so forth.
# A common idiom to change every element of a list looks like this:
# for i in range(len(L)):
# item = L[i]
# ... compute some result based on item ...
# L[i] = result
# This can be rewritten using enumerate() as:
# for i, item in enumerate(L):
# ... compute some result based on item ...
# L[i] = result
for w in sent:
# for each word in this sentence
if w in self._freq:
# if this is not a stopword (common word), add the frequency of that word
# to the weightage assigned to that sentence
ranking[i] += self._freq[w]
# OK - we are outside the for loop and now have rankings for all the sentences
sents_idx = nlargest(n, ranking, key=ranking.get)
# we want to return the first n sentences with highest ranking, use the nlargest function to do so
# this function needs to know how to get the list of values to rank, so give it a function - simply the
# get method of the dictionary
return [sents[j] for j in sents_idx]
# return a list with these values in a list
# Indentation changes - we are done with our FrequencySummarizer class!
######################################################################################
# Now to get a URL and summarize
######################################################################################
import urllib2
from bs4 import BeautifulSoup
######################################################################################
# Introducing Beautiful Soup: " Beautiful Soup is a Python library for pulling data out of
# HTML and XML files. It works with your favorite parser to provide idiomatic ways of
# navigating, searching, and modifying the parse tree. It commonly saves programmers hours
# or days of work.
######################################################################################
def get_only_text_washington_post_url(url):
# This function takes in a URL as an argument, and returns only the text of the article in that URL.
page = urllib2.urlopen(url).read().decode('utf8')
# download the URL
soup = BeautifulSoup(page)
# initialise a BeautifulSoup object with the text of that URL
text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
# use this code to get everything in that text that lies between a pair of
# <article> and </article> tags. We do this because we know that the URLs we are currently
# interested in - those from the WashingtonPost have this nice property
# OK - we got everything between the <article> and </article> tags, but that everything
# includes a bunch of other stuff we don't want
# Now - repeat, but this time we will only take what lies between <p> and </p> tags
# these are HTML tags for "paragraph" i.e. this should give us the actual text of the article
soup2 = BeautifulSoup(text)
if soup2.find_all('p')!=[]:
text = ' '.join(map(lambda p: p.text, soup2.find_all('p')))
# use this code to get everything in that text that lies between a pair of
# <p> and </p> tags. We do this because we know that the URLs we are currently
# interested in - those from the WashingtonPost have this nice property
return soup.title.text, text
# Return a pair of values (article title, article body)
# Btw note that BeautifulSoup return the title without our doing anything special -
# this is why BeautifulSoup works so much better than say regular expressions at parsing HTML
#####################################################################################
# OK! Now lets give this code a spin
#####################################################################################
someUrl = "https://www.washingtonpost.com/news/the-switch/wp/2015/08/06/why-kids-are-meeting-more-strangers-online-than-ever-before/"
# the article we would like to summarize
textOfUrl = get_only_text_washington_post_url(someUrl)
# get the title and text
fs = FrequencySummarizer()
# instantiate our FrequencySummarizer class and get an object of this class
summary = fs.summarize(textOfUrl[1], 3)
# get a summary of this article that is 3 sentences long
# In[ ]: