forked from Delta-Ark/Geo_Bot-complex
-
Notifications
You must be signed in to change notification settings - Fork 1
/
real_time_vis.py
executable file
·338 lines (288 loc) · 10.9 KB
/
real_time_vis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
#!/usr/bin/python
# real_time_vis.py
# Saito 2015
"""This grabs tweets and visualizes them in real time using params.txt.
You can get the tweets using the streaming API or the REST API. The
rest API requires 5 second pauses between successive calls to the
twitter server. This is the default. Use the --stream or -s flag to
enable the streaming API. The Streaming API gets all tweets that are
geotagged within the bounding box. The geolocation is approximately
converted, by inscribing a bounding box square in the circle around
the geocoordinates. The tweets are also saved in JSON form to
a file called 'tweets.json'.
USAGE:
$ python real_time_vis.py [-h][-d][-f FILENAME][-n NUMBER][-s][-a ADDRESS]
OR for help, try:
$ ./real_time_vis.py -h
OR:
$ python real_time_vis.py
Example using default parameter file 'params.txt', with 20 top words
to display, on a growing chart:
$ ./real_time_vis --number 20
Or using the streaming API with an address:
$ ./real_time_vis -n 20 -s -a "175 5th Avenue NYC"
TO EXIT:
To exit one of these multithreaded programs, use a keyboard interrupt
like CTRL+C.
"""
from __future__ import division
import Queue
import argparse
import sys
import matplotlib.pyplot as plt
import geo_converter
import geosearchclass
import streamer
import utils
global stream # so that CTRL + C kills stream
def update_fdist(fdist, new_words):
for word in new_words:
if word in fdist:
fdist[word] += 1
else:
fdist[word] = 1
return fdist
def remove_infrequent_words(samples, fdist):
trimmed_samples = []
for item in samples:
if fdist[item] > 2:
trimmed_samples.append(item)
return trimmed_samples
def updating_plot(geosearchclass, number_of_words, grow=True):
search_results = geosearchclass.search()
filtered_words = utils.tokenize_and_filter(search_results)
fdist = utils.get_freq_dist(filtered_words)
# set up plot
samples = [item for item, _ in fdist.most_common(number_of_words)]
freqs = [fdist[sample] for sample in samples]
plt.grid(True, color="silver")
plt.plot(freqs, range(len(freqs)))
plt.yticks(range(len(samples)), [s for s in samples])
plt.ylabel("Samples")
plt.xlabel("Counts")
plt.title("Top Words Frequency Distribution")
plt.ion()
plt.show()
# set up loop
old_ids = set([s.id for s in search_results])
for i in xrange(100):
plt.pause(5)
# use mixed above, change to recent here
geosearchclass.result_type = "recent"
# perturbation study
# if i%2: # for testing purposes
# # #change location every odd time to nyc
# # geosearchclass.latitude =40.734073
# # geosearchclass.longitude =-73.990663
# # perturb latitude
# geosearchclass.latitude =geosearchclass.latitude + .001
# else:
# #now back to sf
# # geosearchclass.latitude = 37.7821
# # geosearchclass.longitude = -122.4093
# geosearchclass.longitude =geosearchclass.longitude + .001
search_results = geosearchclass.search()
new_search_results = utils.new_tweets(search_results, old_ids)
if new_search_results:
filtered_words = utils.tokenize_and_filter(new_search_results)
fdist = update_fdist(fdist, filtered_words)
if grow:
newsamples = [item
for item, _ in fdist.most_common(number_of_words)
]
s1 = set(newsamples)
s2 = set(samples)
s1.difference_update(s2)
if s1:
print "New words: " + str(list(s1))
newsamples = list(s1)
samples.extend(newsamples)
plt.yticks(range(len(samples)), [s for s in samples])
freqs = [fdist[sample] for sample in samples]
plt.plot(freqs, range(len(freqs)))
if grow:
plt.draw()
print '%d new tweet(s)' % len(new_search_results)
old_ids.update(set([s.id for s in new_search_results]))
else:
print "no updates"
# g = geosearchclass.GeoSearchClass()
# g.set_params_from_file('params.txt')
# search_results = g.search()
def updating_stream_plot(q, number_of_words=30):
"""This plot uses the streaming API to get real time twitter
information from a given region, determined by a geo-coordinate
bounding box. The upper left and lower right determine the
bounding box.
q is a queue instance, which holds tweets
number_of_words determines the average number of words in the
plot. Once the plot reaches 2 x number_of_words, it is shrunk down
to the new set of words and starts growing again
To exit the program early, hit CTRL + Z to stop the python script
and then CTRL + D twice to kill the terminal process and close the
window.
"""
setup = False
fdist = None
samples = None
draw_time = 0.1
samples = []
plt.ion()
plt.grid(True, color="silver")
for i in range(100000):
status = q.get()
search_results = [status]
while not q.empty():
print "getting another tweet"
status = q.get()
search_results.append(status)
if not setup:
print "Gathering enough data to begin plotting"
while len(samples) < 1:
status = q.get()
search_results.append(status)
filtered_words = utils.tokenize_and_filter(search_results)
if fdist is None:
fdist = utils.get_freq_dist(filtered_words)
else:
fdist = update_fdist(fdist, filtered_words)
n_words = min(10, len(fdist))
samples = [item for item, _ in fdist.most_common(n_words)]
# print "len(samples) = {}".format(len(samples))
samples = remove_infrequent_words(samples, fdist)
freqs = [fdist[sample] for sample in samples]
plt.plot(freqs, range(len(freqs)))
plt.yticks(range(len(samples)), [s for s in samples])
plt.ylabel("Samples")
plt.xlabel("Counts")
plt.title("Top Words Frequency Distribution")
plt.show()
plt.pause(draw_time)
setup = True
else:
filtered_words = utils.tokenize_and_filter(search_results)
fdist = update_fdist(fdist, filtered_words)
newsamples = [item
for item, _ in fdist.most_common(number_of_words)]
newsamples = remove_infrequent_words(newsamples, fdist)
s1 = set(newsamples)
s2 = set(samples)
s1.difference_update(s2)
if s1:
print "New words: " + str(list(s1))
newsamples = list(s1)
samples.extend(newsamples)
if len(samples) > 2*number_of_words:
samples = newsamples
plt.close()
plt.yticks(range(len(samples)), [s for s in samples])
freqs = [fdist[sample] for sample in samples]
plt.plot(freqs, range(len(freqs)))
plt.draw()
plt.pause(draw_time)
kill_plot()
return
def kill_plot():
print "turning interactive off"
plt.ioff()
print "closing plot"
plt.close()
return
def get_parser():
""" Creates a command line parser
--doc -d
--help -h
--filename -f
--grow -g
--number -n
"""
# Create command line argument parser
parser = argparse.ArgumentParser(
description='Create an updating word frequency distribution chart.')
parser.add_argument('-d',
'--doc',
action='store_true',
help='print module documentation and exit')
parser.add_argument(
'-f',
'--filename',
help='''specify a FILENAME to use as the parameter file.
If not specified, will use 'params.txt'.''')
parser.add_argument(
'-a',
'--address',
help='''give an ADDRESS to get geocoordinates for.''')
# parser.add_argument('-r',
# '--rest',
# action='store_true',
# help='Use the REST API to create a growing chart\
# as new words arrive.')
parser.add_argument('-n',
'--number',
help='specify NUMBER of words to display. The\
streaming plot will grow to twice this number\
before shrinking again')
parser.add_argument('-s',
'--stream',
action='store_true',
help='Use streaming API to update a growing plot. \
Use Interrupt signal, like CTRL + C to exit. \
This uses the LOCATION and SEARCH_TERM from\
parameter file. The tweets are saved to tweets.json')
return parser
def main():
parser = get_parser()
args = parser.parse_args()
# print args
# print args.help
if args.doc:
print __doc__
import sys
sys.exit(0)
if args.number:
number = int(args.number)
else:
number = 30
g = geosearchclass.GeoSearchClass()
if args.filename:
print 'Using parameters from ' + str(args.filename)
g.set_params_from_file(args.filename)
else:
print "Using search values from params.txt"
g.set_params_from_file('params.txt')
if args.address:
print "Finding geocoordates for address:\n{}".format(args.address)
coords = geo_converter.get_geocoords_from_address(args.address)
if coords:
g.latitude = coords[0]
print "Found this latitude:"
print g.latitude
g.longitude = coords[1]
print "Found this longitude:"
print g.longitude
else:
print "Failed to find coordinates. Exiting."
sys.exit()
if args.stream:
print "using streaming queue"
q = Queue.Queue()
bounding_box = geo_converter.get_bounding_box_from(g)
search_terms = geo_converter.get_search_terms_from(g)
print "bounding_box = {}".format(bounding_box)
print "search_terms = {}".format(search_terms)
global stream
fn = 'tweets.json'
stream = streamer.start_stream(q, bounding_box, fn, search_terms)
updating_stream_plot(q, number)
else:
print "using REST API updating plot"
updating_plot(g, number, True) # set grow flag to True
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
print "Main function interrupted"
if "stream" in globals():
streamer.kill_stream(stream)
kill_plot()
sys.exit()