-
Notifications
You must be signed in to change notification settings - Fork 1
/
twitter_to_kafka.py
60 lines (49 loc) · 2.21 KB
/
twitter_to_kafka.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import json
from kafka import SimpleProducer, KafkaClient
import tweepy
import configparser
# Note: Some of the imports are external python libraries. They are installed on the current machine.
# If you are running multinode cluster, you have to make sure that these libraries
# and currect version of Python is installed on all the worker nodes.
class TweeterStreamListener(tweepy.StreamListener):
""" A class to read the twiiter stream and push it to Kafka"""
def __init__(self, api):
self.api = api
super(tweepy.StreamListener, self).__init__()
client = KafkaClient("localhost:9092")
self.producer = SimpleProducer(client, async = True,
batch_send_every_n = 1000,
batch_send_every_t = 10)
def on_status(self, status):
""" This method is called whenever new data arrives from live stream.
We asynchronously push this data to kafka queue"""
msg = status.text.encode('utf-8')
#print(msg)
try:
self.producer.send_messages(b'twitterstream', msg)
except Exception as e:
print(e)
return False
return True
def on_error(self, status_code):
print("Error received in kafka producer")
return True # Don't kill the stream
def on_timeout(self):
return True # Don't kill the stream
if __name__ == '__main__':
# Read the credententials from 'twitter.txt' file
config = configparser.ConfigParser()
config.read('twitter.txt')
consumer_key = config['DEFAULT']['consumerKey']
consumer_secret = config['DEFAULT']['consumerSecret']
access_key = config['DEFAULT']['accessToken']
access_secret = config['DEFAULT']['accessTokenSecret']
# Create Auth object
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
# Create stream and bind the listener to it
stream = tweepy.Stream(auth, listener = TweeterStreamListener(api))
#Custom Filter rules pull all traffic for those filters in real time.
#stream.filter(track = ['love', 'hate'], languages = ['en'])
stream.filter(locations=[-180,-90,180,90], languages = ['en'])