-
Notifications
You must be signed in to change notification settings - Fork 0
/
pheme_reader.py
154 lines (136 loc) · 6.42 KB
/
pheme_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import os
import json
import elasticsearch
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import http.client
dataset_directory = "/home/cluster/PycharmProjects/data"
DS_STORE = '.DS_Store'
directories = os.listdir(dataset_directory)
directories.remove('README')
es = Elasticsearch(hosts="localhost:9200")
http.client._MAXHEADERS = 5000
# directories.remove(DS_STORE)
def read_data_to_dir():
data = {}
print("Reading Data Started!!")
for directory in directories:
event_name = directory
print("Event: " + str(event_name))
data[event_name] = {'rumor_tweets': [], 'non_rumor_tweets': []}
event_dir_path = os.path.join(dataset_directory, directory)
print("event path: " + str(event_dir_path))
os.chdir(event_dir_path)
clean_dirs = os.listdir(os.curdir)
if clean_dirs.__contains__(DS_STORE):
clean_dirs.remove(DS_STORE)
rumor_dir = clean_dirs[0]
non_rumor_dir = clean_dirs[1]
for rumor_tweet_dir in os.listdir(os.path.join(event_dir_path, rumor_dir)):
if rumor_tweet_dir == DS_STORE:
continue
source_tweet_path = os.listdir(os.path.join(os.curdir, rumor_dir, rumor_tweet_dir, 'source-tweet'))[0]
source_tweet_path = os.path.join(event_dir_path, rumor_dir, rumor_tweet_dir, 'source-tweet',
source_tweet_path)
with open(source_tweet_path) as d:
source_tweet = json.load(d)
# print("source: " + str(source_tweet))
# print(source_tweet.keys())
source_reaction_data = {'source_tweet': source_tweet, 'reactions': []}
reactions = os.listdir(os.path.join(os.curdir, rumor_dir, rumor_tweet_dir, 'reactions'))
for reaction in reactions:
reaction_path = os.path.join(event_dir_path, rumor_dir, rumor_tweet_dir, 'reactions', reaction)
with open(reaction_path) as r:
reaction_tweet = json.load(r)
source_reaction_data['reactions'].append(reaction_tweet)
data[event_name]['rumor_tweets'].append(source_reaction_data)
for non_rumor_tweet_dir in os.listdir(os.path.join(event_dir_path, non_rumor_dir)):
if non_rumor_tweet_dir == DS_STORE:
continue
source_tweet_path = os.listdir(os.path.join(os.curdir, non_rumor_dir, non_rumor_tweet_dir, 'source-tweet'))[
0]
source_tweet_path = os.path.join(event_dir_path, non_rumor_dir, non_rumor_tweet_dir, 'source-tweet',
source_tweet_path)
with open(source_tweet_path) as d:
source_tweet = json.load(d)
# print("source: " + str(source_tweet))
# print(source_tweet.keys())
source_reaction_data = {'source_tweet': source_tweet, 'reactions': []}
reactions = os.listdir(os.path.join(os.curdir, non_rumor_dir, non_rumor_tweet_dir, 'reactions'))
for reaction in reactions:
reaction_path = os.path.join(event_dir_path, non_rumor_dir, non_rumor_tweet_dir, 'reactions', reaction)
with open(reaction_path) as r:
reaction_tweet = json.load(r)
source_reaction_data['reactions'].append(reaction_tweet)
data[event_name]['non_rumor_tweets'].append(source_reaction_data)
print("Reading Data Done!!")
return data
def save_to_txt():
pheme_data = read_data_to_dir()
name_list = list()
name_list.append("charliehebdo")
name_list.append("ferguson")
name_list.append("germanwings-crash")
name_list.append("ottawashooting")
name_list.append("sydneysiege")
charliehebdo = pheme_data["charliehebdo"]
ferguson = pheme_data["ferguson"]
germanwings_crash = pheme_data["germanwings-crash"]
ottawasshooting = pheme_data["ottawashooting"]
sydneysiege = pheme_data["sydneysiege"]
event_list = list()
event_list.append(charliehebdo)
event_list.append(ferguson)
event_list.append(germanwings_crash)
event_list.append(ottawasshooting)
event_list.append(sydneysiege)
all_count = 0
for event in event_list:
success_count = 0
actions = []
rumors = event["rumor_tweets"]
non_rumors = event["non_rumor_tweets"]
current_name = name_list.pop()
for tweets in rumors:
# all_count = all_count + 1
source_tweet = tweets["source_tweet"]
source_tweet["event_name"] = current_name
source_tweet["rumor"] = 1
reactions = tweets["reactions"]
document = json.dumps(source_tweet)
actions.append(document)
# res = es.index(index="twitter", doc_type='doc', id=source_tweet["id_str"], body=document)
# print(res['result'])
for tweet in reactions:
# all_count = all_count + 1
tweet["event_name"] = current_name
tweet["source_tweet_id"] = source_tweet["id_str"]
tweet["rumor"] = 1
document = json.dumps(tweet)
actions.append(document)
# res = es.index(index="twitter", doc_type='doc', id=tweet["id_str"], body=document)
# print(res['result'])
for tweets in non_rumors:
# all_count = all_count + 1
source_tweet = tweets["source_tweet"]
source_tweet["event_name"] = current_name
source_tweet["rumor"] = 0
document = json.dumps(source_tweet)
actions.append(document)
reactions = tweets["reactions"]
# res = es.index(index="twitter", doc_type='doc', id=source_tweet["id_str"], body=document)
# print(res['result'])
for tweet in reactions:
# all_count = all_count + 1
tweet["event_name"] = current_name
tweet["source_tweet_id"] = source_tweet["id_str"]
tweet["rumor"] = 0
document = json.dumps(tweet)
actions.append(document)
# res = es.index(index="twitter", doc_type='doc', id=tweet["id_str"], body=document)
# print(res['result'])
success, _ = bulk(es, actions, index="twitter", doc_type="doc", raise_on_error=True)
success_count += success
print("Successful Doc: " + str(success_count))
# print(str(all_count))
save_to_txt()