-
Notifications
You must be signed in to change notification settings - Fork 35
/
extract-random-sample.py
69 lines (57 loc) · 1.83 KB
/
extract-random-sample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
'''
extract-random-sample.py
Takes random sample of tweets from a large file (or files) with tweets
in JSON format
@p_barbera
Usage:
### extract random sample of 5 percent or 25000 tweets
python extract-random-sample.py -f 'tweets1.json' 'tweets2.json' -o 'sample.json' \
-p 0.05 -k 25000
Will return the lowest of: p (proportion of tweets) or k (number of tweets)
'''
import sys
import json
import argparse
import random
# arguments
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--file', required=True, nargs='+',
help = 'names of files with tweets in json format')
parser.add_argument('-o', '--output', required=True,
help = 'name of file where sample of tweets will be stored')
parser.add_argument('-p', required=True, type=float,
help = 'proportion of tweets to sample')
parser.add_argument('-k', required=True, type=float,
help = 'number of tweets to sample')
args = parser.parse_args()
# function to extract random sample from files
def parse_files(filenames, p, k):
i = 0
tweets = []
for filename in filenames:
print filename
f = open(filename, 'r')
for line in f:
i += 1
if i % 100000 == 0:
print str(i) + ' tweets processed'
if random.random() < p:
try:
t = json.loads(line)
if 'text' in t.keys():
tweets.append(t)
except:
print 'Error parsing json'
continue
if int(p * i) < k:
k = int(p * i)
random.shuffle(tweets)
tweets = tweets[:int(k)]
return(tweets)
# subsetting tweets
tweets = parse_files(args.file, args.p, args.k)
print str(len(tweets)) + ' tweets extracted'
out = open(args.output, 'w')
for tweet in tweets:
out.write(json.dumps(tweet) + '\n')
out.close()