-
Notifications
You must be signed in to change notification settings - Fork 25
/
csv2vw.py
46 lines (33 loc) · 965 Bytes
/
csv2vw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
'convert from [stackoverflow-specific] CSV to VW format'
import sys, csv, re
test_label = '1'
def get_label( status ):
statuses = ['not a real question', 'not constructive', 'off topic', 'open', 'too localized']
label = statuses.index( status ) + 1
return label
input_file = sys.argv[1]
output_file = sys.argv[2]
reader = csv.reader( open( input_file ))
o = open( output_file, 'wb' )
counter = 0
for line in reader:
counter += 1
post_id = line[0]
status = line[1]
reputation = line[2]
good_posts = line[3]
words = line[4]
tags = line[5:10]
tags = " ".join( tags ).strip()
body = line[10]
if status != '0':
label = get_label( status )
else:
label = test_label
output_line = "%s %s %s" % ( label, 1, post_id ) # weight is 1
output_line += "|n %s %s" % ( reputation, good_posts )
output_line += "|w %s |t %s |b %s" % ( words, tags, body )
output_line += "\n"
o.write( output_line )
if counter % 100000 == 0:
print counter