-
Notifications
You must be signed in to change notification settings - Fork 4
/
file_convertor.py
122 lines (108 loc) · 4.54 KB
/
file_convertor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import argparse
import os
import sys
from utils.misc_vocab_loader import VocabLoader
from utils.parallel_utils import run_as_parallel
from utils.vocab_utils import get_idx, get_typeidx
import utils.constants as K
import logging
__author__ = 'Shyam'
def process_line(line, word2idx, type2idx, coh2idx, dotype=False):
parts = line.strip().split("\t")
mid, wid, wikititle = parts[0:3]
start_token = int(parts[3]) # DO NOT add <s> in the start, this is 0-indexed start token
end_token = int(parts[4]) # this is 0-indexed end token
surface = parts[5]
# NO NEED TO PUT START AND END TOKEN ID
words = parts[6].split(" ")
# print(surface,start_token,end_token,list(enumerate(words)))
sent_tokens = list(map(lambda word: str(get_idx(word, word2idx)), words))
sent_str = " ".join(sent_tokens)
if dotype:
types = list(map(lambda t: str(get_typeidx(t, type2idx)), parts[7].split(" ")))
types_str = " ".join(types)
else:
types_str = parts[7]
if len(parts) > 8: # If no mention surface words in coherence
if parts[8].strip() == "":
# coherence = [K.OOV_TOKEN] # [unk_word]
coherence = [str(K.OOV_ID)]
else:
coherence = parts[8].split(" ")
coherence = [str(get_idx(coh,coh2idx)) for coh in coherence]
else:
coherence = [str(K.OOV_ID)]
tmp = sorted(list(set(coherence)))
coherence_str = " ".join(tmp)
# if len(parts) == 10:
# docid = parts[9]
if len(parts) == 11:
dbow = parts[9].split(' ')
dbow = [d.split(":=") for d in dbow]
dbow = [[str(get_idx(d[0],word2idx)),d[1]] for d in dbow]
dbow = [":=".join(d) for d in dbow]
dbow_str = " ".join(dbow)
else:
dbow_str = " "
buf = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (mid, wid, wikititle,
start_token, end_token,
surface, sent_str,
types_str, coherence_str, dbow_str)
# assert end_token <= len(sent_tokens), "Line : %s" % line
if end_token > len(sent_tokens):
logging.info("Bad Line #: %s", line)
# logging.info("Bad Line")
return buf
def process_file(infile, outfile, args):
logging.info("input %s", infile)
logging.info("output %s", outfile)
with open(outfile, "w") as out:
for line in open(infile):
buf = process_line(line,
word2idx=word2idx,
type2idx=type2idx,
coh2idx=coh2idx,
dotype=not args["notype"])
# mid wid wikititle start_token end_token surface tokenized_sentence all_types
out.write(buf)
def handle_file(id, jobs_queue):
while True:
job = jobs_queue.get()
if job:
infile, outfile, args = job
process_file(infile, outfile, args)
else:
logging.debug('Quit extractor')
break
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='convert strings to ids')
parser.add_argument('--in', type=str, required=True, help='input mention file with strings')
parser.add_argument('--out', type=str, required=True, help='outfile mention file with ids')
parser.add_argument('--vocabpkl', type=str, required=True, help='# the *.word2idx.pkl file')
parser.add_argument('--type_vocab', type=str, default="data/enwiki/fbtypelabels.vocab", help='type vocab')
parser.add_argument('--notype', action="store_true")
args = parser.parse_args()
args = vars(args)
print(args)
loader = VocabLoader()
loader.load_word2idx(word2idx_pkl_path=args["vocabpkl"])
loader.load_type_vocab(path=args["type_vocab"])
word2idx = loader.word2idx
coh2idx = None
type2idx = loader.type2idx
if os.path.isdir(args["in"]):
wikipath = args["in"] # "data/enwiki/enwiki_mentions_with_es-en_merged/"
outdir = args["out"]
jobs = []
for filename in sorted(os.listdir(wikipath)):
infile = os.path.join(wikipath, filename)
outfile = os.path.join(outdir, filename + ".ids")
job = (infile, outfile, args)
jobs.append(job) # goes to any available extract_process
# print(jobs[:5])
run_as_parallel(jobs_list=jobs, worker_func=handle_file)
else:
infile = args["in"]
outfile = args["out"]
job = infile, outfile
process_file(infile, outfile, args)