-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
101 lines (86 loc) · 3.82 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# author: Yizhong
# created_at: 16-11-28 下午2:48
import argparse
import os
from dataset.utils.xmlreader import reader, writer, combine
def join_edus(fedu):
ftext = fedu.replace('.edus', '.text')
with open(fedu, 'r') as fin, open(ftext, 'w') as fout:
lines = [l.strip() for l in fin if l.strip()]
fout.write(' '.join(lines))
def extract(fxml):
sent_list, const_list = reader(fxml)
sent_list = combine(sent_list, const_list)
fconll = fxml.replace(".text.xml", ".conll")
writer(sent_list, fconll)
def merge(fxml):
fconll = fxml.replace('.text.xml', '.conll')
fedu = fxml.replace('.text.xml', '.edus')
fmerge = fxml.replace('.text.xml', '.merge')
fpara = fxml.replace('.text.xml', '')
with open(fconll, 'r') as fin1, open(fedu, 'r') as fin2, open(fpara, 'r') as fin3, open(fmerge, 'w') as fout:
edus = [l.strip() for l in fin2 if l.strip()]
paras = []
para_cache = ''
for line in fin3:
if line.strip():
para_cache += line.strip() + ' '
else:
paras.append(para_cache.strip())
para_cache = ''
if para_cache:
paras.append(para_cache)
edu_idx = 0
para_idx = 0
cur_edu_offset = len(edus[edu_idx]) - 1 + 1 # plus 1 for one blank space
edu_cache = ''
for line in fin1:
if not line.strip():
continue
line_info = line.strip().split()
token_end_offset = int(line_info[-1])
fout.write('%s\t%s\t%s\n' % ('\t'.join(line.strip().split('\t')[:-2]), edu_idx + 1, para_idx + 1))
if token_end_offset == cur_edu_offset:
edu_cache += edus[edu_idx] + ' '
if len(edu_cache) == len(paras[para_idx]) + 1:
edu_cache = ''
para_idx += 1
edu_idx += 1
if edu_idx < len(edus):
cur_edu_offset += len(edus[edu_idx]) + 1
elif token_end_offset > cur_edu_offset:
print("Error while merging token \"{}\" in file {} with edu : {}.".format(line_info[2], fconll,
edus[edu_idx]))
edu_idx += 1
if edu_idx < len(edus):
cur_edu_offset += len(edus[edu_idx]) + 1
# Only one error occurs when pre-processing:
# Error while merging token "..." in file /TRAINING/wsj_1373.out.conll with edu :
# that recognize facial features..
def arg_parse():
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir', required=True, help='path to data directory.')
parser.add_argument('--corenlp_dir', required=True, help='path to Stanford Corenlp directory.')
return parser.parse_args()
def main():
args = arg_parse()
print('Join the separated edus in *.edus file into *.text file with a single line...')
edu_files = [os.path.join(args.data_dir, fname) for fname in os.listdir(args.data_dir) if fname.endswith('.edus')]
for i, fedu in enumerate(edu_files):
join_edus(fedu)
if i % 100 == 0:
print("joined ", i, " edus")
print('Parse the *.text files...')
os.system('bash ' + os.path.join(args.corenlp_dir, 'run_corenlp.sh') + ' ' + args.data_dir)
print('Merge parsed files to generate *.merge file...')
parse_files = [os.path.join(args.data_dir, fname) for fname in os.listdir(args.data_dir) if fname.endswith(".xml")]
for i, fxml in enumerate(parse_files):
print('Processing file: {}'.format(fxml))
extract(fxml)
merge(fxml)
if i % 100 == 0:
print("processed ", i, " files")
if __name__ == '__main__':
main()