-
Notifications
You must be signed in to change notification settings - Fork 25
/
preprocess.py
40 lines (38 loc) · 1.51 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import urllib.request
import zipfile
url = 'https://gitlab.com/shimorina/webnlg-dataset/-/archive/master/webnlg-dataset-master.zip?path=release_v3.0/en/train'
urllib.request.urlretrieve(url, 'web.zip')
with zipfile.ZipFile('web.zip', 'r') as zip_ref:
zip_ref.extractall('web')
import glob
import os
import re
import xml.etree.ElementTree as ET
import pandas as pd
files = glob.glob("/content/web/webnlg-dataset-master-release_v3.0-en-train/release_v3.0/en/train/**/*.xml", recursive=True)
triple_re=re.compile('(\d)triples')
data_dct={}
for file in files:
tree = ET.parse(file)
root = tree.getroot()
triples_num=int(triple_re.findall(file)[0])
for sub_root in root:
for ss_root in sub_root:
strutured_master=[]
unstructured=[]
for entry in ss_root:
unstructured.append(entry.text)
strutured=[triple.text for triple in entry]
strutured_master.extend(strutured)
unstructured=[i for i in unstructured if i.replace('\n','').strip()!='' ]
strutured_master=strutured_master[-triples_num:]
strutured_master_str=(' && ').join(strutured_master)
data_dct[strutured_master_str]=unstructured
mdata_dct={"prefix":[], "input_text":[], "target_text":[]}
for st,unst in data_dct.items():
for i in unst:
mdata_dct['prefix'].append('webNLG')
mdata_dct['input_text'].append(st)
mdata_dct['target_text'].append(i)
df=pd.DataFrame(mdata_dct)
df.to_csv('webNLG2020_train.csv')