-
Notifications
You must be signed in to change notification settings - Fork 0
/
baseline.py
103 lines (74 loc) · 2.32 KB
/
baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from os import listdir, mkdir
from os.path import join, exists
from tqdm import tqdm
import shutil
import os
from natasha import (
Segmenter,
MorphVocab,
NewsEmbedding,
NewsMorphTagger,
NewsSyntaxParser,
NewsNERTagger,
Doc,
)
from natasha.doc import DocSpan
segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)
if os.path.exists(f"../baseline"):
shutil.rmtree(f"../baseline")
mkdir(f"../baseline")
mkdir(f"../baseline/generic")
mkdir(f"../baseline/named")
for part in ["generic", "named"]:
texts = {}
anns = {}
files = os.listdir(f"../data/public_test/{part}")
for file in files:
name = file[:-4]
if file[-3:] == "txt":
text = open(f"../data/public_test/{part}/{file}", encoding='utf-8').read()
texts[name] = text
elif file[-3:] == "ann":
ann = open(f"../data/public_test/{part}/{file}", encoding='utf-8').read().strip().split('\n')
anns[name] = ann
for name in tqdm(texts):
text = texts[name]
ann = anns[name]
f = open(f"../baseline/{part}/{name}.norm", 'w', encoding='utf-8')
for line in ann:
spans = list(map(int, line.strip().split()))
entry = ''
while spans:
start, stop = spans[0], spans[1]
entry += text[start:stop] + " "
spans = spans[2:]
entry = entry.strip()
doc = Doc(entry)
doc.segment(segmenter)
doc.tag_morph(morph_tagger)
doc.parse_syntax(syntax_parser)
doc.tag_ner(ner_tagger)
found = False
span = None
for s in doc.spans:
if s.text == entry:
span = s
found = True
break
if not found:
span = DocSpan(
start=0
, stop=len(entry)
, type='ORG'
, text=entry
, tokens=[token for token in doc.tokens]
)
if span is not None:
span.normalize(morph_vocab)
f.write(f"{span.normal}\n")
f.close()