-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_to_gts.py
551 lines (469 loc) · 23.8 KB
/
parse_to_gts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
#!/usr/bin/env python3
"""
Parse and pre-process a project export to the GTS jsonl format for ABSA pair and triplet extraction.
[{
"id": "id of sentence 1",
"sentence": "context of sentence 1",
"triples": [{
"uid": "id of first aspect term of sentence 1",
"target_tags": "the first aspect term with BIO scheme",
"opinion_tags": "the corresponding multiple opinion terms of the aspect term with BIO scheme",
"sentiment": "the corresponding sentiment polarity of the aspect term"
},{
the second aspect term of sentence 1
...
}]
}, {
sentence 2
...
}]
parse_to_gts.py
webannoparser
24/01/21
Copyright (c) Gilles Jacobs. All rights reserved.
"""
import json
import shutil
import random
import itertools
from pathlib import Path
from collections import Counter, deque
from itertools import islice
from parse_project import parse_process_project
from parser import Filler, Participant, Event
from collections import OrderedDict
import settings
import argparse
random.seed(42)
def split_train_dev_test(dataset, split_doc_ids):
split_dataset = {}
for inst in dataset:
for splitname, docids in split_doc_ids.items():
if inst["id"].split(":")[0] in docids:
split_dataset.setdefault(splitname, []).append(inst)
return split_dataset
def basic_stats(data):
# count the B tags in triples to get opinion term count
opi_tags = [[t.split("\\")[-1] for t in inst["opinion_tags"].split(" ")] for sen in data for inst in sen["triples"]]
opi_c = sum(Counter(tags)["B"] for tags in opi_tags)
counts = {
"sentences_n": len(data),
"target_n": sum(len(inst["triples"]) for inst in data),
"opinion_n": opi_c,
}
# count polarities
pol_c = Counter(triple["sentiment"] for inst in data for triple in inst["triples"])
for k, v in pol_c.items():
counts[f"{k}_n"] = v
counts[f"{k}_pct"] = round(v / sum(pol_c.values()) * 100.0, 1)
return counts
def remove_sentiment_expression(project, sentiment_expression):
'''
Utility to remove in-place on project a sentiment expression on doc and sentence level.
:param project:
:param sentiment_expression:
:return:
'''
for doc in project.annotation_documents:
try:
doc.sentiment_expressions.remove(sentiment_expression)
except ValueError as e:
pass
try:
doc.events.remove(sentiment_expression)
except ValueError as e:
pass
for sen in doc.sentences:
try:
sen.sentiment_expressions.remove(sentiment_expression)
except ValueError as e:
pass
try:
sen.events.remove(sentiment_expression)
except ValueError as e:
pass
def remove_cross_sentence(project, unit_iterator):
sentiment_expressions = list(unit_iterator)
sentiment_expressions_preproc = []
sentiment_expression_total = len(sentiment_expressions)
total_target_count = 0
for se in sentiment_expressions:
sentence_id_se = se.in_sentence[0].element_id
for tgt in se.targets[:]:
total_target_count += 1
sentence_id_tgt = tgt.in_sentence[0].element_id
max_sentence_window = 0 # 0 = same sentence, 1 is previous/next sentence, n-sentence window
if abs(sentence_id_tgt - sentence_id_se) > max_sentence_window:
print(f"Cross-sentence target removed out of sentence window (={max_sentence_window}):\n\t{se}")
se.targets.remove(tgt)
statistics["target_cross_sentence_removed"] += 1
if se.targets: # sentiment expression still has targets and thus are valid
sentiment_expressions_preproc.append(se)
else: # no targets left because they were all removed in previous step, do not add to output list
statistics["sentiment_expression_cross_sentence_removed"] += 1
print(f"Sentiment expression removed, all targets out of sentence window (={max_sentence_window}):\n\t{se}")
remove_sentiment_expression(project, se)
print(f"{statistics['target_cross_sentence_removed']}/{total_target_count} ({round(100*statistics['target_cross_sentence_removed']/total_target_count,1)}%) targets removed because cross sentence.")
print(f"{statistics['sentiment_expression_cross_sentence_removed']}/{sentiment_expression_total} ({round(100*statistics['sentiment_expression_cross_sentence_removed']/sentiment_expression_total,1)}%) sentiment expressions removed because cross sentence.")
return sentiment_expressions_preproc
def replace_canonical_referents_sentiment(project, remove_cross_sentence=False):
'''
NB: this is reimplementation of project.replace_canonical_referents func which was buggy and too complicated.
:param sentiment_expressions:
:param remove_cross_sentence:
:return: list of sentiment_expressions_preproc with new target_preproc attrib.
'''
sentiment_expressions_preproc = []
for se in project.get_sentiment_expressions():
targets_preproc = []
for tgt in se.targets:
tgt_is_arg_and_has_refs = (type(tgt).__name__ == "Participant" or type(tgt).__name__ == "Filler") and \
(not tgt.canonical_referents in [[],None] and tgt.canonical_referents != "from_canonref")
# sometimes there are multiple canonrefs tagged
# this can be a) annotation mistake or
# b) multiple reference to a group, e.g. "all" refers to three companies
tgt_is_event_and_has_refs = (type(tgt).__name__ == "Event" and not tgt.coreferents in [[],None]) # collect coreferents for event too
if tgt_is_arg_and_has_refs or tgt_is_event_and_has_refs:
refs_attr = "canonical_referents" if tgt_is_arg_and_has_refs else "coreferents"
for ref in getattr(tgt, refs_attr):
# check whether canonical referent is in same sentence
same_sentence = [s.element_id for s in tgt.in_sentence] == [
s.element_id for s in ref.in_sentence
]
# if skip_cross_sentence is True only replace when canonref is in same sentence
if remove_cross_sentence and same_sentence:
targets_preproc.append(ref)
print(
f"Replaced {tgt.text} with {ref.text} in same sentence:\n\t{se}"
)
statistics["target_replaced_ref"] += 1
elif not remove_cross_sentence: # if skip_cross_sentence is False always replace
targets_preproc.append(ref)
print(
f"Replaced {tgt.txt} with {ref.txt}:\n\t{se}."
)
statistics["target_replaced_ref"] += 1
else: # no canon/co-refs so keep it in
targets_preproc.append(tgt)
se.targets = targets_preproc
sentiment_expressions_preproc.append(se)
return sentiment_expressions_preproc
def remove_pronominal(project, unit_iterator):
sentiment_expressions = list(unit_iterator)
sentiment_expressions_preproc = []
total_target_count = 0
for se in sentiment_expressions:
# NOT NEEDED THERE ARE NONE: check if sentiment expression itself is pronominal
# if se.check_pronominal():
# print(f"Pronominal sentiment expression removed:\n\t{se}")
# statistics["sentiment_expression_pronominal_removed"] += 1
# continue
# check targets
for tgt in se.targets[:]:
total_target_count += 1
if tgt.check_pronominal():
print(f"Pronominal target removed:\n\t{se}")
se.targets.remove(tgt)
statistics["target_pronominal_removed"] += 1
if se.targets:
sentiment_expressions_preproc.append(se)
else:
print(f"Sentiment expression removed because all targets pronominal:\n\t{se}")
statistics["sentiment_expression_pronominal_removed"] += 1
remove_sentiment_expression(project, se)
print(f"{statistics['target_pronominal_removed']}/{total_target_count} ({round(100*statistics['target_pronominal_removed']/total_target_count,1)}%) targets removed because pronominal.")
print(f"{statistics['sentiment_expression_pronominal_removed']}/{len(sentiment_expressions)} ({round(100*statistics['sentiment_expression_pronominal_removed']/len(sentiment_expressions),1)}%) sentiment expressions removed because it has all pronominal targets.")
return sentiment_expressions_preproc
def filter_target_roles_events(project):
'''
Reads the json of {eventtype.subtype: [valid_target_role1, valid_target_role2]}
to remove roles that cannot be a sentiment target entity from events.
This removes non-entity arguments such as TIME, PLACE, etc and ensures a valid mapping from event annotation to
:param project: populate valid targets in-lpace on events.
:return: new events for
'''
with open(cmd_args.target_role, "rt") as target_role_in:
target_role_map = json.load(target_role_in)
# # debug sanity check
valid_targets = {et_arg["name"] for et in target_role_map for et_arg in et["participant"] + et["filler"] if et_arg["is_target"]}
# invalid_targets = {et_arg["name"] for et in target_role_map for et_arg in et["participant"] + et["filler"] if not et_arg["is_target"]}
print(f"Setting valid roles for targets: {valid_targets}")
for ev in project.get_events():
if ev.event_type is None:
print(f"No type label on event {ev.friendly_id()}")
continue
type_fn = ev.event_type
if ev.event_subtype:
type_fn += f"_{ev.event_subtype.split('_')[0]}"
arguments = set(itertools.chain.from_iterable((ev.participants or [], ev.fillers or [])))
target_role_map_event_match = [et for et in target_role_map if et["full_name"] == type_fn]
if not target_role_map_event_match:
print(f"No match for {type_fn.upper()} {ev.friendly_id()} ")
valid_roles = set()
for et in target_role_map_event_match:
for et_arg in et["participant"] + et["filler"]:
if et_arg["is_target"]:
valid_roles.add(et_arg["name"])
for arg in arguments:
if arg.role in valid_roles:
ev.targets.append(arg)
# else:
# print(f"{arg.role.upper()} not a valid target in {valid_roles}")
# pass
def replace_canonical_referents(unit_iterator, remove_cross_sentence=False):
sentiment_expressions_preproc = []
for se in unit_iterator:
targets_preproc = []
for tgt in se.targets:
tgt_is_arg_and_has_refs = (type(tgt).__name__ == "Participant" or type(tgt).__name__ == "Filler") and \
(not tgt.canonical_referents in [[],None] and tgt.canonical_referents != "from_canonref")
# sometimes there are multiple canonrefs tagged
# this can be a) annotation mistake or
# b) multiple reference to a group, e.g. "all" refers to three companies
tgt_is_event_and_has_refs = (type(tgt).__name__ == "Event" and not tgt.coreferents in [[],None]) # collect coreferents for event too
if tgt_is_arg_and_has_refs or tgt_is_event_and_has_refs:
refs_attr = "canonical_referents" if tgt_is_arg_and_has_refs else "coreferents"
for ref in getattr(tgt, refs_attr):
# check whether canonical referent is in same sentence
same_sentence = [s.element_id for s in tgt.in_sentence] == [
s.element_id for s in ref.in_sentence
]
# if skip_cross_sentence is True only replace when canonref is in same sentence
if remove_cross_sentence and same_sentence:
targets_preproc.append(ref)
print(
f"Replaced {tgt.text} with {ref.text} in same sentence:\n\t{se}"
)
statistics["target_replaced_ref"] += 1
elif not remove_cross_sentence: # if skip_cross_sentence is False always replace
targets_preproc.append(ref)
print(
f"Replaced {tgt.txt} with {ref.txt}:\n\t{se}."
)
statistics["target_replaced_ref"] += 1
else: # no canon/co-refs so keep it in
targets_preproc.append(tgt)
se.targets = targets_preproc
sentiment_expressions_preproc.append(se)
return sentiment_expressions_preproc
def preprocess_events(project):
'''
Preprocess events for mapping 2 sentiment - target annotations.
Removes in-place sentiment expressions from project.annotation_documents[i].sentiment_expressions and
project.annotation_documents[i].sentences[j].sentiment_expressions.
Removes in-place targets from SE.targets.
:param project:
:return:
'''
# focus first on sentiment annotations
ev_orig_n = sum(1 for se in project.get_events())
tgt_orig_n = sum(len(list(itertools.chain.from_iterable((ev.participants or [], ev.fillers or [])))) for ev in project.get_events())
# set target accumulator on events
for ev in project.get_events():
ev.preprocess_trigger(fix_false_discont=True,
make_continuous_max_dist=6,
truncate_to_len=False,)
ev.targets = []
if cmd_args.target_role:
filter_target_roles_events(project)
# make cross-sentence annotations in-sentence and count stats of cut relations
if cmd_args.replace_canonical_referents:
evs = replace_canonical_referents(project.get_events(), remove_cross_sentence=cmd_args.remove_cross_sentence)
if cmd_args.remove_cross_sentence:
evs = remove_cross_sentence(project, project.get_events())
if cmd_args.remove_pronominal:
evs = remove_pronominal(project, project.get_events())
tgt_removed_n = statistics["target_cross_sentence_removed"] + statistics["target_pronominal_removed"]
tgt_removed_pct = round(100*tgt_removed_n/tgt_orig_n, 1)
ev_removed_n = ev_orig_n-len(evs)
ev_removed_pct = round(100*ev_removed_n/ev_orig_n, 1)
print(f"{tgt_removed_n}/{tgt_orig_n} ({tgt_removed_pct}%) targets removed in preprocessing.")
print(f"{ev_removed_n}/{ev_orig_n} ({ev_removed_pct}%) sentiment expressions removed in preprocessing.")
return evs
def preprocess_sentiment(project):
'''
Preprocess sentiment expression - target annotations.
Removes in-place sentiment expressions from project.annotation_documents[i].sentiment_expressions and
project.annotation_documents[i].sentences[j].sentiment_expressions.
Removes in-place targets from SE.targets.
:param project:
:return:
'''
# focus first on sentiment annotations
ses_orig_n = sum(1 for se in project.get_sentiment_expressions())
tgt_orig_n = sum(len(se.targets) for se in project.get_sentiment_expressions())
# make cross-sentence annotations in-sentence and count stats of cut relations
if cmd_args.replace_canonical_referents:
ses = replace_canonical_referents(project.get_sentiment_expressions(), remove_cross_sentence=cmd_args.remove_cross_sentence)
if cmd_args.remove_cross_sentence:
ses = remove_cross_sentence(project, project.get_sentiment_expressions())
if cmd_args.remove_pronominal:
ses = remove_pronominal(project, project.get_sentiment_expressions())
tgt_removed_n = statistics["target_cross_sentence_removed"] + statistics["target_pronominal_removed"]
tgt_removed_pct = round(100*tgt_removed_n/tgt_orig_n, 1)
se_removed_n = ses_orig_n-len(ses)
se_removed_pct = round(100*se_removed_n/ses_orig_n, 1)
print(f"{tgt_removed_n}/{tgt_orig_n} ({tgt_removed_pct}%) targets removed in preprocessing.")
print(f"{se_removed_n}/{ses_orig_n} ({se_removed_pct}%) sentiment expressions removed in preprocessing.")
def get_bio_tags(context_tokens, annotation_tokens):
token_bio_seq = ["O"] * len(context_tokens)
for unit in annotation_tokens:
unit_tags = ["B"] + ["I"]*(len(unit)-1)
for i, token in enumerate(unit):
token_bio_seq[token.index_sentence] = unit_tags[i]
return token_bio_seq
def get_bio_text(context_tokens, annotation_tokens):
'''
:param context_tokens: List of tokens of wider context (document- or sentence-level).
:param annotation_tokens: Nested list of tokens corresponding to the annotations units for whic hto generate BIO labels.
:return:
'''
token_bio_seq = get_bio_tags(context_tokens, annotation_tokens)
token_bio_text = " ".join(f"{tok.text}\\{tag}" for tok, tag in zip(context_tokens, token_bio_seq))
return token_bio_text
def get_opinion_tokens(sentiment_expressions):
opinion_tokens = []
for se in sentiment_expressions:
if "SentimentExpression" == type(se).__name__:
# sometimes units are parsed with duplicate and out-of-order tokens, so dedupe and sort the tokens.
se_tokens = sorted(list(set(se.tokens)), key=lambda x: x.begin)
elif "Event" == type(se).__name__:
# TODO implement contentful token selection for discontinuous triggers, right now use same as SE above
se_tokens = sorted(list(set(se.tokens)), key=lambda x: x.begin)
opinion_tokens.append(se_tokens)
opinion_tokens = sorted(opinion_tokens, key=lambda x: x[0].begin) # sort the units by begin token
return opinion_tokens
def parse_to_gts(project):
data = []
# sentence level
for doc in project.annotation_documents:
for i, sen in enumerate(doc.sentences):
sen_id = f"{doc.document_id}:{i}"
triples = []
# make a dict from one target->many sentiment expression OF SAME POLARITY in sentence
target_opinions = {}
for u in sen.sentiment_expressions + sen.events:
polarity = u.polarity_sentiment
for tgt in u.targets:
target_opinions.setdefault((tgt, polarity), set()).add(u)
for j, ((tgt, polarity), ses) in enumerate(target_opinions.items()):
# we need to get the opinion SE term that links to the aspect term > do with dict {AT: [OT1, OT2]} above
target_tags = get_bio_text(sen.tokens, [sorted(list(set(tgt.tokens)), key=lambda x: x.begin)])
opinion_tokens = get_opinion_tokens(ses)
opinion_tags = get_bio_text(sen.tokens, opinion_tokens)
triplet_json = OrderedDict([
("uid", f"{sen_id}-{j}"),
("target_tags", target_tags),
("opinion_tags", opinion_tags),
("sentiment", polarity),
])
triples.append(triplet_json)
inst_json = {
"id": f"{sen_id}",
"sentence": " ".join(t.text for t in sen.tokens),
"triples": triples,
}
data.append(inst_json)
return data
def write_dataset(dataset_split, opt_dir):
opt_dir = Path(opt_dir)
opt_dir.mkdir(parents=True, exist_ok=True)
for split_name, split_data in dataset_split.items():
fp = opt_dir / f"{split_name}.json"
with open(fp, "wt") as split_out:
json.dump(split_data, split_out, indent=2)
print(f"Wrote {split_name} GTS data to {fp}.")
def remove_empty(dataset):
return [inst for inst in dataset if inst["triples"]]
def main():
global cmd_args, statistics
# arguments
parser = argparse.ArgumentParser(
description="Preprocess SENTiVENT WebAnno event data."
)
parser.add_argument(
"input_dir", help="Name for input unzipped WebAnno XMI export directory."
)
parser.add_argument("output_dir", help="Name for output directory.")
parser.add_argument(
"--annotations",
default="sentiment",
const="sentiment",
nargs="?",
choices=["none", "sentiment", "event2sentiment", "sentiment+event2sentiment"],
help="Parse no annotations (only text) or sentiment expressions only (sentiment (default)). \
Map event annotations to sentiment anotations with 'event2sentiment'. \
Combine both with sentiment+event2sentiment.",
)
parser.add_argument(
"--target_role",
nargs="?",
help="JSON file containing argument roles to exclude from event to sentiment transformations.",
)
parser.add_argument(
"--span_width",
default=0,
nargs='?',
type=int,
help="Max. span width. Truncate other spans (args, targets, sents) annotations to this width.",
)
parser.add_argument('--replace_canonical_referents',
action='store_true',
help="Replace argument and target annotations with their linked CanonicalReferent. \
Cross-sentence CanonicalReferent links are skipped by default if --allow_cross_sentence is not used."
)
parser.add_argument('--skip_cross_sentence',
action='store_true',
help="Remove cross-sentence relations (sentiment-target (enabled by default)."
)
parser.add_argument('--remove_pronominal',
action='store_true',
help="Skip pronominal annotations such as events, arguments and sentiment targets that are the result of coreference. \
If 'replace_canonical_referents' is enabled, targets will be replaced with CanonicalReferent annotation first, \
if no CanonRef is available pronominal annotations will be removed."
)
cmd_args = parser.parse_args()
# init stats
statistics = {
"target_replaced_ref": 0,
"target_cross_sentence_removed": 0,
"sentiment_expression_cross_sentence_removed": 0,
"target_pronominal_removed": 0,
"sentiment_expression_pronominal_removed": 0,
}
# parse from raw Webanno
# data_dir = cmd_args.input_dir
data_dir = settings.MASTER_DIRP # set with settings.py for experiments because of documentation
print(f"Parsing raw WebAnno data in {data_dir}")
project = parse_process_project(data_dir, from_scratch=False)
# preprocess events
if "event" in cmd_args.annotations:
print("Preprocessing event annotations and map to sentiment-target.")
preprocess_events(project)
# preprocess annotations
if "sentiment" in cmd_args.annotations:
print("Preprocessing sentiment annotations.")
preprocess_sentiment(project)
# parse to GTS format
print(f"Converting to GTS jsonl format.")
data_json = parse_to_gts(project)
# split data
print(f"Creating data splits.")
splits = settings.SPLITS_DOC_EXPERIMENTS # TODO add argument parse option
dataset_splits = split_train_dev_test(data_json, splits)
# print some basic stats
for split_n, split_data in dataset_splits.items():
print(f"{split_n}: {basic_stats(split_data)}")
# write jsonl
# opt_dir = Path("/home/gilles/repos/dygiepp/data/ace-event/processed-data/sentivent/json")
write_dataset(dataset_splits, Path(cmd_args.output_dir))
# remove negative sentence instances without aspect term.
dataset_splits_no_empty = {split_n: remove_empty(data) for split_n, data in dataset_splits.items()}
for split_n, split_data in dataset_splits_no_empty.items():
print(f"{split_n}: {basic_stats(split_data)}")
write_dataset(dataset_splits_no_empty, f"{cmd_args.output_dir}-no-empty/")
# # backup role map to opt
# if cmd_args.role_map:
# shutil.copy(cmd_args.role_map, str(opt_dir / "role-map.json"))
if __name__ == "__main__":
main()