-
Notifications
You must be signed in to change notification settings - Fork 10
/
reformat_single_amrs.py
76 lines (63 loc) · 2.91 KB
/
reformat_single_amrs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env python
# -*- coding: utf8 -*-
'''Script that reformats AMRs back to their original format with tabs and enters
Possibly also checks if the AMRs are valid (errors if they're not)'''
import sys
import argparse
from amr_utils import valid_amr, write_to_file, tokenize_line, reverse_tokenize
def create_arg_parser():
parser = argparse.ArgumentParser()
parser.add_argument("-f", "--input_file", required=True, type=str, help="File with the to be formatted AMRs")
parser.add_argument("-e", "--extension", default='.txt', type=str, help="New extension of formatted AMRs")
parser.add_argument("-v", "--valid", action='store_true', help="Error when encountering an invalid AMR")
args = parser.parse_args()
return args
def variable_match(token):
'''Function that matches entities that are variables occurring for the second time'''
if len(token) == 1:
if not token.isalpha():
return False
return any(char.isalpha() for char in token) and any(char.isdigit() for char in token) and not token.startswith(':') and len([x for x in token if x.isalpha() or x.isdigit() or x == '-']) == len(token)
def reformat_amr(input_file):
'''Reformat AMRs -- go from single line to indented AMR on multiple lines'''
fixed_amrs = []
# Loop over input file with one AMR per line
for line in open(input_file, 'r'):
tokenized_line = tokenize_line(line).split()
num_tabs = 0
amr_string = []
# Loop over parts of tokenized line
for count, part in enumerate(tokenized_line):
if part == '(':
num_tabs += 1
amr_string.append(part)
elif part == ')':
num_tabs -= 1
amr_string.append(part)
elif part.startswith(':'):
try:
# Variable coming up
if tokenized_line[count+3] == '/':
amr_string.append('\n' + num_tabs * '\t' + part)
# Variable coming, add newline here
elif variable_match(tokenized_line[count+1]):
amr_string.append('\n' + num_tabs * '\t' + part)
else:
amr_string.append(part)
except:
amr_string.append(part)
else:
amr_string.append(part)
original_line = reverse_tokenize(" ".join(amr_string))
original_line = original_line.replace('_ (', '_(').replace(') "', ')"')
fixed_amrs.append(original_line + '\n\n')
return fixed_amrs
if __name__ == "__main__":
args = create_arg_parser()
fixed_amrs = reformat_amr(args.input_file)
# Check if AMRs are valid, error if they're not
if args.valid:
for amr in fixed_amrs:
if not valid_amr(amr):
raise ValueError(amr)
write_to_file(fixed_amrs, args.input_file + args.extension, extra_newline=True)