From 49868c546cec5d4e65064bbae009f71156e14f3c Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Wed, 22 Aug 2018 10:37:56 -0700 Subject: [PATCH] Add 'ace' codec for the 'convert' subcommand This change makes it easier to convert the MRS output of ACE without having to massage the data stream first. It works with normal ACE output and --tsdb-stdout output. For the former, it uses the SENT: line, if available, to set the 'surface' attribute of the following MRSs (until the next double-newline, indicating the end of the results list). Resolves #92 --- CHANGELOG.md | 2 ++ delphin/main.py | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d45a9cbe..6f7546de 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,8 @@ changes are prefixed with "**BREAKING**" * The `convert` command can take a `--predicate-modifiers` option which attempts to rejoin disconnected EDS graphs that fit certain criteria * Documentation for implementing an ACE preprocessor (#91) +* `ace` as a `--from` codec for the `convert` subcommand, which reads + SimpleMRS strings from ACE output (#92) ### Changed diff --git a/delphin/main.py b/delphin/main.py index fd6493fc..585c2027 100644 --- a/delphin/main.py +++ b/delphin/main.py @@ -18,6 +18,7 @@ from delphin.mrs.components import Lnk from delphin import itsdb from delphin.repp import REPP +from delphin.util import SExpr def main(): @@ -45,6 +46,7 @@ def convert(args): from delphin.extra import latex codecs = { 'simplemrs': (simplemrs.loads, simplemrs.dumps), + 'ace': (_read_ace_parse, None), 'mrx': (mrx.loads, mrx.dumps), 'dmrx': (dmrx.loads, dmrx.dumps), 'eds': (eds.loads, eds.dumps), @@ -332,6 +334,40 @@ def do_trace(stream): ## Helper definitions +# read simplemrs from ACE output + +def _read_ace_parse(s): + from delphin.mrs import simplemrs + surface = None + newline = False + for line in s.splitlines(): + if line.startswith('SENT: '): + surface = line[6:] + # regular ACE output + elif line.startswith('['): + m = line.partition(' ; ')[0].strip() + m = simplemrs.loads(m, single=True) + m.surface = surface + yield m + # with --tsdb-stdout + elif line.startswith('('): + while line: + expr = SExpr.parse(line) + line = expr.remainder.lstrip() + if len(expr.data) == 2 and expr.data[0] == ':results': + for result in expr.data[1]: + for key, val in result: + if key == ':mrs': + yield simplemrs.loads(val, single=True) + elif line == '\n': + if newline: + surface = None + newline = False + else: + newline = True + else: + pass + # simulate json codecs for MRS and DMRS class _MRS_JSON(object):