Skip to content

Commit

Permalink
Modernize Pred normalization functions.
Browse files Browse the repository at this point in the history
Fixes #111.
  • Loading branch information
goodmami committed May 31, 2017
1 parent ff940bc commit c9ca7e4
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 19 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

* Properly call `re.sub()` so the flags don't become the count (#108)
* Include file size of gzipped tables in summary of `delphin mkprof` (#110)
* `normalize_pred_string()` now strips `_rel` (#111) and lowercases
* `is_valid_pred_string()` no longer requires `_rel` (#111)

### Deprecated

Expand Down
24 changes: 11 additions & 13 deletions delphin/mrs/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,8 +465,7 @@ def short_form(self):
>>> p.short_form()
'_cat_n_1'
"""
s = self.string.strip('"').lstrip("'")
return re.sub(r'(.*)_rel$', r'\1', s, flags=re.U|re.I)
return normalize_pred_string(self.string)

def is_quantifier(self):
"""
Expand Down Expand Up @@ -518,24 +517,23 @@ def is_valid_pred_string(predstr):
Pred, `False` otherwise.
"""
predstr = predstr.strip('"').lstrip("'")
if not predstr.endswith('_rel'):
return False
# this is a stricter regex than in Pred, but doesn't check POS
return re.match(r'_?((?:[^_\\]|\\.)+_){1,3}rel', predstr) is not None
return re.match(
r'_([^ _\\]|\\.)+_[a-z](_([^ _\\]|\\.)+)?(_rel)?$'
r'|[^_]([^ \\]|\\.)+(_rel)?$',
predstr
) is not None


def normalize_pred_string(predstr):
"""
Make pred strings more consistent by removing quotes and using
the _rel suffix.
Make pred strings more consistent by removing quotes and the _rel
suffix, and by lowercasing them.
"""
tokens = []
tokens = [t for t in split_pred_string(predstr)[:3] if t is not None]
if predstr.lstrip('\'"')[:1] == '_':
tokens.append('')
tokens.extend(t for t in split_pred_string(predstr) if t is not None)
if not tokens[-1] == 'rel':
tokens.append('rel')
return '_'.join(tokens)
tokens = [''] + tokens
return '_'.join(tokens).lower()


class Node(
Expand Down
20 changes: 14 additions & 6 deletions tests/mrs_components_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,13 +464,16 @@ def test_hash(self):
assert spred('"_the_q_rel"') in s
assert spred('_the_q_rel') in s
assert spred('_the_q') in s
assert spred('_The_q_rel') in s
assert spred('the_q_rel') not in s
s.add(spred('_the_q_rel'))
assert len(s) == 1


def test_split_pred_string():
sps = split_pred_string
# normalized
assert sps('dog_n_1') == ('dog', 'n', '1', None)
# with rel
assert sps('pron_rel') == ('pron', None, None, 'rel')
# with pos
Expand Down Expand Up @@ -499,18 +502,23 @@ def test_is_valid_pred_string():
assert ivps('_24/7_a_1_rel')
assert ivps('_a+bit_q_rel')
assert ivps('_A$_n_1_rel')
assert ivps('coord')
assert ivps('_dog_n_1')
assert ivps('_dog_n')
# invalid
assert not ivps('coord')
assert not ivps('coord_relation')
assert not ivps('_dog_rel')
assert not ivps('_dog_1_rel')
assert not ivps('_only_child_n_1_rel')


def test_normalize_pred_string():
nps = normalize_pred_string
assert nps('pron') == 'pron_rel'
assert nps('"udef_q_rel"') == 'udef_q_rel'
assert nps('\'udef_q_rel') == 'udef_q_rel'
assert nps('_dog_n_1_rel') == '_dog_n_1_rel'
assert nps('pron_rel') == 'pron'
assert nps('pron_rel_rel') == 'pron_rel' # i hope nobody does this
assert nps('"udef_q_rel"') == 'udef_q'
assert nps('\'udef_q_rel') == 'udef_q'
assert nps('_dog_n_1_rel') == '_dog_n_1'
assert nps('_DELPH-IN_n_1') == '_delph-in_n_1'

class TestNode():
def test_construct(self):
Expand Down

0 comments on commit c9ca7e4

Please sign in to comment.