Skip to content

Commit

Permalink
Proper treatment of PUNCTs for KNP (#48)
Browse files Browse the repository at this point in the history
* Avoid that JUMAN treats "(" and ")"  as 未定義語

* Proper treatment of PUNCT

括弧始-PUNCTs are not suitable for head tokens in Universal Dependencies.

* small refinement for future enhancement

* revert for han_to_zen_normalize

* bug fix

* support 形態素連結

* replace "(" and ")" into 全角

* rewrite for flake8

* rewrite for flake8 again

* List[Morpheme] into List

* test for _modify_head_punct
  • Loading branch information
KoichiYasuoka authored Apr 22, 2020
1 parent 78c41fe commit 819fe11
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 16 deletions.
2 changes: 1 addition & 1 deletion camphr/lang/juman/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
ShortUnitWord = namedtuple(
"ShortUnitWord", ["surface", "lemma", "pos", "fstring", "space"]
)
_REPLACE_STRINGS = {"\t": " ", "\r": "", "(": "(", ")": ")", "\n": " "}
_REPLACE_STRINGS = {"\t": " ", "\r": "", "\n": " "}


def han_to_zen_normalize(text):
Expand Down
40 changes: 27 additions & 13 deletions camphr/pipelines/knp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,21 +80,10 @@ def __call__(self, doc: Doc) -> Doc:
blist = self.knp.parse_juman_result(sent._.get(JUMAN_LINES))
mlist = blist.mrph_list()
tlist = blist.tag_list()
if len(mlist) != len(sent):
mlist = _separate_mrph(mlist, sent)
for l, comp in zip([blist, mlist, tlist], ["bunsetsu", "morph", "tag"]):
sent._.set(getattr(KNP_USER_KEYS, comp).list_, l)
if len(mlist) != len(sent):
t, m = None, None
for t, m in zip(sent, mlist):
if t.text != m.midasi:
break
raise ValueError(
f"""Internal error occured
Sentence: {sent.text}
mlist : {[m.midasi for m in mlist]}
tokens: {[t.text for t in sent]}
diff : {m.midasi}, {t.text}
"""
)
for m, token in zip(mlist, sent):
token._.set(KNP_USER_KEYS.morph.element, m)
doc.ents = filter_spans(doc.ents + tuple(_extract_knp_ent(doc))) # type: ignore
Expand All @@ -103,6 +92,31 @@ def __call__(self, doc: Doc) -> Doc:
return doc


def _separate_mrph(mlist: List, sent: Span) -> List:
mm = []
i = 0
for m in mlist:
if "形態素連結" in m.fstring:
j = len(m.midasi)
while j > 0:
mm.append(m)
j -= len(sent[i].text)
i += 1
elif sent[i].text == m.midasi:
mm.append(m)
i += 1
else:
raise ValueError(
f"""Internal error occured
Sentence: {sent.text}
mlist : {[m.midasi for m in mlist]}
tokens: {[t.text for t in sent]}
diff : {m.midasi}, {sent[i].text}
"""
)
return mm


@curry
@functools.lru_cache()
def token_to_knp_span(type_: str, token: Token) -> Span:
Expand Down
24 changes: 23 additions & 1 deletion camphr/pipelines/knp/dependency_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def knp_dependency_parser(doc: Doc) -> Doc:
c.head = tag[0]
c.dep_ = _get_child_dep(c)
s.append(tag[0])
s = _modify_head_punct(s)
s = _modify_head_flat(s)
s = _modify_head_conj(s)
doc.is_parsed = True
Expand Down Expand Up @@ -101,6 +102,27 @@ def _get_child_dep(tag: Token) -> str:
return "clf" if pp == NUM else "flat"


def _modify_head_punct(heads: List[Token]) -> List[Token]:
s = [t for t in heads]
for i, t in enumerate(s):
if t.pos != PUNCT:
continue
x = [u for u in t.rights] # type: ignore
if len(x) == 0:
continue
h = x[0]
h.head = t.head
h.dep_ = t.dep_
x = x[1:] + [u for u in t.lefts] # type: ignore
x += [t, h] if h.dep_ == "ROOT" else [t]
x += [u for u in s if u.head == t]
for u in x:
u.head = h
t.dep_ = "punct"
s[i] = h
return s


def _modify_head_flat(heads: List[Token]) -> List[Token]:
s = [t for t in heads]
for i, t in enumerate(s):
Expand All @@ -119,7 +141,7 @@ def _modify_head_flat(heads: List[Token]) -> List[Token]:
continue
h.head = t.head
h.dep_ = t.dep_
x = x[1:]
x = x[1:] + [u for u in t.lefts] # type: ignore
x += [t, h] if h.dep_ == "ROOT" else [t]
x += [u for u in s if u.head == t]
for u in x:
Expand Down
3 changes: 2 additions & 1 deletion tests/pipelines/knp/test_dependency_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ def test_dependency_deps(nlp, text, deps):
("象は鼻が長い", [4, 0, 4, 2, 4], ["dislocated", "case", "nsubj", "case", "ROOT"]),
("リンゴとバナナとミカン", [0, 0, 0, 2, 0], ["ROOT", "case", "conj", "case", "conj"]),
("三匹の豚", [3, 0, 0, 3], ["nummod", "clf", "case", "ROOT"]),
("御盃を相交わす", [1, 4, 1, 4, 4], ["compound", "obj", "case", "advmod", "ROOT"])
("御盃を相交わす", [1, 4, 1, 4, 4], ["compound", "obj", "case", "advmod", "ROOT"]),
("子供を「僕」と呼ぶ", [6, 0, 3, 6, 3, 3, 6], ["obj", "case", "punct", "obl", "punct", "case", "ROOT"])
])
def test_dependency_parse_deps(nlp, text, heads, deps):
doc = nlp(text)
Expand Down

0 comments on commit 819fe11

Please sign in to comment.