Proper treatment of PUNCTs for KNP (#48)

* Avoid that JUMAN treats "(" and ")" as 未定義語 * Proper treatment of PUNCT 括弧始-PUNCTs are not suitable for head tokens in Universal Dependencies. * small refinement for future enhancement * revert for han_to_zen_normalize * bug fix * support 形態素連結 * replace "(" and ")" into 全角 * rewrite for flake8 * rewrite for flake8 again * List[Morpheme] into List * test for _modify_head_punct
PKSHATechnology-Research · Apr 22, 2020 · 819fe11 · 819fe11
1 parent 78c41fe
commit 819fe11
Show file tree

Hide file tree

Showing 4 changed files with 53 additions and 16 deletions.
diff --git a/camphr/lang/juman/__init__.py b/camphr/lang/juman/__init__.py
@@ -16,7 +16,7 @@
 ShortUnitWord = namedtuple(
     "ShortUnitWord", ["surface", "lemma", "pos", "fstring", "space"]
 )
-_REPLACE_STRINGS = {"\t": "　", "\r": "", "（": "(", "）": ")", "\n": "　"}
+_REPLACE_STRINGS = {"\t": "　", "\r": "", "\n": "　"}
 
 
 def han_to_zen_normalize(text):

diff --git a/camphr/pipelines/knp/__init__.py b/camphr/pipelines/knp/__init__.py
@@ -80,21 +80,10 @@ def __call__(self, doc: Doc) -> Doc:
             blist = self.knp.parse_juman_result(sent._.get(JUMAN_LINES))
             mlist = blist.mrph_list()
             tlist = blist.tag_list()
+            if len(mlist) != len(sent):
+                mlist = _separate_mrph(mlist, sent)
             for l, comp in zip([blist, mlist, tlist], ["bunsetsu", "morph", "tag"]):
                 sent._.set(getattr(KNP_USER_KEYS, comp).list_, l)
-            if len(mlist) != len(sent):
-                t, m = None, None
-                for t, m in zip(sent, mlist):
-                    if t.text != m.midasi:
-                        break
-                raise ValueError(
-                    f"""Internal error occured
-            Sentence: {sent.text}
-            mlist : {[m.midasi for m in mlist]}
-            tokens: {[t.text for t in sent]}
-            diff  : {m.midasi}, {t.text}
-            """
-                )
             for m, token in zip(mlist, sent):
                 token._.set(KNP_USER_KEYS.morph.element, m)
         doc.ents = filter_spans(doc.ents + tuple(_extract_knp_ent(doc)))  # type: ignore
@@ -103,6 +92,31 @@ def __call__(self, doc: Doc) -> Doc:
         return doc
 
 
+def _separate_mrph(mlist: List, sent: Span) -> List:
+    mm = []
+    i = 0
+    for m in mlist:
+        if "形態素連結" in m.fstring:
+            j = len(m.midasi)
+            while j > 0:
+                mm.append(m)
+                j -= len(sent[i].text)
+                i += 1
+        elif sent[i].text == m.midasi:
+            mm.append(m)
+            i += 1
+        else:
+            raise ValueError(
+                f"""Internal error occured
+            Sentence: {sent.text}
+            mlist : {[m.midasi for m in mlist]}
+            tokens: {[t.text for t in sent]}
+            diff  : {m.midasi}, {sent[i].text}
+            """
+            )
+    return mm
+
+
 @curry
 @functools.lru_cache()
 def token_to_knp_span(type_: str, token: Token) -> Span:

diff --git a/camphr/pipelines/knp/dependency_parser.py b/camphr/pipelines/knp/dependency_parser.py
@@ -37,6 +37,7 @@ def knp_dependency_parser(doc: Doc) -> Doc:
             c.head = tag[0]
             c.dep_ = _get_child_dep(c)
         s.append(tag[0])
+    s = _modify_head_punct(s)
     s = _modify_head_flat(s)
     s = _modify_head_conj(s)
     doc.is_parsed = True
@@ -101,6 +102,27 @@ def _get_child_dep(tag: Token) -> str:
         return "clf" if pp == NUM else "flat"
 
 
+def _modify_head_punct(heads: List[Token]) -> List[Token]:
+    s = [t for t in heads]
+    for i, t in enumerate(s):
+        if t.pos != PUNCT:
+            continue
+        x = [u for u in t.rights]  # type: ignore
+        if len(x) == 0:
+            continue
+        h = x[0]
+        h.head = t.head
+        h.dep_ = t.dep_
+        x = x[1:] + [u for u in t.lefts]  # type: ignore
+        x += [t, h] if h.dep_ == "ROOT" else [t]
+        x += [u for u in s if u.head == t]
+        for u in x:
+            u.head = h
+        t.dep_ = "punct"
+        s[i] = h
+    return s
+
+
 def _modify_head_flat(heads: List[Token]) -> List[Token]:
     s = [t for t in heads]
     for i, t in enumerate(s):
@@ -119,7 +141,7 @@ def _modify_head_flat(heads: List[Token]) -> List[Token]:
             continue
         h.head = t.head
         h.dep_ = t.dep_
-        x = x[1:]
+        x = x[1:] + [u for u in t.lefts]  # type: ignore
         x += [t, h] if h.dep_ == "ROOT" else [t]
         x += [u for u in s if u.head == t]
         for u in x:

diff --git a/tests/pipelines/knp/test_dependency_parser.py b/tests/pipelines/knp/test_dependency_parser.py
@@ -32,7 +32,8 @@ def test_dependency_deps(nlp, text, deps):
     ("象は鼻が長い", [4, 0, 4, 2, 4], ["dislocated", "case", "nsubj", "case", "ROOT"]),
     ("リンゴとバナナとミカン", [0, 0, 0, 2, 0], ["ROOT", "case", "conj", "case", "conj"]),
     ("三匹の豚", [3, 0, 0, 3], ["nummod", "clf", "case", "ROOT"]),
-    ("御盃を相交わす", [1, 4, 1, 4, 4], ["compound", "obj", "case", "advmod", "ROOT"])
+    ("御盃を相交わす", [1, 4, 1, 4, 4], ["compound", "obj", "case", "advmod", "ROOT"]),
+    ("子供を「僕」と呼ぶ", [6, 0, 3, 6, 3, 3, 6], ["obj", "case", "punct", "obl", "punct", "case", "ROOT"])
 ])
 def test_dependency_parse_deps(nlp, text, heads, deps):
     doc = nlp(text)