dependency improvement for KNP (#47)

* 解析格:ガ２ support * 並列タイプ supported as "conj" * rewrite for flake8 * support "nummod" * treat 接頭辞 as non-head * "# type: ignore" for mypy * rewrite for flake8 * rewrite for flake8 * rewrite for flake8 * split into two functions * rewrite for flake8 * zero-length check for .rights * three-parameter test for 象は鼻が長い Is this style right? * add three examples * double line breaks * four dependency tests merged into one .parametrize * rewrite for flake8
PKSHATechnology-Research · Apr 14, 2020 · cb1fb2b · cb1fb2b
1 parent f7669e4
commit cb1fb2b
Show file tree

Hide file tree

Showing 2 changed files with 78 additions and 13 deletions.
diff --git a/camphr/pipelines/knp/dependency_parser.py b/camphr/pipelines/knp/dependency_parser.py
@@ -1,5 +1,5 @@
 """Convert KNP dependency parsing result to spacy format."""
-from typing import Any, Dict, Iterable, Optional
+from typing import Any, Dict, Iterable, Optional, List
 
 import spacy
 from spacy.symbols import (
@@ -24,6 +24,7 @@
 @spacy.component("knp_dependency_parser", requires=("doc._.knp_tag_parent",))
 def knp_dependency_parser(doc: Doc) -> Doc:
     tag_spans: Iterable[Span] = doc._.get(KNP_USER_KEYS.tag.spans)
+    s = []
     for tag in tag_spans:
         parent: Optional[Span] = tag._.get(KNP_USER_KEYS.tag.parent)
         if parent is not None:
@@ -35,6 +36,9 @@ def knp_dependency_parser(doc: Doc) -> Doc:
         for c in tag[1:]:
             c.head = tag[0]
             c.dep_ = _get_child_dep(c)
+        s.append(tag[0])
+    s = _modify_head_flat(s)
+    s = _modify_head_conj(s)
     doc.is_parsed = True
     return doc
 
@@ -56,21 +60,26 @@ def _get_dep(tag: Token) -> str:
 
 def _get_dep_noun(tag: Token) -> str:
     f: Dict[str, Any] = tag._.knp_morph_tag._.knp_tag_element.features
-    if "係" not in f and "解析格" not in f:
+    if "係" not in f:
         return "dep"
-    k = f["係"] if f["係"] != "未格" else f["解析格"] + "格"
-    x = {"隣": "nmod", "文節内": "compound", "ガ格": "nsubj", "ヲ格": "obj"}
+    k = f["係"] if f["係"] != "未格" or "解析格" not in f else f["解析格"] + "格"
+    x = {"隣": "nmod", "文節内": "compound", "ガ格": "nsubj", "ヲ格": "obj", "ガ２格": "dislocated"}
     if k in x:
         return x[k]
-    elif k != "ノ格":
-        return "obl"
-    if tag.head.pos in {VERB, ADJ}:
-        return "nsubj"
-    elif tag.pos in {DET, PRON}:
-        tag.pos = DET
-        return "det"
-    else:
-        return "nmod"
+    elif k == "ノ格":
+        if tag.head.pos in {VERB, ADJ}:
+            return "nsubj"
+        elif tag.pos in {DET, PRON}:
+            tag.pos = DET
+            return "det"
+        else:
+            return "nummod" if tag.pos == NUM else "nmod"
+    elif "並列タイプ" in f:
+        if tag.head.pos in {VERB, ADJ}:
+            return "obl"
+        else:
+            return "conj"
+    return "obl"
 
 
 def _get_child_dep(tag: Token) -> str:
@@ -90,3 +99,46 @@ def _get_child_dep(tag: Token) -> str:
         return "punct"
     else:
         return "clf" if pp == NUM else "flat"
+
+
+def _modify_head_flat(heads: List[Token]) -> List[Token]:
+    s = [t for t in heads]
+    for i, t in enumerate(s):
+        if not t.tag_.startswith("接頭辞"):
+            continue
+        x = [u for u in t.rights]  # type: ignore
+        if len(x) == 0:
+            continue
+        h = x[0]
+        if t.pos == NOUN and h.dep_ == "flat":
+            d = "compound"
+        elif t.pos == ADV and h.dep_ == "aux":
+            d = "advmod"
+            h.pos = VERB
+        else:
+            continue
+        h.head = t.head
+        h.dep_ = t.dep_
+        x = x[1:]
+        x += [t, h] if h.dep_ == "ROOT" else [t]
+        x += [u for u in s if u.head == t]
+        for u in x:
+            u.head = h
+        t.dep_ = d
+        s[i] = h
+    return s
+
+
+def _modify_head_conj(heads: List[Token]) -> List[Token]:
+    s = [t for t in heads]
+    for t in s:
+        while t.dep_ == "conj" and t.i < t.head.i:
+            h = t.head
+            t.head = h.head
+            t.dep_ = h.dep_
+            x = [h, t] if t.dep_ == "ROOT" else [h]
+            x += [u for u in s if u.head == h and u.i < t.i]
+            for u in x:
+                u.head = t
+            h.dep_ = "conj"
+    return s
diff --git a/tests/pipelines/knp/test_dependency_parser.py b/tests/pipelines/knp/test_dependency_parser.py
@@ -26,3 +26,16 @@ def test_dependency_deps(nlp, text, deps):
     doc = nlp(text)
     for token, depi in itertools.zip_longest(doc, deps):
         assert token.dep_ == depi
+
+
+@pytest.mark.parametrize("text,heads,deps", [
+    ("象は鼻が長い", [4, 0, 4, 2, 4], ["dislocated", "case", "nsubj", "case", "ROOT"]),
+    ("リンゴとバナナとミカン", [0, 0, 0, 2, 0], ["ROOT", "case", "conj", "case", "conj"]),
+    ("三匹の豚", [3, 0, 0, 3], ["nummod", "clf", "case", "ROOT"]),
+    ("御盃を相交わす", [1, 4, 1, 4, 4], ["compound", "obj", "case", "advmod", "ROOT"])
+])
+def test_dependency_parse_deps(nlp, text, heads, deps):
+    doc = nlp(text)
+    for token, headi, depi in itertools.zip_longest(doc, heads, deps):
+        assert token.head.i == headi
+        assert token.dep_ == depi