Feat/spacy lemmatizer (#129)

* fix: add spacy annotator * fix: usage of spacy attributes * fix: test spacy annotator * fix: add params * fix: add params * fix: fix test * fix: rights on file * fix: codestyle * fix: extra f string
deeppavlov · Mar 21, 2022 · 40ec573 · 40ec573
1 parent 531a34b
commit 40ec573
Show file tree

Hide file tree

Showing 10 changed files with 203 additions and 2 deletions.
diff --git a/annotators/spacy_annotator/Dockerfile b/annotators/spacy_annotator/Dockerfile
@@ -0,0 +1,23 @@
+FROM python:3.8.4
+
+ARG SRC_DIR
+ENV SRC_DIR ${SRC_DIR}
+ARG SERVICE_PORT
+ENV SERVICE_PORT ${SERVICE_PORT}
+ARG SPACY_MODEL
+ENV SPACY_MODEL ${SPACY_MODEL}
+ARG TOKEN_ATTRIBUTES
+ENV TOKEN_ATTRIBUTES ${TOKEN_ATTRIBUTES}
+
+RUN mkdir /src
+
+COPY $SRC_DIR /src/
+COPY ./common/ /src/common/
+
+COPY $SRC_DIR/requirements.txt /src/requirements.txt
+RUN pip install -r /src/requirements.txt
+RUN python -m spacy download ${SPACY_MODEL}
+
+WORKDIR /src
+
+CMD gunicorn --workers=2 server:app
diff --git a/annotators/spacy_annotator/README.txt b/annotators/spacy_annotator/README.txt
@@ -0,0 +1 @@
+This is Cobot nounphrase annotator.
diff --git a/annotators/spacy_annotator/requirements.txt b/annotators/spacy_annotator/requirements.txt
@@ -0,0 +1,6 @@
+flask==1.1.1
+itsdangerous==2.0.1
+gunicorn==20.0.4
+sentry-sdk==0.13.4
+requests==2.22.0
+spacy==3.2.0
diff --git a/annotators/spacy_annotator/server.py b/annotators/spacy_annotator/server.py
@@ -0,0 +1,53 @@
+import logging
+import time
+from os import getenv
+
+import sentry_sdk
+import spacy
+from flask import Flask, request, jsonify
+
+
+sentry_sdk.init(getenv("SENTRY_DSN"))
+
+spacy_nlp = spacy.load(getenv("SPACY_MODEL"))
+TOKEN_ATTRIBUTES = getenv("TOKEN_ATTRIBUTES").split("|")
+
+logging.basicConfig(format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
+app = Flask(__name__)
+
+
+def get_result(request):
+    st_time = time.time()
+    sentences = request.json["sentences"]
+    result = []
+
+    for uttr in sentences:
+        doc = spacy_nlp(uttr)
+        curr_tokens = []
+        for token in doc:
+            curr_token = {"text": token.text}
+            for attr in TOKEN_ATTRIBUTES:
+                curr_token[attr] = str(getattr(token, attr))
+            curr_tokens += [curr_token]
+        result += [curr_tokens]
+    total_time = time.time() - st_time
+    logger.info(f"spacy_annotator exec time: {total_time:.3f}s")
+    return result
+
+
+@app.route("/respond", methods=["POST"])
+def respond():
+    result = get_result(request)
+    return jsonify(result)
+
+
+@app.route("/respond_batch", methods=["POST"])
+def respond_batch():
+    result = get_result(request)
+    return jsonify([{"batch": result}])
+
+
+if __name__ == "__main__":
+    app.run(debug=False, host="0.0.0.0", port=3000)
diff --git a/annotators/spacy_annotator/test.py b/annotators/spacy_annotator/test.py
@@ -0,0 +1,67 @@
+import os
+import requests
+
+
+SERVICE_PORT = int(os.getenv("SERVICE_PORT"))
+
+
+def main():
+    url = f"http://0.0.0.0:{SERVICE_PORT}/respond"
+    input_data = {"sentences": ["джейсон стетхэм хочет есть."]}
+    gold = [
+        [
+            {
+                "dep_": "nsubj",
+                "ent_iob_": "B",
+                "ent_type_": "PER",
+                "lemma_": "джейсон",
+                "morph": "Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing",
+                "pos_": "PROPN",
+                "text": "джейсон",
+            },
+            {
+                "dep_": "appos",
+                "ent_iob_": "I",
+                "ent_type_": "PER",
+                "lemma_": "стетхэм",
+                "morph": "Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing",
+                "pos_": "PROPN",
+                "text": "стетхэм",
+            },
+            {
+                "dep_": "ROOT",
+                "ent_iob_": "O",
+                "ent_type_": "",
+                "lemma_": "хотеть",
+                "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=Third|Tense=Pres|VerbForm=Fin|Voice=Act",
+                "pos_": "VERB",
+                "text": "хочет",
+            },
+            {
+                "dep_": "xcomp",
+                "ent_iob_": "O",
+                "ent_type_": "",
+                "lemma_": "есть",
+                "morph": "Aspect=Imp|VerbForm=Inf|Voice=Act",
+                "pos_": "VERB",
+                "text": "есть",
+            },
+            {
+                "dep_": "punct",
+                "ent_iob_": "O",
+                "ent_type_": "",
+                "lemma_": ".",
+                "morph": "",
+                "pos_": "PUNCT",
+                "text": ".",
+            },
+        ]
+    ]
+
+    result = requests.post(url, json=input_data).json()
+    assert result == gold, print(result)
+    print("Success!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/annotators/spacy_annotator/test.sh b/annotators/spacy_annotator/test.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+python test.py
diff --git a/assistant_dists/dream_russian/dev.yml b/assistant_dists/dream_russian/dev.yml
@@ -71,6 +71,11 @@ services:
       - "~/.deeppavlov:/root/.deeppavlov"
     ports:
       - 8074:8074
+  spacy-annotator:
+    volumes:
+      - "./annotators/spacy_annotator:/src"
+    ports:
+      - 8125:8125
   dff-friendship-skill:
     volumes:
       - "./skills/dff_friendship_skill:/src"

diff --git a/assistant_dists/dream_russian/docker-compose.override.yml b/assistant_dists/dream_russian/docker-compose.override.yml
@@ -7,7 +7,7 @@ services:
           ner:8021, personal-info-skill:8030,
           spelling-preprocessing:8074, entity-linking:8075, wiki-parser:8077, dff-generative-skill:8092,
           dff-friendship-skill:8086, dff-wiki-skill:8111, entity-detection:8103, dialogpt:8091,
-          dff-template-skill:8120"
+          dff-template-skill:8120, spacy-annotator:8125"
       WAIT_HOSTS_TIMEOUT: ${WAIT_TIMEOUT:-480}
 
   dff-program-y-skill:
@@ -222,6 +222,26 @@ services:
         reservations:
           memory: 256M
 
+  spacy-annotator:
+    env_file: [.env]
+    build:
+      args:
+        SERVICE_PORT: 8125
+        SRC_DIR: annotators/spacy_annotator
+        SPACY_MODEL: ru_core_news_sm
+        TOKEN_ATTRIBUTES: pos_|dep_|lemma_|ent_iob_|ent_type_|morph
+      context: ./
+      dockerfile: annotators/spacy_annotator/Dockerfile
+    command: flask run -h 0.0.0.0 -p 8125
+    environment:
+      - FLASK_APP=server
+    deploy:
+      resources:
+        limits:
+          memory: 128M
+        reservations:
+          memory: 128M
+
   dff-friendship-skill:
     env_file: [.env]
     build:

diff --git a/assistant_dists/dream_russian/pipeline_conf.json b/assistant_dists/dream_russian/pipeline_conf.json
@@ -74,6 +74,16 @@
                 "response_formatter": "state_formatters.dp_formatters:simple_formatter_service",
                 "state_manager_method": "add_annotation_and_reset_human_attributes_for_first_turn"
             },
+            "spacy_annotator": {
+                "connector": {
+                    "protocol": "http",
+                    "timeout": 1,
+                    "url": "http://spacy-annotator:8125/respond"
+                },
+                "dialog_formatter": "state_formatters.dp_formatters:last_utt_dialog",
+                "response_formatter": "state_formatters.dp_formatters:simple_formatter_service",
+                "state_manager_method": "add_annotation_and_reset_human_attributes_for_first_turn"
+            },
             "badlisted_words": {
                 "connector": {
                     "protocol": "http",
@@ -290,6 +300,19 @@
                 ],
                 "state_manager_method": "add_hypothesis_annotation_batch"
             },
+            "spacy_annotator": {
+                "connector": {
+                    "protocol": "http",
+                    "timeout": 1,
+                    "url": "http://spacy-annotator:8125/batch_respond"
+                },
+                "dialog_formatter": "state_formatters.dp_formatters:hypotheses_list",
+                "response_formatter": "state_formatters.dp_formatters:simple_formatter_service",
+                "previous_services": [
+                    "skills"
+                ],
+                "state_manager_method": "add_hypothesis_annotation_batch"
+            },
             "entity_detection": {
                 "connector": {
                     "protocol": "http",

diff --git a/state_formatters/utils.py b/state_formatters/utils.py
@@ -264,7 +264,7 @@ def dff_formatter(
             "human_utter_index_batch": [human_utter_index],
             "dialog_batch": [new_dialog],
             f"{state_name}_batch": [state],
-            f"dff_shared_state_batch": [dff_shared_state],
+            "dff_shared_state_batch": [dff_shared_state],
             "entities_batch": [entities],
             "used_links_batch": [used_links],
             "age_group_batch": [age_group],