From 40ec57391013e2a5e1a8e755af44ca354fd1a579 Mon Sep 17 00:00:00 2001 From: Dilyara Baymurzina Date: Mon, 21 Mar 2022 18:45:41 +0300 Subject: [PATCH] Feat/spacy lemmatizer (#129) * fix: add spacy annotator * fix: usage of spacy attributes * fix: test spacy annotator * fix: add params * fix: add params * fix: fix test * fix: rights on file * fix: codestyle * fix: extra f string --- annotators/spacy_annotator/Dockerfile | 23 +++++++ annotators/spacy_annotator/README.txt | 1 + annotators/spacy_annotator/requirements.txt | 6 ++ annotators/spacy_annotator/server.py | 53 +++++++++++++++ annotators/spacy_annotator/test.py | 67 +++++++++++++++++++ annotators/spacy_annotator/test.sh | 3 + assistant_dists/dream_russian/dev.yml | 5 ++ .../dream_russian/docker-compose.override.yml | 22 +++++- .../dream_russian/pipeline_conf.json | 23 +++++++ state_formatters/utils.py | 2 +- 10 files changed, 203 insertions(+), 2 deletions(-) create mode 100644 annotators/spacy_annotator/Dockerfile create mode 100644 annotators/spacy_annotator/README.txt create mode 100644 annotators/spacy_annotator/requirements.txt create mode 100644 annotators/spacy_annotator/server.py create mode 100644 annotators/spacy_annotator/test.py create mode 100755 annotators/spacy_annotator/test.sh diff --git a/annotators/spacy_annotator/Dockerfile b/annotators/spacy_annotator/Dockerfile new file mode 100644 index 0000000000..87c4d56d52 --- /dev/null +++ b/annotators/spacy_annotator/Dockerfile @@ -0,0 +1,23 @@ +FROM python:3.8.4 + +ARG SRC_DIR +ENV SRC_DIR ${SRC_DIR} +ARG SERVICE_PORT +ENV SERVICE_PORT ${SERVICE_PORT} +ARG SPACY_MODEL +ENV SPACY_MODEL ${SPACY_MODEL} +ARG TOKEN_ATTRIBUTES +ENV TOKEN_ATTRIBUTES ${TOKEN_ATTRIBUTES} + +RUN mkdir /src + +COPY $SRC_DIR /src/ +COPY ./common/ /src/common/ + +COPY $SRC_DIR/requirements.txt /src/requirements.txt +RUN pip install -r /src/requirements.txt +RUN python -m spacy download ${SPACY_MODEL} + +WORKDIR /src + +CMD gunicorn --workers=2 server:app diff --git a/annotators/spacy_annotator/README.txt b/annotators/spacy_annotator/README.txt new file mode 100644 index 0000000000..10a665d7f5 --- /dev/null +++ b/annotators/spacy_annotator/README.txt @@ -0,0 +1 @@ +This is Cobot nounphrase annotator. diff --git a/annotators/spacy_annotator/requirements.txt b/annotators/spacy_annotator/requirements.txt new file mode 100644 index 0000000000..5ef0607af1 --- /dev/null +++ b/annotators/spacy_annotator/requirements.txt @@ -0,0 +1,6 @@ +flask==1.1.1 +itsdangerous==2.0.1 +gunicorn==20.0.4 +sentry-sdk==0.13.4 +requests==2.22.0 +spacy==3.2.0 \ No newline at end of file diff --git a/annotators/spacy_annotator/server.py b/annotators/spacy_annotator/server.py new file mode 100644 index 0000000000..320d53060a --- /dev/null +++ b/annotators/spacy_annotator/server.py @@ -0,0 +1,53 @@ +import logging +import time +from os import getenv + +import sentry_sdk +import spacy +from flask import Flask, request, jsonify + + +sentry_sdk.init(getenv("SENTRY_DSN")) + +spacy_nlp = spacy.load(getenv("SPACY_MODEL")) +TOKEN_ATTRIBUTES = getenv("TOKEN_ATTRIBUTES").split("|") + +logging.basicConfig(format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.DEBUG) +logger = logging.getLogger(__name__) + +app = Flask(__name__) + + +def get_result(request): + st_time = time.time() + sentences = request.json["sentences"] + result = [] + + for uttr in sentences: + doc = spacy_nlp(uttr) + curr_tokens = [] + for token in doc: + curr_token = {"text": token.text} + for attr in TOKEN_ATTRIBUTES: + curr_token[attr] = str(getattr(token, attr)) + curr_tokens += [curr_token] + result += [curr_tokens] + total_time = time.time() - st_time + logger.info(f"spacy_annotator exec time: {total_time:.3f}s") + return result + + +@app.route("/respond", methods=["POST"]) +def respond(): + result = get_result(request) + return jsonify(result) + + +@app.route("/respond_batch", methods=["POST"]) +def respond_batch(): + result = get_result(request) + return jsonify([{"batch": result}]) + + +if __name__ == "__main__": + app.run(debug=False, host="0.0.0.0", port=3000) diff --git a/annotators/spacy_annotator/test.py b/annotators/spacy_annotator/test.py new file mode 100644 index 0000000000..a07d982bc9 --- /dev/null +++ b/annotators/spacy_annotator/test.py @@ -0,0 +1,67 @@ +import os +import requests + + +SERVICE_PORT = int(os.getenv("SERVICE_PORT")) + + +def main(): + url = f"http://0.0.0.0:{SERVICE_PORT}/respond" + input_data = {"sentences": ["джейсон стетхэм хочет есть."]} + gold = [ + [ + { + "dep_": "nsubj", + "ent_iob_": "B", + "ent_type_": "PER", + "lemma_": "джейсон", + "morph": "Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing", + "pos_": "PROPN", + "text": "джейсон", + }, + { + "dep_": "appos", + "ent_iob_": "I", + "ent_type_": "PER", + "lemma_": "стетхэм", + "morph": "Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing", + "pos_": "PROPN", + "text": "стетхэм", + }, + { + "dep_": "ROOT", + "ent_iob_": "O", + "ent_type_": "", + "lemma_": "хотеть", + "morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=Third|Tense=Pres|VerbForm=Fin|Voice=Act", + "pos_": "VERB", + "text": "хочет", + }, + { + "dep_": "xcomp", + "ent_iob_": "O", + "ent_type_": "", + "lemma_": "есть", + "morph": "Aspect=Imp|VerbForm=Inf|Voice=Act", + "pos_": "VERB", + "text": "есть", + }, + { + "dep_": "punct", + "ent_iob_": "O", + "ent_type_": "", + "lemma_": ".", + "morph": "", + "pos_": "PUNCT", + "text": ".", + }, + ] + ] + + result = requests.post(url, json=input_data).json() + assert result == gold, print(result) + print("Success!") + + +if __name__ == "__main__": + main() diff --git a/annotators/spacy_annotator/test.sh b/annotators/spacy_annotator/test.sh new file mode 100755 index 0000000000..61672db785 --- /dev/null +++ b/annotators/spacy_annotator/test.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +python test.py diff --git a/assistant_dists/dream_russian/dev.yml b/assistant_dists/dream_russian/dev.yml index 4ab4f206ac..9b6d5bdbb0 100644 --- a/assistant_dists/dream_russian/dev.yml +++ b/assistant_dists/dream_russian/dev.yml @@ -71,6 +71,11 @@ services: - "~/.deeppavlov:/root/.deeppavlov" ports: - 8074:8074 + spacy-annotator: + volumes: + - "./annotators/spacy_annotator:/src" + ports: + - 8125:8125 dff-friendship-skill: volumes: - "./skills/dff_friendship_skill:/src" diff --git a/assistant_dists/dream_russian/docker-compose.override.yml b/assistant_dists/dream_russian/docker-compose.override.yml index 101f7a8025..8c7ebd7d1f 100644 --- a/assistant_dists/dream_russian/docker-compose.override.yml +++ b/assistant_dists/dream_russian/docker-compose.override.yml @@ -7,7 +7,7 @@ services: ner:8021, personal-info-skill:8030, spelling-preprocessing:8074, entity-linking:8075, wiki-parser:8077, dff-generative-skill:8092, dff-friendship-skill:8086, dff-wiki-skill:8111, entity-detection:8103, dialogpt:8091, - dff-template-skill:8120" + dff-template-skill:8120, spacy-annotator:8125" WAIT_HOSTS_TIMEOUT: ${WAIT_TIMEOUT:-480} dff-program-y-skill: @@ -222,6 +222,26 @@ services: reservations: memory: 256M + spacy-annotator: + env_file: [.env] + build: + args: + SERVICE_PORT: 8125 + SRC_DIR: annotators/spacy_annotator + SPACY_MODEL: ru_core_news_sm + TOKEN_ATTRIBUTES: pos_|dep_|lemma_|ent_iob_|ent_type_|morph + context: ./ + dockerfile: annotators/spacy_annotator/Dockerfile + command: flask run -h 0.0.0.0 -p 8125 + environment: + - FLASK_APP=server + deploy: + resources: + limits: + memory: 128M + reservations: + memory: 128M + dff-friendship-skill: env_file: [.env] build: diff --git a/assistant_dists/dream_russian/pipeline_conf.json b/assistant_dists/dream_russian/pipeline_conf.json index aa477367e9..7c29c99d05 100644 --- a/assistant_dists/dream_russian/pipeline_conf.json +++ b/assistant_dists/dream_russian/pipeline_conf.json @@ -74,6 +74,16 @@ "response_formatter": "state_formatters.dp_formatters:simple_formatter_service", "state_manager_method": "add_annotation_and_reset_human_attributes_for_first_turn" }, + "spacy_annotator": { + "connector": { + "protocol": "http", + "timeout": 1, + "url": "http://spacy-annotator:8125/respond" + }, + "dialog_formatter": "state_formatters.dp_formatters:last_utt_dialog", + "response_formatter": "state_formatters.dp_formatters:simple_formatter_service", + "state_manager_method": "add_annotation_and_reset_human_attributes_for_first_turn" + }, "badlisted_words": { "connector": { "protocol": "http", @@ -290,6 +300,19 @@ ], "state_manager_method": "add_hypothesis_annotation_batch" }, + "spacy_annotator": { + "connector": { + "protocol": "http", + "timeout": 1, + "url": "http://spacy-annotator:8125/batch_respond" + }, + "dialog_formatter": "state_formatters.dp_formatters:hypotheses_list", + "response_formatter": "state_formatters.dp_formatters:simple_formatter_service", + "previous_services": [ + "skills" + ], + "state_manager_method": "add_hypothesis_annotation_batch" + }, "entity_detection": { "connector": { "protocol": "http", diff --git a/state_formatters/utils.py b/state_formatters/utils.py index e3bf795d5a..37956faf4a 100644 --- a/state_formatters/utils.py +++ b/state_formatters/utils.py @@ -264,7 +264,7 @@ def dff_formatter( "human_utter_index_batch": [human_utter_index], "dialog_batch": [new_dialog], f"{state_name}_batch": [state], - f"dff_shared_state_batch": [dff_shared_state], + "dff_shared_state_batch": [dff_shared_state], "entities_batch": [entities], "used_links_batch": [used_links], "age_group_batch": [age_group],