Skip to content

Commit

Permalink
Feat/spacy lemmatizer (#129)
Browse files Browse the repository at this point in the history
* fix: add spacy annotator

* fix: usage of spacy attributes

* fix: test spacy annotator

* fix: add params

* fix: add params

* fix: fix test

* fix: rights on file

* fix: codestyle

* fix: extra f string
  • Loading branch information
dilyararimovna authored Mar 21, 2022
1 parent 531a34b commit 40ec573
Show file tree
Hide file tree
Showing 10 changed files with 203 additions and 2 deletions.
23 changes: 23 additions & 0 deletions annotators/spacy_annotator/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
FROM python:3.8.4

ARG SRC_DIR
ENV SRC_DIR ${SRC_DIR}
ARG SERVICE_PORT
ENV SERVICE_PORT ${SERVICE_PORT}
ARG SPACY_MODEL
ENV SPACY_MODEL ${SPACY_MODEL}
ARG TOKEN_ATTRIBUTES
ENV TOKEN_ATTRIBUTES ${TOKEN_ATTRIBUTES}

RUN mkdir /src

COPY $SRC_DIR /src/
COPY ./common/ /src/common/

COPY $SRC_DIR/requirements.txt /src/requirements.txt
RUN pip install -r /src/requirements.txt
RUN python -m spacy download ${SPACY_MODEL}

WORKDIR /src

CMD gunicorn --workers=2 server:app
1 change: 1 addition & 0 deletions annotators/spacy_annotator/README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is Cobot nounphrase annotator.
6 changes: 6 additions & 0 deletions annotators/spacy_annotator/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
flask==1.1.1
itsdangerous==2.0.1
gunicorn==20.0.4
sentry-sdk==0.13.4
requests==2.22.0
spacy==3.2.0
53 changes: 53 additions & 0 deletions annotators/spacy_annotator/server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import logging
import time
from os import getenv

import sentry_sdk
import spacy
from flask import Flask, request, jsonify


sentry_sdk.init(getenv("SENTRY_DSN"))

spacy_nlp = spacy.load(getenv("SPACY_MODEL"))
TOKEN_ATTRIBUTES = getenv("TOKEN_ATTRIBUTES").split("|")

logging.basicConfig(format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.DEBUG)
logger = logging.getLogger(__name__)

app = Flask(__name__)


def get_result(request):
st_time = time.time()
sentences = request.json["sentences"]
result = []

for uttr in sentences:
doc = spacy_nlp(uttr)
curr_tokens = []
for token in doc:
curr_token = {"text": token.text}
for attr in TOKEN_ATTRIBUTES:
curr_token[attr] = str(getattr(token, attr))
curr_tokens += [curr_token]
result += [curr_tokens]
total_time = time.time() - st_time
logger.info(f"spacy_annotator exec time: {total_time:.3f}s")
return result


@app.route("/respond", methods=["POST"])
def respond():
result = get_result(request)
return jsonify(result)


@app.route("/respond_batch", methods=["POST"])
def respond_batch():
result = get_result(request)
return jsonify([{"batch": result}])


if __name__ == "__main__":
app.run(debug=False, host="0.0.0.0", port=3000)
67 changes: 67 additions & 0 deletions annotators/spacy_annotator/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import os
import requests


SERVICE_PORT = int(os.getenv("SERVICE_PORT"))


def main():
url = f"http://0.0.0.0:{SERVICE_PORT}/respond"
input_data = {"sentences": ["джейсон стетхэм хочет есть."]}
gold = [
[
{
"dep_": "nsubj",
"ent_iob_": "B",
"ent_type_": "PER",
"lemma_": "джейсон",
"morph": "Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing",
"pos_": "PROPN",
"text": "джейсон",
},
{
"dep_": "appos",
"ent_iob_": "I",
"ent_type_": "PER",
"lemma_": "стетхэм",
"morph": "Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing",
"pos_": "PROPN",
"text": "стетхэм",
},
{
"dep_": "ROOT",
"ent_iob_": "O",
"ent_type_": "",
"lemma_": "хотеть",
"morph": "Aspect=Imp|Mood=Ind|Number=Sing|Person=Third|Tense=Pres|VerbForm=Fin|Voice=Act",
"pos_": "VERB",
"text": "хочет",
},
{
"dep_": "xcomp",
"ent_iob_": "O",
"ent_type_": "",
"lemma_": "есть",
"morph": "Aspect=Imp|VerbForm=Inf|Voice=Act",
"pos_": "VERB",
"text": "есть",
},
{
"dep_": "punct",
"ent_iob_": "O",
"ent_type_": "",
"lemma_": ".",
"morph": "",
"pos_": "PUNCT",
"text": ".",
},
]
]

result = requests.post(url, json=input_data).json()
assert result == gold, print(result)
print("Success!")


if __name__ == "__main__":
main()
3 changes: 3 additions & 0 deletions annotators/spacy_annotator/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

python test.py
5 changes: 5 additions & 0 deletions assistant_dists/dream_russian/dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,11 @@ services:
- "~/.deeppavlov:/root/.deeppavlov"
ports:
- 8074:8074
spacy-annotator:
volumes:
- "./annotators/spacy_annotator:/src"
ports:
- 8125:8125
dff-friendship-skill:
volumes:
- "./skills/dff_friendship_skill:/src"
Expand Down
22 changes: 21 additions & 1 deletion assistant_dists/dream_russian/docker-compose.override.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ services:
ner:8021, personal-info-skill:8030,
spelling-preprocessing:8074, entity-linking:8075, wiki-parser:8077, dff-generative-skill:8092,
dff-friendship-skill:8086, dff-wiki-skill:8111, entity-detection:8103, dialogpt:8091,
dff-template-skill:8120"
dff-template-skill:8120, spacy-annotator:8125"
WAIT_HOSTS_TIMEOUT: ${WAIT_TIMEOUT:-480}

dff-program-y-skill:
Expand Down Expand Up @@ -222,6 +222,26 @@ services:
reservations:
memory: 256M

spacy-annotator:
env_file: [.env]
build:
args:
SERVICE_PORT: 8125
SRC_DIR: annotators/spacy_annotator
SPACY_MODEL: ru_core_news_sm
TOKEN_ATTRIBUTES: pos_|dep_|lemma_|ent_iob_|ent_type_|morph
context: ./
dockerfile: annotators/spacy_annotator/Dockerfile
command: flask run -h 0.0.0.0 -p 8125
environment:
- FLASK_APP=server
deploy:
resources:
limits:
memory: 128M
reservations:
memory: 128M

dff-friendship-skill:
env_file: [.env]
build:
Expand Down
23 changes: 23 additions & 0 deletions assistant_dists/dream_russian/pipeline_conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,16 @@
"response_formatter": "state_formatters.dp_formatters:simple_formatter_service",
"state_manager_method": "add_annotation_and_reset_human_attributes_for_first_turn"
},
"spacy_annotator": {
"connector": {
"protocol": "http",
"timeout": 1,
"url": "http://spacy-annotator:8125/respond"
},
"dialog_formatter": "state_formatters.dp_formatters:last_utt_dialog",
"response_formatter": "state_formatters.dp_formatters:simple_formatter_service",
"state_manager_method": "add_annotation_and_reset_human_attributes_for_first_turn"
},
"badlisted_words": {
"connector": {
"protocol": "http",
Expand Down Expand Up @@ -290,6 +300,19 @@
],
"state_manager_method": "add_hypothesis_annotation_batch"
},
"spacy_annotator": {
"connector": {
"protocol": "http",
"timeout": 1,
"url": "http://spacy-annotator:8125/batch_respond"
},
"dialog_formatter": "state_formatters.dp_formatters:hypotheses_list",
"response_formatter": "state_formatters.dp_formatters:simple_formatter_service",
"previous_services": [
"skills"
],
"state_manager_method": "add_hypothesis_annotation_batch"
},
"entity_detection": {
"connector": {
"protocol": "http",
Expand Down
2 changes: 1 addition & 1 deletion state_formatters/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ def dff_formatter(
"human_utter_index_batch": [human_utter_index],
"dialog_batch": [new_dialog],
f"{state_name}_batch": [state],
f"dff_shared_state_batch": [dff_shared_state],
"dff_shared_state_batch": [dff_shared_state],
"entities_batch": [entities],
"used_links_batch": [used_links],
"age_group_batch": [age_group],
Expand Down

0 comments on commit 40ec573

Please sign in to comment.