Feat/dialogrpt ru (#121)

* fix: file drafts * feat: files for dialogrpt * feat: dialogrpt pipeline and scores * feat: dialogrpt pipeline and scores * feat: dialogrpt readme * fix: small readme * fix: sno healthcheck * feat: add dialogrpt to pipeline * fix: codestyle * fix: test files * feat: upd packages in dockerfile * fix: path to file * fix: shared file * fix: codestyle * fix: imports * fix: option consider * fix: option consider * fix: codestyle * fix: vars * fix: test file * fix: convert to list predictions * fix: tests * fix: codestyle * fix: codestyle * fix: codestyle * fix: readme * fix: dialogrpt to tests * feat: no extra files, add tokenizer as parameter * fix: codestyle * fix: var name * fix: batch prediction * fix: batch prediction parameter * fix: test choice * fix: format values * fix: codestyle * fix: upd deeppavlov download * fix: dialogrpt container name * fix: dialogrpt as hyp annotator * fix: dialogrpt test
deeppavlov · Jun 28, 2022 · 999d17c · 999d17c
1 parent b28604d
commit 999d17c
Show file tree

Hide file tree

Showing 18 changed files with 1,668 additions and 1 deletion.
diff --git a/assistant_dists/dream_russian/cpu.yml b/assistant_dists/dream_russian/cpu.yml
@@ -10,3 +10,6 @@ services:
   dialogpt:
     environment:
       CUDA_VISIBLE_DEVICES: ""
+  dialogrpt:
+    environment:
+      CUDA_VISIBLE_DEVICES: ""
diff --git a/assistant_dists/dream_russian/dev.yml b/assistant_dists/dream_russian/dev.yml
@@ -100,6 +100,11 @@ services:
       - "./common:/src/common"
     ports:
       - 8092:8092
+  dialogrpt:
+    volumes:
+      - "./services/dialogrpt_ru:/src"
+    ports:
+      - 8122:8122
   dff-template-skill:
     volumes:
       - "./skills/dff_template_skill:/src"

diff --git a/assistant_dists/dream_russian/docker-compose.override.yml b/assistant_dists/dream_russian/docker-compose.override.yml
@@ -297,6 +297,25 @@ services:
         reservations:
           memory: 128M
 
+  dialogrpt:
+    env_file: [ .env ]
+    build:
+      context: ./services/dialogrpt_ru/
+      args:
+        SERVICE_PORT: 8122
+        PRETRAINED_MODEL_FNAME: dialogrpt_ru_ckpt_v0.pth
+        TOKENIZER_NAME_OR_PATH: "Grossmend/rudialogpt3_medium_based_on_gpt2"
+    command: flask run -h 0.0.0.0 -p 8122
+    environment:
+      - CUDA_VISIBLE_DEVICES=0
+      - FLASK_APP=server
+    deploy:
+      resources:
+        limits:
+          memory: 2.5G
+        reservations:
+          memory: 2.5G
+
   dff-template-skill:
     env_file: [.env]
     build:
@@ -312,4 +331,5 @@ services:
           memory: 128M
         reservations:
           memory: 128M
+
 version: '3.7'
diff --git a/assistant_dists/dream_russian/pipeline_conf.json b/assistant_dists/dream_russian/pipeline_conf.json
@@ -302,6 +302,19 @@
                     "skills"
                 ],
                 "state_manager_method": "add_hypothesis_annotation_batch"
+            },
+            "dialogrpt": {
+                "connector": {
+                    "protocol": "http",
+                    "timeout": 1,
+                    "url": "http://dialogrpt:8122/respond"
+                },
+                "dialog_formatter": "state_formatters.dp_formatters:hypotheses_with_context_list",
+                "response_formatter": "state_formatters.dp_formatters:simple_formatter_service",
+                "previous_services": [
+                    "skills"
+                ],
+                "state_manager_method": "add_hypothesis_annotation_batch"
             }
         },
         "response_selectors": {

diff --git a/assistant_dists/dream_russian/test.yml b/assistant_dists/dream_russian/test.yml
@@ -41,5 +41,8 @@ services:
   dialogpt:
     environment:
       - CUDA_VISIBLE_DEVICES=7
+  dialogrpt:
+    environment:
+      - CUDA_VISIBLE_DEVICES=7
   dff-template-skill:
 version: '3.7'
diff --git a/response_selectors/convers_evaluation_based_selector/tag_based_selection.py b/response_selectors/convers_evaluation_based_selector/tag_based_selection.py
@@ -255,6 +255,9 @@ def compute_curr_single_scores(candidates, scores, confidences):
     if all(["hypothesis_scorer" in cand["annotations"] for cand in candidates]):
         for i in range(len(candidates)):
             curr_single_scores.append(candidates[i]["annotations"]["hypothesis_scorer"])
+    elif all(["dialogrpt" in cand["annotations"] for cand in candidates]):
+        for i in range(len(candidates)):
+            curr_single_scores.append(candidates[i]["annotations"]["dialogrpt"])
     else:
         for i in range(len(scores)):
             cand_scores = scores[i]

diff --git a/services/dialogrpt_ru/Dockerfile b/services/dialogrpt_ru/Dockerfile
@@ -0,0 +1,26 @@
+# syntax=docker/dockerfile:experimental
+
+FROM pytorch/pytorch:1.5-cuda10.1-cudnn7-runtime
+
+RUN apt-get update && apt-get install -y --allow-unauthenticated wget && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /src
+
+ARG PRETRAINED_MODEL_FNAME
+ENV PRETRAINED_MODEL_FNAME ${PRETRAINED_MODEL_FNAME}
+ARG SERVICE_PORT
+ENV SERVICE_PORT ${SERVICE_PORT}
+ARG TOKENIZER_NAME_OR_PATH
+ENV TOKENIZER_NAME_OR_PATH ${TOKENIZER_NAME_OR_PATH}
+
+RUN mkdir /data/
+
+RUN wget -c -q http://files.deeppavlov.ai/deeppavlov_data/${PRETRAINED_MODEL_FNAME} -P /data/
+
+COPY ./requirements.txt /src/requirements.txt
+RUN pip install -r /src/requirements.txt
+
+COPY . /src
+
+CMD gunicorn --workers=1 server:app -b 0.0.0.0:${SERVICE_PORT} --timeout=300
+
diff --git a/services/dialogrpt_ru/README.md b/services/dialogrpt_ru/README.md
@@ -0,0 +1,15 @@
+# Russian DialogRPT model
+
+Code from https://github.com/golsun/DialogRPT
+
+Trained on 827k samples (plus 95k validation samples) from Russian Pikabu web-site. 
+
+Data parsed from Pikabu by `zhirzemli` (OpenDataScience Slack nickname), code is available [on GitHub](https://github.com/alexeykarnachev/dialogs_data_parsers) 
+and the data is available [here](https://drive.google.com/file/d/1XYCprTqn_MlzDD9qgj7ANJkwFigK66mv/view?usp=sharing).
+
+Final acc=0.64 (on valid).
+
+Trained on 8 GPUs.
+```
+python src/main.py train --data=data/out/updown  --min_score_gap=20 --min_rank_gap=0.5 --max_seq_len 256 --batch 16 1>out.txt 2>&1
+```