deeppavlov · dilyararimovna · Jun 27, 2022 · May 24, 2022 · May 26, 2022 · Jun 16, 2022
diff --git a/annotators/COMeT/comet_commonsense/src/evaluate/sampler.py b/annotators/COMeT/comet_commonsense/src/evaluate/sampler.py
@@ -244,16 +244,16 @@ def generate_sequence(self, batch, model, data_loader, start_idx, end_len):
 
             if paper_results:
                 # Results from paper with slightly buggy beam search
-                current_beam_lls = beam_lls.unsqueeze(1).repeat(1, self.opt.eval.bs).view(self.opt.eval.bs**2)
+                current_beam_lls = beam_lls.unsqueeze(1).repeat(1, self.opt.eval.bs).view(self.opt.eval.bs ** 2)
             else:
                 # Current beam search implementation
-                current_beam_lls = beam_losses[-1].unsqueeze(1).repeat(1, self.opt.eval.bs).view(self.opt.eval.bs**2)
+                current_beam_lls = beam_losses[-1].unsqueeze(1).repeat(1, self.opt.eval.bs).view(self.opt.eval.bs ** 2)
 
             # Compute losses of hypotheses, masking those that have ended
-            hyp_beam_lls = (hyp_beam_lls.view(self.opt.eval.bs**2) * hypothesis_mask.view(-1)) + current_beam_lls
+            hyp_beam_lls = (hyp_beam_lls.view(self.opt.eval.bs ** 2) * hypothesis_mask.view(-1)) + current_beam_lls
 
             # Get normalizer for sequences
-            temp_counts = counts.unsqueeze(1).repeat(1, self.opt.eval.bs).view(self.opt.eval.bs**2)
+            temp_counts = counts.unsqueeze(1).repeat(1, self.opt.eval.bs).view(self.opt.eval.bs ** 2)
 
             # Select best beams with lowest aggregate loss
             beam_lls, top_beam_idxs = (hyp_beam_lls / temp_counts).topk(self.opt.eval.bs)
@@ -282,7 +282,7 @@ def generate_sequence(self, batch, model, data_loader, start_idx, end_len):
                 .repeat(self.opt.eval.bs, 1)
                 .t()
                 .contiguous()
-                .view(self.opt.eval.bs**2, -1)[top_beam_idxs]
+                .view(self.opt.eval.bs ** 2, -1)[top_beam_idxs]
             )
             beam_seqs = torch.cat((beam_seqs, beam_toks.unsqueeze(1)), dim=1)
 
@@ -294,7 +294,7 @@ def generate_sequence(self, batch, model, data_loader, start_idx, end_len):
                 .transpose(2, 1)
                 .transpose(1, 0)
                 .contiguous()
-                .view(self.opt.eval.bs**2, XMB.size(1), XMB.size(2))[top_beam_idxs]
+                .view(self.opt.eval.bs ** 2, XMB.size(1), XMB.size(2))[top_beam_idxs]
             )
 
             XMB, MMB = self.append_batch(XMB, beam_toks, MMB)

diff --git a/annotators/IntentCatcher/src/test.py b/annotators/IntentCatcher/src/test.py
@@ -11,7 +11,7 @@
 def main_test():
     url = "http://0.0.0.0:8014/detect"
     if "RU" in INTENT_DATA_PATH:
-        tests = json.load(open("tests_RU.json"))
+        tests = json.load(open("../../IntentCatcherTransformers/tests_RU.json"))
     else:
         tests = json.load(open("tests.json"))
     for test in tests:

diff --git a/annotators/IntentCatcherTransformers/Dockerfile b/annotators/IntentCatcherTransformers/Dockerfile
@@ -13,6 +13,8 @@ ARG CONFIG_NAME
 ENV CONFIG_NAME ${CONFIG_NAME}
 ARG SERVICE_PORT
 ENV SERVICE_PORT ${SERVICE_PORT}
+ARG INTENT_PHRASES_PATH
+ENV INTENT_PHRASES_PATH ${INTENT_PHRASES_PATH}
 
 COPY annotators/IntentCatcherTransformers/requirements.txt /src/requirements.txt
 RUN pip install -r /src/requirements.txt

diff --git a/annotators/IntentCatcherTransformers/README.md b/annotators/IntentCatcherTransformers/README.md
@@ -11,3 +11,9 @@ It consumes 3.5Gb GPU RAM during fine-tuning. Classification results after 5 epo
 {"train": {"eval_examples_count": 209297, "metrics": {"accuracy": 0.9997, "f1_weighted": 1.0, "f1_macro": 0.9999, "roc_auc": 1.0}, "time_spent": "0:03:46"}}
 {"valid": {"eval_examples_count": 52325, "metrics": {"accuracy": 0.9995, "f1_weighted": 0.9999, "f1_macro": 0.9999, "roc_auc": 1.0}, "time_spent": "0:00:57"}}
 ```
+
+Russian Intent Catcher is also available. Conversational Russian BERT-base version after 5 epochs achieves the following results:
+```json
+{"train": {"eval_examples_count": 16315, "metrics": {"accuracy": 1.0, "f1_weighted": 1.0, "f1_macro": 1.0, "roc_auc": 1.0}, "time_spent": "0:00:30"}}
+{"valid": {"eval_examples_count": 4079, "metrics": {"accuracy": 0.9998, "f1_weighted": 0.9998, "f1_macro": 0.989, "roc_auc": 1.0}, "time_spent": "0:00:08"}}
+```
diff --git a/annotators/IntentCatcherTransformers/intent_phrases_RU.json b/annotators/IntentCatcherTransformers/intent_phrases_RU.json
diff --git a/annotators/IntentCatcherTransformers/intents_model_dp_config_RU.json b/annotators/IntentCatcherTransformers/intents_model_dp_config_RU.json
@@ -0,0 +1,190 @@
+{
+    "dataset_reader": {
+      "class_name": "intents_dataset_reader:IntentsJsonReader",
+      "data_path": "./",
+      "train": "intent_phrases_RU.json",
+      "generated_data_path": "./generated_data"
+    },
+    "dataset_iterator": {
+        "class_name": "basic_classification_iterator",
+        "seed": 42,
+        "split_seed": 23,
+        "field_to_split": "train",
+        "split_fields": [
+          "train",
+          "valid"
+        ],
+        "split_proportions": [
+          0.8,
+          0.2
+        ]
+    },
+    "chainer": {
+      "in": [
+        "x"
+      ],
+      "in_y": [
+        "y"
+      ],
+      "pipe": [
+        {
+          "class_name": "torch_transformers_preprocessor",
+          "vocab_file": "{TRANSFORMER}",
+          "do_lower_case": true,
+          "max_seq_length": 64,
+          "in": [
+            "x"
+          ],
+          "out": [
+            "bert_features"
+          ]
+        },
+        {
+          "id": "classes_vocab",
+          "class_name": "simple_vocab",
+          "fit_on": [
+            "y"
+          ],
+          "save_path": "{MODEL_PATH}/classes.dict",
+          "load_path": "{MODEL_PATH}/classes.dict",
+          "in": [
+            "y"
+          ],
+          "out": [
+            "y_ids"
+          ]
+        },
+        {
+          "id": "my_one_hotter",
+          "in": [
+            "y_ids"
+          ],
+          "out": [
+            "y_onehot"
+          ],
+          "class_name": "one_hotter",
+          "depth": "#classes_vocab.len",
+          "single_vector": true
+        },
+        {
+          "class_name": "torch_transformers_classifier",
+          "n_classes": "#classes_vocab.len",
+          "return_probas": true,
+          "one_hot_labels": true,
+          "multilabel": true,
+          "pretrained_bert": "{TRANSFORMER}",
+          "save_path": "{MODEL_PATH}/model",
+          "load_path": "{MODEL_PATH}/model",
+          "optimizer": "AdamW",
+          "optimizer_parameters": {
+            "lr": 1e-05
+          },
+          "learning_rate_drop_patience": 5,
+          "learning_rate_drop_div": 2.0,
+          "in": [
+            "bert_features"
+          ],
+          "in_y": [
+            "y_onehot"
+          ],
+          "out": [
+            "y_pred_probas"
+          ]
+        },
+        {
+          "in": [
+            "y_pred_probas"
+          ],
+          "out": [
+            "y_pred_ids"
+          ],
+          "class_name": "proba2labels",
+          "max_proba": false,
+          "confidence_threshold": 0.5
+        },        
+        {
+          "ref": "my_one_hotter",
+          "in": [
+            "y_pred_ids"
+          ],
+          "out": [
+            "y_pred_onehot"
+          ]
+        },
+        {
+          "in": [
+            "y_pred_ids"
+          ],
+          "out": [
+            "y_pred_labels"
+          ],
+          "ref": "classes_vocab"
+        }
+      ],
+      "out": [
+        "y_pred_labels",
+        "y_pred_probas"
+      ]
+    },
+    "train": {
+      "epochs": 5,
+      "batch_size": 64,
+      "metrics": [
+        {
+          "name": "accuracy",
+          "inputs": [
+            "y",
+            "y_pred_labels"
+          ]
+        },
+        {
+          "name": "f1_weighted",
+          "inputs": [
+            "y_onehot",
+            "y_pred_onehot"
+          ]
+        },
+        {
+            "name": "f1_macro",
+            "inputs": [
+              "y_onehot",
+              "y_pred_onehot"
+            ]
+        },
+        {
+          "name": "roc_auc",
+          "inputs": [
+            "y_onehot",
+            "y_pred_probas"
+          ]
+        }
+      ],
+      "validation_patience": 5,
+      "val_every_n_epochs": 1,
+      "log_every_n_epochs": 1,
+      "show_examples": false,
+      "evaluation_targets": [
+        "train",
+        "valid"
+      ],
+      "class_name": "torch_trainer"
+    },
+    "metadata": {
+      "imports": [
+        "intents_dataset_reader"
+      ],
+      "variables": {
+        "TRANSFORMER": "DeepPavlov/rubert-base-cased-conversational",
+        "ROOT_PATH": "~/.deeppavlov",
+        "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+        "MODELS_PATH": "{ROOT_PATH}/models",
+        "MODEL_PATH": "{MODELS_PATH}/classifiers/intents_model_RU_v0"
+      },
+      "download": [
+        {
+          "url": "http://files.deeppavlov.ai/deeppavlov_data/intents_model_RU_v0.tar.gz",
+          "subdir": "{MODELS_PATH}/classifiers"
+        }
+      ]
+    }
+  }
diff --git a/annotators/IntentCatcherTransformers/test.py b/annotators/IntentCatcherTransformers/test.py
@@ -2,11 +2,17 @@
 
 import requests
 import json
+from os import getenv
+
+INTENT_PHRASES_PATH = getenv("INTENT_PHRASES_PATH")
 
 
 def main_test():
     url = "http://0.0.0.0:8014/detect"
-    tests = json.load(open("tests.json"))
+    if "RU" in INTENT_PHRASES_PATH:
+        tests = json.load(open("tests_RU.json"))
+    else:
+        tests = json.load(open("tests.json"))
     for test in tests:
         r = requests.post(url=url, json={"sentences": [[test["sentence"]]]})
         assert r.ok
@@ -15,7 +21,7 @@ def main_test():
             assert (
                 data.get(test["intent"], {"detected": 0}).get("detected", 0) == 1
                 and sum([v.get("detected", 0) for v in data.values()]) == 1
-            ), print(f"TEST FAILED!\nTest: {test}\nResult:{data}")
+            ), print(f"TEST FAILED!\nTest: {test}\nResult:{json.dumps(data, indent=2)}")
         else:
             assert all([intent["detected"] == 0 for intent in data.values()]), f"test: {test}\nprediction: {data}"
     print("Success")

diff --git a/annotators/IntentCatcher/src/tests_RU.json → ...s/IntentCatcherTransformers/tests_RU.json b/annotators/IntentCatcher/src/tests_RU.json → ...s/IntentCatcherTransformers/tests_RU.json
@@ -28,7 +28,7 @@
         "intent": "what_is_your_name"
     },
     {
-        "sentence": "расскажи откуда ты родом.",
+        "sentence": "откуда ты родом",
         "intent": "where_are_you_from"
     },
     {

diff --git a/annotators/NER_ru/Dockerfile b/annotators/NER_ru/Dockerfile
@@ -1,5 +1,11 @@
 FROM tensorflow/tensorflow:1.15.2-gpu
 
+RUN apt-key del 7fa2af80  && \
+    rm -f /etc/apt/sources.list.d/cuda*.list && \
+    curl https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb \
+    -o cuda-keyring_1.0-1_all.deb && \
+    dpkg -i cuda-keyring_1.0-1_all.deb
+
 RUN apt-get -y update && \
     apt-get install -y software-properties-common && \
     apt-get update && apt-get install git -y

diff --git a/annotators/entity_detection_rus/test.sh b/annotators/entity_detection_rus/test.sh
@@ -2,3 +2,4 @@
 
 
 python test_entity_detection.py
+
diff --git a/annotators/entity_linking_rus/Dockerfile b/annotators/entity_linking_rus/Dockerfile
@@ -1,5 +1,11 @@
 FROM python:3.7.6
 
+RUN apt-key del 7fa2af80  && \
+    rm -f /etc/apt/sources.list.d/cuda*.list && \
+    curl https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb \
+    -o cuda-keyring_1.0-1_all.deb && \
+    dpkg -i cuda-keyring_1.0-1_all.deb
+
 RUN apt-get -y update && \
     apt-get install -y build-essential libssl-dev zlib1g-dev libbz2-dev \
 libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \

diff --git a/annotators/spelling_preprocessing/server.py b/annotators/spelling_preprocessing/server.py
@@ -108,7 +108,7 @@ def ten_power_3n_repl(match_obj):
     result = 0
     for i in range(start, start + n):
         power = (start + n - i - 1) * 3
-        result += 0 if match_obj.group(i) is None else int(match_obj.group(i)) * 10**power
+        result += 0 if match_obj.group(i) is None else int(match_obj.group(i)) * 10 ** power
     return str(result)
 
 

diff --git a/annotators/spelling_preprocessing_ru/Dockerfile b/annotators/spelling_preprocessing_ru/Dockerfile
@@ -1,5 +1,11 @@
 FROM tensorflow/tensorflow:1.15.2-gpu
 
+RUN apt-key del 7fa2af80  && \
+    rm -f /etc/apt/sources.list.d/cuda*.list && \
+    curl https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb \
+    -o cuda-keyring_1.0-1_all.deb && \
+    dpkg -i cuda-keyring_1.0-1_all.deb
+
 RUN apt-get -y update && \
     apt-get install -y software-properties-common && \
     apt-get update && apt-get install git -y

diff --git a/assistant_dists/dream/dev.yml b/assistant_dists/dream/dev.yml
@@ -54,6 +54,7 @@ services:
     volumes:
       - "./annotators/IntentCatcherTransformers:/src"
       - "./common:/src/common"
+      - "~/.deeppavlov:/root/.deeppavlov"
     ports:
       - 8014:8014
   badlisted-words:

diff --git a/assistant_dists/dream/docker-compose.override.yml b/assistant_dists/dream/docker-compose.override.yml
@@ -167,7 +167,8 @@ services:
       args:
         SERVICE_PORT: 8014
         CONFIG_NAME: intents_model_dp_config.json
-    command:  python -m flask run -h 0.0.0.0 -p 8014 --without-threads
+        INTENT_PHRASES_PATH: intent_phrases.json
+    command:  python -m flask run -h 0.0.0.0 -p 8014
     environment:
       - FLASK_APP=server
       - CUDA_VISIBLE_DEVICES=0

diff --git a/assistant_dists/dream_mini/docker-compose.override.yml b/assistant_dists/dream_mini/docker-compose.override.yml
@@ -106,7 +106,7 @@ services:
     env_file: [.env]
     build:
       context: ./annotators/IntentCatcher/
-    command:  python -m flask run -h 0.0.0.0 -p 8014 --without-threads
+    command:  python -m flask run -h 0.0.0.0 -p 8014
     environment:
       - FLASK_APP=server
     deploy:

diff --git a/assistant_dists/dream_russian/cpu.yml b/assistant_dists/dream_russian/cpu.yml
@@ -16,3 +16,6 @@ services:
   toxic-classification:
     environment:
       CUDA_VISIBLE_DEVICES: ""
+  intent-catcher:
+    environment:
+      CUDA_VISIBLE_DEVICES: ""
diff --git a/assistant_dists/dream_russian/dev.yml b/assistant_dists/dream_russian/dev.yml
@@ -31,8 +31,9 @@ services:
       - 8011:8011
   intent-catcher:
     volumes:
-      - "./annotators/IntentCatcher/src:/src"
+      - "./annotators/IntentCatcherTransformers:/src"
       - "./common:/src/common"
+      - "~/.deeppavlov:/root/.deeppavlov"
     ports:
       - 8014:8014
   badlisted-words: