From 4a933d2f8f45b3b5dff1b5821cb2beadfbffc58e Mon Sep 17 00:00:00 2001 From: dilyararimovna Date: Fri, 5 Aug 2022 17:29:10 +0300 Subject: [PATCH 01/10] feat: upd dp-ner with extended version --- ...stic_multilingual_distilbert_extended.json | 156 ++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 annotators/NER_deeppavlov/ner_case_agnostic_multilingual_distilbert_extended.json diff --git a/annotators/NER_deeppavlov/ner_case_agnostic_multilingual_distilbert_extended.json b/annotators/NER_deeppavlov/ner_case_agnostic_multilingual_distilbert_extended.json new file mode 100644 index 0000000000..c1b40ea159 --- /dev/null +++ b/annotators/NER_deeppavlov/ner_case_agnostic_multilingual_distilbert_extended.json @@ -0,0 +1,156 @@ +{ + "dataset_reader": { + "class_name": "conll2003_reader", + "data_path": "{DOWNLOADS_PATH}/conll2003/", + "dataset_name": "conll2003", + "provide_pos": false + }, + "dataset_iterator": { + "class_name": "data_learning_iterator" + }, + "chainer": { + "in": [ + "x" + ], + "in_y": [ + "y" + ], + "pipe": [ + { + "class_name": "torch_transformers_ner_preprocessor", + "vocab_file": "{TRANSFORMER}", + "do_lower_case": false, + "max_seq_length": 512, + "max_subword_length": 15, + "token_masking_prob": 0.0, + "in": [ + "x" + ], + "out": [ + "x_tokens", + "x_subword_tokens", + "x_subword_tok_ids", + "startofword_markers", + "attention_mask" + ] + }, + { + "id": "tag_vocab", + "class_name": "simple_vocab", + "unk_token": [ + "O" + ], + "pad_with_zeros": true, + "save_path": "{MODEL_PATH}/tag.dict", + "load_path": "{MODEL_PATH}/tag.dict", + "fit_on": [ + "y" + ], + "in": [ + "y" + ], + "out": [ + "y_ind" + ] + }, + { + "class_name": "torch_transformers_sequence_tagger", + "n_tags": "#tag_vocab.len", + "pretrained_bert": "{TRANSFORMER}", + "attention_probs_keep_prob": 0.5, + "return_probas": false, + "use_crf": true, + "encoder_layer_ids": [ + -1 + ], + "optimizer": "AdamW", + "optimizer_parameters": { + "lr": 2e-05, + "weight_decay": 1e-06, + "betas": [ + 0.9, + 0.999 + ], + "eps": 1e-06 + }, + "clip_norm": 1.0, + "min_learning_rate": 1e-07, + "learning_rate_drop_patience": 20, + "learning_rate_drop_div": 1.5, + "load_before_drop": true, + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "in": [ + "x_subword_tok_ids", + "attention_mask", + "startofword_markers" + ], + "in_y": [ + "y_ind" + ], + "out": [ + "y_pred_ind", + "probas" + ] + }, + { + "ref": "tag_vocab", + "in": [ + "y_pred_ind" + ], + "out": [ + "y_pred" + ] + } + ], + "out": [ + "x_tokens", + "y_pred" + ] + }, + "train": { + "epochs": 50, + "batch_size": 150, + "metrics": [ + { + "name": "ner_f1", + "inputs": [ + "y", + "y_pred" + ] + }, + { + "name": "ner_token_f1", + "inputs": [ + "y", + "y_pred" + ] + } + ], + "validation_patience": 100, + "val_every_n_batches": 50, + "log_every_n_batches": 50, + "show_examples": false, + "pytest_max_batches": 2, + "pytest_batch_size": 8, + "evaluation_targets": [ + "test" + ], + "class_name": "torch_trainer" + }, + "metadata": { + "variables": { + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "~/.deeppavlov/downloads", + "MODELS_PATH": "~/.deeppavlov/models", + "TRANSFORMER": "distilbert-base-multilingual-cased", + "MODEL_PATH": "{MODELS_PATH}/ner/mbert_dream_distil_with_numbers_rus_ext" + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/v1/ner/mbert_dream_distil_with_numbers.tar.gz", + "subdir": "{MODEL_PATH}" + } + ] + } +} From b4da4e5e8364fc15e18ddb9df401a0f2870fccb5 Mon Sep 17 00:00:00 2001 From: dilyararimovna Date: Mon, 8 Aug 2022 12:30:24 +0300 Subject: [PATCH 02/10] fix: upd tests --- annotators/NER_deeppavlov/test_server.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/annotators/NER_deeppavlov/test_server.py b/annotators/NER_deeppavlov/test_server.py index 3f2f2ed369..c19b9bc83a 100644 --- a/annotators/NER_deeppavlov/test_server.py +++ b/annotators/NER_deeppavlov/test_server.py @@ -16,29 +16,30 @@ def main(): gold_results = [ [ [ - {"confidence": 1, "end_pos": 14, "start_pos": 9, "text": "ивана", "type": "PER"}, - {"confidence": 1, "end_pos": 23, "start_pos": 17, "text": "москве", "type": "LOC"}, + {'confidence': 1, 'end_pos': 14, 'start_pos': 9, 'text': 'ивана', 'type': 'PER'}, + {'confidence': 1, 'end_pos': 23, 'start_pos': 17, 'text': 'москве', 'type': 'LOC'} ] ], [ [ - {"confidence": 1, "end_pos": 14, "start_pos": 9, "text": "Ивана", "type": "PER"}, - {"confidence": 1, "end_pos": 23, "start_pos": 17, "text": "Москве", "type": "LOC"}, + {'confidence': 1, 'end_pos': 14, 'start_pos': 9, 'text': 'Ивана', 'type': 'PER'}, + {'confidence': 1, 'end_pos': 23, 'start_pos': 17, 'text': 'Москве', 'type': 'LOC'} ] ], [ [ - {"confidence": 1, "end_pos": 25, "start_pos": 19, "text": "justin", "type": "ORG"}, - {"confidence": 1, "end_pos": 42, "start_pos": 36, "text": "sahara", "type": "LOC"}, + {'confidence': 1, 'end_pos': 25, 'start_pos': 19, 'text': 'justin', 'type': 'ORG'}, + {'confidence': 1, 'end_pos': 42, 'start_pos': 36, 'text': 'sahara', 'type': 'LOC'}, + {'confidence': 1, 'end_pos': 49, 'start_pos': 43, 'text': 'desert', 'type': 'LOC'} ] ], [ [ - {"confidence": 1, "end_pos": 25, "start_pos": 19, "text": "Justin", "type": "PER"}, - {"confidence": 1, "end_pos": 42, "start_pos": 36, "text": "Sahara", "type": "ORG"}, - {"confidence": 1, "end_pos": 50, "start_pos": 43, "text": "Desert", "type": "ORG"}, + {'confidence': 1, 'end_pos': 25, 'start_pos': 19, 'text': 'Justin', 'type': 'PER'}, + {'confidence': 1, 'end_pos': 42, 'start_pos': 36, 'text': 'Sahara', 'type': 'LOC'}, + {'confidence': 1, 'end_pos': 49, 'start_pos': 43, 'text': 'Desert', 'type': 'LOC'} ] - ], + ] ] result = requests.post(url, json=request_data).json() From ceb744faf733698fa5b259961044b42f7de4a128 Mon Sep 17 00:00:00 2001 From: dilyararimovna Date: Mon, 8 Aug 2022 15:37:40 +0300 Subject: [PATCH 03/10] fix: working for tags --- annotators/NER_deeppavlov/server.py | 65 ++++++++++++++++-------- annotators/NER_deeppavlov/test_server.py | 30 +++++------ 2 files changed, 57 insertions(+), 38 deletions(-) diff --git a/annotators/NER_deeppavlov/server.py b/annotators/NER_deeppavlov/server.py index d218b87004..37e52337f8 100644 --- a/annotators/NER_deeppavlov/server.py +++ b/annotators/NER_deeppavlov/server.py @@ -27,15 +27,50 @@ raise e -def convert_prediction(s, token, tag): - start_pos = s.find(token) - return { - "confidence": 1, - "text": token, - "type": tag.replace("B-", "").replace("I-", ""), - "start_pos": start_pos, - "end_pos": start_pos + len(token), - } +def convert_prediction(sents, pred_labels): + entities = [] + for sent, tags in zip(sents, pred_labels): + entities.append([]) + start = end = -1 + for i, (word, tag) in enumerate(zip(sent, tags)): + if tag[0] == "B": + if start != -1: + entities[-1].append( + { + "start_pos": start, + "end_pos": end, + "type": tags[start].split("-")[1], + "text": " ".join(sent[start:end]), + "confidence": 1, + } + ) + start = i + end = i + 1 + elif tag[0] == "I": + end = i + 1 + else: + if start != -1: + entities[-1].append( + { + "start_pos": start, + "end_pos": end, + "type": tags[start].split("-")[1], + "text": " ".join(sent[start:end]), + "confidence": 1, + } + ) + start = -1 + if start != -1: + entities[-1].append( + { + "start_pos": start, + "end_pos": end, + "type": tags[start].split("-")[1], + "text": " ".join(sent[start:end]), + "confidence": 1, + } + ) + return entities def get_result(request): @@ -51,17 +86,7 @@ def get_result(request): dialog_ids.append(i) tokens_batch, tags_batch = ner_model(samples) - good_preds = [ - [convert_prediction(s, token, tag) for token, tag in zip(tokens, tags) if tag != "O"] - for s, tokens, tags in zip(samples, tokens_batch, tags_batch) - ] - dialog_ids = np.array(dialog_ids) - - ret = [] - for i, utterance_sents in enumerate(last_utterances): - curr_ids = np.where(dialog_ids == i)[0] - curr_preds = [good_preds[curr_id] for curr_id in curr_ids] - ret.append(curr_preds) + ret = convert_prediction(tokens_batch, tags_batch) logger.info(f"NER output: {ret}") total_time = time.time() - st_time diff --git a/annotators/NER_deeppavlov/test_server.py b/annotators/NER_deeppavlov/test_server.py index c19b9bc83a..eeba3bd124 100644 --- a/annotators/NER_deeppavlov/test_server.py +++ b/annotators/NER_deeppavlov/test_server.py @@ -15,30 +15,24 @@ def main(): gold_results = [ [ - [ - {'confidence': 1, 'end_pos': 14, 'start_pos': 9, 'text': 'ивана', 'type': 'PER'}, - {'confidence': 1, 'end_pos': 23, 'start_pos': 17, 'text': 'москве', 'type': 'LOC'} - ] + {'confidence': 1, 'end_pos': 3, 'start_pos': 2, 'text': 'ивана', 'type': 'PER'}, + {'confidence': 1, 'end_pos': 5, 'start_pos': 4, 'text': 'москве', 'type': 'LOC'} ], [ - [ - {'confidence': 1, 'end_pos': 14, 'start_pos': 9, 'text': 'Ивана', 'type': 'PER'}, - {'confidence': 1, 'end_pos': 23, 'start_pos': 17, 'text': 'Москве', 'type': 'LOC'} - ] + {'confidence': 1, 'end_pos': 3, 'start_pos': 2, 'text': 'Ивана', 'type': 'PER'}, + {'confidence': 1, 'end_pos': 5, 'start_pos': 4, 'text': 'Москве', 'type': 'LOC'} ], [ - [ - {'confidence': 1, 'end_pos': 25, 'start_pos': 19, 'text': 'justin', 'type': 'ORG'}, - {'confidence': 1, 'end_pos': 42, 'start_pos': 36, 'text': 'sahara', 'type': 'LOC'}, - {'confidence': 1, 'end_pos': 49, 'start_pos': 43, 'text': 'desert', 'type': 'LOC'} - ] + {'confidence': 1, 'end_pos': 5, 'start_pos': 4, 'text': 'justin', 'type': 'ORG'}, + {'confidence': 1, 'end_pos': 11, 'start_pos': 9, 'text': 'sahara desert', 'type': 'LOC'} ], [ - [ - {'confidence': 1, 'end_pos': 25, 'start_pos': 19, 'text': 'Justin', 'type': 'PER'}, - {'confidence': 1, 'end_pos': 42, 'start_pos': 36, 'text': 'Sahara', 'type': 'LOC'}, - {'confidence': 1, 'end_pos': 49, 'start_pos': 43, 'text': 'Desert', 'type': 'LOC'} - ] + {'confidence': 1, 'end_pos': 5, 'start_pos': 4, 'text': 'Justin', 'type': 'PER'}, + {'confidence': 1, 'end_pos': 11, 'start_pos': 9, 'text': 'Sahara Desert', 'type': 'LOC'} + ], + [ + {'confidence': 1, 'end_pos': 5, 'start_pos': 3, 'text': 'Bob Smith', 'type': 'PER'}, + {'confidence': 1, 'end_pos': 8, 'start_pos': 6, 'text': 'Las Vegas', 'type': 'LOC'} ] ] From 2580748b6c28d376c301c159d11a608c669d75aa Mon Sep 17 00:00:00 2001 From: dilyararimovna Date: Mon, 8 Aug 2022 15:40:59 +0300 Subject: [PATCH 04/10] fix: codestyle --- annotators/NER_deeppavlov/server.py | 1 - annotators/NER_deeppavlov/test_server.py | 22 +++++++++++----------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/annotators/NER_deeppavlov/server.py b/annotators/NER_deeppavlov/server.py index 37e52337f8..ef37a42a48 100644 --- a/annotators/NER_deeppavlov/server.py +++ b/annotators/NER_deeppavlov/server.py @@ -2,7 +2,6 @@ import os import time -import numpy as np import sentry_sdk from flask import Flask, jsonify, request diff --git a/annotators/NER_deeppavlov/test_server.py b/annotators/NER_deeppavlov/test_server.py index eeba3bd124..b25940d4cd 100644 --- a/annotators/NER_deeppavlov/test_server.py +++ b/annotators/NER_deeppavlov/test_server.py @@ -15,25 +15,25 @@ def main(): gold_results = [ [ - {'confidence': 1, 'end_pos': 3, 'start_pos': 2, 'text': 'ивана', 'type': 'PER'}, - {'confidence': 1, 'end_pos': 5, 'start_pos': 4, 'text': 'москве', 'type': 'LOC'} + {"confidence": 1, "end_pos": 3, "start_pos": 2, "text": "ивана", "type": "PER"}, + {"confidence": 1, "end_pos": 5, "start_pos": 4, "text": "москве", "type": "LOC"}, ], [ - {'confidence': 1, 'end_pos': 3, 'start_pos': 2, 'text': 'Ивана', 'type': 'PER'}, - {'confidence': 1, 'end_pos': 5, 'start_pos': 4, 'text': 'Москве', 'type': 'LOC'} + {"confidence": 1, "end_pos": 3, "start_pos": 2, "text": "Ивана", "type": "PER"}, + {"confidence": 1, "end_pos": 5, "start_pos": 4, "text": "Москве", "type": "LOC"}, ], [ - {'confidence': 1, 'end_pos': 5, 'start_pos': 4, 'text': 'justin', 'type': 'ORG'}, - {'confidence': 1, 'end_pos': 11, 'start_pos': 9, 'text': 'sahara desert', 'type': 'LOC'} + {"confidence": 1, "end_pos": 5, "start_pos": 4, "text": "justin", "type": "ORG"}, + {"confidence": 1, "end_pos": 11, "start_pos": 9, "text": "sahara desert", "type": "LOC"}, ], [ - {'confidence': 1, 'end_pos': 5, 'start_pos': 4, 'text': 'Justin', 'type': 'PER'}, - {'confidence': 1, 'end_pos': 11, 'start_pos': 9, 'text': 'Sahara Desert', 'type': 'LOC'} + {"confidence": 1, "end_pos": 5, "start_pos": 4, "text": "Justin", "type": "PER"}, + {"confidence": 1, "end_pos": 11, "start_pos": 9, "text": "Sahara Desert", "type": "LOC"}, ], [ - {'confidence': 1, 'end_pos': 5, 'start_pos': 3, 'text': 'Bob Smith', 'type': 'PER'}, - {'confidence': 1, 'end_pos': 8, 'start_pos': 6, 'text': 'Las Vegas', 'type': 'LOC'} - ] + {"confidence": 1, "end_pos": 5, "start_pos": 3, "text": "Bob Smith", "type": "PER"}, + {"confidence": 1, "end_pos": 8, "start_pos": 6, "text": "Las Vegas", "type": "LOC"}, + ], ] result = requests.post(url, json=request_data).json() From dc4deeab19a4af3809b72b12a5b88a6cae8363f9 Mon Sep 17 00:00:00 2001 From: dilyararimovna Date: Mon, 8 Aug 2022 16:08:41 +0300 Subject: [PATCH 05/10] fix: user new model --- annotators/NER_deeppavlov/server.py | 1 + annotators/NER_deeppavlov/test_server.py | 9 ++++++--- assistant_dists/dream/docker-compose.override.yml | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/annotators/NER_deeppavlov/server.py b/annotators/NER_deeppavlov/server.py index ef37a42a48..e6024fe703 100644 --- a/annotators/NER_deeppavlov/server.py +++ b/annotators/NER_deeppavlov/server.py @@ -85,6 +85,7 @@ def get_result(request): dialog_ids.append(i) tokens_batch, tags_batch = ner_model(samples) + logger.info(f"NER model predictions: tokens: {tokens_batch}, tags: {tags_batch}") ret = convert_prediction(tokens_batch, tags_batch) logger.info(f"NER output: {ret}") diff --git a/annotators/NER_deeppavlov/test_server.py b/annotators/NER_deeppavlov/test_server.py index b25940d4cd..7a1d79a8e7 100644 --- a/annotators/NER_deeppavlov/test_server.py +++ b/annotators/NER_deeppavlov/test_server.py @@ -10,6 +10,7 @@ def main(): ["Я видела Ивана в Москве"], ["i have heard about justin. he is in sahara desert"], ["I have heard about Justin. He is in Sahara Desert"], + ["can john smith move forward for 15 meters, then for fifteen meters, and get back to las vegas then"], ] } @@ -23,7 +24,7 @@ def main(): {"confidence": 1, "end_pos": 5, "start_pos": 4, "text": "Москве", "type": "LOC"}, ], [ - {"confidence": 1, "end_pos": 5, "start_pos": 4, "text": "justin", "type": "ORG"}, + {"confidence": 1, "end_pos": 5, "start_pos": 4, "text": "justin", "type": "PER"}, {"confidence": 1, "end_pos": 11, "start_pos": 9, "text": "sahara desert", "type": "LOC"}, ], [ @@ -31,8 +32,10 @@ def main(): {"confidence": 1, "end_pos": 11, "start_pos": 9, "text": "Sahara Desert", "type": "LOC"}, ], [ - {"confidence": 1, "end_pos": 5, "start_pos": 3, "text": "Bob Smith", "type": "PER"}, - {"confidence": 1, "end_pos": 8, "start_pos": 6, "text": "Las Vegas", "type": "LOC"}, + {"confidence": 1, "end_pos": 3, "start_pos": 1, "text": "john smith", "type": "PER"}, + {"confidence": 1, "end_pos": 8, "start_pos": 6, "text": "15 meters", "type": "QUANTITY"}, + {"confidence": 1, "end_pos": 13, "start_pos": 11, "text": "fifteen meters", "type": "QUANTITY"}, + {"confidence": 1, "end_pos": 20, "start_pos": 18, "text": "las vegas", "type": "LOC"}, ], ] diff --git a/assistant_dists/dream/docker-compose.override.yml b/assistant_dists/dream/docker-compose.override.yml index 9198020619..f1b68e50f8 100644 --- a/assistant_dists/dream/docker-compose.override.yml +++ b/assistant_dists/dream/docker-compose.override.yml @@ -234,7 +234,7 @@ services: env_file: [ .env ] build: args: - CONFIG: ner_case_agnostic_multilingual_bert_base.json + CONFIG: ner_case_agnostic_multilingual_distilbert_extended.json PORT: 8021 SRC_DIR: annotators/NER_deeppavlov COMMIT: f5117cd9ad1e64f6c2d970ecaa42fc09ccb23144 From b323d6b440b4d6d2c36c7d3d0cd680562dd3748a Mon Sep 17 00:00:00 2001 From: dilyararimovna Date: Mon, 8 Aug 2022 16:46:01 +0300 Subject: [PATCH 06/10] feat: woking --- annotators/NER_deeppavlov/test_server.py | 5 +++++ assistant_dists/dream/docker-compose.override.yml | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/annotators/NER_deeppavlov/test_server.py b/annotators/NER_deeppavlov/test_server.py index 7a1d79a8e7..d2a61c5a1d 100644 --- a/annotators/NER_deeppavlov/test_server.py +++ b/annotators/NER_deeppavlov/test_server.py @@ -11,6 +11,7 @@ def main(): ["i have heard about justin. he is in sahara desert"], ["I have heard about Justin. He is in Sahara Desert"], ["can john smith move forward for 15 meters, then for fifteen meters, and get back to las vegas then"], + ["я бы проехала на 30 метров вперед, а потом повернула на сорок пять градусов по часовой стрелке"], ] } @@ -37,6 +38,10 @@ def main(): {"confidence": 1, "end_pos": 13, "start_pos": 11, "text": "fifteen meters", "type": "QUANTITY"}, {"confidence": 1, "end_pos": 20, "start_pos": 18, "text": "las vegas", "type": "LOC"}, ], + [ + {"confidence": 1, "end_pos": 6, "start_pos": 3, "text": "на 30 метров", "type": "QUANTITY"}, + {"confidence": 1, "end_pos": 14, "start_pos": 13, "text": "пять", "type": "QUANTITY"}, + ], ] result = requests.post(url, json=request_data).json() diff --git a/assistant_dists/dream/docker-compose.override.yml b/assistant_dists/dream/docker-compose.override.yml index f1b68e50f8..02de388293 100644 --- a/assistant_dists/dream/docker-compose.override.yml +++ b/assistant_dists/dream/docker-compose.override.yml @@ -234,7 +234,7 @@ services: env_file: [ .env ] build: args: - CONFIG: ner_case_agnostic_multilingual_distilbert_extended.json + CONFIG: ner_case_agnostic_multilingual_bert_base_extended.json PORT: 8021 SRC_DIR: annotators/NER_deeppavlov COMMIT: f5117cd9ad1e64f6c2d970ecaa42fc09ccb23144 From 7c1c9b5e0d8c377cdab2f781f478dd773716e287 Mon Sep 17 00:00:00 2001 From: dilyararimovna Date: Tue, 9 Aug 2022 12:30:06 +0300 Subject: [PATCH 07/10] fix: config --- ...ostic_multilingual_bert_base_extended.json | 157 ++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 annotators/NER_deeppavlov/ner_case_agnostic_multilingual_bert_base_extended.json diff --git a/annotators/NER_deeppavlov/ner_case_agnostic_multilingual_bert_base_extended.json b/annotators/NER_deeppavlov/ner_case_agnostic_multilingual_bert_base_extended.json new file mode 100644 index 0000000000..bf7aace0f4 --- /dev/null +++ b/annotators/NER_deeppavlov/ner_case_agnostic_multilingual_bert_base_extended.json @@ -0,0 +1,157 @@ +{ + "dataset_reader": { + "class_name": "conll2003_reader", + "data_path": "{DOWNLOADS_PATH}/conll2003/", + "dataset_name": "conll2003", + "provide_pos": false + }, + "dataset_iterator": { + "class_name": "data_learning_iterator" + }, + "chainer": { + "in": [ + "x" + ], + "in_y": [ + "y" + ], + "pipe": [ + { + "class_name": "torch_transformers_ner_preprocessor", + "vocab_file": "{TRANSFORMER}", + "do_lower_case": false, + "max_seq_length": 512, + "max_subword_length": 15, + "token_masking_prob": 0.0, + "in": [ + "x" + ], + "out": [ + "x_tokens", + "x_subword_tokens", + "x_subword_tok_ids", + "startofword_markers", + "attention_mask" + ] + }, + { + "id": "tag_vocab", + "class_name": "simple_vocab", + "unk_token": [ + "O" + ], + "pad_with_zeros": true, + "save_path": "{MODEL_PATH}/tag.dict", + "load_path": "{MODEL_PATH}/tag.dict", + "fit_on": [ + "y" + ], + "in": [ + "y" + ], + "out": [ + "y_ind" + ] + }, + { + "class_name": "torch_transformers_sequence_tagger", + "n_tags": "#tag_vocab.len", + "pretrained_bert": "{TRANSFORMER}", + "attention_probs_keep_prob": 0.5, + "return_probas": false, + "use_crf": true, + "encoder_layer_ids": [ + -1 + ], + "optimizer": "AdamW", + "optimizer_parameters": { + "lr": 2e-05, + "weight_decay": 1e-06, + "betas": [ + 0.9, + 0.999 + ], + "eps": 1e-06 + }, + "clip_norm": 1.0, + "min_learning_rate": 1e-07, + "learning_rate_drop_patience": 20, + "learning_rate_drop_div": 1.5, + "load_before_drop": true, + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "in": [ + "x_subword_tok_ids", + "attention_mask", + "startofword_markers" + ], + "in_y": [ + "y_ind" + ], + "out": [ + "y_pred_ind", + "probas" + ] + }, + { + "ref": "tag_vocab", + "in": [ + "y_pred_ind" + ], + "out": [ + "y_pred" + ] + } + ], + "out": [ + "x_tokens", + "y_pred" + ] + }, + "train": { + "epochs": 50, + "batch_size": 100, + "metrics": [ + { + "name": "ner_f1", + "inputs": [ + "y", + "y_pred" + ] + }, + { + "name": "ner_token_f1", + "inputs": [ + "y", + "y_pred" + ], + "print_results": true + } + ], + "validation_patience": 100, + "val_every_n_batches": 50, + "log_every_n_batches": 50, + "show_examples": false, + "pytest_max_batches": 2, + "pytest_batch_size": 8, + "evaluation_targets": [ + "test" + ], + "class_name": "torch_trainer" + }, + "metadata": { + "variables": { + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "~/.deeppavlov/downloads", + "MODELS_PATH": "~/.deeppavlov/models", + "TRANSFORMER": "bert-base-multilingual-cased", + "MODEL_PATH": "{MODELS_PATH}/ner/mbert_dream_with_numbers_rus_ext" + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/v1/ner/mbert_dream_with_numbers.tar.gz", + "subdir": "{MODEL_PATH}" + } + ] + } +} From a0f6bec7bcaddf6e996b6b1077f403e1ea1a6300 Mon Sep 17 00:00:00 2001 From: dilyararimovna Date: Tue, 9 Aug 2022 12:30:39 +0300 Subject: [PATCH 08/10] fix: upd ner dockerfile --- annotators/NER_deeppavlov/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/annotators/NER_deeppavlov/Dockerfile b/annotators/NER_deeppavlov/Dockerfile index 89a8b58b2e..19574f897d 100644 --- a/annotators/NER_deeppavlov/Dockerfile +++ b/annotators/NER_deeppavlov/Dockerfile @@ -16,5 +16,6 @@ COPY $SRC_DIR /src WORKDIR /src RUN python -m deeppavlov install $CONFIG +RUN python -m deeppavlov download $CONFIG CMD gunicorn --workers=1 --timeout 500 server:app -b 0.0.0.0:8021 From 9b6e4d0aaf5764d254bc4990af8e86f0a30a8eed Mon Sep 17 00:00:00 2001 From: dilyararimovna Date: Thu, 11 Aug 2022 12:11:25 +0300 Subject: [PATCH 09/10] fix: revert format list --- annotators/NER_deeppavlov/server.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/annotators/NER_deeppavlov/server.py b/annotators/NER_deeppavlov/server.py index e6024fe703..5dfbe4e358 100644 --- a/annotators/NER_deeppavlov/server.py +++ b/annotators/NER_deeppavlov/server.py @@ -2,6 +2,7 @@ import os import time +import numpy as np import sentry_sdk from flask import Flask, jsonify, request @@ -86,7 +87,14 @@ def get_result(request): tokens_batch, tags_batch = ner_model(samples) logger.info(f"NER model predictions: tokens: {tokens_batch}, tags: {tags_batch}") - ret = convert_prediction(tokens_batch, tags_batch) + good_preds = convert_prediction(tokens_batch, tags_batch) + dialog_ids = np.array(dialog_ids) + + ret = [] + for i, utterance_sents in enumerate(last_utterances): + curr_ids = np.where(dialog_ids == i)[0] + curr_preds = [good_preds[curr_id] for curr_id in curr_ids] + ret.append(curr_preds) logger.info(f"NER output: {ret}") total_time = time.time() - st_time From b070b4b7baf7f1536ca6daaad7a9b2601285d8f8 Mon Sep 17 00:00:00 2001 From: dilyararimovna Date: Thu, 11 Aug 2022 13:46:08 +0300 Subject: [PATCH 10/10] fix: change ner for all dists --- assistant_dists/dream_russian/docker-compose.override.yml | 2 +- assistant_dists/dream_sfc/docker-compose.override.yml | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/assistant_dists/dream_russian/docker-compose.override.yml b/assistant_dists/dream_russian/docker-compose.override.yml index e1649bad49..72f3bedbb8 100644 --- a/assistant_dists/dream_russian/docker-compose.override.yml +++ b/assistant_dists/dream_russian/docker-compose.override.yml @@ -149,7 +149,7 @@ services: env_file: [ .env ] build: args: - CONFIG: ner_case_agnostic_multilingual_bert_base.json + CONFIG: ner_case_agnostic_multilingual_bert_base_extended.json PORT: 8021 SRC_DIR: annotators/NER_deeppavlov COMMIT: f5117cd9ad1e64f6c2d970ecaa42fc09ccb23144 diff --git a/assistant_dists/dream_sfc/docker-compose.override.yml b/assistant_dists/dream_sfc/docker-compose.override.yml index edb667cf9c..8f02aa5a55 100644 --- a/assistant_dists/dream_sfc/docker-compose.override.yml +++ b/assistant_dists/dream_sfc/docker-compose.override.yml @@ -210,11 +210,10 @@ services: env_file: [ .env ] build: args: - CONFIG: ner_case_agnostic_multilingual_bert_base.json + CONFIG: ner_case_agnostic_multilingual_bert_base_extended.json PORT: 8021 SRC_DIR: annotators/NER_deeppavlov COMMIT: f5117cd9ad1e64f6c2d970ecaa42fc09ccb23144 - LANGUAGE: EN context: ./ dockerfile: annotators/NER_deeppavlov/Dockerfile command: flask run -h 0.0.0.0 -p 8021