From f38010ecd7162b7475e6287223378b30397726db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Mon, 26 Sep 2022 21:27:32 +0300 Subject: [PATCH 01/40] property extraction --- annotators/property_extraction/Dockerfile | 20 ++ .../property_classification_distilbert.json | 100 ++++++++ annotators/property_extraction/rel_list.txt | 61 +++++ .../property_extraction/requirements.txt | 13 + annotators/property_extraction/server.py | 115 +++++++++ .../src/t5_generative_ie.py | 231 ++++++++++++++++++ .../src/torch_transformers_preprocessor.py | 79 ++++++ .../t5_generative_ie_infer.json | 49 ++++ .../test_property_extraction.py | 16 ++ assistant_dists/dream/dev.yml | 6 + .../dream/docker-compose.override.yml | 22 +- assistant_dists/dream/pipeline_conf.json | 13 + state_formatters/dp_formatters.py | 15 ++ 13 files changed, 739 insertions(+), 1 deletion(-) create mode 100644 annotators/property_extraction/Dockerfile create mode 100644 annotators/property_extraction/property_classification_distilbert.json create mode 100644 annotators/property_extraction/rel_list.txt create mode 100644 annotators/property_extraction/requirements.txt create mode 100644 annotators/property_extraction/server.py create mode 100644 annotators/property_extraction/src/t5_generative_ie.py create mode 100644 annotators/property_extraction/src/torch_transformers_preprocessor.py create mode 100644 annotators/property_extraction/t5_generative_ie_infer.json create mode 100644 annotators/property_extraction/test_property_extraction.py diff --git a/annotators/property_extraction/Dockerfile b/annotators/property_extraction/Dockerfile new file mode 100644 index 0000000000..c541035263 --- /dev/null +++ b/annotators/property_extraction/Dockerfile @@ -0,0 +1,20 @@ +FROM deeppavlov/base-gpu + +RUN apt-get update && apt-get install git -y + +ARG CONFIG +ARG PORT +ARG SRC_DIR +ARG SED_ARG=" | " + +ENV CONFIG=$CONFIG +ENV PORT=$PORT + +COPY ./annotators/property_extraction/requirements.txt /src/requirements.txt +RUN pip install -r /src/requirements.txt + +COPY $SRC_DIR /src + +WORKDIR /src + +CMD gunicorn --workers=1 --timeout 500 server:app -b 0.0.0.0:8126 diff --git a/annotators/property_extraction/property_classification_distilbert.json b/annotators/property_extraction/property_classification_distilbert.json new file mode 100644 index 0000000000..a9db83a238 --- /dev/null +++ b/annotators/property_extraction/property_classification_distilbert.json @@ -0,0 +1,100 @@ +{ + "dataset_reader": { + "class_name": "sq_reader", + "data_path": "{DOWNLOADS_PATH}/dialogue_nli/dialogue_nli_cls.json" + }, + "dataset_iterator": { + "class_name": "basic_classification_iterator", + "seed": 42 + }, + "chainer": { + "in": ["x"], + "in_y": ["y"], + "pipe": [ + { + "class_name": "torch_transformers_preprocessor", + "vocab_file": "{TRANSFORMER}", + "do_lower_case": false, + "max_seq_length": 64, + "in": ["x"], + "out": ["bert_features"] + }, + { + "id": "classes_vocab", + "class_name": "simple_vocab", + "fit_on": ["y"], + "save_path": "{MODEL_PATH}/classes.dict", + "load_path": "{MODEL_PATH}/classes.dict", + "in": ["y"], + "out": ["y_ids"] + }, + { + "in": ["y_ids"], + "out": ["y_onehot"], + "class_name": "one_hotter", + "depth": "#classes_vocab.len", + "single_vector": true + }, + { + "class_name": "torch_transformers_classifier", + "n_classes": "#classes_vocab.len", + "return_probas": true, + "pretrained_bert": "{TRANSFORMER}", + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "optimizer": "AdamW", + "optimizer_parameters": {"lr": 1e-05}, + "learning_rate_drop_patience": 5, + "learning_rate_drop_div": 2.0, + "in": ["bert_features"], + "in_y": ["y_ids"], + "out": ["y_pred_probas"] + }, + { + "in": ["y_pred_probas"], + "out": ["y_pred_ids"], + "class_name": "proba2labels", + "max_proba": true + }, + { + "in": ["y_pred_ids"], + "out": ["y_pred_labels"], + "ref": "classes_vocab" + } + ], + "out": ["y_pred_labels"] + }, + "train": { + "epochs": 100, + "batch_size": 64, + "metrics": [ + "f1_macro", + "accuracy" + ], + "validation_patience": 10, + "val_every_n_batches": 100, + "log_every_n_batches": 100, + "show_examples": false, + "evaluation_targets": ["valid", "test"], + "class_name": "torch_trainer" + }, + "metadata": { + "variables": { + "TRANSFORMER": "distilbert-base-uncased", + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "MODEL_PATH": "{MODELS_PATH}/classifiers/property_classification" + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/deeppavlov_data/generative_ie/property_classification.tar.gz", + "subdir": "{MODEL_PATH}" + }, + { + "url": "http://files.deeppavlov.ai/deeppavlov_data/generative_ie/dialogue_nli_cls.tar.gz", + "subdir": "{DOWNLOADS_PATH}/dialogue_nli" + } + ] + } +} diff --git a/annotators/property_extraction/rel_list.txt b/annotators/property_extraction/rel_list.txt new file mode 100644 index 0000000000..890a24ac48 --- /dev/null +++ b/annotators/property_extraction/rel_list.txt @@ -0,0 +1,61 @@ + p +attend_school r +dislike r +employed_by_company r +employed_by_general r +favorite r +favorite_activity r +favorite_animal r +favorite_book r +favorite_color r +favorite_drink r +favorite_food r +favorite_hobby r +favorite_movie r +favorite_music r +favorite_music_artist r +favorite_place r +favorite_season r +favorite_show r +favorite_sport r +gender p +has_ability r +has_age p +has_degree r +has_hobby r +has_profession r +have r +have_chidren r +have_family r +have_pet r +have_sibling r +have_vehicle r +job_status p +like_activity r +like_animal r +like_drink r +like_food r +like_general r +like_goto r +like_movie r +like_music r +like_read r +like_sports r +like_watching r +live_in_citystatecountry r +live_in_general r +marital_status p +member_of r +misc_attribute p +nationality p +not_have r +other p +own r +physical_attribute p +place_origin r +previous_profession r +school_status p +teach r +want r +want_do r +want_job p diff --git a/annotators/property_extraction/requirements.txt b/annotators/property_extraction/requirements.txt new file mode 100644 index 0000000000..710183f05e --- /dev/null +++ b/annotators/property_extraction/requirements.txt @@ -0,0 +1,13 @@ +pyopenssl==22.0.0 +Flask==1.1.1 +itsdangerous==2.0.1 +nltk==3.2.5 +numpy==1.18.0 +gunicorn==19.9.0 +requests==2.27.1 +jinja2<=3.0.3 +Werkzeug<=2.0.3 +sentry-sdk==0.12.3 +spacy==3.2.0 +torch==1.7.1 +transformers==4.10.1 diff --git a/annotators/property_extraction/server.py b/annotators/property_extraction/server.py new file mode 100644 index 0000000000..13f69c5734 --- /dev/null +++ b/annotators/property_extraction/server.py @@ -0,0 +1,115 @@ +import logging +import os +import re +import time + +import sentry_sdk +from flask import Flask, jsonify, request + +from deeppavlov import build_model + +sentry_sdk.init(os.getenv("SENTRY_DSN")) + +logging.basicConfig(format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO) +logger = logging.getLogger(__name__) +app = Flask(__name__) + +config_name = os.getenv("CONFIG") +rel_cls_flag = int(os.getenv("REL_CLS_FLAG", "0")) + +rel_type_dict = {} +with open("rel_list.txt", "r") as fl: + lines = fl.readlines() + for line in lines: + rel, rel_type = line.strip().split() + if rel_type == "r": + rel_type = "relation" + else: + rel_type = "property" + rel_type_dict[rel.replace("_", " ")] = rel_type + +try: + generative_ie = build_model(config_name, download=True) + logger.info("property extraction model is loaded.") + if rel_cls_flag: + rel_cls = build_model("property_classification_distilbert.json") +except Exception as e: + sentry_sdk.capture_exception(e) + logger.exception(e) + raise e + + +def get_result(request): + st_time = time.time() + uttrs = request.json.get("utterances", []) + entities_with_labels_batch = request.json.get("entities_with_labels", []) + entity_info_batch = request.json.get("entity_info", []) + + triplets_batch = [] + outputs, scores = generative_ie(uttrs) + for output in outputs: + triplet = "" + fnd = re.findall(r" (.*?) (.*?) (.*)", output) + if fnd: + triplet = list(fnd[0]) + if triplet[0] == "i": + triplet[0] = "user" + triplets_batch.append(triplet) + if rel_cls_flag: + rels = rel_cls(uttrs) + logger.info(f"classified relations: {rels}") + filtered_triplets_batch = [] + for triplet, rel in zip(triplets_batch, rels): + rel = rel.replace("_", " ") + if len(triplet) == 3 and triplet[1] == rel: + filtered_triplets_batch.append(triplet) + else: + filtered_triplets_batch.append([]) + triplets_batch = filtered_triplets_batch + + triplets_info_batch = [] + for triplet, uttr, entities_with_labels, entity_info_list in zip( + triplets_batch, uttrs, entities_with_labels_batch, entity_info_batch + ): + uttr = uttr.lower() + entity_substr_dict = {} + formatted_triplet = {} + if len(uttr.split()) > 1: + for entity in entities_with_labels: + if "text" in entity: + entity_substr = entity["text"] + if "offsets" in entity: + start_offset, end_offset = entity["offsets"] + else: + start_offset = uttr.find(entity_substr.lower()) + end_offset = start_offset + len(entity_substr) + offsets = [start_offset, end_offset] + if triplet and entity_substr in [triplet[0], triplet[2]]: + entity_substr_dict[entity_substr] = {"offsets": offsets} + if entity_info_list: + for entity_info in entity_info_list: + if entity_info and "entity_substr" in entity_info and "entity_ids" in entity_info: + entity_substr = entity_info["entity_substr"] + if triplet and entity_substr in [triplet[0], triplet[2]]: + if entity_substr not in entity_substr_dict: + entity_substr_dict[entity_substr] = {} + entity_substr_dict[entity_substr]["entity_ids"] = entity_info["entity_ids"] + entity_substr_dict[entity_substr]["dbpedia_types"] = entity_info.get("dbpedia_types", []) + if triplet: + formatted_triplet = {"subject": triplet[0], rel_type_dict[triplet[1]]: triplet[1], "object": triplet[2]} + triplets_info_batch.append({"triplet": formatted_triplet, "entity_info": entity_substr_dict}) + + total_time = time.time() - st_time + logger.info(f"property extraction exec time: {total_time: .3f}s") + logger.info(f"property extraction, input {uttrs}, output {triplets_info_batch} scores {scores}") + return triplets_info_batch + + +@app.route("/respond", methods=["POST"]) +def respond(): + result = get_result(request) + return jsonify(result) + + +if __name__ == "__main__": + app.run(debug=False, host="0.0.0.0", port=8103) diff --git a/annotators/property_extraction/src/t5_generative_ie.py b/annotators/property_extraction/src/t5_generative_ie.py new file mode 100644 index 0000000000..66649555f7 --- /dev/null +++ b/annotators/property_extraction/src/t5_generative_ie.py @@ -0,0 +1,231 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from logging import getLogger +from pathlib import Path +from typing import List, Optional, Dict + +import torch +from overrides import overrides +from transformers import AutoConfig, AutoTokenizer +from transformers import T5ForConditionalGeneration + +from deeppavlov.core.common.errors import ConfigError +from deeppavlov.core.commands.utils import expand_path +from deeppavlov.core.common.registry import register +from deeppavlov.core.models.torch_model import TorchModel + +logger = getLogger(__name__) + + +def softmax_mask(val, mask): + inf = 1e30 + return -inf * (1 - mask.to(torch.float32)) + val + + +@register("t5_generative_ie") +class T5GenerativeIE(TorchModel): + def __init__( + self, + pretrained_transformer: str, + attention_probs_keep_prob: Optional[float] = None, + add_special_tokens: List[str] = None, + hidden_keep_prob: Optional[float] = None, + optimizer: str = "AdamW", + optimizer_parameters: Optional[dict] = None, + bert_config_file: Optional[str] = None, + learning_rate_drop_patience: int = 20, + learning_rate_drop_div: float = 2.0, + load_before_drop: bool = True, + clip_norm: Optional[float] = None, + min_learning_rate: float = 1e-06, + generate_max_length: int = 50, + top_n: int = 1, + batch_decode: bool = False, + scores_thres: float = -0.055, + **kwargs, + ) -> None: + + if not optimizer_parameters: + optimizer_parameters = {"lr": 0.01, "weight_decay": 0.01, "betas": (0.9, 0.999), "eps": 1e-6} + self.generate_max_length = generate_max_length + + self.attention_probs_keep_prob = attention_probs_keep_prob + self.hidden_keep_prob = hidden_keep_prob + self.clip_norm = clip_norm + + self.pretrained_transformer = pretrained_transformer + self.bert_config_file = bert_config_file + self.tokenizer = AutoTokenizer.from_pretrained(pretrained_transformer, do_lower_case=False) + special_tokens_dict = {"additional_special_tokens": add_special_tokens} + self.tokenizer.add_special_tokens(special_tokens_dict) + self.replace_tokens = [("", ""), ("", ""), ("", "")] + self.top_n = top_n + self.batch_decode = batch_decode + self.scores_thres = scores_thres + + super().__init__( + optimizer=optimizer, + optimizer_parameters=optimizer_parameters, + learning_rate_drop_patience=learning_rate_drop_patience, + learning_rate_drop_div=learning_rate_drop_div, + load_before_drop=load_before_drop, + min_learning_rate=min_learning_rate, + **kwargs, + ) + + def train_on_batch(self, input_ids_batch, attention_mask_batch, target_ids_batch) -> Dict: + input_ids_batch = torch.LongTensor(input_ids_batch).to(self.device) + attention_mask_batch = torch.LongTensor(attention_mask_batch).to(self.device) + target_ids_batch = torch.LongTensor(target_ids_batch).to(self.device) + input_ = {"input_ids": input_ids_batch, "attention_mask": attention_mask_batch, "labels": target_ids_batch} + + self.optimizer.zero_grad() + loss = self.model(**input_)[0] + if self.is_data_parallel: + loss = loss.mean() + loss.backward() + # Clip the norm of the gradients to 1.0. + # This is to help prevent the "exploding gradients" problem. + if self.clip_norm: + torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip_norm) + + self.optimizer.step() + if self.lr_scheduler is not None: + self.lr_scheduler.step() + + return {"loss": loss.item()} + + @property + def is_data_parallel(self) -> bool: + return isinstance(self.model, torch.nn.DataParallel) + + def __call__(self, input_ids_batch, attention_mask_batch): + model = self.model.module if hasattr(self.model, "module") else self.model + if self.batch_decode: + input_ids_batch = torch.LongTensor(input_ids_batch).to(self.device) + attention_mask_batch = torch.LongTensor(attention_mask_batch).to(self.device) + input_ = { + "input_ids": input_ids_batch, + "attention_mask": attention_mask_batch, + } + with torch.no_grad(): + answer_ids_batch = model.generate(**input_) + init_answers_batch = self.tokenizer.batch_decode(answer_ids_batch, skip_special_tokens=False) + answers_batch = [] + for answer in init_answers_batch: + for old_tok, new_tok in self.replace_tokens: + answer = answer.replace(old_tok, new_tok) + answers_batch.append(answer) + return answers_batch + else: + answers_batch, scores_batch = [], [] + for input_ids in input_ids_batch: + input_ids = torch.LongTensor([input_ids]).to(self.device) + with torch.no_grad(): + outputs = model.generate( + input_ids, + num_beams=5, + num_return_sequences=self.top_n, + return_dict_in_generate=True, + output_scores=True, + ) + sequences = outputs.sequences + scores = outputs.sequences_scores + scores = scores.cpu().numpy().tolist() + answers = [self.tokenizer.decode(output, skip_special_tokens=False) for output in sequences] + processed_answers, processed_scores = [], [] + for answer, score in zip(answers, scores): + if score > self.scores_thres: + for old_tok, new_tok in self.replace_tokens: + answer = answer.replace(old_tok, new_tok) + processed_answers.append(answer) + processed_scores.append(score) + if self.top_n == 1: + answers_batch.append(processed_answers[0]) + scores_batch.append(processed_scores[0]) + else: + answers_batch.append(processed_answers) + scores_batch.append(processed_scores) + return answers_batch, scores_batch + + @overrides + def load(self, fname=None): + if fname is not None: + self.load_path = fname + + if self.pretrained_transformer: + logger.info(f"From pretrained {self.pretrained_transformer}.") + config = AutoConfig.from_pretrained( + self.pretrained_transformer, output_attentions=False, output_hidden_states=False + ) + + self.model = T5ForConditionalGeneration.from_pretrained(self.pretrained_transformer, config=config) + + elif self.bert_config_file and Path(self.bert_config_file).is_file(): + self.bert_config = AutoConfig.from_json_file(str(expand_path(self.bert_config_file))) + + if self.attention_probs_keep_prob is not None: + self.bert_config.attention_probs_dropout_prob = 1.0 - self.attention_probs_keep_prob + if self.hidden_keep_prob is not None: + self.bert_config.hidden_dropout_prob = 1.0 - self.hidden_keep_prob + self.model = T5ForConditionalGeneration(config=self.bert_config) + else: + raise ConfigError("No pre-trained BERT model is given.") + + if self.device.type == "cuda" and torch.cuda.device_count() > 1: + self.model = torch.nn.DataParallel(self.model) + + self.model.to(self.device) + + self.optimizer = getattr(torch.optim, self.optimizer_name)(self.model.parameters(), **self.optimizer_parameters) + + if self.lr_scheduler_name is not None: + self.lr_scheduler = getattr(torch.optim.lr_scheduler, self.lr_scheduler_name)( + self.optimizer, **self.lr_scheduler_parameters + ) + + if self.load_path: + logger.info(f"Load path {self.load_path} is given.") + if isinstance(self.load_path, Path) and not self.load_path.parent.is_dir(): + raise ConfigError("Provided load path is incorrect!") + + weights_path = Path(self.load_path.resolve()) + weights_path = weights_path.with_suffix(".pth.tar") + if weights_path.exists(): + logger.info(f"Load path {weights_path} exists.") + logger.info(f"Initializing `{self.__class__.__name__}` from saved.") + + # now load the weights, optimizer from saved + logger.info(f"Loading weights from {weights_path}.") + checkpoint = torch.load(weights_path, map_location=self.device) + model_state = checkpoint["model_state_dict"] + optimizer_state = checkpoint["optimizer_state_dict"] + + # load a multi-gpu model on a single device + if not self.is_data_parallel and "module." in list(model_state.keys())[0]: + tmp_model_state = {} + for key, value in model_state.items(): + tmp_model_state[re.sub("module.", "", key)] = value + model_state = tmp_model_state + + strict_load_flag = bool( + [key for key in checkpoint["model_state_dict"].keys() if key.endswith("embeddings.position_ids")] + ) + self.model.load_state_dict(model_state, strict=strict_load_flag) + self.optimizer.load_state_dict(optimizer_state) + self.epochs_done = checkpoint.get("epochs_done", 0) + else: + logger.info(f"Init from scratch. Load path {weights_path} does not exist.") diff --git a/annotators/property_extraction/src/torch_transformers_preprocessor.py b/annotators/property_extraction/src/torch_transformers_preprocessor.py new file mode 100644 index 0000000000..506b28a20b --- /dev/null +++ b/annotators/property_extraction/src/torch_transformers_preprocessor.py @@ -0,0 +1,79 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from logging import getLogger +from pathlib import Path +from typing import List + +from transformers import AutoTokenizer + +from deeppavlov.core.commands.utils import expand_path +from deeppavlov.core.common.registry import register +from deeppavlov.core.models.component import Component + +log = getLogger(__name__) + + +@register("t5_generative_ie_preprocessor") +class T5GenerativeIEPreprocessor(Component): + def __init__( + self, + vocab_file: str, + do_lower_case: bool = True, + max_seq_length: int = 512, + return_tokens: bool = False, + add_special_tokens: List[str] = None, + **kwargs, + ) -> None: + self.max_seq_length = max_seq_length + self.return_tokens = return_tokens + if Path(vocab_file).is_file(): + vocab_file = str(expand_path(vocab_file)) + self.tokenizer = AutoTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) + else: + self.tokenizer = AutoTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case) + special_tokens_dict = {"additional_special_tokens": add_special_tokens} + self.tokenizer.add_special_tokens(special_tokens_dict) + + def __call__(self, uttr_batch: List[str], targets_batch: List[str] = None): + input_ids_batch, attention_mask_batch, lengths = [], [], [] + for uttr in uttr_batch: + encoding = self.tokenizer.encode_plus(text=uttr, return_attention_mask=True, truncation=True) + input_ids = encoding["input_ids"] + attention_mask = encoding["attention_mask"] + input_ids_batch.append(input_ids) + attention_mask_batch.append(attention_mask) + lengths.append(len(input_ids)) + max_length = min(max(lengths), self.max_seq_length) + for i in range(len(input_ids_batch)): + for j in range(max_length - len(input_ids_batch[i])): + input_ids_batch[i].append(0) + attention_mask_batch[i].append(0) + + if targets_batch is None: + return input_ids_batch, attention_mask_batch + else: + target_ids_batch, lengths = [], [] + for (subj, rel, obj) in targets_batch: + target = f" {subj} {rel} {obj}" + encoding = self.tokenizer.encode_plus(text=target, return_attention_mask=True, truncation=True) + input_ids = encoding["input_ids"] + target_ids_batch.append(input_ids) + lengths.append(len(input_ids)) + max_length = max(lengths) + for i in range(len(target_ids_batch)): + for j in range(max_length - len(target_ids_batch[i])): + target_ids_batch[i].append(0) + + return input_ids_batch, attention_mask_batch, target_ids_batch diff --git a/annotators/property_extraction/t5_generative_ie_infer.json b/annotators/property_extraction/t5_generative_ie_infer.json new file mode 100644 index 0000000000..9db32603a3 --- /dev/null +++ b/annotators/property_extraction/t5_generative_ie_infer.json @@ -0,0 +1,49 @@ +{ + "chainer": { + "in": ["question"], + "pipe": [ + { + "class_name": "src.torch_transformers_preprocessor:T5GenerativeIEPreprocessor", + "vocab_file": "{TRANSFORMER}", + "add_special_tokens": ["", "", ""], + "max_seq_length": 512, + "in": ["question"], + "out": ["input_ids", "attention_mask"] + }, + { + "class_name": "src.t5_generative_ie:T5GenerativeIE", + "pretrained_transformer": "{TRANSFORMER}", + "add_special_tokens": ["", "", ""], + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "optimizer": "AdamW", + "optimizer_parameters": { + "lr": 3e-05, + "weight_decay": 0.01, + "betas": [0.9, 0.999], + "eps": 1e-06 + }, + "learning_rate_drop_patience": 6, + "learning_rate_drop_div": 1.5, + "in": ["input_ids", "attention_mask"], + "out": ["answer", "score"] + } + ], + "out": ["answer", "score"] + }, + "metadata": { + "variables": { + "TRANSFORMER": "t5-base", + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "MODEL_PATH": "{MODELS_PATH}/t5_base_generative_ie" + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/deeppavlov_data/generative_ie/t5_base_generative_ie.tar.gz", + "subdir": "{MODEL_PATH}" + } + ] + } +} diff --git a/annotators/property_extraction/test_property_extraction.py b/annotators/property_extraction/test_property_extraction.py new file mode 100644 index 0000000000..b043194e16 --- /dev/null +++ b/annotators/property_extraction/test_property_extraction.py @@ -0,0 +1,16 @@ +import requests + + +def main(): + url = "http://0.0.0.0:8126/respond" + + request_data = [{"utterances": ["i live in moscow"]}] + + count = 0 + for data in request_data: + result = requests.post(url, json=data).json() + print(result) + + +if __name__ == "__main__": + main() diff --git a/assistant_dists/dream/dev.yml b/assistant_dists/dream/dev.yml index 3fb0f8df13..13aa17384e 100755 --- a/assistant_dists/dream/dev.yml +++ b/assistant_dists/dream/dev.yml @@ -406,4 +406,10 @@ services: - "./common:/src/common" ports: - 8120:8120 + property-extraction: + volumes: + - "./annotators/property_extraction:/src" + - "~/.deeppavlov:/root/.deeppavlov" + ports: + - 8126:8126 version: "3.7" diff --git a/assistant_dists/dream/docker-compose.override.yml b/assistant_dists/dream/docker-compose.override.yml index 171f4a91ed..f59673ea78 100644 --- a/assistant_dists/dream/docker-compose.override.yml +++ b/assistant_dists/dream/docker-compose.override.yml @@ -19,7 +19,7 @@ services: dff-funfact-skill:8104, dff-bot-persona-skill:8105, news-api-annotator:8112, dff-gossip-skill:8109, dff-wiki-skill:8111, dff-gaming-skill:8115, topic-recommendation:8113, user-persona-extractor:8114, wiki-facts:8116, dff-music-skill:8099, entity-detection:8103, dff-art-skill:8117, - midas-predictor:8121, dialogpt:8125, infilling:8122, dff-template-skill:8120" + midas-predictor:8121, dialogpt:8125, infilling:8122, dff-template-skill:8120, property-extraction:8126" WAIT_HOSTS_TIMEOUT: ${WAIT_TIMEOUT:-480} convers-evaluator-annotator: env_file: [.env] @@ -1203,4 +1203,24 @@ services: memory: 128M reservations: memory: 128M + + property-extraction: + env_file: [.env] + build: + args: + CONFIG: t5_generative_ie_infer.json + PORT: 8126 + SRC_DIR: annotators/property_extraction/ + context: ./ + dockerfile: annotators/property_extraction/Dockerfile + command: flask run -h 0.0.0.0 -p 8126 + environment: + - FLASK_APP=server + - CUDA_VISIBLE_DEVICES=7 + deploy: + resources: + limits: + memory: 2.5G + reservations: + memory: 2.5G version: '3.7' diff --git a/assistant_dists/dream/pipeline_conf.json b/assistant_dists/dream/pipeline_conf.json index 32a4f15642..056347b15a 100644 --- a/assistant_dists/dream/pipeline_conf.json +++ b/assistant_dists/dream/pipeline_conf.json @@ -303,6 +303,19 @@ "annotators.spacy_nounphrases" ] }, + "property_extraction": { + "connector": { + "protocol": "http", + "timeout": 1, + "url": "http://property-extraction:8126/respond" + }, + "dialog_formatter": "state_formatters.dp_formatters:property_extraction_formatter_dialog", + "response_formatter": "state_formatters.dp_formatters:simple_formatter_service", + "state_manager_method": "add_annotation", + "previous_services": [ + "annotators.entity_linking" + ] + }, "wiki_parser": { "connector": { "protocol": "http", diff --git a/state_formatters/dp_formatters.py b/state_formatters/dp_formatters.py index fdcac14901..c87c9692d9 100755 --- a/state_formatters/dp_formatters.py +++ b/state_formatters/dp_formatters.py @@ -211,6 +211,21 @@ def entity_detection_formatter_dialog(dialog: Dict) -> List[Dict]: return [{"sentences": context}] +def property_extraction_formatter_dialog(dialog: Dict) -> List[Dict]: + out = open(f"{len(dialog['human_utterances'])}.json", 'w') + json.dump(dialog, out, indent=2) + out.close() + entities_with_labels = get_entities(dialog["human_utterances"][-1], only_named=False, with_labels=True) + entity_info_list = dialog["human_utterances"][-1]["annotations"].get("entity_linking", [{}]) + return [ + { + "utterances": [dialog["human_utterances"][-1]["text"]], + "entities_with_labels": [entities_with_labels], + "entity_info": [entity_info_list], + } + ] + + def preproc_last_human_utt_dialog_w_hist(dialog: Dict) -> List[Dict]: # Used by: sentseg over human uttrs last_human_utt = dialog["human_utterances"][-1]["annotations"].get( From edcd39d9166bb4284a9cc2f58918a83d09ccba65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Tue, 27 Sep 2022 14:08:04 +0300 Subject: [PATCH 02/40] fixes --- annotators/property_extraction/src/t5_generative_ie.py | 8 ++++++-- state_formatters/dp_formatters.py | 3 --- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/annotators/property_extraction/src/t5_generative_ie.py b/annotators/property_extraction/src/t5_generative_ie.py index 66649555f7..559142b433 100644 --- a/annotators/property_extraction/src/t5_generative_ie.py +++ b/annotators/property_extraction/src/t5_generative_ie.py @@ -154,8 +154,12 @@ def __call__(self, input_ids_batch, attention_mask_batch): processed_answers.append(answer) processed_scores.append(score) if self.top_n == 1: - answers_batch.append(processed_answers[0]) - scores_batch.append(processed_scores[0]) + if processed_answers: + answers_batch.append(processed_answers[0]) + scores_batch.append(processed_scores[0]) + else: + answers_batch.append("") + scores_batch.append(0.0) else: answers_batch.append(processed_answers) scores_batch.append(processed_scores) diff --git a/state_formatters/dp_formatters.py b/state_formatters/dp_formatters.py index c87c9692d9..67e03ed1ee 100755 --- a/state_formatters/dp_formatters.py +++ b/state_formatters/dp_formatters.py @@ -212,9 +212,6 @@ def entity_detection_formatter_dialog(dialog: Dict) -> List[Dict]: def property_extraction_formatter_dialog(dialog: Dict) -> List[Dict]: - out = open(f"{len(dialog['human_utterances'])}.json", 'w') - json.dump(dialog, out, indent=2) - out.close() entities_with_labels = get_entities(dialog["human_utterances"][-1], only_named=False, with_labels=True) entity_info_list = dialog["human_utterances"][-1]["annotations"].get("entity_linking", [{}]) return [ From ce378423a3ec3c80696f52879e3cbbea812f9108 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Tue, 27 Sep 2022 17:25:24 +0300 Subject: [PATCH 03/40] fixes --- annotators/entity_linking/src/entity_linking.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/annotators/entity_linking/src/entity_linking.py b/annotators/entity_linking/src/entity_linking.py index 5910023c7b..bc66b07edd 100644 --- a/annotators/entity_linking/src/entity_linking.py +++ b/annotators/entity_linking/src/entity_linking.py @@ -59,7 +59,6 @@ def __init__( **kwargs, ) -> None: """ - Args: load_path: path to folder with inverted index files entity_ranker: component deeppavlov.models.kbqa.rel_ranking_bert @@ -333,7 +332,7 @@ def find_exact_match(self, entity_substr, tags): cand_ent_init = self.process_cand_ent( cand_ent_init, entities_and_ids, entity_substr_split, tag, tag_conf ) - if tags and tags[0][0] == "misc" and not cand_ent_init: + if tags and ((tags[0][0] == "misc" and not cand_ent_init) or tags[0][1] < 0.7): for tag in self.cursors: query = "SELECT * FROM inverted_index WHERE title MATCH '{}';".format(entity_substr) res = self.cursors[tag].execute(query) @@ -481,8 +480,8 @@ def rank_by_description( ) for entity, score in scores ] - log.info(f"len entities with scores {len(entities_with_scores)}") - if entity_tags and entity_tags[0][0] == "misc": + log.info(f"len entities with scores {len(entities_with_scores)} --- entity_tags {entity_tags}") + if entity_tags and (entity_tags[0][0] == "misc" or entity_tags[0][1] < 0.7): entities_with_scores = sorted(entities_with_scores, key=lambda x: (x[1], x[2], x[4]), reverse=True) else: entities_with_scores = sorted(entities_with_scores, key=lambda x: (x[1], x[4], x[3]), reverse=True) From 26eee510dc60914c94b839e824474280c5427e34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Wed, 28 Sep 2022 16:31:23 +0300 Subject: [PATCH 04/40] fixes --- annotators/property_extraction/server.py | 7 ++++--- annotators/property_extraction/src/t5_generative_ie.py | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/annotators/property_extraction/server.py b/annotators/property_extraction/server.py index 13f69c5734..b7e8226314 100644 --- a/annotators/property_extraction/server.py +++ b/annotators/property_extraction/server.py @@ -42,8 +42,8 @@ def get_result(request): st_time = time.time() uttrs = request.json.get("utterances", []) - entities_with_labels_batch = request.json.get("entities_with_labels", []) - entity_info_batch = request.json.get("entity_info", []) + entities_with_labels_batch = request.json.get("entities_with_labels", [[] for _ in uttrs]) + entity_info_batch = request.json.get("entity_info", [[] for _ in uttrs]) triplets_batch = [] outputs, scores = generative_ie(uttrs) @@ -55,6 +55,7 @@ def get_result(request): if triplet[0] == "i": triplet[0] = "user" triplets_batch.append(triplet) + logger.info(f"outputs {outputs} scores {scores} triplets_batch {triplets_batch}") if rel_cls_flag: rels = rel_cls(uttrs) logger.info(f"classified relations: {rels}") @@ -74,7 +75,7 @@ def get_result(request): uttr = uttr.lower() entity_substr_dict = {} formatted_triplet = {} - if len(uttr.split()) > 1: + if len(uttr.split()) > 2: for entity in entities_with_labels: if "text" in entity: entity_substr = entity["text"] diff --git a/annotators/property_extraction/src/t5_generative_ie.py b/annotators/property_extraction/src/t5_generative_ie.py index 559142b433..da4ad10711 100644 --- a/annotators/property_extraction/src/t5_generative_ie.py +++ b/annotators/property_extraction/src/t5_generative_ie.py @@ -54,7 +54,7 @@ def __init__( generate_max_length: int = 50, top_n: int = 1, batch_decode: bool = False, - scores_thres: float = -0.055, + scores_thres: float = -0.17, **kwargs, ) -> None: @@ -146,6 +146,7 @@ def __call__(self, input_ids_batch, attention_mask_batch): scores = outputs.sequences_scores scores = scores.cpu().numpy().tolist() answers = [self.tokenizer.decode(output, skip_special_tokens=False) for output in sequences] + logger.info(f"triplets {answers} scores {scores}") processed_answers, processed_scores = [], [] for answer, score in zip(answers, scores): if score > self.scores_thres: From 44ddf4664945fbfbb1d6c31caa2b1dfd8a21f744 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Thu, 29 Sep 2022 11:09:28 +0300 Subject: [PATCH 05/40] add finegrained_types to property extraction --- .../entity_linking/entity_linking_eng.json | 4 +- annotators/entity_linking/server.py | 21 +++++++-- .../entity_linking/src/entity_linking.py | 43 +++++++++++++------ annotators/property_extraction/server.py | 3 ++ .../test_property_extraction.py | 1 - 5 files changed, 54 insertions(+), 18 deletions(-) diff --git a/annotators/entity_linking/entity_linking_eng.json b/annotators/entity_linking/entity_linking_eng.json index c07ab31058..1f6587df4e 100644 --- a/annotators/entity_linking/entity_linking_eng.json +++ b/annotators/entity_linking/entity_linking_eng.json @@ -15,7 +15,7 @@ { "class_name": "src.entity_linking:EntityLinker", "in": ["entity_substr", "entity_tags", "sentences"], - "out": ["entity_ids", "entity_conf", "entity_pages", "first_pars", "dbpedia_types"], + "out": ["entity_ids", "entity_conf", "entity_id_tags", "entity_pages", "first_pars", "dbpedia_types"], "load_path": "{DOWNLOADS_PATH}/entity_linking_eng/el_eng_dream", "add_info_filename": "{DOWNLOADS_PATH}/entity_linking_eng/el_eng_dream/add_info.db", "tags_filename": "{MODELS_PATH}/finegrained_tags/tag.dict", @@ -35,7 +35,7 @@ "lang": "en" } ], - "out": ["entity_substr", "entity_ids", "entity_conf", "entity_pages", "first_pars", "dbpedia_types"] + "out": ["entity_substr", "entity_ids", "entity_conf", "entity_id_tags", "entity_pages", "first_pars", "dbpedia_types"] }, "metadata": { "variables": { diff --git a/annotators/entity_linking/server.py b/annotators/entity_linking/server.py index 7dc5b34322..3e021a599d 100644 --- a/annotators/entity_linking/server.py +++ b/annotators/entity_linking/server.py @@ -51,6 +51,7 @@ def respond(): entity_substr_batch, entity_ids_batch, conf_batch, + entity_id_tags_batch, entity_pages_batch, first_pars_batch, dbpedia_types_batch, @@ -60,21 +61,35 @@ def respond(): entity_substr_list, entity_ids_list, conf_list, + entity_id_tags_list, entity_pages_list, first_pars_list, dbpedia_types_list, ) in zip( - entity_substr_batch, entity_ids_batch, conf_batch, entity_pages_batch, first_pars_batch, dbpedia_types_batch + entity_substr_batch, + entity_ids_batch, + conf_batch, + entity_id_tags_batch, + entity_pages_batch, + first_pars_batch, + dbpedia_types_batch, ): entity_info_list = [] - for entity_substr, entity_ids, confs, entity_pages, first_pars, dbpedia_types in zip( - entity_substr_list, entity_ids_list, conf_list, entity_pages_list, first_pars_list, dbpedia_types_list + for entity_substr, entity_ids, confs, entity_id_tags, entity_pages, first_pars, dbpedia_types in zip( + entity_substr_list, + entity_ids_list, + conf_list, + entity_id_tags_list, + entity_pages_list, + first_pars_list, + dbpedia_types_list, ): entity_info = {} entity_info["entity_substr"] = entity_substr entity_info["entity_ids"] = entity_ids entity_info["confidences"] = [float(elem[2]) for elem in confs] entity_info["tokens_match_conf"] = [float(elem[0]) for elem in confs] + entity_info["entity_id_tags"] = entity_id_tags entity_info["pages_titles"] = entity_pages entity_info["first_paragraphs"] = first_pars entity_info["dbpedia_types"] = dbpedia_types diff --git a/annotators/entity_linking/src/entity_linking.py b/annotators/entity_linking/src/entity_linking.py index bc66b07edd..a62999eedd 100644 --- a/annotators/entity_linking/src/entity_linking.py +++ b/annotators/entity_linking/src/entity_linking.py @@ -167,11 +167,11 @@ def __call__( entity_offsets_list.append([st_offset, end_offset]) entity_offsets_batch.append(entity_offsets_list) - entity_ids_batch, entity_conf_batch, entity_pages_batch = [], [], [] + entity_ids_batch, entity_conf_batch, entity_pages_batch, entity_id_tags_batch = [], [], [], [] for entity_substr_list, entity_offsets_list, entity_tags_list, sentences_list, sentences_offsets_list in zip( entity_substr_batch, entity_offsets_batch, entity_tags_batch, sentences_batch, sentences_offsets_batch ): - entity_ids_list, entity_conf_list, entity_pages_list = self.link_entities( + entity_ids_list, entity_conf_list, entity_pages_list, entity_id_tags_list = self.link_entities( entity_substr_list, entity_offsets_list, entity_tags_list, @@ -185,9 +185,17 @@ def __call__( entity_pages_list = [entity_pages[: self.num_entities_to_return] for entity_pages in entity_pages_list] entity_ids_batch.append(entity_ids_list) entity_conf_batch.append(entity_conf_list) + entity_id_tags_batch.append(entity_id_tags_list) entity_pages_batch.append(entity_pages_list) first_par_batch, dbpedia_types_batch = self.extract_add_info(entity_pages_batch) - return entity_ids_batch, entity_conf_batch, entity_pages_batch, first_par_batch, dbpedia_types_batch + return ( + entity_ids_batch, + entity_conf_batch, + entity_id_tags_batch, + entity_pages_batch, + first_par_batch, + dbpedia_types_batch, + ) def extract_add_info(self, entity_pages_batch: List[List[List[str]]]): first_par_batch, dbpedia_types_batch = [], [] @@ -226,7 +234,8 @@ def link_entities( f"entity_substr_list {entity_substr_list} entity_tags_list {entity_tags_list} " f"entity_offsets_list {entity_offsets_list}" ) - entity_ids_list, conf_list, pages_list, pages_dict_list, descr_list = [], [], [], [], [] + entity_ids_list, conf_list, pages_list, entity_id_tags_list, descr_list = [], [], [], [], [] + pages_dict_list = [] if entity_substr_list: entities_scores_list = [] cand_ent_scores_list = [] @@ -283,15 +292,22 @@ def link_entities( cand_ent_scores = cand_ent_scores[: self.num_entities_for_bert_ranking] cand_ent_scores_list.append(cand_ent_scores) entity_ids = [elem[0] for elem in cand_ent_scores] - pages = [elem[5] for elem in cand_ent_scores] + entity_id_tags = [elem[5] for elem in cand_ent_scores] + pages = [elem[6] for elem in cand_ent_scores] scores = [elem[1:5] for elem in cand_ent_scores] entities_scores_list.append( {entity_id: entity_scores for entity_id, entity_scores in zip(entity_ids, scores)} ) entity_ids_list.append(entity_ids) + entity_id_tags_list.append(entity_id_tags) pages_list.append(pages) - pages_dict_list.append({entity_id: page for entity_id, page in zip(entity_ids, pages)}) - descr_list.append([elem[6] for elem in cand_ent_scores]) + pages_dict_list.append( + { + entity_id: (page, entity_id_tag) + for entity_id, page, entity_id_tag in zip(entity_ids, pages, entity_id_tags) + } + ) + descr_list.append([elem[7] for elem in cand_ent_scores]) if self.use_descriptions: substr_lens = [len(entity_substr.split()) for entity_substr in entity_substr_list] @@ -307,16 +323,19 @@ def link_entities( substr_lens, ) pages_list = [ - [pages_dict.get(entity_id, "") for entity_id in entity_ids] + [pages_dict.get(entity_id, ("", ""))[0] for entity_id in entity_ids] for entity_ids, pages_dict in zip(entity_ids_list, pages_dict_list) ] - - return entity_ids_list, conf_list, pages_list + entity_id_tags_list = [ + [pages_dict.get(entity_id, ("", ""))[1] for entity_id in entity_ids] + for entity_ids, pages_dict in zip(entity_ids_list, pages_dict_list) + ] + return entity_ids_list, conf_list, pages_list, entity_id_tags_list def process_cand_ent(self, cand_ent_init, entities_and_ids, entity_substr_split, tag, tag_conf): - for entity_title, entity_id, entity_rels, anchor_cnt, _, page, descr in entities_and_ids: + for entity_title, entity_id, entity_rels, anchor_cnt, tag, page, descr in entities_and_ids: substr_score = self.calc_substr_score(entity_title, entity_substr_split) - cand_ent_init[entity_id].add((substr_score, anchor_cnt, entity_rels, tag_conf, page, descr)) + cand_ent_init[entity_id].add((substr_score, anchor_cnt, entity_rels, tag_conf, tag, page, descr)) return cand_ent_init def find_exact_match(self, entity_substr, tags): diff --git a/annotators/property_extraction/server.py b/annotators/property_extraction/server.py index b7e8226314..f128513eea 100644 --- a/annotators/property_extraction/server.py +++ b/annotators/property_extraction/server.py @@ -96,6 +96,9 @@ def get_result(request): entity_substr_dict[entity_substr] = {} entity_substr_dict[entity_substr]["entity_ids"] = entity_info["entity_ids"] entity_substr_dict[entity_substr]["dbpedia_types"] = entity_info.get("dbpedia_types", []) + entity_substr_dict[entity_substr]["finegrained_types"] = entity_info.get( + "dbpedia_types", [] + ) if triplet: formatted_triplet = {"subject": triplet[0], rel_type_dict[triplet[1]]: triplet[1], "object": triplet[2]} triplets_info_batch.append({"triplet": formatted_triplet, "entity_info": entity_substr_dict}) diff --git a/annotators/property_extraction/test_property_extraction.py b/annotators/property_extraction/test_property_extraction.py index b043194e16..e36de4a7c5 100644 --- a/annotators/property_extraction/test_property_extraction.py +++ b/annotators/property_extraction/test_property_extraction.py @@ -6,7 +6,6 @@ def main(): request_data = [{"utterances": ["i live in moscow"]}] - count = 0 for data in request_data: result = requests.post(url, json=data).json() print(result) From 17c7534f3123cbbd774b0b54f0e8d1460cf51acd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Thu, 29 Sep 2022 12:42:41 +0300 Subject: [PATCH 06/40] update --- .../entity_linking/src/entity_linking.py | 30 ++++++++++++++----- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/annotators/entity_linking/src/entity_linking.py b/annotators/entity_linking/src/entity_linking.py index a62999eedd..6a6e0bcf65 100644 --- a/annotators/entity_linking/src/entity_linking.py +++ b/annotators/entity_linking/src/entity_linking.py @@ -92,6 +92,7 @@ def __init__( self.full_paragraph = full_paragraph self.re_tokenizer = re.compile(r"[\w']+|[^\w ]") self.not_found_str = "not in wiki" + self.stemmer = nltk.PorterStemmer() self.related_tags = { "loc": ["gpe", "country", "city", "us_state", "river"], "gpe": ["loc", "country", "city", "us_state"], @@ -106,6 +107,16 @@ def __init__( "politician": ["per"], "writer": ["per"], } + self.not_named_entities_tags = { + "animal", + "food", + "music_genre", + "misc", + "language", + "occupation", + "type_of_sport", + "product", + } self.word_searcher = None if self.words_dict_filename: self.word_searcher = WordSearcher(self.words_dict_filename, self.ngrams_matrix_filename) @@ -279,6 +290,10 @@ def link_entities( corr_words = self.word_searcher(entity_substr_split[0], set(clean_tags + corr_clean_tags)) if corr_words: cand_ent_init = self.find_exact_match(corr_words[0], tags + corr_tags) + if len(entity_substr_split) == 1 and self.stemmer.stem(entity_substr) != entity_substr: + entity_substr_stemmed = self.stemmer.stem(entity_substr) + stem_cand_ent_init = self.find_exact_match(entity_substr_stemmed, tags) + cand_ent_init = {**cand_ent_init, **stem_cand_ent_init} if not cand_ent_init and len(entity_substr_split) > 1: cand_ent_init = self.find_fuzzy_match(entity_substr_split, tags) @@ -353,13 +368,14 @@ def find_exact_match(self, entity_substr, tags): ) if tags and ((tags[0][0] == "misc" and not cand_ent_init) or tags[0][1] < 0.7): for tag in self.cursors: - query = "SELECT * FROM inverted_index WHERE title MATCH '{}';".format(entity_substr) - res = self.cursors[tag].execute(query) - entities_and_ids = res.fetchall() - if entities_and_ids: - cand_ent_init = self.process_cand_ent( - cand_ent_init, entities_and_ids, entity_substr_split, tag, tag_conf - ) + if (tags[0][0] == "misc" and tag in self.not_named_entities_tags) or tags[0][0] != "misc": + query = "SELECT * FROM inverted_index WHERE title MATCH '{}';".format(entity_substr) + res = self.cursors[tag].execute(query) + entities_and_ids = res.fetchall() + if entities_and_ids: + cand_ent_init = self.process_cand_ent( + cand_ent_init, entities_and_ids, entity_substr_split, tag, tag_conf + ) return cand_ent_init def find_fuzzy_match(self, entity_substr_split, tags): From db192c962d7383c9eb3a3c0caef1b5710b317a9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Mon, 3 Oct 2022 14:06:00 +0300 Subject: [PATCH 07/40] fixes --- annotators/property_extraction/server.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/annotators/property_extraction/server.py b/annotators/property_extraction/server.py index f128513eea..2671578838 100644 --- a/annotators/property_extraction/server.py +++ b/annotators/property_extraction/server.py @@ -28,6 +28,13 @@ rel_type = "property" rel_type_dict[rel.replace("_", " ")] = rel_type + +def check_triplet(triplet): + if triplet[0] in {"hi", "hello"} or any([word in triplet[0] for word in {" hi ", " hello "}]): + return False + return True + + try: generative_ie = build_model(config_name, download=True) logger.info("property extraction model is loaded.") @@ -62,7 +69,7 @@ def get_result(request): filtered_triplets_batch = [] for triplet, rel in zip(triplets_batch, rels): rel = rel.replace("_", " ") - if len(triplet) == 3 and triplet[1] == rel: + if len(triplet) == 3 and triplet[1] == rel and check_triplet(triplet): filtered_triplets_batch.append(triplet) else: filtered_triplets_batch.append([]) From 5c0e4079650fcf374695feb25ba89490a629f6cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Tue, 4 Oct 2022 17:09:38 +0300 Subject: [PATCH 08/40] fixes --- annotators/property_extraction/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annotators/property_extraction/server.py b/annotators/property_extraction/server.py index 2671578838..721dee4a69 100644 --- a/annotators/property_extraction/server.py +++ b/annotators/property_extraction/server.py @@ -104,7 +104,7 @@ def get_result(request): entity_substr_dict[entity_substr]["entity_ids"] = entity_info["entity_ids"] entity_substr_dict[entity_substr]["dbpedia_types"] = entity_info.get("dbpedia_types", []) entity_substr_dict[entity_substr]["finegrained_types"] = entity_info.get( - "dbpedia_types", [] + "entity_id_tags", [] ) if triplet: formatted_triplet = {"subject": triplet[0], rel_type_dict[triplet[1]]: triplet[1], "object": triplet[2]} From f6cbeada269f61d66ee52e185e159fdc44f88439 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Tue, 4 Oct 2022 18:28:21 +0300 Subject: [PATCH 09/40] fix plural nouns --- annotators/entity_linking/src/entity_linking.py | 6 ++++-- annotators/property_extraction/server.py | 8 +++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/annotators/entity_linking/src/entity_linking.py b/annotators/entity_linking/src/entity_linking.py index 6a6e0bcf65..3e52224616 100644 --- a/annotators/entity_linking/src/entity_linking.py +++ b/annotators/entity_linking/src/entity_linking.py @@ -219,8 +219,10 @@ def extract_add_info(self, entity_pages_batch: List[List[List[str]]]): query = "SELECT * FROM entity_additional_info WHERE page_title='{}';".format(entity_page) res = self.add_info_cur.execute(query) fetch_res = res.fetchall() - first_par = fetch_res[0][1] - dbpedia_types_elem = fetch_res[0][2].split() + first_par, dbpedia_types = "", [] + if fetch_res: + first_par = fetch_res[0][1] + dbpedia_types_elem = fetch_res[0][2].split() first_pars.append(first_par) dbpedia_types.append(dbpedia_types_elem) except Exception as e: diff --git a/annotators/property_extraction/server.py b/annotators/property_extraction/server.py index 721dee4a69..40a53fafdd 100644 --- a/annotators/property_extraction/server.py +++ b/annotators/property_extraction/server.py @@ -3,6 +3,7 @@ import re import time +import nltk import sentry_sdk from flask import Flask, jsonify, request @@ -14,6 +15,8 @@ logger = logging.getLogger(__name__) app = Flask(__name__) +stemmer = nltk.PorterStemmer() + config_name = os.getenv("CONFIG") rel_cls_flag = int(os.getenv("REL_CLS_FLAG", "0")) @@ -98,7 +101,10 @@ def get_result(request): for entity_info in entity_info_list: if entity_info and "entity_substr" in entity_info and "entity_ids" in entity_info: entity_substr = entity_info["entity_substr"] - if triplet and entity_substr in [triplet[0], triplet[2]]: + if triplet and ( + entity_substr in [triplet[0], triplet[2]] + or stemmer.stem(entity_substr) in [triplet[0], triplet[2]] + ): if entity_substr not in entity_substr_dict: entity_substr_dict[entity_substr] = {} entity_substr_dict[entity_substr]["entity_ids"] = entity_info["entity_ids"] From e389411f17551c0bdacbdbab7c7dc33e21bcead9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Wed, 5 Oct 2022 10:42:33 +0300 Subject: [PATCH 10/40] add triplets --- annotators/property_extraction/server.py | 20 ++++++++++++++++---- state_formatters/dp_formatters.py | 2 ++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/annotators/property_extraction/server.py b/annotators/property_extraction/server.py index 40a53fafdd..2c19fa3a0a 100644 --- a/annotators/property_extraction/server.py +++ b/annotators/property_extraction/server.py @@ -52,6 +52,7 @@ def check_triplet(triplet): def get_result(request): st_time = time.time() uttrs = request.json.get("utterances", []) + named_entities_batch = request.json.get("named_entities", [[] for _ in uttrs]) entities_with_labels_batch = request.json.get("entities_with_labels", [[] for _ in uttrs]) entity_info_batch = request.json.get("entity_info", [[] for _ in uttrs]) @@ -79,12 +80,12 @@ def get_result(request): triplets_batch = filtered_triplets_batch triplets_info_batch = [] - for triplet, uttr, entities_with_labels, entity_info_list in zip( - triplets_batch, uttrs, entities_with_labels_batch, entity_info_batch + for triplet, uttr, named_entities, entities_with_labels, entity_info_list in zip( + triplets_batch, uttrs, named_entities_batch, entities_with_labels_batch, entity_info_batch ): uttr = uttr.lower() entity_substr_dict = {} - formatted_triplet = {} + formatted_triplet, per_triplet = {}, {} if len(uttr.split()) > 2: for entity in entities_with_labels: if "text" in entity: @@ -114,8 +115,19 @@ def get_result(request): ) if triplet: formatted_triplet = {"subject": triplet[0], rel_type_dict[triplet[1]]: triplet[1], "object": triplet[2]} - triplets_info_batch.append({"triplet": formatted_triplet, "entity_info": entity_substr_dict}) + named_entities_list = [] + for elem in named_entities: + for entity in elem: + named_entities_list.append(entity) + per_entities = [entity for entity in named_entities_list if entity.get("type", "") == "PER"] + if triplet[1] in {"have pet", "have family", "have sibling", "have chidren"} and per_entities: + per_triplet = {"subject": triplet[2], "property": "name", "object": per_entities[0].get("text", "")} + triplets_info_batch.append({"triplet": formatted_triplet, "entity_info": entity_substr_dict}) + if per_triplet: + triplets_info_batch.append( + {"triplet": per_triplet, "entity_info": {per_triplet["object"]: {"entity_id_tags": ["PER"]}}} + ) total_time = time.time() - st_time logger.info(f"property extraction exec time: {total_time: .3f}s") logger.info(f"property extraction, input {uttrs}, output {triplets_info_batch} scores {scores}") diff --git a/state_formatters/dp_formatters.py b/state_formatters/dp_formatters.py index b359777cf3..83773b1ba1 100755 --- a/state_formatters/dp_formatters.py +++ b/state_formatters/dp_formatters.py @@ -214,10 +214,12 @@ def entity_detection_formatter_dialog(dialog: Dict) -> List[Dict]: def property_extraction_formatter_dialog(dialog: Dict) -> List[Dict]: entities_with_labels = get_entities(dialog["human_utterances"][-1], only_named=False, with_labels=True) entity_info_list = dialog["human_utterances"][-1]["annotations"].get("entity_linking", [{}]) + named_entities = dialog["human_utterances"][-1]["annotations"].get("ner", [{}]) return [ { "utterances": [dialog["human_utterances"][-1]["text"]], "entities_with_labels": [entities_with_labels], + "named_entities": [named_entities], "entity_info": [entity_info_list], } ] From 436a075a6be992941db86c81a27a9e01ab372154 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Thu, 6 Oct 2022 10:42:50 +0300 Subject: [PATCH 11/40] add to yml files --- annotators/property_extraction/Dockerfile | 2 +- annotators/property_extraction/test.sh | 4 ++++ assistant_dists/dream/dev.yml | 4 ++-- .../dream/docker-compose.override.yml | 8 ++++---- assistant_dists/dream/gpu1.yml | 6 ++++++ assistant_dists/dream/proxy.yml | 18 +++++++++--------- assistant_dists/dream/test.yml | 5 +++++ tests/runtests.sh | 2 +- 8 files changed, 32 insertions(+), 17 deletions(-) create mode 100755 annotators/property_extraction/test.sh diff --git a/annotators/property_extraction/Dockerfile b/annotators/property_extraction/Dockerfile index c541035263..b178867561 100644 --- a/annotators/property_extraction/Dockerfile +++ b/annotators/property_extraction/Dockerfile @@ -17,4 +17,4 @@ COPY $SRC_DIR /src WORKDIR /src -CMD gunicorn --workers=1 --timeout 500 server:app -b 0.0.0.0:8126 +CMD gunicorn --workers=1 --timeout 500 server:app -b 0.0.0.0:8129 diff --git a/annotators/property_extraction/test.sh b/annotators/property_extraction/test.sh new file mode 100755 index 0000000000..4088512108 --- /dev/null +++ b/annotators/property_extraction/test.sh @@ -0,0 +1,4 @@ +#!/bin/bash + + +python test_property_extraction.py diff --git a/assistant_dists/dream/dev.yml b/assistant_dists/dream/dev.yml index 1267e4400c..8c6f440428 100755 --- a/assistant_dists/dream/dev.yml +++ b/assistant_dists/dream/dev.yml @@ -83,7 +83,7 @@ services: ner: volumes: - './annotators/NER_deeppavlov:/src' - - "~/.deeppavlov:/root/.deeppavlov" + - "/archive/evseev/.deeppavlov:/root/.deeppavlov" ports: - 8021:8021 eliza: @@ -426,5 +426,5 @@ services: - "./annotators/property_extraction:/src" - "~/.deeppavlov:/root/.deeppavlov" ports: - - 8126:8126 + - 8129:8129 version: "3.7" diff --git a/assistant_dists/dream/docker-compose.override.yml b/assistant_dists/dream/docker-compose.override.yml index b845e9005e..d9876bc567 100644 --- a/assistant_dists/dream/docker-compose.override.yml +++ b/assistant_dists/dream/docker-compose.override.yml @@ -20,7 +20,7 @@ services: dff-gossip-skill:8109, dff-wiki-skill:8111, dff-gaming-skill:8115, topic-recommendation:8113, user-persona-extractor:8114, wiki-facts:8116, dff-music-skill:8099, entity-detection:8103, dff-art-skill:8117, midas-predictor:8121, dialogpt:8125, storygpt:8126, prompt-storygpt:8127, infilling:8122, - property-extraction:8126, dff-template-skill:8120" + property-extraction:8129, dff-template-skill:8120" WAIT_HOSTS_TIMEOUT: ${WAIT_TIMEOUT:-480} convers-evaluator-annotator: env_file: [.env] @@ -1251,14 +1251,14 @@ services: build: args: CONFIG: t5_generative_ie_infer.json - PORT: 8126 + PORT: 8129 SRC_DIR: annotators/property_extraction/ context: ./ dockerfile: annotators/property_extraction/Dockerfile - command: flask run -h 0.0.0.0 -p 8126 + command: flask run -h 0.0.0.0 -p 8129 environment: - FLASK_APP=server - - CUDA_VISIBLE_DEVICES=7 + - CUDA_VISIBLE_DEVICES=0 deploy: resources: limits: diff --git a/assistant_dists/dream/gpu1.yml b/assistant_dists/dream/gpu1.yml index 2366444953..10dc738996 100644 --- a/assistant_dists/dream/gpu1.yml +++ b/assistant_dists/dream/gpu1.yml @@ -199,4 +199,10 @@ services: - CUDA_VISIBLE_DEVICES=7 dff-template-skill: restart: unless-stopped + property-extraction: + restart: unless-stopped + volumes: + - "~/.deeppavlov:/root/.deeppavlov" + environment: + - CUDA_VISIBLE_DEVICES=8 version: '3.7' diff --git a/assistant_dists/dream/proxy.yml b/assistant_dists/dream/proxy.yml index cc49824567..281acaa34d 100644 --- a/assistant_dists/dream/proxy.yml +++ b/assistant_dists/dream/proxy.yml @@ -278,15 +278,6 @@ services: - PROXY_PASS=dream.deeppavlov.ai:8074 - PORT=8074 - entity-linking: - command: [ "nginx", "-g", "daemon off;" ] - build: - context: dp/proxy/ - dockerfile: Dockerfile - environment: - - PROXY_PASS=dream.deeppavlov.ai:8075 - - PORT=8075 - wiki-parser: command: [ "nginx", "-g", "daemon off;" ] build: @@ -629,4 +620,13 @@ services: environment: - PROXY_PASS=dream.deeppavlov.ai:8127 - PORT=8127 + + property-extraction: + command: [ "nginx", "-g", "daemon off;" ] + build: + context: dp/proxy/ + dockerfile: Dockerfile + environment: + - PROXY_PASS=dream.deeppavlov.ai:8129 + - PORT=8129 version: '3.7' diff --git a/assistant_dists/dream/test.yml b/assistant_dists/dream/test.yml index 9737806c0d..4de5a7fb04 100644 --- a/assistant_dists/dream/test.yml +++ b/assistant_dists/dream/test.yml @@ -129,4 +129,9 @@ services: environment: - CUDA_VISIBLE_DEVICES=7 dff-template-skill: + property-extraction: + volumes: + - "~/.deeppavlov:/root/.deeppavlov" + environment: + - CUDA_VISIBLE_DEVICES=8 version: '3.7' diff --git a/tests/runtests.sh b/tests/runtests.sh index 888c0bb9e0..5d2ffa09fa 100755 --- a/tests/runtests.sh +++ b/tests/runtests.sh @@ -149,7 +149,7 @@ if [[ "$MODE" == "test_skills" || "$MODE" == "all" ]]; then dff-gossip-skill dff-wiki-skill topic-recommendation dff-science-skill personal-info-skill \ user-persona-extractor small-talk-skill wiki-facts dff-art-skill dff-funfact-skill \ meta-script-skill spelling-preprocessing dff-gaming-skill dialogpt \ - dff-music-skill dff-bot-persona-skill entity-detection midas-predictor infilling; do + dff-music-skill dff-bot-persona-skill entity-detection midas-predictor infilling property-extraction; do echo "Run tests for $container" dockercompose_cmd exec -T -u $(id -u) $container ./test.sh From a4906aa03cae5cc4d93900a0e1261b9197742ca8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Thu, 6 Oct 2022 11:02:42 +0300 Subject: [PATCH 12/40] fix tests --- .../test_property_extraction.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/annotators/property_extraction/test_property_extraction.py b/annotators/property_extraction/test_property_extraction.py index e36de4a7c5..25720a1e7e 100644 --- a/annotators/property_extraction/test_property_extraction.py +++ b/annotators/property_extraction/test_property_extraction.py @@ -2,14 +2,25 @@ def main(): - url = "http://0.0.0.0:8126/respond" + url = "http://0.0.0.0:8129/respond" request_data = [{"utterances": ["i live in moscow"]}] + gold_results = [ + {"entity_info": {}, "triplet": {"object": "moscow", "relation": "live in citystatecountry", "subject": "user"}} + ] - for data in request_data: + count = 0 + for data, gold_result in zip(request_data, gold_results): result = requests.post(url, json=data).json() + if result and result[0] == gold_result: + count += 1 + else: + print(f"Got {result}, but expected: {gold_result}") print(result) + assert count == len(request_data) + print("Success") + if __name__ == "__main__": main() From 96485075174e82115cea5fc22a7de8e7bb1b76fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Sat, 8 Oct 2022 20:52:17 +0300 Subject: [PATCH 13/40] entity linking input from property extraction --- annotators/property_extraction/server.py | 17 ++++++++++---- .../test_property_extraction.py | 4 +--- assistant_dists/dream/pipeline_conf.json | 22 ++++++++++--------- state_formatters/dp_formatters.py | 6 +++++ 4 files changed, 32 insertions(+), 17 deletions(-) diff --git a/annotators/property_extraction/server.py b/annotators/property_extraction/server.py index 2c19fa3a0a..b4d98bb515 100644 --- a/annotators/property_extraction/server.py +++ b/annotators/property_extraction/server.py @@ -19,6 +19,7 @@ config_name = os.getenv("CONFIG") rel_cls_flag = int(os.getenv("REL_CLS_FLAG", "0")) +add_entity_info = int(os.getenv("ADD_ENTITY_INFO", "0")) rel_type_dict = {} with open("rel_list.txt", "r") as fl: @@ -123,11 +124,19 @@ def get_result(request): if triplet[1] in {"have pet", "have family", "have sibling", "have chidren"} and per_entities: per_triplet = {"subject": triplet[2], "property": "name", "object": per_entities[0].get("text", "")} - triplets_info_batch.append({"triplet": formatted_triplet, "entity_info": entity_substr_dict}) + triplets_info_list = [] + if add_entity_info: + triplets_info_list.append({"triplet": formatted_triplet, "entity_info": entity_substr_dict}) + else: + triplets_info_list.append({"triplet": formatted_triplet}) if per_triplet: - triplets_info_batch.append( - {"triplet": per_triplet, "entity_info": {per_triplet["object"]: {"entity_id_tags": ["PER"]}}} - ) + if add_entity_info: + triplets_info_list.append( + {"triplet": per_triplet, "entity_info": {per_triplet["object"]: {"entity_id_tags": ["PER"]}}} + ) + else: + triplets_info_list.append({"triplet": per_triplet}) + triplets_info_batch.append(triplets_info_list) total_time = time.time() - st_time logger.info(f"property extraction exec time: {total_time: .3f}s") logger.info(f"property extraction, input {uttrs}, output {triplets_info_batch} scores {scores}") diff --git a/annotators/property_extraction/test_property_extraction.py b/annotators/property_extraction/test_property_extraction.py index 25720a1e7e..92d729fde4 100644 --- a/annotators/property_extraction/test_property_extraction.py +++ b/annotators/property_extraction/test_property_extraction.py @@ -5,9 +5,7 @@ def main(): url = "http://0.0.0.0:8129/respond" request_data = [{"utterances": ["i live in moscow"]}] - gold_results = [ - {"entity_info": {}, "triplet": {"object": "moscow", "relation": "live in citystatecountry", "subject": "user"}} - ] + gold_results = [{"triplet": {"object": "moscow", "relation": "live in citystatecountry", "subject": "user"}}] count = 0 for data, gold_result in zip(request_data, gold_results): diff --git a/assistant_dists/dream/pipeline_conf.json b/assistant_dists/dream/pipeline_conf.json index f6ebf22a85..19785ba9e3 100644 --- a/assistant_dists/dream/pipeline_conf.json +++ b/assistant_dists/dream/pipeline_conf.json @@ -301,32 +301,34 @@ "annotators.entity_linking" ] }, - "entity_linking": { + "property_extraction": { "connector": { "protocol": "http", "timeout": 1, - "url": "http://entity-linking:8075/model" + "url": "http://property-extraction:8129/respond" }, - "dialog_formatter": "state_formatters.dp_formatters:el_formatter_dialog", + "dialog_formatter": "state_formatters.dp_formatters:property_extraction_formatter_dialog", "response_formatter": "state_formatters.dp_formatters:simple_formatter_service", "state_manager_method": "add_annotation", "previous_services": [ - "annotators.ner", - "annotators.entity_detection", - "annotators.spacy_nounphrases" + "annotators.spelling_preprocessing", + "annotators.sentseg" ] }, - "property_extraction": { + "entity_linking": { "connector": { "protocol": "http", "timeout": 1, - "url": "http://property-extraction:8126/respond" + "url": "http://entity-linking:8075/model" }, - "dialog_formatter": "state_formatters.dp_formatters:property_extraction_formatter_dialog", + "dialog_formatter": "state_formatters.dp_formatters:el_formatter_dialog", "response_formatter": "state_formatters.dp_formatters:simple_formatter_service", "state_manager_method": "add_annotation", "previous_services": [ - "annotators.entity_linking" + "annotators.ner", + "annotators.entity_detection", + "annotators.spacy_nounphrases", + "annotators.property_extraction" ] }, "wiki_parser": { diff --git a/state_formatters/dp_formatters.py b/state_formatters/dp_formatters.py index 83773b1ba1..89fb2cee3f 100755 --- a/state_formatters/dp_formatters.py +++ b/state_formatters/dp_formatters.py @@ -600,6 +600,12 @@ def el_formatter_dialog(dialog: Dict): entity_tags_list.append([[entity["label"].lower(), 1.0]]) else: entity_tags_list.append([["misc", 1.0]]) + triplets = dialog["human_utterances"][-1]["annotations"].get("property_extraction", [{}]) + for triplet in triplets: + object_entity_substr = triplet.get("object", "") + if object_entity_substr and object_entity_substr not in entity_substr_list: + entity_substr_list.append(object_entity_substr) + entity_tags_list.append([["misc", 1.0]]) dialog = utils.get_last_n_turns(dialog, bot_last_turns=1) dialog = utils.replace_with_annotated_utterances(dialog, mode="punct_sent") context = [[uttr["text"] for uttr in dialog["utterances"][-num_last_utterances:]]] From 55e3c73aef228cb0353bfc6d56ec08ae229649ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Thu, 13 Oct 2022 11:56:14 +0300 Subject: [PATCH 14/40] fixes --- tests/runtests.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 tests/runtests.sh diff --git a/tests/runtests.sh b/tests/runtests.sh old mode 100644 new mode 100755 From 6df8689d6a5d60b90c027114cab8c6eb5102df97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Thu, 13 Oct 2022 14:07:03 +0300 Subject: [PATCH 15/40] change gpu number --- assistant_dists/dream/gpu1.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assistant_dists/dream/gpu1.yml b/assistant_dists/dream/gpu1.yml index 25ea8fd4ab..fde19b481e 100644 --- a/assistant_dists/dream/gpu1.yml +++ b/assistant_dists/dream/gpu1.yml @@ -208,5 +208,5 @@ services: volumes: - "~/.deeppavlov:/root/.deeppavlov" environment: - - CUDA_VISIBLE_DEVICES=8 + - CUDA_VISIBLE_DEVICES=4 version: '3.7' From b2357205ea6d7e7dd97e7f13a5e8685b819dcdbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Thu, 13 Oct 2022 14:33:18 +0300 Subject: [PATCH 16/40] model on cpu --- annotators/property_extraction/src/t5_generative_ie.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/annotators/property_extraction/src/t5_generative_ie.py b/annotators/property_extraction/src/t5_generative_ie.py index da4ad10711..1d8c42818c 100644 --- a/annotators/property_extraction/src/t5_generative_ie.py +++ b/annotators/property_extraction/src/t5_generative_ie.py @@ -55,6 +55,7 @@ def __init__( top_n: int = 1, batch_decode: bool = False, scores_thres: float = -0.17, + device: str = "cpu", **kwargs, ) -> None: @@ -77,6 +78,7 @@ def __init__( self.scores_thres = scores_thres super().__init__( + device=device, optimizer=optimizer, optimizer_parameters=optimizer_parameters, learning_rate_drop_patience=learning_rate_drop_patience, @@ -85,6 +87,7 @@ def __init__( min_learning_rate=min_learning_rate, **kwargs, ) + self.device = torch.device("cuda" if torch.cuda.is_available() and device == "gpu" else "cpu") def train_on_batch(self, input_ids_batch, attention_mask_batch, target_ids_batch) -> Dict: input_ids_batch = torch.LongTensor(input_ids_batch).to(self.device) From 16f7f6d42bba4c1fc4852b7d984d0e58c558ef4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Thu, 13 Oct 2022 16:03:40 +0300 Subject: [PATCH 17/40] add entity linking to proxy.yml --- assistant_dists/dream/proxy.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/assistant_dists/dream/proxy.yml b/assistant_dists/dream/proxy.yml index 6d2fe2809f..d944631181 100644 --- a/assistant_dists/dream/proxy.yml +++ b/assistant_dists/dream/proxy.yml @@ -277,6 +277,15 @@ services: environment: - PROXY_PASS=dream.deeppavlov.ai:8074 - PORT=8074 + + entity-linking: + command: [ "nginx", "-g", "daemon off;" ] + build: + context: dp/proxy/ + dockerfile: Dockerfile + environment: + - PROXY_PASS=dream.deeppavlov.ai:8075 + - PORT=8075 wiki-parser: command: [ "nginx", "-g", "daemon off;" ] From ef744a2b686263676a2bf42ac26905bcd02b975d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Thu, 13 Oct 2022 17:09:31 +0300 Subject: [PATCH 18/40] fix tests --- annotators/property_extraction/test_property_extraction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annotators/property_extraction/test_property_extraction.py b/annotators/property_extraction/test_property_extraction.py index 92d729fde4..685f07f3bc 100644 --- a/annotators/property_extraction/test_property_extraction.py +++ b/annotators/property_extraction/test_property_extraction.py @@ -5,7 +5,7 @@ def main(): url = "http://0.0.0.0:8129/respond" request_data = [{"utterances": ["i live in moscow"]}] - gold_results = [{"triplet": {"object": "moscow", "relation": "live in citystatecountry", "subject": "user"}}] + gold_results = [[{"triplet": {"object": "moscow", "relation": "live in citystatecountry", "subject": "user"}}]] count = 0 for data, gold_result in zip(request_data, gold_results): From ce116789428855255527363091e7b82f9928cd05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Fri, 14 Oct 2022 11:50:01 +0300 Subject: [PATCH 19/40] fix dp version and property extraction to cpu --- annotators/property_extraction/Dockerfile | 2 +- assistant_dists/dream/gpu1.yml | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/annotators/property_extraction/Dockerfile b/annotators/property_extraction/Dockerfile index b178867561..d019eb006e 100644 --- a/annotators/property_extraction/Dockerfile +++ b/annotators/property_extraction/Dockerfile @@ -1,4 +1,4 @@ -FROM deeppavlov/base-gpu +FROM deeppavlov/base-gpu:0.17.6 RUN apt-get update && apt-get install git -y diff --git a/assistant_dists/dream/gpu1.yml b/assistant_dists/dream/gpu1.yml index fde19b481e..2da069e16d 100644 --- a/assistant_dists/dream/gpu1.yml +++ b/assistant_dists/dream/gpu1.yml @@ -207,6 +207,4 @@ services: restart: unless-stopped volumes: - "~/.deeppavlov:/root/.deeppavlov" - environment: - - CUDA_VISIBLE_DEVICES=4 version: '3.7' From 1bbc1e9ff31a634f73fc3943233c413891c696f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Mon, 17 Oct 2022 12:24:54 +0300 Subject: [PATCH 20/40] remove env variables --- annotators/property_extraction/Dockerfile | 3 --- assistant_dists/dream/docker-compose.override.yml | 1 - assistant_dists/dream/test.yml | 2 -- 3 files changed, 6 deletions(-) diff --git a/annotators/property_extraction/Dockerfile b/annotators/property_extraction/Dockerfile index d019eb006e..347a18e19c 100644 --- a/annotators/property_extraction/Dockerfile +++ b/annotators/property_extraction/Dockerfile @@ -3,12 +3,9 @@ FROM deeppavlov/base-gpu:0.17.6 RUN apt-get update && apt-get install git -y ARG CONFIG -ARG PORT ARG SRC_DIR -ARG SED_ARG=" | " ENV CONFIG=$CONFIG -ENV PORT=$PORT COPY ./annotators/property_extraction/requirements.txt /src/requirements.txt RUN pip install -r /src/requirements.txt diff --git a/assistant_dists/dream/docker-compose.override.yml b/assistant_dists/dream/docker-compose.override.yml index 194d67f4d1..3bda85ec39 100644 --- a/assistant_dists/dream/docker-compose.override.yml +++ b/assistant_dists/dream/docker-compose.override.yml @@ -1299,7 +1299,6 @@ services: command: flask run -h 0.0.0.0 -p 8129 environment: - FLASK_APP=server - - CUDA_VISIBLE_DEVICES=0 deploy: resources: limits: diff --git a/assistant_dists/dream/test.yml b/assistant_dists/dream/test.yml index c2e0204632..58d5519563 100644 --- a/assistant_dists/dream/test.yml +++ b/assistant_dists/dream/test.yml @@ -135,6 +135,4 @@ services: property-extraction: volumes: - "~/.deeppavlov:/root/.deeppavlov" - environment: - - CUDA_VISIBLE_DEVICES=8 version: '3.7' From 6f36573535bd5196167a00c1d6c6875b6b665fe3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Wed, 14 Dec 2022 19:49:23 +0300 Subject: [PATCH 21/40] annotatate bot utterances --- assistant_dists/dream/pipeline_conf.json | 13 +++++++++++++ state_formatters/dp_formatters.py | 8 ++++++++ 2 files changed, 21 insertions(+) diff --git a/assistant_dists/dream/pipeline_conf.json b/assistant_dists/dream/pipeline_conf.json index 4f59fd79fe..cd92859ac0 100644 --- a/assistant_dists/dream/pipeline_conf.json +++ b/assistant_dists/dream/pipeline_conf.json @@ -111,6 +111,19 @@ ], "state_manager_method": "add_annotation_prev_bot_utt" }, + "property_extraction": { + "connector": { + "protocol": "http", + "url": "http://property-extraction:8129/respond" + }, + "dialog_formatter": "state_formatters.dp_formatters:property_extraction_formatter_last_bot_dialog", + "response_formatter": "state_formatters.dp_formatters:simple_formatter_service", + "state_manager_method": "add_annotation_prev_bot_utt", + "previous_services": [ + "annotators.spelling_preprocessing", + "annotators.sentseg" + ] + }, "sentrewrite": { "connector": "connectors.sentrewrite", "dialog_formatter": "state_formatters.dp_formatters:sent_rewrite_formatter_w_o_last_dialog", diff --git a/state_formatters/dp_formatters.py b/state_formatters/dp_formatters.py index 088fa9d242..524a5725bf 100755 --- a/state_formatters/dp_formatters.py +++ b/state_formatters/dp_formatters.py @@ -225,6 +225,14 @@ def property_extraction_formatter_dialog(dialog: Dict) -> List[Dict]: ] +def property_extraction_formatter_last_bot_dialog(dialog: Dict) -> List[Dict]: + return [ + { + "utterances": [dialog["bot_utterances"][-1]["text"]], + } + ] + + def preproc_last_human_utt_dialog_w_hist(dialog: Dict) -> List[Dict]: # Used by: sentseg over human uttrs last_human_utt = dialog["human_utterances"][-1]["annotations"].get( From c604088cdd88af0b2b216edeecb56e3395ac52b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Thu, 15 Dec 2022 08:58:42 +0300 Subject: [PATCH 22/40] add timeout --- assistant_dists/dream/pipeline_conf.json | 1 + 1 file changed, 1 insertion(+) diff --git a/assistant_dists/dream/pipeline_conf.json b/assistant_dists/dream/pipeline_conf.json index cd92859ac0..5e7b9ce30c 100644 --- a/assistant_dists/dream/pipeline_conf.json +++ b/assistant_dists/dream/pipeline_conf.json @@ -114,6 +114,7 @@ "property_extraction": { "connector": { "protocol": "http", + "timeout": 1, "url": "http://property-extraction:8129/respond" }, "dialog_formatter": "state_formatters.dp_formatters:property_extraction_formatter_last_bot_dialog", From 9c98d35d5575724571ccc5a9cc6406c6d291a859 Mon Sep 17 00:00:00 2001 From: dmitry Date: Mon, 26 Dec 2022 21:21:18 +0300 Subject: [PATCH 23/40] add property extraction to readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 9857255393..b381155fc8 100644 --- a/README.md +++ b/README.md @@ -221,6 +221,7 @@ Dream Architecture is presented in the following image: | NER | 2.2 GiB RAM, 5 GiB GPU | extracts person names, names of locations, organizations from uncased text | | News API annotator | 80 MiB RAM | extracts the latest news about entities or topics using the GNews API. DeepPavlov Dream deployments utilize our own API key. | | Personality Catcher | 30 MiB RAM | | +| Property Extraction | 6.3 GiB RAM | extracts user attributes from utterances | | Rake keywords | 40 MiB RAM | extracts keywords from utterances with the help of RAKE algorithm | | Relative Persona Extractor | 50 MiB RAM | Annotator utilizing Sentence Ranker to rank persona sentences and selecting `N_SENTENCES_OT_RETURN` the most relevant sentences | | Sentrewrite | 200 MiB RAM | rewrites user's utterances by replacing pronouns with specific names that provide more useful information to downstream components | From c7785593a9d2f08c59a24238dc1b388b0d8a7545 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Thu, 29 Dec 2022 17:52:32 +0300 Subject: [PATCH 24/40] update --- .../property_extraction/requirements.txt | 3 +- annotators/property_extraction/server.py | 35 ++++++++++++++++--- .../test_property_extraction.py | 2 +- state_formatters/dp_formatters.py | 5 ++- 4 files changed, 38 insertions(+), 7 deletions(-) diff --git a/annotators/property_extraction/requirements.txt b/annotators/property_extraction/requirements.txt index 710183f05e..f606ea620f 100644 --- a/annotators/property_extraction/requirements.txt +++ b/annotators/property_extraction/requirements.txt @@ -8,6 +8,7 @@ requests==2.27.1 jinja2<=3.0.3 Werkzeug<=2.0.3 sentry-sdk==0.12.3 -spacy==3.2.0 +spacy==2.2.3 +https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz#egg=en_core_web_sm==2.2.5 torch==1.7.1 transformers==4.10.1 diff --git a/annotators/property_extraction/server.py b/annotators/property_extraction/server.py index b4d98bb515..4d3efbc195 100644 --- a/annotators/property_extraction/server.py +++ b/annotators/property_extraction/server.py @@ -5,6 +5,7 @@ import nltk import sentry_sdk +import spacy from flask import Flask, jsonify, request from deeppavlov import build_model @@ -16,6 +17,7 @@ app = Flask(__name__) stemmer = nltk.PorterStemmer() +nlp = spacy.load("en_core_web_sm") config_name = os.getenv("CONFIG") rel_cls_flag = int(os.getenv("REL_CLS_FLAG", "0")) @@ -52,10 +54,35 @@ def check_triplet(triplet): def get_result(request): st_time = time.time() - uttrs = request.json.get("utterances", []) - named_entities_batch = request.json.get("named_entities", [[] for _ in uttrs]) - entities_with_labels_batch = request.json.get("entities_with_labels", [[] for _ in uttrs]) - entity_info_batch = request.json.get("entity_info", [[] for _ in uttrs]) + init_uttrs = request.json.get("utterances", []) + named_entities_batch = request.json.get("named_entities", [[] for _ in init_uttrs]) + entities_with_labels_batch = request.json.get("entities_with_labels", [[] for _ in init_uttrs]) + entity_info_batch = request.json.get("entity_info", [[] for _ in init_uttrs]) + uttrs = [] + for uttr_list in init_uttrs: + if len(uttr_list) == 1: + uttrs.append(uttr_list[0]) + else: + utt_prev = uttr_list[-2].lower() + utt_cur = uttr_list[-1].lower() + is_question = ( + any([utt_prev.startswith(q_word) for q_word in ["what ", "who ", "when ", "where "]]) + or "?" in utt_prev + ) + + is_sentence = False + parsed_sentence = nlp(utt_cur) + if parsed_sentence: + tokens = [elem.text for elem in parsed_sentence] + tags = [elem.tag_ for elem in parsed_sentence] + found_verbs = any([tag in tags for tag in ["VB", "VBZ", "VBP"]]) + if found_verbs and len(tokens) > 2: + is_sentence = True + + if is_question and not is_sentence: + uttrs.append(f"{utt_prev} {utt_cur}") + else: + uttrs.append(utt_cur) triplets_batch = [] outputs, scores = generative_ie(uttrs) diff --git a/annotators/property_extraction/test_property_extraction.py b/annotators/property_extraction/test_property_extraction.py index 685f07f3bc..73d311534f 100644 --- a/annotators/property_extraction/test_property_extraction.py +++ b/annotators/property_extraction/test_property_extraction.py @@ -4,7 +4,7 @@ def main(): url = "http://0.0.0.0:8129/respond" - request_data = [{"utterances": ["i live in moscow"]}] + request_data = [{"utterances": [["i live in moscow"]]}] gold_results = [[{"triplet": {"object": "moscow", "relation": "live in citystatecountry", "subject": "user"}}]] count = 0 diff --git a/state_formatters/dp_formatters.py b/state_formatters/dp_formatters.py index 729d3f05f8..f7518b1102 100755 --- a/state_formatters/dp_formatters.py +++ b/state_formatters/dp_formatters.py @@ -212,12 +212,15 @@ def entity_detection_formatter_dialog(dialog: Dict) -> List[Dict]: def property_extraction_formatter_dialog(dialog: Dict) -> List[Dict]: + dialog = utils.get_last_n_turns(dialog, bot_last_turns=1) + dialog = utils.replace_with_annotated_utterances(dialog, mode="punct_sent") + dialog_history = [uttr["text"] for uttr in dialog["utterances"][-2:]] entities_with_labels = get_entities(dialog["human_utterances"][-1], only_named=False, with_labels=True) entity_info_list = dialog["human_utterances"][-1]["annotations"].get("entity_linking", [{}]) named_entities = dialog["human_utterances"][-1]["annotations"].get("ner", [{}]) return [ { - "utterances": [dialog["human_utterances"][-1]["text"]], + "utterances": [dialog_history], "entities_with_labels": [entities_with_labels], "named_entities": [named_entities], "entity_info": [entity_info_list], From 8b7b92f69e0ac7f107f502327bbf9f009148c914 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Thu, 29 Dec 2022 18:04:28 +0300 Subject: [PATCH 25/40] codestyle --- annotators/property_extraction/server.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/annotators/property_extraction/server.py b/annotators/property_extraction/server.py index 4d3efbc195..7b1b4e208d 100644 --- a/annotators/property_extraction/server.py +++ b/annotators/property_extraction/server.py @@ -66,8 +66,7 @@ def get_result(request): utt_prev = uttr_list[-2].lower() utt_cur = uttr_list[-1].lower() is_question = ( - any([utt_prev.startswith(q_word) for q_word in ["what ", "who ", "when ", "where "]]) - or "?" in utt_prev + any([utt_prev.startswith(q_word) for q_word in ["what ", "who ", "when ", "where "]]) or "?" in utt_prev ) is_sentence = False From da6efd252e6cd09c7cffa16dbf9a5f3a40e0b2ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Fri, 30 Dec 2022 10:02:31 +0300 Subject: [PATCH 26/40] fix state formatter --- state_formatters/dp_formatters.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/state_formatters/dp_formatters.py b/state_formatters/dp_formatters.py index f7518b1102..9031fc13af 100755 --- a/state_formatters/dp_formatters.py +++ b/state_formatters/dp_formatters.py @@ -229,9 +229,13 @@ def property_extraction_formatter_dialog(dialog: Dict) -> List[Dict]: def property_extraction_formatter_last_bot_dialog(dialog: Dict) -> List[Dict]: + if dialog["bot_utterances"]: + dialog_history = [dialog["bot_utterances"][-1]["text"]] + else: + dialog_history = [""] return [ { - "utterances": [dialog["bot_utterances"][-1]["text"]], + "utterances": [dialog_history], } ] From 02c4dd2841c6d4f77c42c7387b959b419c0b3f27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Sun, 15 Jan 2023 17:58:09 +0300 Subject: [PATCH 27/40] update requirements --- annotators/kbqa/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annotators/kbqa/requirements.txt b/annotators/kbqa/requirements.txt index 7ad4134d6f..52fc498643 100644 --- a/annotators/kbqa/requirements.txt +++ b/annotators/kbqa/requirements.txt @@ -1,4 +1,4 @@ -pyopenssl==22.0.0 +pyopenssl==22.1.0 sentry-sdk[flask]==0.14.1 flask==1.1.1 itsdangerous==2.0.1 From d697c5d173694acc466eca2cf104baed3123df93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Sun, 15 Jan 2023 18:39:27 +0300 Subject: [PATCH 28/40] fix requirements --- annotators/entity_detection/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annotators/entity_detection/requirements.txt b/annotators/entity_detection/requirements.txt index 57c084b17a..b46d6f191d 100644 --- a/annotators/entity_detection/requirements.txt +++ b/annotators/entity_detection/requirements.txt @@ -1,4 +1,4 @@ -pyopenssl==22.0.0 +pyopenssl==22.1.0 Flask==1.1.1 itsdangerous==2.0.1 nltk==3.4.5 From def0c71b72f319c9f29f632b18b68be9e9e87497 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Sun, 15 Jan 2023 18:59:58 +0300 Subject: [PATCH 29/40] fix requirements --- annotators/entity_detection/Dockerfile | 1 - annotators/entity_detection/requirements.txt | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/annotators/entity_detection/Dockerfile b/annotators/entity_detection/Dockerfile index 8e688bc7e3..b1051b84d9 100644 --- a/annotators/entity_detection/Dockerfile +++ b/annotators/entity_detection/Dockerfile @@ -1,5 +1,4 @@ FROM deeppavlov/base-gpu:0.12.1 -RUN pip install --upgrade pip && pip install git+https://github.com/deeppavlov/DeepPavlov.git@0.12.1 RUN apt-get update && apt-get install git -y diff --git a/annotators/entity_detection/requirements.txt b/annotators/entity_detection/requirements.txt index b46d6f191d..e0b14b7aa1 100644 --- a/annotators/entity_detection/requirements.txt +++ b/annotators/entity_detection/requirements.txt @@ -11,5 +11,5 @@ sentry-sdk==0.12.3 torch==1.6.0 transformers==4.6.0 pydantic==1.3 -deeppavlov==0.17.3 +deeppavlov==0.17.6 spacy==2.2.3 From 401df87fd41b8603ddf91e8107d289f762c1dd4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Sun, 15 Jan 2023 19:33:06 +0300 Subject: [PATCH 30/40] update el requirements --- annotators/entity_linking/Dockerfile | 6 ------ 1 file changed, 6 deletions(-) diff --git a/annotators/entity_linking/Dockerfile b/annotators/entity_linking/Dockerfile index 27ae58a462..6289834e63 100644 --- a/annotators/entity_linking/Dockerfile +++ b/annotators/entity_linking/Dockerfile @@ -6,12 +6,6 @@ RUN apt-key del 7fa2af80 && \ -o cuda-keyring_1.0-1_all.deb && \ dpkg -i cuda-keyring_1.0-1_all.deb RUN apt-get -y update -RUN apt-get install -y build-essential zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget llvm \ - libncurses5-dev libncursesw5-dev xz-utils libffi-dev liblzma-dev - -RUN apt-get -y update && \ - apt-get install -y software-properties-common && \ - apt-get update && apt-get install git -y RUN apt-get install -y sqlite3 From 6a85675696469ee8dee4c74e2f9e1e9c03274cc5 Mon Sep 17 00:00:00 2001 From: dilyararimovna Date: Thu, 19 Jan 2023 12:41:29 +0800 Subject: [PATCH 31/40] fix: revert entity detection --- annotators/entity_detection/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/annotators/entity_detection/Dockerfile b/annotators/entity_detection/Dockerfile index 960488ac76..1326d61a1c 100644 --- a/annotators/entity_detection/Dockerfile +++ b/annotators/entity_detection/Dockerfile @@ -1,4 +1,5 @@ FROM deeppavlov/base-gpu:0.12.1 +RUN pip install --upgrade pip && pip install git+https://github.com/deeppavlov/DeepPavlov.git@0.12.1 RUN apt-get update && apt-get install git -y From b3caa3414b930314a403a3c5461aba4e58d0c3d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Mon, 30 Jan 2023 21:31:54 +0300 Subject: [PATCH 32/40] sentence rewrite --- annotators/property_extraction/server.py | 59 +++++- .../src/sentence_answer.py | 181 ++++++++++++++++++ .../src/torch_transformers_preprocessor.py | 4 +- 3 files changed, 233 insertions(+), 11 deletions(-) create mode 100644 annotators/property_extraction/src/sentence_answer.py diff --git a/annotators/property_extraction/server.py b/annotators/property_extraction/server.py index 7b1b4e208d..cd47325a6e 100644 --- a/annotators/property_extraction/server.py +++ b/annotators/property_extraction/server.py @@ -1,3 +1,4 @@ +import copy import logging import os import re @@ -9,6 +10,7 @@ from flask import Flask, jsonify, request from deeppavlov import build_model +from src.sentence_answer import sentence_answer sentry_sdk.init(os.getenv("SENTRY_DSN")) @@ -52,21 +54,49 @@ def check_triplet(triplet): raise e +def sentrewrite(sentence, answer): + answer = answer.strip(".") + if any([sentence.startswith(elem) for elem in ["what's", "what is"]]): + for old_tok, new_tok in [ + ("what's your", f"{answer} is my"), + ("what is your", f"{answer} is my"), + ("what is", "{answer} is"), + ("what's", "{answer} is"), + ]: + sentence = sentence.replace(old_tok, new_tok) + elif any([sentence.startswith(elem) for elem in ["where", "when"]]): + sentence = sentence_answer(sentence, answer) + elif any([sentence.startswith(elem) for elem in ["is there"]]): + for old_tok, new_tok in [("is there any", f"{answer} is"), ("is there", f"{answer} is")]: + sentence = sentence.replace(old_tok, new_tok) + return sentence + + def get_result(request): st_time = time.time() init_uttrs = request.json.get("utterances", []) + init_uttrs_cased = request.json.get("utterances_init", []) + if not init_uttrs_cased: + init_uttrs_cased = copy.deepcopy(init_uttrs) named_entities_batch = request.json.get("named_entities", [[] for _ in init_uttrs]) entities_with_labels_batch = request.json.get("entities_with_labels", [[] for _ in init_uttrs]) entity_info_batch = request.json.get("entity_info", [[] for _ in init_uttrs]) - uttrs = [] - for uttr_list in init_uttrs: + logger.info(f"init_uttrs {init_uttrs}") + uttrs, uttrs_cased = [], [] + for uttr_list, uttr_list_cased in zip(init_uttrs, init_uttrs_cased): if len(uttr_list) == 1: uttrs.append(uttr_list[0]) + uttrs_cased.append(uttr_list[0]) else: - utt_prev = uttr_list[-2].lower() - utt_cur = uttr_list[-1].lower() + utt_prev = uttr_list_cased[-2] + utt_prev_sentences = nltk.sent_tokenize(utt_prev) + utt_prev = utt_prev_sentences[-1] + utt_cur = uttr_list_cased[-1] + utt_prev_l = utt_prev.lower() + utt_cur_l = utt_cur.lower() is_question = ( - any([utt_prev.startswith(q_word) for q_word in ["what ", "who ", "when ", "where "]]) or "?" in utt_prev + any([utt_prev_l.startswith(q_word) for q_word in ["what ", "who ", "when ", "where "]]) + or "?" in utt_prev_l ) is_sentence = False @@ -74,24 +104,35 @@ def get_result(request): if parsed_sentence: tokens = [elem.text for elem in parsed_sentence] tags = [elem.tag_ for elem in parsed_sentence] - found_verbs = any([tag in tags for tag in ["VB", "VBZ", "VBP"]]) + found_verbs = any([tag in tags for tag in ["VB", "VBZ", "VBP", "VBD"]]) if found_verbs and len(tokens) > 2: is_sentence = True + logger.info(f"is_question: {is_question} --- is_sentence: {is_sentence}") if is_question and not is_sentence: - uttrs.append(f"{utt_prev} {utt_cur}") + if len(utt_cur_l.split()) <= 2: + uttrs.append(sentrewrite(utt_prev_l, utt_cur_l)) + uttrs_cased.append(sentrewrite(utt_prev, utt_cur)) + else: + uttrs.append(f"{utt_prev_l} {utt_cur_l}") + uttrs_cased.append(f"{utt_prev} {utt_cur}") else: - uttrs.append(utt_cur) + uttrs.append(utt_cur_l) + uttrs_cased.append(utt_cur) + logger.info(f"input utterances: {uttrs}") triplets_batch = [] outputs, scores = generative_ie(uttrs) - for output in outputs: + for output, uttr in zip(outputs, uttrs_cased): triplet = "" fnd = re.findall(r" (.*?) (.*?) (.*)", output) if fnd: triplet = list(fnd[0]) if triplet[0] == "i": triplet[0] = "user" + obj = triplet[2] + if obj.islower() and obj.capitalize() in uttr: + triplet[2] = obj.capitalize() triplets_batch.append(triplet) logger.info(f"outputs {outputs} scores {scores} triplets_batch {triplets_batch}") if rel_cls_flag: diff --git a/annotators/property_extraction/src/sentence_answer.py b/annotators/property_extraction/src/sentence_answer.py new file mode 100644 index 0000000000..1fcad7b2a7 --- /dev/null +++ b/annotators/property_extraction/src/sentence_answer.py @@ -0,0 +1,181 @@ +import importlib +import re +from logging import getLogger + +import pkg_resources +import spacy + +log = getLogger(__name__) + +# en_core_web_sm is installed and used by test_inferring_pretrained_model in the same interpreter session during tests. +# Spacy checks en_core_web_sm package presence with pkg_resources, but pkg_resources is initialized with interpreter, +# sot it doesn't see en_core_web_sm installed after interpreter initialization, so we use importlib.reload below. + +if "en-core-web-sm" not in pkg_resources.working_set.by_key.keys(): + importlib.reload(pkg_resources) + +# TODO: move nlp to sentence_answer, sentence_answer to rel_ranking_infer and revise en_core_web_sm requirement, +# TODO: make proper downloading with spacy.cli.download +nlp = spacy.load("en_core_web_sm") + +pronouns = ["who", "what", "when", "where", "how"] + + +def find_tokens(tokens, node, not_inc_node): + if node != not_inc_node: + tokens.append(node.text) + for elem in node.children: + tokens = find_tokens(tokens, elem, not_inc_node) + return tokens + + +def find_inflect_dict(sent_nodes): + inflect_dict = {} + for node in sent_nodes: + if node.dep_ == "aux" and node.tag_ == "VBD" and (node.head.tag_ == "VBP" or node.head.tag_ == "VB"): + new_verb = node.head._.inflect("VBD") + inflect_dict[node.head.text] = new_verb + inflect_dict[node.text] = "" + if node.dep_ == "aux" and node.tag_ == "VBZ" and node.head.tag_ == "VB": + new_verb = node.head._.inflect("VBZ") + inflect_dict[node.head.text] = new_verb + inflect_dict[node.text] = "" + return inflect_dict + + +def find_wh_node(sent_nodes): + wh_node = "" + main_head = "" + wh_node_head = "" + for node in sent_nodes: + if node.text.lower() in pronouns: + wh_node = node + break + + if wh_node: + wh_node_head = wh_node.head + if wh_node_head.dep_ == "ccomp": + main_head = wh_node_head.head + + return wh_node, wh_node_head, main_head + + +def find_tokens_to_replace(wh_node_head, main_head, question_tokens, question): + redundant_tokens_to_replace = [] + question_tokens_to_replace = [] + + if main_head: + redundant_tokens_to_replace = find_tokens([], main_head, wh_node_head) + what_tokens_fnd = re.findall("what (.*) (is|was|does|did) (.*)", question, re.IGNORECASE) + if what_tokens_fnd: + what_tokens = what_tokens_fnd[0][0].split() + if len(what_tokens) <= 2: + redundant_tokens_to_replace += what_tokens + + wh_node_head_desc = [] + if wh_node_head: + wh_node_head_desc = [node for node in wh_node_head.children if node.text != "?"] + wh_node_head_dep = [ + node.dep_ + for node in wh_node_head.children + if (node.text != "?" and node.dep_ not in ["aux", "prep"] and node.text.lower() not in pronouns) + ] + for node in wh_node_head_desc: + if node.dep_ == "nsubj" and len(wh_node_head_dep) > 1 or node.text.lower() in pronouns or node.dep_ == "aux": + question_tokens_to_replace.append(node.text) + for elem in node.subtree: + question_tokens_to_replace.append(elem.text) + + question_tokens_to_replace = list(set(question_tokens_to_replace)) + + redundant_replace_substr = [] + for token in question_tokens: + if token in redundant_tokens_to_replace: + redundant_replace_substr.append(token) + else: + if redundant_replace_substr: + break + + redundant_replace_substr = " ".join(redundant_replace_substr) + + question_replace_substr = [] + + for token in question_tokens: + if token in question_tokens_to_replace: + question_replace_substr.append(token) + else: + if question_replace_substr: + break + + question_replace_substr = " ".join(question_replace_substr) + + return redundant_replace_substr, question_replace_substr + + +def sentence_answer(question, entity_title, entities=None, template_answer=None): + log.debug(f"question {question} entity_title {entity_title} entities {entities} template_answer {template_answer}") + sent_nodes = nlp(question) + reverse = False + if sent_nodes[-2].tag_ == "IN": + reverse = True + question_tokens = [elem.text for elem in sent_nodes] + log.debug(f"spacy tags: {[(elem.text, elem.tag_, elem.dep_, elem.head.text) for elem in sent_nodes]}") + + inflect_dict = find_inflect_dict(sent_nodes) + wh_node, wh_node_head, main_head = find_wh_node(sent_nodes) + redundant_replace_substr, question_replace_substr = find_tokens_to_replace( + wh_node_head, main_head, question_tokens, question + ) + log.debug(f"redundant_replace_substr {redundant_replace_substr} question_replace_substr {question_replace_substr}") + if redundant_replace_substr: + answer = question.replace(redundant_replace_substr, "") + else: + answer = question + + if answer.endswith("?"): + answer = answer.replace("?", "").strip() + + if question_replace_substr: + if template_answer and entities: + answer = template_answer.replace("[ent]", entities[0]).replace("[ans]", entity_title) + elif wh_node.text.lower() in ["what", "who", "how"]: + fnd_date = re.findall(rf"what (day|year) (.*)\?", question, re.IGNORECASE) + fnd_wh = re.findall(r"what (is|was) the name of (.*) (which|that) (.*)\?", question, re.IGNORECASE) + fnd_name = re.findall(r"what (is|was) the name (.*)\?", question, re.IGNORECASE) + if fnd_date: + fnd_date_aux = re.findall(rf"what (day|year) (is|was) ({entities[0]}) (.*)\?", question, re.IGNORECASE) + if fnd_date_aux: + answer = f"{entities[0]} {fnd_date_aux[0][1]} {fnd_date_aux[0][3]} on {entity_title}" + else: + answer = f"{fnd_date[0][1]} on {entity_title}" + elif fnd_wh: + answer = f"{entity_title} {fnd_wh[0][3]}" + elif fnd_name: + aux_verb, sent_cut = fnd_name[0] + if sent_cut.startswith("of "): + sent_cut = sent_cut[3:] + answer = f"{entity_title} {aux_verb} {sent_cut}" + else: + if reverse: + answer = answer.replace(question_replace_substr, "") + answer = f"{answer} {entity_title}" + else: + answer = answer.replace(question_replace_substr, entity_title) + elif wh_node.text.lower() in ["when", "where"] and entities: + sent_cut = re.findall(rf"(when|where) (was|is) {entities[0]} (.*)\?", question, re.IGNORECASE) + if sent_cut: + if sent_cut[0][0].lower() == "when": + answer = f"{entities[0]} {sent_cut[0][1]} {sent_cut[0][2]} on {entity_title}" + else: + answer = f"{entities[0]} {sent_cut[0][1]} {sent_cut[0][2]} in {entity_title}" + else: + answer = answer.replace(question_replace_substr, "") + answer = f"{answer} in {entity_title}" + + for old_tok, new_tok in inflect_dict.items(): + answer = answer.replace(old_tok, new_tok) + answer = re.sub(r"\s+", " ", answer).strip() + + answer = answer + "." + + return answer diff --git a/annotators/property_extraction/src/torch_transformers_preprocessor.py b/annotators/property_extraction/src/torch_transformers_preprocessor.py index 506b28a20b..804a56e29a 100644 --- a/annotators/property_extraction/src/torch_transformers_preprocessor.py +++ b/annotators/property_extraction/src/torch_transformers_preprocessor.py @@ -57,7 +57,7 @@ def __call__(self, uttr_batch: List[str], targets_batch: List[str] = None): lengths.append(len(input_ids)) max_length = min(max(lengths), self.max_seq_length) for i in range(len(input_ids_batch)): - for j in range(max_length - len(input_ids_batch[i])): + for _ in range(max_length - len(input_ids_batch[i])): input_ids_batch[i].append(0) attention_mask_batch[i].append(0) @@ -73,7 +73,7 @@ def __call__(self, uttr_batch: List[str], targets_batch: List[str] = None): lengths.append(len(input_ids)) max_length = max(lengths) for i in range(len(target_ids_batch)): - for j in range(max_length - len(target_ids_batch[i])): + for _ in range(max_length - len(target_ids_batch[i])): target_ids_batch[i].append(0) return input_ids_batch, attention_mask_batch, target_ids_batch From 54d61a052a9248023090ecca2eec5d0628785774 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Thu, 2 Feb 2023 13:56:06 +0300 Subject: [PATCH 33/40] update --- annotators/property_extraction/server.py | 12 +++-- .../src/sentence_answer.py | 6 +-- .../t5_generative_ie_lite_infer.json | 49 +++++++++++++++++++ .../dream/docker-compose.override.yml | 2 +- 4 files changed, 58 insertions(+), 11 deletions(-) create mode 100644 annotators/property_extraction/t5_generative_ie_lite_infer.json diff --git a/annotators/property_extraction/server.py b/annotators/property_extraction/server.py index cd47325a6e..274e019fbc 100644 --- a/annotators/property_extraction/server.py +++ b/annotators/property_extraction/server.py @@ -54,8 +54,8 @@ def check_triplet(triplet): raise e -def sentrewrite(sentence, answer): - answer = answer.strip(".") +def sentrewrite(sentence, init_answer): + answer = init_answer.strip(".") if any([sentence.startswith(elem) for elem in ["what's", "what is"]]): for old_tok, new_tok in [ ("what's your", f"{answer} is my"), @@ -69,6 +69,8 @@ def sentrewrite(sentence, answer): elif any([sentence.startswith(elem) for elem in ["is there"]]): for old_tok, new_tok in [("is there any", f"{answer} is"), ("is there", f"{answer} is")]: sentence = sentence.replace(old_tok, new_tok) + else: + sentence = f"{sentence} {init_answer}" return sentence @@ -94,7 +96,7 @@ def get_result(request): utt_cur = uttr_list_cased[-1] utt_prev_l = utt_prev.lower() utt_cur_l = utt_cur.lower() - is_question = ( + is_q = ( any([utt_prev_l.startswith(q_word) for q_word in ["what ", "who ", "when ", "where "]]) or "?" in utt_prev_l ) @@ -108,8 +110,8 @@ def get_result(request): if found_verbs and len(tokens) > 2: is_sentence = True - logger.info(f"is_question: {is_question} --- is_sentence: {is_sentence}") - if is_question and not is_sentence: + logger.info(f"is_q: {is_q} --- is_s: {is_sentence} --- utt_prev: {utt_prev_l} --- utt_cur: {utt_cur_l}") + if is_q and not is_sentence: if len(utt_cur_l.split()) <= 2: uttrs.append(sentrewrite(utt_prev_l, utt_cur_l)) uttrs_cased.append(sentrewrite(utt_prev, utt_cur)) diff --git a/annotators/property_extraction/src/sentence_answer.py b/annotators/property_extraction/src/sentence_answer.py index 1fcad7b2a7..44490272a1 100644 --- a/annotators/property_extraction/src/sentence_answer.py +++ b/annotators/property_extraction/src/sentence_answer.py @@ -33,12 +33,8 @@ def find_inflect_dict(sent_nodes): inflect_dict = {} for node in sent_nodes: if node.dep_ == "aux" and node.tag_ == "VBD" and (node.head.tag_ == "VBP" or node.head.tag_ == "VB"): - new_verb = node.head._.inflect("VBD") - inflect_dict[node.head.text] = new_verb inflect_dict[node.text] = "" if node.dep_ == "aux" and node.tag_ == "VBZ" and node.head.tag_ == "VB": - new_verb = node.head._.inflect("VBZ") - inflect_dict[node.head.text] = new_verb inflect_dict[node.text] = "" return inflect_dict @@ -139,7 +135,7 @@ def sentence_answer(question, entity_title, entities=None, template_answer=None) if template_answer and entities: answer = template_answer.replace("[ent]", entities[0]).replace("[ans]", entity_title) elif wh_node.text.lower() in ["what", "who", "how"]: - fnd_date = re.findall(rf"what (day|year) (.*)\?", question, re.IGNORECASE) + fnd_date = re.findall(r"what (day|year) (.*)\?", question, re.IGNORECASE) fnd_wh = re.findall(r"what (is|was) the name of (.*) (which|that) (.*)\?", question, re.IGNORECASE) fnd_name = re.findall(r"what (is|was) the name (.*)\?", question, re.IGNORECASE) if fnd_date: diff --git a/annotators/property_extraction/t5_generative_ie_lite_infer.json b/annotators/property_extraction/t5_generative_ie_lite_infer.json new file mode 100644 index 0000000000..43540361b3 --- /dev/null +++ b/annotators/property_extraction/t5_generative_ie_lite_infer.json @@ -0,0 +1,49 @@ +{ + "chainer": { + "in": ["question"], + "pipe": [ + { + "class_name": "src.torch_transformers_preprocessor:T5GenerativeIEPreprocessor", + "vocab_file": "{TRANSFORMER}", + "add_special_tokens": ["", "", ""], + "max_seq_length": 512, + "in": ["question"], + "out": ["input_ids", "attention_mask"] + }, + { + "class_name": "src.t5_generative_ie:T5GenerativeIE", + "pretrained_transformer": "{TRANSFORMER}", + "add_special_tokens": ["", "", ""], + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "optimizer": "AdamW", + "optimizer_parameters": { + "lr": 3e-05, + "weight_decay": 0.01, + "betas": [0.9, 0.999], + "eps": 1e-06 + }, + "learning_rate_drop_patience": 6, + "learning_rate_drop_div": 1.5, + "in": ["input_ids", "attention_mask"], + "out": ["answer", "score"] + } + ], + "out": ["answer", "score"] + }, + "metadata": { + "variables": { + "TRANSFORMER": "t5-small", + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "MODEL_PATH": "{MODELS_PATH}/t5_small_generative_ie" + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/tmp/t5_small_generative_ie.tar.gz", + "subdir": "{MODEL_PATH}" + } + ] + } +} diff --git a/assistant_dists/dream/docker-compose.override.yml b/assistant_dists/dream/docker-compose.override.yml index 3ef800dd9a..6a31b290c4 100644 --- a/assistant_dists/dream/docker-compose.override.yml +++ b/assistant_dists/dream/docker-compose.override.yml @@ -1306,7 +1306,7 @@ services: env_file: [.env] build: args: - CONFIG: t5_generative_ie_infer.json + CONFIG: t5_generative_ie_lite_infer.json PORT: 8129 SRC_DIR: annotators/property_extraction/ context: ./ From dbc1604220f212667d4bfab9a0f448dfb01fb701 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Fri, 3 Feb 2023 10:49:37 +0300 Subject: [PATCH 34/40] fix typo --- assistant_dists/dream/docker-compose.override.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/assistant_dists/dream/docker-compose.override.yml b/assistant_dists/dream/docker-compose.override.yml index 7fbdfbeef9..9de20a9dbf 100644 --- a/assistant_dists/dream/docker-compose.override.yml +++ b/assistant_dists/dream/docker-compose.override.yml @@ -1085,9 +1085,6 @@ services: context: ./ dockerfile: annotators/entity_detection/Dockerfile command: flask run -h 0.0.0.0 -p 8103 - environment:black --line-length=120 --check annotators/entity_detection - проверка - -black --line-length=120 annotators/entity_detection - форматирование - FLASK_APP=server - CUDA_VISIBLE_DEVICES=0 deploy: From 05fe4af58d501f35a04315e0f579cdaa396c2555 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Fri, 3 Feb 2023 12:47:10 +0300 Subject: [PATCH 35/40] fix requirements --- annotators/spacy_nounphrases/Dockerfile | 2 +- annotators/spacy_nounphrases/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/annotators/spacy_nounphrases/Dockerfile b/annotators/spacy_nounphrases/Dockerfile index daa7bc3643..9e6b92f493 100644 --- a/annotators/spacy_nounphrases/Dockerfile +++ b/annotators/spacy_nounphrases/Dockerfile @@ -4,7 +4,7 @@ RUN mkdir /src COPY ./annotators/spacy_nounphrases/requirements.txt /src/requirements.txt RUN pip install -r /src/requirements.txt -RUN pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz +RUN pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl COPY ./annotators/spacy_nounphrases/ /src/ COPY ./common/ /src/common/ diff --git a/annotators/spacy_nounphrases/requirements.txt b/annotators/spacy_nounphrases/requirements.txt index c7720ef3c7..b7fac0c490 100644 --- a/annotators/spacy_nounphrases/requirements.txt +++ b/annotators/spacy_nounphrases/requirements.txt @@ -3,6 +3,6 @@ itsdangerous==2.0.1 gunicorn==20.0.4 sentry-sdk==0.13.4 requests==2.22.0 -spacy==2.2.0 +spacy==3.2.3 jinja2<=3.0.3 Werkzeug<=2.0.3 From 896b1538fcff0822d7bfa153fb3c852d55a05529 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Wed, 15 Feb 2023 22:15:07 +0300 Subject: [PATCH 36/40] fix typo --- .../entity_linking/src/entity_linking.py | 548 ------------------ 1 file changed, 548 deletions(-) diff --git a/annotators/entity_linking/src/entity_linking.py b/annotators/entity_linking/src/entity_linking.py index e8ea408d79..89da9f00e9 100644 --- a/annotators/entity_linking/src/entity_linking.py +++ b/annotators/entity_linking/src/entity_linking.py @@ -62,69 +62,6 @@ def __init__( ) -> None: """ -# Copyright 2017 Neural Networks and Deep Learning lab, MIPT -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re -import sqlite3 -import time -from logging import getLogger -from typing import List, Dict, Tuple -from collections import defaultdict - -import nltk -from nltk.corpus import stopwords -from rapidfuzz import fuzz - -from deeppavlov.core.common.registry import register -from deeppavlov.core.models.component import Component -from deeppavlov.core.models.serializable import Serializable -from deeppavlov.core.commands.utils import expand_path -from src.find_word import WordSearcher - -log = getLogger(__name__) -nltk.download("stopwords") - - -@register("entity_linker") -class EntityLinker(Component, Serializable): - """ - Class for linking of entity substrings in the document to entities in Wikidata - """ - - def __init__( - self, - load_path: str, - tags_filename: str, - add_info_filename: str, - words_dict_filename: str = None, - ngrams_matrix_filename: str = None, - entity_ranker=None, - num_entities_for_bert_ranking: int = 50, - num_entities_to_return: int = 10, - max_text_len: int = 300, - max_paragraph_len: int = 150, - lang: str = "ru", - use_descriptions: bool = True, - use_tags: bool = False, - use_related_tags: bool = False, - lemmatize: bool = False, - full_paragraph: bool = False, - use_connections: bool = False, - **kwargs, - ) -> None: - """ Args: load_path: path to folder with inverted index files entity_ranker: component deeppavlov.models.kbqa.rel_ranking_bert @@ -610,488 +547,3 @@ def rank_by_description( entity_ids_list.append(top_entities[: self.num_entities_to_return]) conf_list.append(top_conf[: self.num_entities_to_return]) return entity_ids_list, conf_list - Args: - load_path: path to folder with inverted index files - entity_ranker: component deeppavlov.models.kbqa.rel_ranking_bert - num_entities_for_bert_ranking: number of candidate entities for BERT ranking using description and context - ngram_range: char ngrams range for TfidfVectorizer - num_entities_to_return: number of candidate entities for the substring which are returned - lang: russian or english - use_description: whether to perform entity ranking by context and description - lemmatize: whether to lemmatize tokens - **kwargs: - """ - super().__init__(save_path=None, load_path=load_path) - self.lemmatize = lemmatize - self.tags_filename = tags_filename - self.add_info_filename = add_info_filename - self.words_dict_filename = words_dict_filename - self.ngrams_matrix_filename = ngrams_matrix_filename - self.num_entities_for_bert_ranking = num_entities_for_bert_ranking - self.entity_ranker = entity_ranker - self.num_entities_to_return = num_entities_to_return - self.max_text_len = max_text_len - self.max_paragraph_len = max_paragraph_len - self.lang = f"@{lang}" - if self.lang == "@en": - self.stopwords = set(stopwords.words("english")) - elif self.lang == "@ru": - self.stopwords = set(stopwords.words("russian")) - self.use_descriptions = use_descriptions - self.use_connections = use_connections - self.use_tags = use_tags - self.use_related_tags = use_related_tags - self.full_paragraph = full_paragraph - self.re_tokenizer = re.compile(r"[\w']+|[^\w ]") - self.not_found_str = "not in wiki" - self.related_tags = { - "loc": ["gpe", "country", "city", "us_state", "river", "county"], - "product": ["work_of_art"], - "org": ["fac", "business", "norp"], - "per": ["actor", "athlete", "musician", "politician", "writer"], - "event": ["championship", "sports_season", "sports_event"], - "film": ["work_of_art", "road"], - "misc": ["animal", "language", "law", "food", "nation"], - "sport_team": ["association_football_club", "sports_league", "national_sports_team"], - } - self.word_searcher = None - if self.words_dict_filename: - self.word_searcher = WordSearcher(self.words_dict_filename, self.ngrams_matrix_filename) - self.load() - - def load(self) -> None: - with open(str(expand_path(self.tags_filename)), "r") as fl: - lines = fl.readlines() - tags = [] - for line in lines: - tag_str = line.strip().split()[:-1] - tags.append("_".join(tag_str)) - if "O" in tags: - tags.remove("O") - self.cursors = {} - for tag in tags: - conn = sqlite3.connect(f"{self.load_path}/{tag.lower()}.db", check_same_thread=False) - cur = conn.cursor() - self.cursors[tag.lower()] = cur - conn = sqlite3.connect(str(expand_path(self.add_info_filename)), check_same_thread=False) - self.add_info_cur = conn.cursor() - - def save(self) -> None: - pass - - def __call__( - self, - entity_substr_batch: List[List[str]], - entity_tags_batch: List[List[str]] = None, - sentences_batch: List[List[str]] = None, - entity_offsets_batch: List[List[List[int]]] = None, - sentences_offsets_batch: List[List[Tuple[int, int]]] = None, - ): - if sentences_offsets_batch is None and sentences_batch is not None: - sentences_offsets_batch = [] - for sentences_list in sentences_batch: - sentences_offsets_list = [] - start = 0 - for sentence in sentences_list: - end = start + len(sentence) - sentences_offsets_list.append([start, end]) - start = end + 1 - sentences_offsets_batch.append(sentences_offsets_list) - - if sentences_batch is None: - sentences_batch = [[] for _ in entity_substr_batch] - sentences_offsets_batch = [[] for _ in entity_substr_batch] - - log.info(f"sentences_batch {sentences_batch}") - if entity_offsets_batch is None and sentences_batch is not None: - entity_offsets_batch = [] - for entity_substr_list, sentences_list in zip(entity_substr_batch, sentences_batch): - text = " ".join(sentences_list).lower() - log.info(f"text {text}") - entity_offsets_list = [] - for entity_substr in entity_substr_list: - st_offset = text.find(entity_substr.lower()) - end_offset = st_offset + len(entity_substr) - entity_offsets_list.append([st_offset, end_offset]) - entity_offsets_batch.append(entity_offsets_list) - - entity_ids_batch, entity_conf_batch, entity_pages_batch = [], [], [] - for entity_substr_list, entity_offsets_list, entity_tags_list, sentences_list, sentences_offsets_list in zip( - entity_substr_batch, entity_offsets_batch, entity_tags_batch, sentences_batch, sentences_offsets_batch - ): - entity_ids_list, entity_conf_list, entity_pages_list = self.link_entities( - entity_substr_list, - entity_offsets_list, - entity_tags_list, - sentences_list, - sentences_offsets_list, - ) - log.info(f"entity_ids_list {entity_ids_list} entity_conf_list {entity_conf_list}") - if self.num_entities_to_return == 1: - entity_pages_list = [entity_pages[0] for entity_pages in entity_pages_list] - else: - entity_pages_list = [entity_pages[: self.num_entities_to_return] for entity_pages in entity_pages_list] - entity_ids_batch.append(entity_ids_list) - entity_conf_batch.append(entity_conf_list) - entity_pages_batch.append(entity_pages_list) - first_par_batch, dbpedia_types_batch = self.extract_add_info(entity_pages_batch) - return entity_ids_batch, entity_conf_batch, entity_pages_batch, first_par_batch, dbpedia_types_batch - - def extract_add_info(self, entity_pages_batch: List[List[List[str]]]): - first_par_batch, dbpedia_types_batch = [], [] - for entity_pages_list in entity_pages_batch: - first_par_list, dbpedia_types_list = [], [] - for entity_pages in entity_pages_list: - first_pars, dbpedia_types = [], [] - for entity_page in entity_pages: - try: - query = "SELECT * FROM entity_additional_info WHERE page_title='{}';".format(entity_page) - res = self.add_info_cur.execute(query) - fetch_res = res.fetchall() - first_par = fetch_res[0][1] - dbpedia_types_elem = fetch_res[0][2].split() - first_pars.append(first_par) - dbpedia_types.append(dbpedia_types_elem) - except Exception as e: - first_pars.append("") - dbpedia_types.append([]) - log.info(f"error {e}") - first_par_list.append(first_pars) - dbpedia_types_list.append(dbpedia_types) - first_par_batch.append(first_par_list) - dbpedia_types_batch.append(dbpedia_types_list) - return first_par_batch, dbpedia_types_batch - - def link_entities( - self, - entity_substr_list: List[str], - entity_offsets_list: List[List[int]], - entity_tags_list: List[str], - sentences_list: List[str], - sentences_offsets_list: List[List[int]], - ) -> List[List[str]]: - log.info( - f"entity_substr_list {entity_substr_list} entity_tags_list {entity_tags_list} " - f"entity_offsets_list {entity_offsets_list}" - ) - entity_ids_list, conf_list, pages_list, pages_dict_list, descr_list = [], [], [], [], [] - if entity_substr_list: - entities_scores_list = [] - cand_ent_scores_list = [] - tm_st = time.time() - for entity_substr, tags in zip(entity_substr_list, entity_tags_list): - for symb_old, symb_new in [("'", "''"), ("-", " "), ("@", ""), (".", ""), (" ", " ")]: - entity_substr = entity_substr.replace(symb_old, symb_new) - cand_ent_init = defaultdict(set) - if len(entity_substr) > 1: - if tags and isinstance(tags[0], str): - tags = [tags] - cand_ent_init = self.find_exact_match(entity_substr, tags) - all_low_conf = True - for entity_id in cand_ent_init: - entity_info_set = cand_ent_init[entity_id] - for entity_info in entity_info_set: - if entity_info[0] == 1.0: - all_low_conf = False - break - if not all_low_conf: - break - clean_tags = [tag for tag, conf in tags] - corr_tags, corr_clean_tags = [], [] - for tag, conf in tags: - if tag in self.related_tags: - corr_tag_list = self.related_tags[tag] - for corr_tag in corr_tag_list: - if corr_tag not in clean_tags and corr_tag not in corr_clean_tags: - corr_tags.append([corr_tag, conf]) - corr_clean_tags.append(corr_tag) - - if (not cand_ent_init or all_low_conf or self.use_related_tags) and corr_tags: - corr_cand_ent_init = self.find_exact_match(entity_substr, corr_tags) - cand_ent_init = {**cand_ent_init, **corr_cand_ent_init} - entity_substr_split = [ - word for word in entity_substr.split(" ") if word not in self.stopwords and len(word) > 0 - ] - if ( - not cand_ent_init - and len(entity_substr_split) == 1 - and self.word_searcher - and all([letter.isalpha() for letter in entity_substr_split[0]]) - ): - corr_words = self.word_searcher(entity_substr_split[0], set(clean_tags + corr_clean_tags)) - if corr_words: - cand_ent_init = self.find_exact_match(corr_words[0], tags + corr_tags) - - if not cand_ent_init and len(entity_substr_split) > 1: - cand_ent_init = self.find_fuzzy_match(entity_substr_split, tags) - - cand_ent_scores = [] - for entity in cand_ent_init: - entities_scores = list(cand_ent_init[entity]) - entities_scores = sorted(entities_scores, key=lambda x: (x[0], x[3], x[2]), reverse=True) - cand_ent_scores.append(([entity] + list(entities_scores[0]))) - - cand_ent_scores = sorted(cand_ent_scores, key=lambda x: (x[1], x[4], x[3]), reverse=True) - cand_ent_scores = cand_ent_scores[: self.num_entities_for_bert_ranking] - cand_ent_scores_list.append(cand_ent_scores) - entity_ids = [elem[0] for elem in cand_ent_scores] - pages = [elem[5] for elem in cand_ent_scores] - scores = [elem[1:5] for elem in cand_ent_scores] - entities_scores_list.append( - {entity_id: entity_scores for entity_id, entity_scores in zip(entity_ids, scores)} - ) - entity_ids_list.append(entity_ids) - pages_list.append(pages) - pages_dict_list.append({entity_id: page for entity_id, page in zip(entity_ids, pages)}) - descr_list.append([elem[6] for elem in cand_ent_scores]) - log.info(f"get candidate entities time: {time.time() - tm_st}") - - if self.use_descriptions: - tm_st = time.time() - substr_lens = [len(entity_substr.split()) for entity_substr in entity_substr_list] - entity_ids_list, conf_list = self.rank_by_description( - entity_substr_list, - entity_tags_list, - entity_offsets_list, - entity_ids_list, - descr_list, - entities_scores_list, - sentences_list, - sentences_offsets_list, - substr_lens, - ) - pages_list = [ - [pages_dict.get(entity_id, "") for entity_id in entity_ids] - for entity_ids, pages_dict in zip(entity_ids_list, pages_dict_list) - ] - log.info(f"get descriptions time: {time.time() - tm_st}") - - return entity_ids_list, conf_list, pages_list - - def process_cand_ent(self, cand_ent_init, entities_and_ids, entity_substr_split, tag, tag_conf): - for entity_title, entity_id, entity_rels, anchor_cnt, _, page, descr in entities_and_ids: - substr_score = self.calc_substr_score(entity_title, entity_substr_split) - cand_ent_init[entity_id].add((substr_score, anchor_cnt, entity_rels, tag_conf, page, descr)) - return cand_ent_init - - def find_exact_match(self, entity_substr, tags): - entity_substr = entity_substr.lower() - entity_substr_split = entity_substr.split() - cand_ent_init = defaultdict(set) - for tag, tag_conf in tags: - if tag.lower() in self.cursors: - query = "SELECT * FROM inverted_index WHERE title MATCH '{}';".format(entity_substr) - res = self.cursors[tag.lower()].execute(query) - entities_and_ids = res.fetchall() - if entities_and_ids: - cand_ent_init = self.process_cand_ent( - cand_ent_init, entities_and_ids, entity_substr_split, tag, tag_conf - ) - if tags and tags[0][0] == "misc" and not cand_ent_init: - for tag in self.cursors: - if tag not in {"actor", "athlete", "musician", "per", "politician", "writer"}: - query = "SELECT * FROM inverted_index WHERE title MATCH '{}';".format(entity_substr) - res = self.cursors[tag].execute(query) - entities_and_ids = res.fetchall() - if entities_and_ids: - cand_ent_init = self.process_cand_ent( - cand_ent_init, entities_and_ids, entity_substr_split, tag, tag_conf - ) - return cand_ent_init - - def find_fuzzy_match(self, entity_substr_split, tags): - entity_substr_split = [word.lower() for word in entity_substr_split] - cand_ent_init = defaultdict(set) - for tag, tag_conf in tags: - if tag.lower() in self.cursors: - for word in entity_substr_split: - query = "SELECT * FROM inverted_index WHERE title MATCH '{}';".format(word) - res = self.cursors[tag.lower()].execute(query) - part_entities_and_ids = res.fetchall() - cand_ent_init = self.process_cand_ent( - cand_ent_init, part_entities_and_ids, entity_substr_split, tag, tag_conf - ) - return cand_ent_init - - def calc_substr_score(self, entity_title, entity_substr_split): - label_tokens = entity_title.split() - cnt = 0.0 - for ent_tok in entity_substr_split: - found = False - for label_tok in label_tokens: - if label_tok == ent_tok: - found = True - break - if found: - cnt += 1.0 - else: - for label_tok in label_tokens: - if label_tok[:2] == ent_tok[:2]: - fuzz_score = fuzz.ratio(label_tok, ent_tok) - if fuzz_score >= 80.0 and not found: - cnt += fuzz_score * 0.01 - break - substr_score = round(cnt / max(len(label_tokens), len(entity_substr_split)), 3) - if len(label_tokens) == 2 and len(entity_substr_split) == 1: - if entity_substr_split[0] == label_tokens[1]: - substr_score = 0.5 - elif entity_substr_split[0] == label_tokens[0]: - substr_score = 0.3 - return substr_score - - def rank_by_description( - self, - entity_substr_list: List[str], - entity_tags_list: List[List[Tuple[str, int]]], - entity_offsets_list: List[List[int]], - cand_ent_list: List[List[str]], - cand_ent_descr_list: List[List[str]], - entities_scores_list: List[Dict[str, Tuple[int, float]]], - sentences_list: List[str], - sentences_offsets_list: List[Tuple[int, int]], - substr_lens: List[int], - ) -> List[List[str]]: - entity_ids_list = [] - conf_list = [] - contexts = [] - for entity_start_offset, entity_end_offset in entity_offsets_list: - sentence = "" - rel_start_offset = 0 - rel_end_offset = 0 - found_sentence_num = 0 - for num, (sent, (sent_start_offset, sent_end_offset)) in enumerate( - zip(sentences_list, sentences_offsets_list) - ): - if entity_start_offset >= sent_start_offset and entity_end_offset <= sent_end_offset: - sentence = sent - found_sentence_num = num - rel_start_offset = entity_start_offset - sent_start_offset - rel_end_offset = entity_end_offset - sent_start_offset - break - context = "" - if sentence: - start_of_sentence = 0 - end_of_sentence = len(sentence) - if len(sentence) > self.max_text_len: - start_of_sentence = max(rel_start_offset - self.max_text_len // 2, 0) - end_of_sentence = min(rel_end_offset + self.max_text_len // 2, len(sentence)) - text_before = sentence[start_of_sentence:rel_start_offset] - text_after = sentence[rel_end_offset:end_of_sentence] - context = text_before + "[ent]" + text_after - if self.full_paragraph: - cur_sent_len = len(re.findall(self.re_tokenizer, context)) - first_sentence_num = found_sentence_num - last_sentence_num = found_sentence_num - context = [context] - while True: - added = False - if last_sentence_num < len(sentences_list) - 1: - sentence_tokens = re.findall(self.re_tokenizer, sentences_list[last_sentence_num + 1]) - last_sentence_len = len(sentence_tokens) - if cur_sent_len + last_sentence_len < self.max_paragraph_len: - context.append(sentences_list[last_sentence_num + 1]) - cur_sent_len += last_sentence_len - last_sentence_num += 1 - added = True - if first_sentence_num > 0: - sentence_tokens = re.findall(self.re_tokenizer, sentences_list[first_sentence_num - 1]) - first_sentence_len = len(sentence_tokens) - if cur_sent_len + first_sentence_len < self.max_paragraph_len: - context = [sentences_list[first_sentence_num - 1]] + context - cur_sent_len += first_sentence_len - first_sentence_num -= 1 - added = True - if not added: - break - context = " ".join(context) - - log.info(f"rank, context: {context}") - contexts.append(context) - - scores_list = self.entity_ranker(contexts, cand_ent_list, cand_ent_descr_list) - - for context, entity_tags, candidate_entities, substr_len, entities_scores, scores in zip( - contexts, entity_tags_list, cand_ent_list, substr_lens, entities_scores_list, scores_list - ): - log.info(f"len candidate entities {len(candidate_entities)}") - if len(context.split()) < 4: - entities_with_scores = [ - ( - entity, - round(entities_scores.get(entity, (0.0, 0, 0))[0], 2), - entities_scores.get(entity, (0.0, 0, 0))[1], - entities_scores.get(entity, (0.0, 0, 0))[2], - 0.95, - ) - for entity, score in scores - ] - else: - entities_with_scores = [ - ( - entity, - round(entities_scores.get(entity, (0.0, 0, 0))[0], 2), - entities_scores.get(entity, (0.0, 0, 0))[1], - entities_scores.get(entity, (0.0, 0, 0))[2], - round(score, 2), - ) - for entity, score in scores - ] - log.info(f"len entities with scores {len(entities_with_scores)}") - if entity_tags and entity_tags[0][0] == "misc": - entities_with_scores = sorted(entities_with_scores, key=lambda x: (x[1], x[2], x[4]), reverse=True) - else: - entities_with_scores = sorted(entities_with_scores, key=lambda x: (x[1], x[4], x[3]), reverse=True) - log.info(f"--- entities_with_scores {entities_with_scores}") - - if not entities_with_scores: - top_entities = [self.not_found_str] - top_conf = [(0.0, 0, 0, 0.0)] - elif entities_with_scores and substr_len == 1 and entities_with_scores[0][1] < 1.0: - top_entities = [self.not_found_str] - top_conf = [(0.0, 0, 0, 0.0)] - elif entities_with_scores and ( - entities_with_scores[0][1] < 0.3 - or (entities_with_scores[0][4] < 0.13 and entities_with_scores[0][3] < 20) - or (entities_with_scores[0][4] < 0.3 and entities_with_scores[0][3] < 4) - or entities_with_scores[0][1] < 0.6 - ): - top_entities = [self.not_found_str] - top_conf = [(0.0, 0, 0, 0.0)] - else: - top_entities = [score[0] for score in entities_with_scores] - top_conf = [score[1:] for score in entities_with_scores] - - log.info(f"--- top_entities {top_entities} top_conf {top_conf}") - - high_conf_entities = [] - high_conf_nums = [] - for elem_num, (entity, conf) in enumerate(zip(top_entities, top_conf)): - if len(conf) == 3 and conf[0] == 1.0 and conf[2] > 50 and conf[3] > 0.3: - new_conf = list(conf) - if new_conf[2] > 55: - new_conf[3] = 1.0 - new_conf = tuple(new_conf) - high_conf_entities.append((entity,) + new_conf) - high_conf_nums.append(elem_num) - - high_conf_entities = sorted(high_conf_entities, key=lambda x: (x[1], x[4], x[3]), reverse=True) - for n, elem_num in enumerate(high_conf_nums): - if elem_num - n >= 0 and elem_num - n < len(top_entities): - del top_entities[elem_num - n] - del top_conf[elem_num - n] - - log.info(f"top entities {top_entities} top_conf {top_conf}") - log.info(f"high_conf_entities {high_conf_entities}") - - top_entities = [elem[0] for elem in high_conf_entities] + top_entities - top_conf = [elem[1:] for elem in high_conf_entities] + top_conf - - log.info(f"top entities {top_entities} top_conf {top_conf}") - - if self.num_entities_to_return == 1 and top_entities: - entity_ids_list.append(top_entities[0]) - conf_list.append(top_conf[0]) - else: - entity_ids_list.append(top_entities[: self.num_entities_to_return]) - conf_list.append(top_conf[: self.num_entities_to_return]) - return entity_ids_list, conf_list \ No newline at end of file From 01c0222b8f1c2ce83d311a69af69c2b315f5a800 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Wed, 15 Feb 2023 22:34:54 +0300 Subject: [PATCH 37/40] change port --- annotators/property_extraction/Dockerfile | 2 +- assistant_dists/dream/dev.yml | 2 +- assistant_dists/dream/docker-compose.override.yml | 4 ++-- assistant_dists/dream/pipeline_conf.json | 6 +++--- assistant_dists/dream/proxy.yml | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/annotators/property_extraction/Dockerfile b/annotators/property_extraction/Dockerfile index 347a18e19c..79b3ae7be7 100644 --- a/annotators/property_extraction/Dockerfile +++ b/annotators/property_extraction/Dockerfile @@ -14,4 +14,4 @@ COPY $SRC_DIR /src WORKDIR /src -CMD gunicorn --workers=1 --timeout 500 server:app -b 0.0.0.0:8129 +CMD gunicorn --workers=1 --timeout 500 server:app -b 0.0.0.0:8136 diff --git a/assistant_dists/dream/dev.yml b/assistant_dists/dream/dev.yml index 27a46cc4bd..a793b63429 100644 --- a/assistant_dists/dream/dev.yml +++ b/assistant_dists/dream/dev.yml @@ -452,5 +452,5 @@ services: - "./annotators/property_extraction:/src" - "~/.deeppavlov:/root/.deeppavlov" ports: - - 8129:8129 + - 8136:8136 version: "3.7" diff --git a/assistant_dists/dream/docker-compose.override.yml b/assistant_dists/dream/docker-compose.override.yml index cfc180cae7..d361fa14ee 100644 --- a/assistant_dists/dream/docker-compose.override.yml +++ b/assistant_dists/dream/docker-compose.override.yml @@ -1308,11 +1308,11 @@ services: build: args: CONFIG: t5_generative_ie_lite_infer.json - PORT: 8129 + PORT: 8136 SRC_DIR: annotators/property_extraction/ context: ./ dockerfile: annotators/property_extraction/Dockerfile - command: flask run -h 0.0.0.0 -p 8129 + command: flask run -h 0.0.0.0 -p 8136 environment: - FLASK_APP=server deploy: diff --git a/assistant_dists/dream/pipeline_conf.json b/assistant_dists/dream/pipeline_conf.json index c0720a3141..c9014af108 100644 --- a/assistant_dists/dream/pipeline_conf.json +++ b/assistant_dists/dream/pipeline_conf.json @@ -115,7 +115,7 @@ "connector": { "protocol": "http", "timeout": 1, - "url": "http://property-extraction:8129/respond" + "url": "http://property-extraction:8136/respond" }, "dialog_formatter": "state_formatters.dp_formatters:property_extraction_formatter_last_bot_dialog", "response_formatter": "state_formatters.dp_formatters:simple_formatter_service", @@ -319,7 +319,7 @@ "connector": { "protocol": "http", "timeout": 1, - "url": "http://property-extraction:8129/respond" + "url": "http://property-extraction:8136/respond" }, "dialog_formatter": "state_formatters.dp_formatters:property_extraction_formatter_dialog", "response_formatter": "state_formatters.dp_formatters:simple_formatter_service", @@ -1126,4 +1126,4 @@ "gpu_usage": "50 GB", "disk_usage": "50 GB" } -} \ No newline at end of file +} diff --git a/assistant_dists/dream/proxy.yml b/assistant_dists/dream/proxy.yml index 52b23431f7..6ef2091b4b 100644 --- a/assistant_dists/dream/proxy.yml +++ b/assistant_dists/dream/proxy.yml @@ -654,6 +654,6 @@ services: context: dp/proxy/ dockerfile: Dockerfile environment: - - PROXY_PASS=dream.deeppavlov.ai:8129 - - PORT=8129 + - PROXY_PASS=dream.deeppavlov.ai:8136 + - PORT=8136 version: '3.7' From 7e8e68eb30ff8cb738b39c790ee7c27f8b9b4852 Mon Sep 17 00:00:00 2001 From: dmitry Date: Thu, 16 Feb 2023 10:22:33 +0300 Subject: [PATCH 38/40] update el --- annotators/entity_linking/Dockerfile | 6 ++++++ .../entity_linking/entity_linking_eng.json | 4 ++-- annotators/entity_linking/server.py | 21 +++---------------- 3 files changed, 11 insertions(+), 20 deletions(-) diff --git a/annotators/entity_linking/Dockerfile b/annotators/entity_linking/Dockerfile index 6289834e63..27ae58a462 100644 --- a/annotators/entity_linking/Dockerfile +++ b/annotators/entity_linking/Dockerfile @@ -6,6 +6,12 @@ RUN apt-key del 7fa2af80 && \ -o cuda-keyring_1.0-1_all.deb && \ dpkg -i cuda-keyring_1.0-1_all.deb RUN apt-get -y update +RUN apt-get install -y build-essential zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget llvm \ + libncurses5-dev libncursesw5-dev xz-utils libffi-dev liblzma-dev + +RUN apt-get -y update && \ + apt-get install -y software-properties-common && \ + apt-get update && apt-get install git -y RUN apt-get install -y sqlite3 diff --git a/annotators/entity_linking/entity_linking_eng.json b/annotators/entity_linking/entity_linking_eng.json index 96bef11a5a..a1e7c77d35 100644 --- a/annotators/entity_linking/entity_linking_eng.json +++ b/annotators/entity_linking/entity_linking_eng.json @@ -15,7 +15,7 @@ { "class_name": "src.entity_linking:EntityLinker", "in": ["entity_substr", "entity_tags", "sentences"], - "out": ["entity_ids", "entity_conf", "entity_id_tags", "entity_pages", "first_pars", "dbpedia_types"], + "out": ["entity_ids", "entity_conf", "entity_pages", "first_pars", "dbpedia_types"], "load_path": "{DOWNLOADS_PATH}/entity_linking_eng/el_eng_dream", "add_info_filename": "{DOWNLOADS_PATH}/entity_linking_eng/el_eng_dream/add_info.db", "tags_filename": "{MODELS_PATH}/finegrained_tags/tag.dict", @@ -36,7 +36,7 @@ "lang": "en" } ], - "out": ["entity_substr", "entity_ids", "entity_conf", "entity_id_tags", "entity_pages", "first_pars", "dbpedia_types"] + "out": ["entity_substr", "entity_ids", "entity_conf", "entity_pages", "first_pars", "dbpedia_types"] }, "metadata": { "variables": { diff --git a/annotators/entity_linking/server.py b/annotators/entity_linking/server.py index 3e021a599d..7dc5b34322 100644 --- a/annotators/entity_linking/server.py +++ b/annotators/entity_linking/server.py @@ -51,7 +51,6 @@ def respond(): entity_substr_batch, entity_ids_batch, conf_batch, - entity_id_tags_batch, entity_pages_batch, first_pars_batch, dbpedia_types_batch, @@ -61,35 +60,21 @@ def respond(): entity_substr_list, entity_ids_list, conf_list, - entity_id_tags_list, entity_pages_list, first_pars_list, dbpedia_types_list, ) in zip( - entity_substr_batch, - entity_ids_batch, - conf_batch, - entity_id_tags_batch, - entity_pages_batch, - first_pars_batch, - dbpedia_types_batch, + entity_substr_batch, entity_ids_batch, conf_batch, entity_pages_batch, first_pars_batch, dbpedia_types_batch ): entity_info_list = [] - for entity_substr, entity_ids, confs, entity_id_tags, entity_pages, first_pars, dbpedia_types in zip( - entity_substr_list, - entity_ids_list, - conf_list, - entity_id_tags_list, - entity_pages_list, - first_pars_list, - dbpedia_types_list, + for entity_substr, entity_ids, confs, entity_pages, first_pars, dbpedia_types in zip( + entity_substr_list, entity_ids_list, conf_list, entity_pages_list, first_pars_list, dbpedia_types_list ): entity_info = {} entity_info["entity_substr"] = entity_substr entity_info["entity_ids"] = entity_ids entity_info["confidences"] = [float(elem[2]) for elem in confs] entity_info["tokens_match_conf"] = [float(elem[0]) for elem in confs] - entity_info["entity_id_tags"] = entity_id_tags entity_info["pages_titles"] = entity_pages entity_info["first_paragraphs"] = first_pars entity_info["dbpedia_types"] = dbpedia_types From f58f7ead81c54b5e58e2c8b3886fddf052f2a244 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9=20=D0=95=D0=B2?= =?UTF-8?q?=D1=81=D0=B5=D0=B5=D0=B2?= Date: Thu, 16 Feb 2023 15:30:44 +0300 Subject: [PATCH 39/40] fix tests --- annotators/property_extraction/test_property_extraction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annotators/property_extraction/test_property_extraction.py b/annotators/property_extraction/test_property_extraction.py index 73d311534f..806ee6c9f7 100644 --- a/annotators/property_extraction/test_property_extraction.py +++ b/annotators/property_extraction/test_property_extraction.py @@ -2,7 +2,7 @@ def main(): - url = "http://0.0.0.0:8129/respond" + url = "http://0.0.0.0:8136/respond" request_data = [{"utterances": [["i live in moscow"]]}] gold_results = [[{"triplet": {"object": "moscow", "relation": "live in citystatecountry", "subject": "user"}}]] From 1db56ba8bf01d712523b2121a7a0fa8925acf6a4 Mon Sep 17 00:00:00 2001 From: dilyararimovna Date: Tue, 21 Feb 2023 08:32:11 +0300 Subject: [PATCH 40/40] fix: readme and paths --- README.md | 51 ------------------- assistant_dists/dream/dev.yml | 2 +- .../dream/docker-compose.override.yml | 6 +-- assistant_dists/dream/pipeline_conf.json | 2 +- assistant_dists/dream/proxy.yml | 2 +- 5 files changed, 6 insertions(+), 57 deletions(-) diff --git a/README.md b/README.md index 0269522a1c..fbb754a6ba 100644 --- a/README.md +++ b/README.md @@ -325,57 +325,6 @@ Dream Architecture is presented in the following image: | DFF Weather Skill | 1.4 GB RAM | **[New DFF version]** uses the OpenWeatherMap service to get the forecast for the user's location | | DFF Wiki Skill | 150 MB RAM | used for making scenarios with the extraction of entities, slot filling, facts insertion, and acknowledgements | -# Components Russian Version - -Dream Architecture is presented in the following image: -![DREAM](RussianDREAM.png) - -## Annotators - -| Name | Requirements | Description | -|------------------------|--------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Badlisted words | 50 MB RAM | detects obscene Russian words from the badlist | -| Entity detection | 3 GB RAM | extracts entities and their types from utterances | -| Entity linking | 500 MB RAM, ?? GB GPU | finds Wikidata entity ids for the entities detected with Entity Detection | -| Intent catcher | 900 MB RAM | classifies user utterances into a number of predefined intents which are trained on a set of phrases and regexps | -| NER | 1.7 GB RAM, 4.9 GB GPU | extracts person names, names of locations, organizations from uncased text using ruBert-based (pyTorch) model | -| Sentseg | 2.4 GB RAM, 4.9 GB GPU | recovers punctuation using ruBert-based (pyTorch) model and splits into sentences | -| Spacy Annotator | 250 MB RAM | token-wise annotations by Spacy | -| Spelling preprocessing | 8 GB RAM | Russian Levenshtein correction model | -| Wiki parser | 100 MB RAM | extracts Wikidata triplets for the entities detected with Entity Linking | -| DialogRPT | 3.8 GB RAM, 2 GB GPU | DialogRPT model which is based on [Russian DialoGPT by DeepPavlov](https://huggingface.co/DeepPavlov/rudialogpt3_medium_based_on_gpt2_v2) and fine-tuned on Russian Pikabu Comment sequences | - -## Skills & Services -| Name | Requirements | Description | -|------------------------|---------------------------|-------------------------------------------------------------------------------------------------------------------------------------| -| DialoGPT | 2.8 GB RAM, 2 GB GPU | [Russian DialoGPT by DeepPavlov](https://huggingface.co/DeepPavlov/rudialogpt3_medium_based_on_gpt2_v2) | -| Dummy Skill | a part of agent container | a fallback skill with multiple non-toxic candidate responses and random Russian questions | -| Personal Info skill | 40 MB RAM | queries and stores user's name, birthplace, and location | -| DFF Generative skill | 50 MB RAM | **[New DFF version]** generative skill which uses DialoGPT service to generate 3 different hypotheses | -| DFF Intent Responder | 50 MB RAM | provides template-based replies for some of the intents detected by Intent Catcher annotator | -| DFF Program Y skill | 80 MB RAM | **[New DFF version]** Chatbot Program Y (https://github.com/keiffster/program-y) adapted for Dream socialbot | -| DFF Friendship skill | 70 MB RAM | **[New DFF version]** DFF-based skill to greet the user in the beginning of the dialog, and forward the user to some scripted skill | -| DFF Wiki skill | 150 MB RAM | used for making scenarios with the extraction of entities, slot filling, facts insertion, and acknowledgements | - - -# Components Multilingual Version - -Dream Architecture is presented in the following image: -![DREAM](multilingualDREAM.png) - -## Annotators - -| Name | Requirements | Description | -|--------------------------|--------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Sentiment Classification | 2 GB RAM, 2 GB GPU | classifies sentiment to positive, negative and neutral classes | -| Toxic Classification | 3 GB RAM, 2 GB GPU | classifies toxicity: identity_attack, insult, obscene, severe_toxicity, sexual_explicit, threat, toxicity | -| Sentence Ranker | 2.5 GB RAM, 1.8 GB GPU | for a pair of sentences predicts a floating point value. For multilingual version, return cosine similarity between embeddings from multilingual sentence BERT | - -## Skills & Services -| Name | Requirements | Description | -|----------------|-----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------| -| gpt2-generator | 5 GB RAM, 6.5 GB GPU | GPT2-based generative model. For Multilingual distribution we propose mgpt by Sberbank [from HugginFace](https://huggingface.co/sberbank-ai/mGPT) | - # Papers ### Alexa Prize 3 diff --git a/assistant_dists/dream/dev.yml b/assistant_dists/dream/dev.yml index a793b63429..f4522c8e11 100644 --- a/assistant_dists/dream/dev.yml +++ b/assistant_dists/dream/dev.yml @@ -84,7 +84,7 @@ services: ner: volumes: - './annotators/NER_deeppavlov:/src' - - "/archive/evseev/.deeppavlov:/root/.deeppavlov" + - "~/.deeppavlov:/root/.deeppavlov" ports: - 8021:8021 eliza: diff --git a/assistant_dists/dream/docker-compose.override.yml b/assistant_dists/dream/docker-compose.override.yml index d361fa14ee..23b933a271 100644 --- a/assistant_dists/dream/docker-compose.override.yml +++ b/assistant_dists/dream/docker-compose.override.yml @@ -20,7 +20,7 @@ services: dff-gossip-skill:8109, dff-wiki-skill:8111, dff-gaming-skill:8115, topic-recommendation:8113, user-persona-extractor:8114, wiki-facts:8116, dff-music-skill:8099, entity-detection:8103, dff-art-skill:8117, midas-predictor:8121, dialogpt:8125, storygpt:8126, prompt-storygpt:8127, seq2seq-persona-based:8140, sentence-ranker:8128, - dff-template-skill:8120, property-extraction:8136" + property-extraction:8136, dff-template-skill:8120" WAIT_HOSTS_TIMEOUT: ${WAIT_TIMEOUT:-480} HIGH_PRIORITY_INTENTS: 1 RESTRICTION_FOR_SENSITIVE_CASE: 1 @@ -1318,9 +1318,9 @@ services: deploy: resources: limits: - memory: 2.5G + memory: 7G reservations: - memory: 2.5G + memory: 7G dff-template-skill: env_file: [ .env ] diff --git a/assistant_dists/dream/pipeline_conf.json b/assistant_dists/dream/pipeline_conf.json index c9014af108..27a760cdcd 100644 --- a/assistant_dists/dream/pipeline_conf.json +++ b/assistant_dists/dream/pipeline_conf.json @@ -1126,4 +1126,4 @@ "gpu_usage": "50 GB", "disk_usage": "50 GB" } -} +} \ No newline at end of file diff --git a/assistant_dists/dream/proxy.yml b/assistant_dists/dream/proxy.yml index 6ef2091b4b..41669ec7a8 100644 --- a/assistant_dists/dream/proxy.yml +++ b/assistant_dists/dream/proxy.yml @@ -277,7 +277,7 @@ services: environment: - PROXY_PASS=dream.deeppavlov.ai:8074 - PORT=8074 - + entity-linking: command: [ "nginx", "-g", "daemon off;" ] build: