diff --git a/README.md b/README.md index 21208d38e0..fbb754a6ba 100644 --- a/README.md +++ b/README.md @@ -244,6 +244,7 @@ Dream Architecture is presented in the following image: | News API Annotator | 80 MB RAM | extracts the latest news about entities or topics using the GNews API. DeepPavlov Dream deployments utilize our own API key. | | Personality Catcher | 30 MB RAM | | | Prompt Selector | 50 MB RAM | Annotator utilizing Sentence Ranker to rank prompts and selecting `N_SENTENCES_TO_RETURN` most relevant prompts (based on questions provided in prompts) | +| Property Extraction | 6.3 GiB RAM | extracts user attributes from utterances | | Rake Keywords | 40 MB RAM | extracts keywords from utterances with the help of RAKE algorithm | | Relative Persona Extractor | 50 MB RAM | Annotator utilizing Sentence Ranker to rank persona sentences and selecting `N_SENTENCES_TO_RETURN` the most relevant sentences | | Sentrewrite | 200 MB RAM | rewrites user's utterances by replacing pronouns with specific names that provide more useful information to downstream components | diff --git a/annotators/property_extraction/Dockerfile b/annotators/property_extraction/Dockerfile new file mode 100644 index 0000000000..79b3ae7be7 --- /dev/null +++ b/annotators/property_extraction/Dockerfile @@ -0,0 +1,17 @@ +FROM deeppavlov/base-gpu:0.17.6 + +RUN apt-get update && apt-get install git -y + +ARG CONFIG +ARG SRC_DIR + +ENV CONFIG=$CONFIG + +COPY ./annotators/property_extraction/requirements.txt /src/requirements.txt +RUN pip install -r /src/requirements.txt + +COPY $SRC_DIR /src + +WORKDIR /src + +CMD gunicorn --workers=1 --timeout 500 server:app -b 0.0.0.0:8136 diff --git a/annotators/property_extraction/property_classification_distilbert.json b/annotators/property_extraction/property_classification_distilbert.json new file mode 100644 index 0000000000..a9db83a238 --- /dev/null +++ b/annotators/property_extraction/property_classification_distilbert.json @@ -0,0 +1,100 @@ +{ + "dataset_reader": { + "class_name": "sq_reader", + "data_path": "{DOWNLOADS_PATH}/dialogue_nli/dialogue_nli_cls.json" + }, + "dataset_iterator": { + "class_name": "basic_classification_iterator", + "seed": 42 + }, + "chainer": { + "in": ["x"], + "in_y": ["y"], + "pipe": [ + { + "class_name": "torch_transformers_preprocessor", + "vocab_file": "{TRANSFORMER}", + "do_lower_case": false, + "max_seq_length": 64, + "in": ["x"], + "out": ["bert_features"] + }, + { + "id": "classes_vocab", + "class_name": "simple_vocab", + "fit_on": ["y"], + "save_path": "{MODEL_PATH}/classes.dict", + "load_path": "{MODEL_PATH}/classes.dict", + "in": ["y"], + "out": ["y_ids"] + }, + { + "in": ["y_ids"], + "out": ["y_onehot"], + "class_name": "one_hotter", + "depth": "#classes_vocab.len", + "single_vector": true + }, + { + "class_name": "torch_transformers_classifier", + "n_classes": "#classes_vocab.len", + "return_probas": true, + "pretrained_bert": "{TRANSFORMER}", + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "optimizer": "AdamW", + "optimizer_parameters": {"lr": 1e-05}, + "learning_rate_drop_patience": 5, + "learning_rate_drop_div": 2.0, + "in": ["bert_features"], + "in_y": ["y_ids"], + "out": ["y_pred_probas"] + }, + { + "in": ["y_pred_probas"], + "out": ["y_pred_ids"], + "class_name": "proba2labels", + "max_proba": true + }, + { + "in": ["y_pred_ids"], + "out": ["y_pred_labels"], + "ref": "classes_vocab" + } + ], + "out": ["y_pred_labels"] + }, + "train": { + "epochs": 100, + "batch_size": 64, + "metrics": [ + "f1_macro", + "accuracy" + ], + "validation_patience": 10, + "val_every_n_batches": 100, + "log_every_n_batches": 100, + "show_examples": false, + "evaluation_targets": ["valid", "test"], + "class_name": "torch_trainer" + }, + "metadata": { + "variables": { + "TRANSFORMER": "distilbert-base-uncased", + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "MODEL_PATH": "{MODELS_PATH}/classifiers/property_classification" + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/deeppavlov_data/generative_ie/property_classification.tar.gz", + "subdir": "{MODEL_PATH}" + }, + { + "url": "http://files.deeppavlov.ai/deeppavlov_data/generative_ie/dialogue_nli_cls.tar.gz", + "subdir": "{DOWNLOADS_PATH}/dialogue_nli" + } + ] + } +} diff --git a/annotators/property_extraction/rel_list.txt b/annotators/property_extraction/rel_list.txt new file mode 100644 index 0000000000..890a24ac48 --- /dev/null +++ b/annotators/property_extraction/rel_list.txt @@ -0,0 +1,61 @@ + p +attend_school r +dislike r +employed_by_company r +employed_by_general r +favorite r +favorite_activity r +favorite_animal r +favorite_book r +favorite_color r +favorite_drink r +favorite_food r +favorite_hobby r +favorite_movie r +favorite_music r +favorite_music_artist r +favorite_place r +favorite_season r +favorite_show r +favorite_sport r +gender p +has_ability r +has_age p +has_degree r +has_hobby r +has_profession r +have r +have_chidren r +have_family r +have_pet r +have_sibling r +have_vehicle r +job_status p +like_activity r +like_animal r +like_drink r +like_food r +like_general r +like_goto r +like_movie r +like_music r +like_read r +like_sports r +like_watching r +live_in_citystatecountry r +live_in_general r +marital_status p +member_of r +misc_attribute p +nationality p +not_have r +other p +own r +physical_attribute p +place_origin r +previous_profession r +school_status p +teach r +want r +want_do r +want_job p diff --git a/annotators/property_extraction/requirements.txt b/annotators/property_extraction/requirements.txt new file mode 100644 index 0000000000..f606ea620f --- /dev/null +++ b/annotators/property_extraction/requirements.txt @@ -0,0 +1,14 @@ +pyopenssl==22.0.0 +Flask==1.1.1 +itsdangerous==2.0.1 +nltk==3.2.5 +numpy==1.18.0 +gunicorn==19.9.0 +requests==2.27.1 +jinja2<=3.0.3 +Werkzeug<=2.0.3 +sentry-sdk==0.12.3 +spacy==2.2.3 +https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz#egg=en_core_web_sm==2.2.5 +torch==1.7.1 +transformers==4.10.1 diff --git a/annotators/property_extraction/server.py b/annotators/property_extraction/server.py new file mode 100644 index 0000000000..274e019fbc --- /dev/null +++ b/annotators/property_extraction/server.py @@ -0,0 +1,222 @@ +import copy +import logging +import os +import re +import time + +import nltk +import sentry_sdk +import spacy +from flask import Flask, jsonify, request + +from deeppavlov import build_model +from src.sentence_answer import sentence_answer + +sentry_sdk.init(os.getenv("SENTRY_DSN")) + +logging.basicConfig(format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO) +logger = logging.getLogger(__name__) +app = Flask(__name__) + +stemmer = nltk.PorterStemmer() +nlp = spacy.load("en_core_web_sm") + +config_name = os.getenv("CONFIG") +rel_cls_flag = int(os.getenv("REL_CLS_FLAG", "0")) +add_entity_info = int(os.getenv("ADD_ENTITY_INFO", "0")) + +rel_type_dict = {} +with open("rel_list.txt", "r") as fl: + lines = fl.readlines() + for line in lines: + rel, rel_type = line.strip().split() + if rel_type == "r": + rel_type = "relation" + else: + rel_type = "property" + rel_type_dict[rel.replace("_", " ")] = rel_type + + +def check_triplet(triplet): + if triplet[0] in {"hi", "hello"} or any([word in triplet[0] for word in {" hi ", " hello "}]): + return False + return True + + +try: + generative_ie = build_model(config_name, download=True) + logger.info("property extraction model is loaded.") + if rel_cls_flag: + rel_cls = build_model("property_classification_distilbert.json") +except Exception as e: + sentry_sdk.capture_exception(e) + logger.exception(e) + raise e + + +def sentrewrite(sentence, init_answer): + answer = init_answer.strip(".") + if any([sentence.startswith(elem) for elem in ["what's", "what is"]]): + for old_tok, new_tok in [ + ("what's your", f"{answer} is my"), + ("what is your", f"{answer} is my"), + ("what is", "{answer} is"), + ("what's", "{answer} is"), + ]: + sentence = sentence.replace(old_tok, new_tok) + elif any([sentence.startswith(elem) for elem in ["where", "when"]]): + sentence = sentence_answer(sentence, answer) + elif any([sentence.startswith(elem) for elem in ["is there"]]): + for old_tok, new_tok in [("is there any", f"{answer} is"), ("is there", f"{answer} is")]: + sentence = sentence.replace(old_tok, new_tok) + else: + sentence = f"{sentence} {init_answer}" + return sentence + + +def get_result(request): + st_time = time.time() + init_uttrs = request.json.get("utterances", []) + init_uttrs_cased = request.json.get("utterances_init", []) + if not init_uttrs_cased: + init_uttrs_cased = copy.deepcopy(init_uttrs) + named_entities_batch = request.json.get("named_entities", [[] for _ in init_uttrs]) + entities_with_labels_batch = request.json.get("entities_with_labels", [[] for _ in init_uttrs]) + entity_info_batch = request.json.get("entity_info", [[] for _ in init_uttrs]) + logger.info(f"init_uttrs {init_uttrs}") + uttrs, uttrs_cased = [], [] + for uttr_list, uttr_list_cased in zip(init_uttrs, init_uttrs_cased): + if len(uttr_list) == 1: + uttrs.append(uttr_list[0]) + uttrs_cased.append(uttr_list[0]) + else: + utt_prev = uttr_list_cased[-2] + utt_prev_sentences = nltk.sent_tokenize(utt_prev) + utt_prev = utt_prev_sentences[-1] + utt_cur = uttr_list_cased[-1] + utt_prev_l = utt_prev.lower() + utt_cur_l = utt_cur.lower() + is_q = ( + any([utt_prev_l.startswith(q_word) for q_word in ["what ", "who ", "when ", "where "]]) + or "?" in utt_prev_l + ) + + is_sentence = False + parsed_sentence = nlp(utt_cur) + if parsed_sentence: + tokens = [elem.text for elem in parsed_sentence] + tags = [elem.tag_ for elem in parsed_sentence] + found_verbs = any([tag in tags for tag in ["VB", "VBZ", "VBP", "VBD"]]) + if found_verbs and len(tokens) > 2: + is_sentence = True + + logger.info(f"is_q: {is_q} --- is_s: {is_sentence} --- utt_prev: {utt_prev_l} --- utt_cur: {utt_cur_l}") + if is_q and not is_sentence: + if len(utt_cur_l.split()) <= 2: + uttrs.append(sentrewrite(utt_prev_l, utt_cur_l)) + uttrs_cased.append(sentrewrite(utt_prev, utt_cur)) + else: + uttrs.append(f"{utt_prev_l} {utt_cur_l}") + uttrs_cased.append(f"{utt_prev} {utt_cur}") + else: + uttrs.append(utt_cur_l) + uttrs_cased.append(utt_cur) + + logger.info(f"input utterances: {uttrs}") + triplets_batch = [] + outputs, scores = generative_ie(uttrs) + for output, uttr in zip(outputs, uttrs_cased): + triplet = "" + fnd = re.findall(r" (.*?) (.*?) (.*)", output) + if fnd: + triplet = list(fnd[0]) + if triplet[0] == "i": + triplet[0] = "user" + obj = triplet[2] + if obj.islower() and obj.capitalize() in uttr: + triplet[2] = obj.capitalize() + triplets_batch.append(triplet) + logger.info(f"outputs {outputs} scores {scores} triplets_batch {triplets_batch}") + if rel_cls_flag: + rels = rel_cls(uttrs) + logger.info(f"classified relations: {rels}") + filtered_triplets_batch = [] + for triplet, rel in zip(triplets_batch, rels): + rel = rel.replace("_", " ") + if len(triplet) == 3 and triplet[1] == rel and check_triplet(triplet): + filtered_triplets_batch.append(triplet) + else: + filtered_triplets_batch.append([]) + triplets_batch = filtered_triplets_batch + + triplets_info_batch = [] + for triplet, uttr, named_entities, entities_with_labels, entity_info_list in zip( + triplets_batch, uttrs, named_entities_batch, entities_with_labels_batch, entity_info_batch + ): + uttr = uttr.lower() + entity_substr_dict = {} + formatted_triplet, per_triplet = {}, {} + if len(uttr.split()) > 2: + for entity in entities_with_labels: + if "text" in entity: + entity_substr = entity["text"] + if "offsets" in entity: + start_offset, end_offset = entity["offsets"] + else: + start_offset = uttr.find(entity_substr.lower()) + end_offset = start_offset + len(entity_substr) + offsets = [start_offset, end_offset] + if triplet and entity_substr in [triplet[0], triplet[2]]: + entity_substr_dict[entity_substr] = {"offsets": offsets} + if entity_info_list: + for entity_info in entity_info_list: + if entity_info and "entity_substr" in entity_info and "entity_ids" in entity_info: + entity_substr = entity_info["entity_substr"] + if triplet and ( + entity_substr in [triplet[0], triplet[2]] + or stemmer.stem(entity_substr) in [triplet[0], triplet[2]] + ): + if entity_substr not in entity_substr_dict: + entity_substr_dict[entity_substr] = {} + entity_substr_dict[entity_substr]["entity_ids"] = entity_info["entity_ids"] + entity_substr_dict[entity_substr]["dbpedia_types"] = entity_info.get("dbpedia_types", []) + entity_substr_dict[entity_substr]["finegrained_types"] = entity_info.get( + "entity_id_tags", [] + ) + if triplet: + formatted_triplet = {"subject": triplet[0], rel_type_dict[triplet[1]]: triplet[1], "object": triplet[2]} + named_entities_list = [] + for elem in named_entities: + for entity in elem: + named_entities_list.append(entity) + per_entities = [entity for entity in named_entities_list if entity.get("type", "") == "PER"] + if triplet[1] in {"have pet", "have family", "have sibling", "have chidren"} and per_entities: + per_triplet = {"subject": triplet[2], "property": "name", "object": per_entities[0].get("text", "")} + + triplets_info_list = [] + if add_entity_info: + triplets_info_list.append({"triplet": formatted_triplet, "entity_info": entity_substr_dict}) + else: + triplets_info_list.append({"triplet": formatted_triplet}) + if per_triplet: + if add_entity_info: + triplets_info_list.append( + {"triplet": per_triplet, "entity_info": {per_triplet["object"]: {"entity_id_tags": ["PER"]}}} + ) + else: + triplets_info_list.append({"triplet": per_triplet}) + triplets_info_batch.append(triplets_info_list) + total_time = time.time() - st_time + logger.info(f"property extraction exec time: {total_time: .3f}s") + logger.info(f"property extraction, input {uttrs}, output {triplets_info_batch} scores {scores}") + return triplets_info_batch + + +@app.route("/respond", methods=["POST"]) +def respond(): + result = get_result(request) + return jsonify(result) + + +if __name__ == "__main__": + app.run(debug=False, host="0.0.0.0", port=8103) diff --git a/annotators/property_extraction/src/sentence_answer.py b/annotators/property_extraction/src/sentence_answer.py new file mode 100644 index 0000000000..44490272a1 --- /dev/null +++ b/annotators/property_extraction/src/sentence_answer.py @@ -0,0 +1,177 @@ +import importlib +import re +from logging import getLogger + +import pkg_resources +import spacy + +log = getLogger(__name__) + +# en_core_web_sm is installed and used by test_inferring_pretrained_model in the same interpreter session during tests. +# Spacy checks en_core_web_sm package presence with pkg_resources, but pkg_resources is initialized with interpreter, +# sot it doesn't see en_core_web_sm installed after interpreter initialization, so we use importlib.reload below. + +if "en-core-web-sm" not in pkg_resources.working_set.by_key.keys(): + importlib.reload(pkg_resources) + +# TODO: move nlp to sentence_answer, sentence_answer to rel_ranking_infer and revise en_core_web_sm requirement, +# TODO: make proper downloading with spacy.cli.download +nlp = spacy.load("en_core_web_sm") + +pronouns = ["who", "what", "when", "where", "how"] + + +def find_tokens(tokens, node, not_inc_node): + if node != not_inc_node: + tokens.append(node.text) + for elem in node.children: + tokens = find_tokens(tokens, elem, not_inc_node) + return tokens + + +def find_inflect_dict(sent_nodes): + inflect_dict = {} + for node in sent_nodes: + if node.dep_ == "aux" and node.tag_ == "VBD" and (node.head.tag_ == "VBP" or node.head.tag_ == "VB"): + inflect_dict[node.text] = "" + if node.dep_ == "aux" and node.tag_ == "VBZ" and node.head.tag_ == "VB": + inflect_dict[node.text] = "" + return inflect_dict + + +def find_wh_node(sent_nodes): + wh_node = "" + main_head = "" + wh_node_head = "" + for node in sent_nodes: + if node.text.lower() in pronouns: + wh_node = node + break + + if wh_node: + wh_node_head = wh_node.head + if wh_node_head.dep_ == "ccomp": + main_head = wh_node_head.head + + return wh_node, wh_node_head, main_head + + +def find_tokens_to_replace(wh_node_head, main_head, question_tokens, question): + redundant_tokens_to_replace = [] + question_tokens_to_replace = [] + + if main_head: + redundant_tokens_to_replace = find_tokens([], main_head, wh_node_head) + what_tokens_fnd = re.findall("what (.*) (is|was|does|did) (.*)", question, re.IGNORECASE) + if what_tokens_fnd: + what_tokens = what_tokens_fnd[0][0].split() + if len(what_tokens) <= 2: + redundant_tokens_to_replace += what_tokens + + wh_node_head_desc = [] + if wh_node_head: + wh_node_head_desc = [node for node in wh_node_head.children if node.text != "?"] + wh_node_head_dep = [ + node.dep_ + for node in wh_node_head.children + if (node.text != "?" and node.dep_ not in ["aux", "prep"] and node.text.lower() not in pronouns) + ] + for node in wh_node_head_desc: + if node.dep_ == "nsubj" and len(wh_node_head_dep) > 1 or node.text.lower() in pronouns or node.dep_ == "aux": + question_tokens_to_replace.append(node.text) + for elem in node.subtree: + question_tokens_to_replace.append(elem.text) + + question_tokens_to_replace = list(set(question_tokens_to_replace)) + + redundant_replace_substr = [] + for token in question_tokens: + if token in redundant_tokens_to_replace: + redundant_replace_substr.append(token) + else: + if redundant_replace_substr: + break + + redundant_replace_substr = " ".join(redundant_replace_substr) + + question_replace_substr = [] + + for token in question_tokens: + if token in question_tokens_to_replace: + question_replace_substr.append(token) + else: + if question_replace_substr: + break + + question_replace_substr = " ".join(question_replace_substr) + + return redundant_replace_substr, question_replace_substr + + +def sentence_answer(question, entity_title, entities=None, template_answer=None): + log.debug(f"question {question} entity_title {entity_title} entities {entities} template_answer {template_answer}") + sent_nodes = nlp(question) + reverse = False + if sent_nodes[-2].tag_ == "IN": + reverse = True + question_tokens = [elem.text for elem in sent_nodes] + log.debug(f"spacy tags: {[(elem.text, elem.tag_, elem.dep_, elem.head.text) for elem in sent_nodes]}") + + inflect_dict = find_inflect_dict(sent_nodes) + wh_node, wh_node_head, main_head = find_wh_node(sent_nodes) + redundant_replace_substr, question_replace_substr = find_tokens_to_replace( + wh_node_head, main_head, question_tokens, question + ) + log.debug(f"redundant_replace_substr {redundant_replace_substr} question_replace_substr {question_replace_substr}") + if redundant_replace_substr: + answer = question.replace(redundant_replace_substr, "") + else: + answer = question + + if answer.endswith("?"): + answer = answer.replace("?", "").strip() + + if question_replace_substr: + if template_answer and entities: + answer = template_answer.replace("[ent]", entities[0]).replace("[ans]", entity_title) + elif wh_node.text.lower() in ["what", "who", "how"]: + fnd_date = re.findall(r"what (day|year) (.*)\?", question, re.IGNORECASE) + fnd_wh = re.findall(r"what (is|was) the name of (.*) (which|that) (.*)\?", question, re.IGNORECASE) + fnd_name = re.findall(r"what (is|was) the name (.*)\?", question, re.IGNORECASE) + if fnd_date: + fnd_date_aux = re.findall(rf"what (day|year) (is|was) ({entities[0]}) (.*)\?", question, re.IGNORECASE) + if fnd_date_aux: + answer = f"{entities[0]} {fnd_date_aux[0][1]} {fnd_date_aux[0][3]} on {entity_title}" + else: + answer = f"{fnd_date[0][1]} on {entity_title}" + elif fnd_wh: + answer = f"{entity_title} {fnd_wh[0][3]}" + elif fnd_name: + aux_verb, sent_cut = fnd_name[0] + if sent_cut.startswith("of "): + sent_cut = sent_cut[3:] + answer = f"{entity_title} {aux_verb} {sent_cut}" + else: + if reverse: + answer = answer.replace(question_replace_substr, "") + answer = f"{answer} {entity_title}" + else: + answer = answer.replace(question_replace_substr, entity_title) + elif wh_node.text.lower() in ["when", "where"] and entities: + sent_cut = re.findall(rf"(when|where) (was|is) {entities[0]} (.*)\?", question, re.IGNORECASE) + if sent_cut: + if sent_cut[0][0].lower() == "when": + answer = f"{entities[0]} {sent_cut[0][1]} {sent_cut[0][2]} on {entity_title}" + else: + answer = f"{entities[0]} {sent_cut[0][1]} {sent_cut[0][2]} in {entity_title}" + else: + answer = answer.replace(question_replace_substr, "") + answer = f"{answer} in {entity_title}" + + for old_tok, new_tok in inflect_dict.items(): + answer = answer.replace(old_tok, new_tok) + answer = re.sub(r"\s+", " ", answer).strip() + + answer = answer + "." + + return answer diff --git a/annotators/property_extraction/src/t5_generative_ie.py b/annotators/property_extraction/src/t5_generative_ie.py new file mode 100644 index 0000000000..1d8c42818c --- /dev/null +++ b/annotators/property_extraction/src/t5_generative_ie.py @@ -0,0 +1,239 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from logging import getLogger +from pathlib import Path +from typing import List, Optional, Dict + +import torch +from overrides import overrides +from transformers import AutoConfig, AutoTokenizer +from transformers import T5ForConditionalGeneration + +from deeppavlov.core.common.errors import ConfigError +from deeppavlov.core.commands.utils import expand_path +from deeppavlov.core.common.registry import register +from deeppavlov.core.models.torch_model import TorchModel + +logger = getLogger(__name__) + + +def softmax_mask(val, mask): + inf = 1e30 + return -inf * (1 - mask.to(torch.float32)) + val + + +@register("t5_generative_ie") +class T5GenerativeIE(TorchModel): + def __init__( + self, + pretrained_transformer: str, + attention_probs_keep_prob: Optional[float] = None, + add_special_tokens: List[str] = None, + hidden_keep_prob: Optional[float] = None, + optimizer: str = "AdamW", + optimizer_parameters: Optional[dict] = None, + bert_config_file: Optional[str] = None, + learning_rate_drop_patience: int = 20, + learning_rate_drop_div: float = 2.0, + load_before_drop: bool = True, + clip_norm: Optional[float] = None, + min_learning_rate: float = 1e-06, + generate_max_length: int = 50, + top_n: int = 1, + batch_decode: bool = False, + scores_thres: float = -0.17, + device: str = "cpu", + **kwargs, + ) -> None: + + if not optimizer_parameters: + optimizer_parameters = {"lr": 0.01, "weight_decay": 0.01, "betas": (0.9, 0.999), "eps": 1e-6} + self.generate_max_length = generate_max_length + + self.attention_probs_keep_prob = attention_probs_keep_prob + self.hidden_keep_prob = hidden_keep_prob + self.clip_norm = clip_norm + + self.pretrained_transformer = pretrained_transformer + self.bert_config_file = bert_config_file + self.tokenizer = AutoTokenizer.from_pretrained(pretrained_transformer, do_lower_case=False) + special_tokens_dict = {"additional_special_tokens": add_special_tokens} + self.tokenizer.add_special_tokens(special_tokens_dict) + self.replace_tokens = [("", ""), ("", ""), ("", "")] + self.top_n = top_n + self.batch_decode = batch_decode + self.scores_thres = scores_thres + + super().__init__( + device=device, + optimizer=optimizer, + optimizer_parameters=optimizer_parameters, + learning_rate_drop_patience=learning_rate_drop_patience, + learning_rate_drop_div=learning_rate_drop_div, + load_before_drop=load_before_drop, + min_learning_rate=min_learning_rate, + **kwargs, + ) + self.device = torch.device("cuda" if torch.cuda.is_available() and device == "gpu" else "cpu") + + def train_on_batch(self, input_ids_batch, attention_mask_batch, target_ids_batch) -> Dict: + input_ids_batch = torch.LongTensor(input_ids_batch).to(self.device) + attention_mask_batch = torch.LongTensor(attention_mask_batch).to(self.device) + target_ids_batch = torch.LongTensor(target_ids_batch).to(self.device) + input_ = {"input_ids": input_ids_batch, "attention_mask": attention_mask_batch, "labels": target_ids_batch} + + self.optimizer.zero_grad() + loss = self.model(**input_)[0] + if self.is_data_parallel: + loss = loss.mean() + loss.backward() + # Clip the norm of the gradients to 1.0. + # This is to help prevent the "exploding gradients" problem. + if self.clip_norm: + torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip_norm) + + self.optimizer.step() + if self.lr_scheduler is not None: + self.lr_scheduler.step() + + return {"loss": loss.item()} + + @property + def is_data_parallel(self) -> bool: + return isinstance(self.model, torch.nn.DataParallel) + + def __call__(self, input_ids_batch, attention_mask_batch): + model = self.model.module if hasattr(self.model, "module") else self.model + if self.batch_decode: + input_ids_batch = torch.LongTensor(input_ids_batch).to(self.device) + attention_mask_batch = torch.LongTensor(attention_mask_batch).to(self.device) + input_ = { + "input_ids": input_ids_batch, + "attention_mask": attention_mask_batch, + } + with torch.no_grad(): + answer_ids_batch = model.generate(**input_) + init_answers_batch = self.tokenizer.batch_decode(answer_ids_batch, skip_special_tokens=False) + answers_batch = [] + for answer in init_answers_batch: + for old_tok, new_tok in self.replace_tokens: + answer = answer.replace(old_tok, new_tok) + answers_batch.append(answer) + return answers_batch + else: + answers_batch, scores_batch = [], [] + for input_ids in input_ids_batch: + input_ids = torch.LongTensor([input_ids]).to(self.device) + with torch.no_grad(): + outputs = model.generate( + input_ids, + num_beams=5, + num_return_sequences=self.top_n, + return_dict_in_generate=True, + output_scores=True, + ) + sequences = outputs.sequences + scores = outputs.sequences_scores + scores = scores.cpu().numpy().tolist() + answers = [self.tokenizer.decode(output, skip_special_tokens=False) for output in sequences] + logger.info(f"triplets {answers} scores {scores}") + processed_answers, processed_scores = [], [] + for answer, score in zip(answers, scores): + if score > self.scores_thres: + for old_tok, new_tok in self.replace_tokens: + answer = answer.replace(old_tok, new_tok) + processed_answers.append(answer) + processed_scores.append(score) + if self.top_n == 1: + if processed_answers: + answers_batch.append(processed_answers[0]) + scores_batch.append(processed_scores[0]) + else: + answers_batch.append("") + scores_batch.append(0.0) + else: + answers_batch.append(processed_answers) + scores_batch.append(processed_scores) + return answers_batch, scores_batch + + @overrides + def load(self, fname=None): + if fname is not None: + self.load_path = fname + + if self.pretrained_transformer: + logger.info(f"From pretrained {self.pretrained_transformer}.") + config = AutoConfig.from_pretrained( + self.pretrained_transformer, output_attentions=False, output_hidden_states=False + ) + + self.model = T5ForConditionalGeneration.from_pretrained(self.pretrained_transformer, config=config) + + elif self.bert_config_file and Path(self.bert_config_file).is_file(): + self.bert_config = AutoConfig.from_json_file(str(expand_path(self.bert_config_file))) + + if self.attention_probs_keep_prob is not None: + self.bert_config.attention_probs_dropout_prob = 1.0 - self.attention_probs_keep_prob + if self.hidden_keep_prob is not None: + self.bert_config.hidden_dropout_prob = 1.0 - self.hidden_keep_prob + self.model = T5ForConditionalGeneration(config=self.bert_config) + else: + raise ConfigError("No pre-trained BERT model is given.") + + if self.device.type == "cuda" and torch.cuda.device_count() > 1: + self.model = torch.nn.DataParallel(self.model) + + self.model.to(self.device) + + self.optimizer = getattr(torch.optim, self.optimizer_name)(self.model.parameters(), **self.optimizer_parameters) + + if self.lr_scheduler_name is not None: + self.lr_scheduler = getattr(torch.optim.lr_scheduler, self.lr_scheduler_name)( + self.optimizer, **self.lr_scheduler_parameters + ) + + if self.load_path: + logger.info(f"Load path {self.load_path} is given.") + if isinstance(self.load_path, Path) and not self.load_path.parent.is_dir(): + raise ConfigError("Provided load path is incorrect!") + + weights_path = Path(self.load_path.resolve()) + weights_path = weights_path.with_suffix(".pth.tar") + if weights_path.exists(): + logger.info(f"Load path {weights_path} exists.") + logger.info(f"Initializing `{self.__class__.__name__}` from saved.") + + # now load the weights, optimizer from saved + logger.info(f"Loading weights from {weights_path}.") + checkpoint = torch.load(weights_path, map_location=self.device) + model_state = checkpoint["model_state_dict"] + optimizer_state = checkpoint["optimizer_state_dict"] + + # load a multi-gpu model on a single device + if not self.is_data_parallel and "module." in list(model_state.keys())[0]: + tmp_model_state = {} + for key, value in model_state.items(): + tmp_model_state[re.sub("module.", "", key)] = value + model_state = tmp_model_state + + strict_load_flag = bool( + [key for key in checkpoint["model_state_dict"].keys() if key.endswith("embeddings.position_ids")] + ) + self.model.load_state_dict(model_state, strict=strict_load_flag) + self.optimizer.load_state_dict(optimizer_state) + self.epochs_done = checkpoint.get("epochs_done", 0) + else: + logger.info(f"Init from scratch. Load path {weights_path} does not exist.") diff --git a/annotators/property_extraction/src/torch_transformers_preprocessor.py b/annotators/property_extraction/src/torch_transformers_preprocessor.py new file mode 100644 index 0000000000..804a56e29a --- /dev/null +++ b/annotators/property_extraction/src/torch_transformers_preprocessor.py @@ -0,0 +1,79 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from logging import getLogger +from pathlib import Path +from typing import List + +from transformers import AutoTokenizer + +from deeppavlov.core.commands.utils import expand_path +from deeppavlov.core.common.registry import register +from deeppavlov.core.models.component import Component + +log = getLogger(__name__) + + +@register("t5_generative_ie_preprocessor") +class T5GenerativeIEPreprocessor(Component): + def __init__( + self, + vocab_file: str, + do_lower_case: bool = True, + max_seq_length: int = 512, + return_tokens: bool = False, + add_special_tokens: List[str] = None, + **kwargs, + ) -> None: + self.max_seq_length = max_seq_length + self.return_tokens = return_tokens + if Path(vocab_file).is_file(): + vocab_file = str(expand_path(vocab_file)) + self.tokenizer = AutoTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) + else: + self.tokenizer = AutoTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case) + special_tokens_dict = {"additional_special_tokens": add_special_tokens} + self.tokenizer.add_special_tokens(special_tokens_dict) + + def __call__(self, uttr_batch: List[str], targets_batch: List[str] = None): + input_ids_batch, attention_mask_batch, lengths = [], [], [] + for uttr in uttr_batch: + encoding = self.tokenizer.encode_plus(text=uttr, return_attention_mask=True, truncation=True) + input_ids = encoding["input_ids"] + attention_mask = encoding["attention_mask"] + input_ids_batch.append(input_ids) + attention_mask_batch.append(attention_mask) + lengths.append(len(input_ids)) + max_length = min(max(lengths), self.max_seq_length) + for i in range(len(input_ids_batch)): + for _ in range(max_length - len(input_ids_batch[i])): + input_ids_batch[i].append(0) + attention_mask_batch[i].append(0) + + if targets_batch is None: + return input_ids_batch, attention_mask_batch + else: + target_ids_batch, lengths = [], [] + for (subj, rel, obj) in targets_batch: + target = f" {subj} {rel} {obj}" + encoding = self.tokenizer.encode_plus(text=target, return_attention_mask=True, truncation=True) + input_ids = encoding["input_ids"] + target_ids_batch.append(input_ids) + lengths.append(len(input_ids)) + max_length = max(lengths) + for i in range(len(target_ids_batch)): + for _ in range(max_length - len(target_ids_batch[i])): + target_ids_batch[i].append(0) + + return input_ids_batch, attention_mask_batch, target_ids_batch diff --git a/annotators/property_extraction/t5_generative_ie_infer.json b/annotators/property_extraction/t5_generative_ie_infer.json new file mode 100644 index 0000000000..9db32603a3 --- /dev/null +++ b/annotators/property_extraction/t5_generative_ie_infer.json @@ -0,0 +1,49 @@ +{ + "chainer": { + "in": ["question"], + "pipe": [ + { + "class_name": "src.torch_transformers_preprocessor:T5GenerativeIEPreprocessor", + "vocab_file": "{TRANSFORMER}", + "add_special_tokens": ["", "", ""], + "max_seq_length": 512, + "in": ["question"], + "out": ["input_ids", "attention_mask"] + }, + { + "class_name": "src.t5_generative_ie:T5GenerativeIE", + "pretrained_transformer": "{TRANSFORMER}", + "add_special_tokens": ["", "", ""], + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "optimizer": "AdamW", + "optimizer_parameters": { + "lr": 3e-05, + "weight_decay": 0.01, + "betas": [0.9, 0.999], + "eps": 1e-06 + }, + "learning_rate_drop_patience": 6, + "learning_rate_drop_div": 1.5, + "in": ["input_ids", "attention_mask"], + "out": ["answer", "score"] + } + ], + "out": ["answer", "score"] + }, + "metadata": { + "variables": { + "TRANSFORMER": "t5-base", + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "MODEL_PATH": "{MODELS_PATH}/t5_base_generative_ie" + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/deeppavlov_data/generative_ie/t5_base_generative_ie.tar.gz", + "subdir": "{MODEL_PATH}" + } + ] + } +} diff --git a/annotators/property_extraction/t5_generative_ie_lite_infer.json b/annotators/property_extraction/t5_generative_ie_lite_infer.json new file mode 100644 index 0000000000..43540361b3 --- /dev/null +++ b/annotators/property_extraction/t5_generative_ie_lite_infer.json @@ -0,0 +1,49 @@ +{ + "chainer": { + "in": ["question"], + "pipe": [ + { + "class_name": "src.torch_transformers_preprocessor:T5GenerativeIEPreprocessor", + "vocab_file": "{TRANSFORMER}", + "add_special_tokens": ["", "", ""], + "max_seq_length": 512, + "in": ["question"], + "out": ["input_ids", "attention_mask"] + }, + { + "class_name": "src.t5_generative_ie:T5GenerativeIE", + "pretrained_transformer": "{TRANSFORMER}", + "add_special_tokens": ["", "", ""], + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "optimizer": "AdamW", + "optimizer_parameters": { + "lr": 3e-05, + "weight_decay": 0.01, + "betas": [0.9, 0.999], + "eps": 1e-06 + }, + "learning_rate_drop_patience": 6, + "learning_rate_drop_div": 1.5, + "in": ["input_ids", "attention_mask"], + "out": ["answer", "score"] + } + ], + "out": ["answer", "score"] + }, + "metadata": { + "variables": { + "TRANSFORMER": "t5-small", + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "MODEL_PATH": "{MODELS_PATH}/t5_small_generative_ie" + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/tmp/t5_small_generative_ie.tar.gz", + "subdir": "{MODEL_PATH}" + } + ] + } +} diff --git a/annotators/property_extraction/test.sh b/annotators/property_extraction/test.sh new file mode 100755 index 0000000000..4088512108 --- /dev/null +++ b/annotators/property_extraction/test.sh @@ -0,0 +1,4 @@ +#!/bin/bash + + +python test_property_extraction.py diff --git a/annotators/property_extraction/test_property_extraction.py b/annotators/property_extraction/test_property_extraction.py new file mode 100644 index 0000000000..806ee6c9f7 --- /dev/null +++ b/annotators/property_extraction/test_property_extraction.py @@ -0,0 +1,24 @@ +import requests + + +def main(): + url = "http://0.0.0.0:8136/respond" + + request_data = [{"utterances": [["i live in moscow"]]}] + gold_results = [[{"triplet": {"object": "moscow", "relation": "live in citystatecountry", "subject": "user"}}]] + + count = 0 + for data, gold_result in zip(request_data, gold_results): + result = requests.post(url, json=data).json() + if result and result[0] == gold_result: + count += 1 + else: + print(f"Got {result}, but expected: {gold_result}") + print(result) + + assert count == len(request_data) + print("Success") + + +if __name__ == "__main__": + main() diff --git a/assistant_dists/dream/dev.yml b/assistant_dists/dream/dev.yml index 2203ee7eba..f4522c8e11 100644 --- a/assistant_dists/dream/dev.yml +++ b/assistant_dists/dream/dev.yml @@ -447,4 +447,10 @@ services: - "./common:/src/common" ports: - 8120:8120 + property-extraction: + volumes: + - "./annotators/property_extraction:/src" + - "~/.deeppavlov:/root/.deeppavlov" + ports: + - 8136:8136 version: "3.7" diff --git a/assistant_dists/dream/docker-compose.override.yml b/assistant_dists/dream/docker-compose.override.yml index 69283efe59..23b933a271 100644 --- a/assistant_dists/dream/docker-compose.override.yml +++ b/assistant_dists/dream/docker-compose.override.yml @@ -20,7 +20,7 @@ services: dff-gossip-skill:8109, dff-wiki-skill:8111, dff-gaming-skill:8115, topic-recommendation:8113, user-persona-extractor:8114, wiki-facts:8116, dff-music-skill:8099, entity-detection:8103, dff-art-skill:8117, midas-predictor:8121, dialogpt:8125, storygpt:8126, prompt-storygpt:8127, seq2seq-persona-based:8140, sentence-ranker:8128, - dff-template-skill:8120" + property-extraction:8136, dff-template-skill:8120" WAIT_HOSTS_TIMEOUT: ${WAIT_TIMEOUT:-480} HIGH_PRIORITY_INTENTS: 1 RESTRICTION_FOR_SENSITIVE_CASE: 1 @@ -1303,6 +1303,25 @@ services: reservations: memory: 10G + property-extraction: + env_file: [.env] + build: + args: + CONFIG: t5_generative_ie_lite_infer.json + PORT: 8136 + SRC_DIR: annotators/property_extraction/ + context: ./ + dockerfile: annotators/property_extraction/Dockerfile + command: flask run -h 0.0.0.0 -p 8136 + environment: + - FLASK_APP=server + deploy: + resources: + limits: + memory: 7G + reservations: + memory: 7G + dff-template-skill: env_file: [ .env ] build: diff --git a/assistant_dists/dream/gpu1.yml b/assistant_dists/dream/gpu1.yml index 7186dca974..9c3b21c7e6 100644 --- a/assistant_dists/dream/gpu1.yml +++ b/assistant_dists/dream/gpu1.yml @@ -203,4 +203,8 @@ services: - CUDA_VISIBLE_DEVICES=9 dff-template-skill: restart: unless-stopped + property-extraction: + restart: unless-stopped + volumes: + - "~/.deeppavlov:/root/.deeppavlov" version: '3.7' diff --git a/assistant_dists/dream/pipeline_conf.json b/assistant_dists/dream/pipeline_conf.json index 975a725e71..27a760cdcd 100644 --- a/assistant_dists/dream/pipeline_conf.json +++ b/assistant_dists/dream/pipeline_conf.json @@ -111,6 +111,20 @@ ], "state_manager_method": "add_annotation_prev_bot_utt" }, + "property_extraction": { + "connector": { + "protocol": "http", + "timeout": 1, + "url": "http://property-extraction:8136/respond" + }, + "dialog_formatter": "state_formatters.dp_formatters:property_extraction_formatter_last_bot_dialog", + "response_formatter": "state_formatters.dp_formatters:simple_formatter_service", + "state_manager_method": "add_annotation_prev_bot_utt", + "previous_services": [ + "annotators.spelling_preprocessing", + "annotators.sentseg" + ] + }, "sentrewrite": { "connector": "connectors.sentrewrite", "dialog_formatter": "state_formatters.dp_formatters:sent_rewrite_formatter_w_o_last_dialog", @@ -301,6 +315,20 @@ "annotators.entity_linking" ] }, + "property_extraction": { + "connector": { + "protocol": "http", + "timeout": 1, + "url": "http://property-extraction:8136/respond" + }, + "dialog_formatter": "state_formatters.dp_formatters:property_extraction_formatter_dialog", + "response_formatter": "state_formatters.dp_formatters:simple_formatter_service", + "state_manager_method": "add_annotation", + "previous_services": [ + "annotators.spelling_preprocessing", + "annotators.sentseg" + ] + }, "entity_linking": { "connector": { "protocol": "http", @@ -313,7 +341,8 @@ "previous_services": [ "annotators.ner", "annotators.entity_detection", - "annotators.spacy_nounphrases" + "annotators.spacy_nounphrases", + "annotators.property_extraction" ] }, "wiki_parser": { diff --git a/assistant_dists/dream/proxy.yml b/assistant_dists/dream/proxy.yml index 8dea31424f..41669ec7a8 100644 --- a/assistant_dists/dream/proxy.yml +++ b/assistant_dists/dream/proxy.yml @@ -647,4 +647,13 @@ services: environment: - PROXY_PASS=dream.deeppavlov.ai:8127 - PORT=8127 + + property-extraction: + command: [ "nginx", "-g", "daemon off;" ] + build: + context: dp/proxy/ + dockerfile: Dockerfile + environment: + - PROXY_PASS=dream.deeppavlov.ai:8136 + - PORT=8136 version: '3.7' diff --git a/assistant_dists/dream/test.yml b/assistant_dists/dream/test.yml index 9fd6835c08..210054babe 100644 --- a/assistant_dists/dream/test.yml +++ b/assistant_dists/dream/test.yml @@ -134,4 +134,7 @@ services: environment: - CUDA_VISIBLE_DEVICES=9 dff-template-skill: + property-extraction: + volumes: + - "~/.deeppavlov:/root/.deeppavlov" version: '3.7' diff --git a/state_formatters/dp_formatters.py b/state_formatters/dp_formatters.py index 889c1cd3f7..2a84ddf7fe 100755 --- a/state_formatters/dp_formatters.py +++ b/state_formatters/dp_formatters.py @@ -242,6 +242,35 @@ def entity_detection_formatter_dialog(dialog: Dict) -> List[Dict]: return [{"sentences": context}] +def property_extraction_formatter_dialog(dialog: Dict) -> List[Dict]: + dialog = utils.get_last_n_turns(dialog, bot_last_turns=1) + dialog = utils.replace_with_annotated_utterances(dialog, mode="punct_sent") + dialog_history = [uttr["text"] for uttr in dialog["utterances"][-2:]] + entities_with_labels = get_entities(dialog["human_utterances"][-1], only_named=False, with_labels=True) + entity_info_list = dialog["human_utterances"][-1]["annotations"].get("entity_linking", [{}]) + named_entities = dialog["human_utterances"][-1]["annotations"].get("ner", [{}]) + return [ + { + "utterances": [dialog_history], + "entities_with_labels": [entities_with_labels], + "named_entities": [named_entities], + "entity_info": [entity_info_list], + } + ] + + +def property_extraction_formatter_last_bot_dialog(dialog: Dict) -> List[Dict]: + if dialog["bot_utterances"]: + dialog_history = [dialog["bot_utterances"][-1]["text"]] + else: + dialog_history = [""] + return [ + { + "utterances": [dialog_history], + } + ] + + def preproc_last_human_utt_dialog_w_hist(dialog: Dict) -> List[Dict]: # Used by: sentseg over human uttrs last_human_utt = dialog["human_utterances"][-1]["annotations"].get( @@ -662,6 +691,12 @@ def el_formatter_dialog(dialog: Dict): entity_tags_list.append([[entity["label"].lower(), 1.0]]) else: entity_tags_list.append([["misc", 1.0]]) + triplets = dialog["human_utterances"][-1]["annotations"].get("property_extraction", [{}]) + for triplet in triplets: + object_entity_substr = triplet.get("object", "") + if object_entity_substr and object_entity_substr not in entity_substr_list: + entity_substr_list.append(object_entity_substr) + entity_tags_list.append([["misc", 1.0]]) dialog = utils.get_last_n_turns(dialog, bot_last_turns=1) dialog = utils.replace_with_annotated_utterances(dialog, mode="punct_sent") context = [[uttr["text"] for uttr in dialog["utterances"][-num_last_utterances:]]] diff --git a/tests/runtests.sh b/tests/runtests.sh index 1a76fa9205..532378d7e0 100755 --- a/tests/runtests.sh +++ b/tests/runtests.sh @@ -150,7 +150,7 @@ if [[ "$MODE" == "test_skills" || "$MODE" == "all" ]]; then user-persona-extractor small-talk-skill wiki-facts dff-art-skill dff-funfact-skill \ meta-script-skill spelling-preprocessing dff-gaming-skill dialogpt \ dff-music-skill dff-bot-persona-skill entity-detection midas-predictor \ - sentence-ranker relative-persona-extractor seq2seq-persona-based; do + sentence-ranker relative-persona-extractor seq2seq-persona-based property-extraction; do echo "Run tests for $container" dockercompose_cmd exec -T -u $(id -u) $container ./test.sh