diff --git a/annotators/entity_detection/requirements.txt b/annotators/entity_detection/requirements.txt index 9f6f40c586..5414666494 100644 --- a/annotators/entity_detection/requirements.txt +++ b/annotators/entity_detection/requirements.txt @@ -1,9 +1,10 @@ +pyopenssl==22.0.0 Flask==1.1.1 itsdangerous==2.0.1 nltk==3.2.5 numpy==1.15.4 gunicorn==19.9.0 -requests==2.22.0 +requests==2.27.1 jinja2<=3.0.3 Werkzeug<=2.0.3 sentry-sdk==0.12.3 diff --git a/annotators/kbqa/kbqa_cq_mt_bert_lite.json b/annotators/kbqa/kbqa_cq_mt_bert_lite.json index 4d85043098..fb3032a37f 100644 --- a/annotators/kbqa/kbqa_cq_mt_bert_lite.json +++ b/annotators/kbqa/kbqa_cq_mt_bert_lite.json @@ -38,7 +38,7 @@ { "class_name": "rel_ranking_infer", "id": "rel_r_inf", - "ranker": {"config_path": "{CONFIGS_PATH}/classifiers/rel_ranking_bert_lite.json"}, + "ranker": {"config_path": "/src/rel_ranking_bert_en.json"}, "load_path": "{DOWNLOADS_PATH}/wikidata_eng", "rel_q2name_filename": "wiki_dict_properties.pickle", "rels_to_leave": 40 diff --git a/annotators/kbqa/rel_ranking_bert_en.json b/annotators/kbqa/rel_ranking_bert_en.json new file mode 100644 index 0000000000..0d53023321 --- /dev/null +++ b/annotators/kbqa/rel_ranking_bert_en.json @@ -0,0 +1,115 @@ +{ + "dataset_reader": { + "class_name": "sq_reader", + "data_path": "{DOWNLOADS_PATH}/rel_ranking_eng/lcquad_rel_ranking.pickle" + }, + "dataset_iterator": { + "class_name": "basic_classification_iterator", + "seed": 42 + }, + "chainer": { + "in": ["question", "rel_list"], + "in_y": ["y"], + "pipe": [ + { + "class_name": "torch_transformers_preprocessor:RelRankingPreprocessor", + "vocab_file": "{TRANSFORMER}", + "do_lower_case": true, + "max_seq_length": 64, + "add_special_tokens": ["", "", ""], + "in": ["question", "rel_list"], + "out": ["bert_features"] + }, + { + "id": "classes_vocab", + "class_name": "simple_vocab", + "fit_on": ["y"], + "save_path": "{MODEL_PATH}/classes.dict", + "load_path": "{MODEL_PATH}/classes.dict", + "in": ["y"], + "out": ["y_ids"] + }, + { + "in": ["y_ids"], + "out": ["y_onehot"], + "class_name": "one_hotter", + "depth": "#classes_vocab.len", + "single_vector": true + }, + { + "class_name": "torch_transformers_classifier:TorchTransformersClassifierModel", + "n_classes": "#classes_vocab.len", + "return_probas": "true", + "num_special_tokens": 3, + "pretrained_bert": "{TRANSFORMER}", + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "optimizer": "AdamW", + "optimizer_parameters": { + "lr": 1e-05 + }, + "learning_rate_drop_patience": 5, + "learning_rate_drop_div": 2.0, + "in": ["bert_features"], + "in_y": ["y_ids"], + "out": ["y_pred_probas"] + }, + { + "in": ["y_pred_probas"], + "out": ["y_pred_ids"], + "class_name": "proba2labels", + "max_proba": true + }, + { + "in": ["y_pred_ids"], + "out": ["y_pred_labels"], + "ref": "classes_vocab" + } + ], + "out": ["y_pred_probas"] + }, + "train": { + "epochs": 3, + "batch_size": 30, + "metrics": [ + { + "name": "roc_auc", + "inputs": [ + "y_onehot", + "y_pred_probas" + ] + }, + "accuracy", + "f1_macro" + ], + "validation_patience": 5, + "val_every_n_batches": 100, + "log_every_n_batches": 100, + "show_examples": false, + "evaluation_targets": [ + "train", + "valid", + "test" + ], + "class_name": "torch_trainer" + }, + "metadata": { + "variables": { + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "TRANSFORMER": "haisongzhang/roberta-tiny-cased", + "MODEL_PATH": "{MODELS_PATH}/classifiers/rel_ranking_bert_eng_torch" + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/kbqa/wikidata/rel_ranking_bert_eng_torch.tar.gz", + "subdir": "{MODELS_PATH}/classifiers/rel_ranking_bert_eng_torch" + }, + { + "url": "http://files.deeppavlov.ai/kbqa/wikidata/lcquad_rel_ranking.pickle", + "subdir": "{DOWNLOADS_PATH}/rel_ranking_eng" + } + ] + } +} diff --git a/annotators/kbqa/requirements.txt b/annotators/kbqa/requirements.txt index 8955157e0d..8ff7ea67b3 100644 --- a/annotators/kbqa/requirements.txt +++ b/annotators/kbqa/requirements.txt @@ -1,8 +1,11 @@ +pyopenssl==22.0.0 sentry-sdk[flask]==0.14.1 flask==1.1.1 itsdangerous==2.0.1 gunicorn==19.9.0 -requests==2.22.0 +requests==2.27.1 jinja2<=3.0.3 Werkzeug<=2.0.3 click==7.1.2 +torch==1.6.0 +transformers==4.6.0 diff --git a/annotators/kbqa/server.py b/annotators/kbqa/server.py index c400558643..121557e877 100644 --- a/annotators/kbqa/server.py +++ b/annotators/kbqa/server.py @@ -54,6 +54,7 @@ def respond(): kbqa_input = [sanitized_questions, sanitized_questions, template_types, sanitized_entities, entity_types] else: kbqa_input = [sanitized_questions] + logger.info(f"kbqa_input: {kbqa_input}") default_resp = {"qa_system": "kbqa", "answer": "", "confidence": 0.0} out_res = [default_resp for _ in questions] try: @@ -71,7 +72,7 @@ def respond(): answer, conf = res[cnt_fnd] out_res.append({"qa_system": "kbqa", "answer": answer, "confidence": float(conf)}) cnt_fnd += 1 - logger.info(f"kbqa exec time: {time.time() - st_time}") + logger.info(f"kbqa exec time: {time.time() - st_time} out_res {out_res}") except Exception as e: sentry_sdk.capture_exception(e) logger.exception(e) diff --git a/annotators/kbqa/torch_transformers_classifier.py b/annotators/kbqa/torch_transformers_classifier.py new file mode 100644 index 0000000000..560e6b7e1d --- /dev/null +++ b/annotators/kbqa/torch_transformers_classifier.py @@ -0,0 +1,377 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from logging import getLogger +from pathlib import Path +from typing import List, Dict, Union, Optional, Tuple + +import numpy as np +import torch +from overrides import overrides +from torch.nn import BCEWithLogitsLoss +from transformers import AutoModelForSequenceClassification, AutoConfig, AutoModel, AutoTokenizer +from transformers.modeling_outputs import SequenceClassifierOutput + +from deeppavlov.core.common.errors import ConfigError +from deeppavlov.core.commands.utils import expand_path +from deeppavlov.core.common.registry import register +from deeppavlov.core.models.torch_model import TorchModel + +log = getLogger(__name__) + + +@register("torch_transformers_classifier") +class TorchTransformersClassifierModel(TorchModel): + """Bert-based model for text classification on PyTorch. + + It uses output from [CLS] token and predicts labels using linear transformation. + + Args: + n_classes: number of classes + pretrained_bert: pretrained Bert checkpoint path or key title (e.g. "bert-base-uncased") + one_hot_labels: set True if one-hot encoding for labels is used + multilabel: set True if it is multi-label classification + return_probas: set True if return class probabilites instead of most probable label needed + attention_probs_keep_prob: keep_prob for Bert self-attention layers + hidden_keep_prob: keep_prob for Bert hidden layers + optimizer: optimizer name from `torch.optim` + optimizer_parameters: dictionary with optimizer's parameters, + e.g. {'lr': 0.1, 'weight_decay': 0.001, 'momentum': 0.9} + clip_norm: clip gradients by norm coefficient + bert_config_file: path to Bert configuration file (not used if pretrained_bert is key title) + is_binary: whether classification task is binary or multi-class + num_special_tokens: number of special tokens used by classification model + """ + + def __init__( + self, + n_classes, + pretrained_bert, + one_hot_labels: bool = False, + multilabel: bool = False, + return_probas: bool = False, + attention_probs_keep_prob: Optional[float] = None, + hidden_keep_prob: Optional[float] = None, + optimizer: str = "AdamW", + optimizer_parameters: Optional[dict] = None, + clip_norm: Optional[float] = None, + bert_config_file: Optional[str] = None, + is_binary: Optional[bool] = False, + num_special_tokens: int = None, + **kwargs, + ) -> None: + + if not optimizer_parameters: + optimizer_parameters = ({"lr": 1e-3, "weight_decay": 0.01, "betas": (0.9, 0.999), "eps": 1e-6},) + + self.return_probas = return_probas + self.one_hot_labels = one_hot_labels + self.multilabel = multilabel + self.pretrained_bert = pretrained_bert + self.bert_config_file = bert_config_file + self.attention_probs_keep_prob = attention_probs_keep_prob + self.hidden_keep_prob = hidden_keep_prob + self.n_classes = n_classes + self.clip_norm = clip_norm + self.is_binary = is_binary + self.bert_config = None + self.num_special_tokens = num_special_tokens + + if self.multilabel and not self.one_hot_labels: + raise RuntimeError("Use one-hot encoded labels for multilabel classification!") + + if self.multilabel and not self.return_probas: + raise RuntimeError("Set return_probas to True for multilabel classification!") + + if self.return_probas and self.n_classes == 1: + raise RuntimeError("Set return_probas to False for regression task!") + + super().__init__(optimizer=optimizer, optimizer_parameters=optimizer_parameters, **kwargs) + + def train_on_batch(self, features: Dict[str, torch.tensor], y: Union[List[int], List[List[int]]]) -> Dict: + """Train model on given batch. + This method calls train_op using features and y (labels). + + Args: + features: batch of InputFeatures + y: batch of labels (class id or one-hot encoding) + + Returns: + dict with loss and learning_rate values + """ + + _input = {key: value.to(self.device) for key, value in features.items()} + + if self.n_classes > 1 and not self.is_binary: + _input["labels"] = torch.from_numpy(np.array(y)).to(self.device) + + # regression + else: + _input["labels"] = torch.from_numpy(np.array(y, dtype=np.float32)).unsqueeze(1).to(self.device) + + self.optimizer.zero_grad() + + tokenized = {key: value for (key, value) in _input.items() if key in self.accepted_keys} + + loss = self.model(**tokenized).loss + if self.is_data_parallel: + loss = loss.mean() + loss.backward() + # Clip the norm of the gradients to 1.0. + # This is to help prevent the "exploding gradients" problem. + if self.clip_norm: + torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip_norm) + + self.optimizer.step() + if self.lr_scheduler is not None: + self.lr_scheduler.step() + + return {"loss": loss.item()} + + def __call__(self, features: Dict[str, torch.tensor]) -> Union[List[int], List[List[float]]]: + """Make prediction for given features (texts). + + Args: + features: batch of InputFeatures + + Returns: + predicted classes or probabilities of each class + + """ + + _input = {key: value.to(self.device) for key, value in features.items()} + + with torch.no_grad(): + tokenized = {key: value for (key, value) in _input.items() if key in self.accepted_keys} + + # Forward pass, calculate logit predictions + logits = self.model(**tokenized) + logits = logits[0] + + if self.return_probas: + if self.is_binary: + pred = torch.sigmoid(logits).squeeze(1) + elif not self.multilabel: + pred = torch.nn.functional.softmax(logits, dim=-1) + else: + pred = torch.nn.functional.sigmoid(logits) + pred = pred.detach().cpu().numpy() + elif self.n_classes > 1: + logits = logits.detach().cpu().numpy() + pred = np.argmax(logits, axis=1) + # regression + else: + pred = logits.squeeze(-1).detach().cpu().numpy() + + return pred + + # TODO move to the super class + @property + def accepted_keys(self) -> Tuple[str]: + if self.is_data_parallel: + accepted_keys = self.model.module.forward.__code__.co_varnames + else: + accepted_keys = self.model.forward.__code__.co_varnames + return accepted_keys + + # TODO move to the super class + @property + def is_data_parallel(self) -> bool: + return isinstance(self.model, torch.nn.DataParallel) + + # TODO this method requires massive refactoring + @overrides + def load(self, fname=None): + if fname is not None: + self.load_path = fname + + if self.pretrained_bert: + log.info(f"From pretrained {self.pretrained_bert}.") + config = AutoConfig.from_pretrained( + self.pretrained_bert, + # num_labels=self.n_classes, + output_attentions=False, + output_hidden_states=False, + ) + + if self.is_binary: + config.add_pooling_layer = False + self.model = AutoModelForBinaryClassification(self.pretrained_bert, config) + else: + self.model = AutoModelForSequenceClassification.from_pretrained(self.pretrained_bert, config=config) + + # TODO need a better solution here and at + # deeppavlov.models.torch_bert.torch_bert_ranker.TorchBertRankerModel.load + try: + hidden_size = self.model.classifier.out_proj.in_features + + if self.n_classes != self.model.num_labels: + self.model.classifier.out_proj.weight = torch.nn.Parameter( + torch.randn(self.n_classes, hidden_size) + ) + self.model.classifier.out_proj.bias = torch.nn.Parameter(torch.randn(self.n_classes)) + self.model.classifier.out_proj.out_features = self.n_classes + self.model.num_labels = self.n_classes + + except AttributeError: + hidden_size = self.model.classifier.in_features + + if self.n_classes != self.model.num_labels: + self.model.classifier.weight = torch.nn.Parameter(torch.randn(self.n_classes, hidden_size)) + self.model.classifier.bias = torch.nn.Parameter(torch.randn(self.n_classes)) + self.model.classifier.out_features = self.n_classes + self.model.num_labels = self.n_classes + + elif self.bert_config_file and Path(self.bert_config_file).is_file(): + self.bert_config = AutoConfig.from_pretrained(str(expand_path(self.bert_config_file))) + if self.attention_probs_keep_prob is not None: + self.bert_config.attention_probs_dropout_prob = 1.0 - self.attention_probs_keep_prob + if self.hidden_keep_prob is not None: + self.bert_config.hidden_dropout_prob = 1.0 - self.hidden_keep_prob + self.model = AutoModelForSequenceClassification.from_config(config=self.bert_config) + else: + raise ConfigError("No pre-trained BERT model is given.") + + tokenizer = AutoTokenizer.from_pretrained(self.pretrained_bert) + if self.num_special_tokens: + self.model.resize_token_embeddings(len(tokenizer) + self.num_special_tokens) + + # TODO that should probably be parametrized in config + if self.device.type == "cuda" and torch.cuda.device_count() > 1: + self.model = torch.nn.DataParallel(self.model) + + self.model.to(self.device) + + self.optimizer = getattr(torch.optim, self.optimizer_name)(self.model.parameters(), **self.optimizer_parameters) + if self.lr_scheduler_name is not None: + self.lr_scheduler = getattr(torch.optim.lr_scheduler, self.lr_scheduler_name)( + self.optimizer, **self.lr_scheduler_parameters + ) + + if self.load_path: + log.info(f"Load path {self.load_path} is given.") + if isinstance(self.load_path, Path) and not self.load_path.parent.is_dir(): + raise ConfigError("Provided load path is incorrect!") + + weights_path = Path(self.load_path.resolve()) + weights_path = weights_path.with_suffix(".pth.tar") + if weights_path.exists(): + log.info(f"Load path {weights_path} exists.") + log.info(f"Initializing `{self.__class__.__name__}` from saved.") + + # now load the weights, optimizer from saved + log.info(f"Loading weights from {weights_path}.") + checkpoint = torch.load(weights_path, map_location=self.device) + model_state = checkpoint["model_state_dict"] + optimizer_state = checkpoint["optimizer_state_dict"] + + # load a multi-gpu model on a single device + if not self.is_data_parallel and "module." in list(model_state.keys())[0]: + tmp_model_state = {} + for key, value in model_state.items(): + tmp_model_state[re.sub("module.", "", key)] = value + model_state = tmp_model_state + + # set strict flag to False if position_ids are missing + # this is needed to load models trained on older versions + # of transformers library + strict_load_flag = bool( + [key for key in checkpoint["model_state_dict"].keys() if key.endswith("embeddings.position_ids")] + ) + self.model.load_state_dict(model_state, strict=strict_load_flag) + self.optimizer.load_state_dict(optimizer_state) + self.epochs_done = checkpoint.get("epochs_done", 0) + else: + log.info(f"Init from scratch. Load path {weights_path} does not exist.") + + +class AutoModelForBinaryClassification(torch.nn.Module): + def __init__(self, pretrained_bert, config): + super().__init__() + self.pretrained_bert = pretrained_bert + self.config = config + + self.model = AutoModel.from_pretrained(self.pretrained_bert, self.config) + self.classifier = BinaryClassificationHead(config) + + self.classifier.init_weights() + + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions + ) + + +class BinaryClassificationHead(torch.nn.Module): + def __init__(self, config): + super().__init__() + + self.config = config + + self.dense = torch.nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = torch.nn.Dropout(config.hidden_dropout_prob) + self.out_proj = torch.nn.Linear(config.hidden_size, 1) + + def init_weights(self): + self.dense.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if self.dense.bias is not None: + self.dense.bias.data.zero_() + + def forward(self, features, **kwargs): + x = features[:, 0, :] + x = self.dropout(x) + x = self.dense(x) + x = torch.tanh(x) + x = self.dropout(x) + x = self.out_proj(x) + return x diff --git a/annotators/kbqa/torch_transformers_preprocessor.py b/annotators/kbqa/torch_transformers_preprocessor.py new file mode 100644 index 0000000000..39c95c8655 --- /dev/null +++ b/annotators/kbqa/torch_transformers_preprocessor.py @@ -0,0 +1,82 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from logging import getLogger +import torch +from typing import List, Dict + +from transformers import AutoTokenizer + +from deeppavlov.core.common.registry import register +from deeppavlov.core.models.component import Component + +log = getLogger(__name__) + + +@register("rel_ranking_preprocessor") +class RelRankingPreprocessor(Component): + def __init__( + self, + vocab_file: str, + add_special_tokens: List[str], + do_lower_case: bool = True, + max_seq_length: int = 512, + return_tokens: bool = False, + **kwargs, + ) -> None: + self.max_seq_length = max_seq_length + self.return_tokens = return_tokens + self.tokenizer = AutoTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case) + self.add_special_tokens = add_special_tokens + + def __call__(self, questions_batch: List[List[str]], rels_batch: List[List[str]] = None) -> Dict[str, torch.tensor]: + + lengths = [] + for question, rels_list in zip(questions_batch, rels_batch): + if isinstance(rels_list, list): + rels_str = self.add_special_tokens[2].join(rels_list) + else: + rels_str = rels_list + text_input = f"{self.add_special_tokens[0]} {question} {self.add_special_tokens[1]} {rels_str}" + encoding = self.tokenizer.encode_plus( + text=text_input, return_attention_mask=True, add_special_tokens=True, truncation=True + ) + lengths.append(len(encoding["input_ids"])) + max_len = max(lengths) + input_ids_batch = [] + attention_mask_batch = [] + token_type_ids_batch = [] + for question, rels_list in zip(questions_batch, rels_batch): + if isinstance(rels_list, list): + rels_str = self.add_special_tokens[2].join(rels_list) + else: + rels_str = rels_list + text_input = f"{self.add_special_tokens[0]} {question} {self.add_special_tokens[1]} {rels_str}" + encoding = self.tokenizer.encode_plus( + text=text_input, truncation=True, max_length=max_len, pad_to_max_length=True, return_attention_mask=True + ) + input_ids_batch.append(encoding["input_ids"]) + attention_mask_batch.append(encoding["attention_mask"]) + if "token_type_ids" in encoding: + token_type_ids_batch.append(encoding["token_type_ids"]) + else: + token_type_ids_batch.append([0]) + + input_features = { + "input_ids": torch.LongTensor(input_ids_batch), + "attention_mask": torch.LongTensor(attention_mask_batch), + "token_type_ids": torch.LongTensor(token_type_ids_batch), + } + + return input_features diff --git a/assistant_dists/dream/docker-compose.override.yml b/assistant_dists/dream/docker-compose.override.yml index f2f0af226a..29fddd5b3f 100644 --- a/assistant_dists/dream/docker-compose.override.yml +++ b/assistant_dists/dream/docker-compose.override.yml @@ -550,7 +550,7 @@ services: CONFIG: kbqa_cq_mt_bert_lite.json PORT: 8072 SRC_DIR: annotators/kbqa/ - COMMIT: 83fa40d7f600137910c663bda9a4d81fa425fe12 + COMMIT: 47adffa799cc8b04fa9af7b8842e9f5d76bfe2dc context: ./ dockerfile: annotators/kbqa/Dockerfile command: flask run -h 0.0.0.0 -p 8072 @@ -1041,7 +1041,7 @@ services: CONFIG: dialog_entity_detection.json PORT: 8103 SRC_DIR: annotators/entity_detection/ - COMMIT: 5d27dca3dfa0cf481324facd73f2e02f579f66b3 + COMMIT: 10f6e39e71f0623ec4ad9fb27ae8739d8624be85 context: ./ dockerfile: annotators/entity_detection/Dockerfile_new command: flask run -h 0.0.0.0 -p 8103 diff --git a/skill_selectors/rule_based_selector/connector.py b/skill_selectors/rule_based_selector/connector.py index 2f80dd668e..c6abc455f5 100644 --- a/skill_selectors/rule_based_selector/connector.py +++ b/skill_selectors/rule_based_selector/connector.py @@ -13,7 +13,14 @@ from common.sensitive import is_sensitive_topic_and_request from common.skills_turn_on_topics_and_patterns import turn_on_skills from common.universal_templates import if_chat_about_particular_topic, if_choose_topic, GREETING_QUESTIONS_TEXTS -from common.utils import high_priority_intents, low_priority_intents, get_topics, get_intents, get_named_locations +from common.utils import ( + high_priority_intents, + low_priority_intents, + get_topics, + get_intents, + get_named_locations, + get_factoid, +) from common.weather import if_special_weather_turn_on from common.wiki_skill import if_switch_wiki_skill, switch_wiki_skill_on_news, if_switch_test_skill from common.response_selection import UNPREDICTABLE_SKILLS @@ -48,8 +55,8 @@ async def send(self, payload: Dict, callback: Callable): cobot_dialogact_topics = set(get_topics(user_uttr, which="cobot_dialogact_topics")) cobot_topics = set(get_topics(user_uttr, which="cobot_topics")) - is_factoid = user_uttr_annotations.get("factoid_classification", {}).get("factoid", 0.0) > 0.9 - + factoid_conf = get_factoid(user_uttr) + is_factoid = factoid_conf.get("is_factoid", 0.0) > 0.96 is_celebrity_mentioned = check_is_celebrity_mentioned(user_uttr) prev_user_uttr_hyp = ( diff --git a/skills/factoid_qa/requirements.txt b/skills/factoid_qa/requirements.txt index ae59d3c27e..2d0bc56221 100644 --- a/skills/factoid_qa/requirements.txt +++ b/skills/factoid_qa/requirements.txt @@ -1,10 +1,11 @@ +pyopenssl==22.0.0 flask==1.1.1 itsdangerous==2.0.1 gunicorn==19.9.0 -requests==2.22.0 +requests==2.27.1 numpy==1.17.2 sentry-sdk==0.14.2 spacy==2.2.3 https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz#egg=en_core_web_sm==2.2.5 jinja2<=3.0.3 -Werkzeug<=2.0.3 \ No newline at end of file +Werkzeug<=2.0.3 diff --git a/skills/factoid_qa/server.py b/skills/factoid_qa/server.py index c8d2776b43..c81d44e3ca 100644 --- a/skills/factoid_qa/server.py +++ b/skills/factoid_qa/server.py @@ -14,7 +14,7 @@ from common.factoid import DONT_KNOW_ANSWER, FACTOID_NOTSURE_CONFIDENCE from common.universal_templates import if_chat_about_particular_topic -from common.utils import get_entities +from common.utils import get_entities, get_factoid sentry_sdk.init(getenv("SENTRY_DSN")) @@ -34,7 +34,6 @@ fact_dict = json.load(open("fact_dict.json", "r")) use_random_facts = False -decrease_coef = 0.95 nlp = spacy.load("en_core_web_sm") @@ -250,8 +249,9 @@ def respond(): names = [j for j in names + probable_subjects if j in fact_dict.keys()] names = list(set(names)) nounphrases = get_entities(dialog["human_utterances"][-1], only_named=False, with_labels=False) - is_factoid_class = uttr["annotations"].get("factoid_classification", {}).get("factoid", 0) - is_factoid = is_factoid_class and (names or nounphrases) and check_factoid(last_phrase) + factoid_conf = get_factoid(uttr) + is_factoid_cls = factoid_conf.get("is_factoid", 0.0) > 0.9 + is_factoid = is_factoid_cls and (names or nounphrases) and check_factoid(last_phrase) is_factoid_sents.append(is_factoid) ner_outputs_to_classify.append(names) @@ -289,9 +289,9 @@ def respond(): text_qa_response_batch = [{"answer": "", "answer_sentence": "", "confidence": 0.0} for _ in dialogs_batch] resp = requests.post(TEXT_QA_URL, json={"question_raw": questions_batch, "top_facts": facts_batch}, timeout=0.5) if resp.status_code != 200: - logger.info(f"API Error: Text QA inaccessible") + logger.info("API Error: Text QA inaccessible") else: - logger.info(f"Query against Text QA succeeded") + logger.info("Query against Text QA succeeded") text_qa_resp = resp.json() text_qa_response_batch = [] cnt_fnd = 0 @@ -325,6 +325,7 @@ def respond(): else: curr_uttr_rewritten = curr_ann_uttr["text"] is_question = "?" in curr_uttr_rewritten + logger.info(f"is_factoid {is_factoid} tell_me_about {tell_me_about_intent} is_question {is_question}") if is_factoid and (tell_me_about_intent or is_question): logger.info("Question is classified as factoid. Querying KBQA and ODQA.") print("Question is classified as factoid. Querying KBQA and ODQA...", flush=True) @@ -363,7 +364,8 @@ def respond(): response = "" confidence = 0.0 - confidence = confidence * decrease_coef + if confidence == 1.0: + confidence = 0.99 responses.append(response) confidences.append(confidence) attributes.append(attr) diff --git a/state_formatters/dp_formatters.py b/state_formatters/dp_formatters.py index e61bb3d43a..dfa2c46815 100755 --- a/state_formatters/dp_formatters.py +++ b/state_formatters/dp_formatters.py @@ -557,16 +557,12 @@ def kbqa_formatter_dialog(dialog: Dict): entity_substr = get_entities(dialog["human_utterances"][-1], only_named=True, with_labels=False) nounphrases = get_entities(dialog["human_utterances"][-1], only_named=False, with_labels=False) entities = [] - for n, entities_list in enumerate(entity_substr): - if entities_list: - entities.append([entities_list[0]]) - elif nounphrases and len(nounphrases) > n: - entities.append(nounphrases[n]) - else: - entities.append([]) - if not entities: - entities = [[] for _ in sentences] - entities = entities[: len(sentences)] + if entity_substr: + entities = [entity_substr] + elif nounphrases: + entities = [nounphrases] + else: + entities = [[]] return [{"x_init": sentences, "entities": entities}] diff --git a/utils/analyze_downloads.py b/utils/analyze_downloads.py index 776f357808..8c214de5da 100644 --- a/utils/analyze_downloads.py +++ b/utils/analyze_downloads.py @@ -27,6 +27,11 @@ repo.git.checkout(commit) config_path = Path(service_args["build"]["args"]["SRC_DIR"]) / service_args["build"]["args"]["CONFIG"] try: + if service_name == "kbqa": + with open(config_path) as fin: + lines = fin.readlines() + with open(config_path, "w") as fout: + fout.writelines([line.replace('"/src/', '"annotators/kbqa/') for line in lines]) config_downloads = dict(get_configs_downloads(config_path)) for url, paths in config_downloads.items(): md5_url = path_set_md5(url)