diff --git a/annotators/ConveRTBasedNLI/Dockerfile b/annotators/ConveRTBasedNLI/Dockerfile new file mode 100644 index 0000000000..8257b6e737 --- /dev/null +++ b/annotators/ConveRTBasedNLI/Dockerfile @@ -0,0 +1,25 @@ +FROM python:3.9.16-slim + +ARG CONVERT_URL=http://files.deeppavlov.ai/tmp/convert_model.tar.gz +ARG NLI_URL=http://files.deeppavlov.ai/tmp/nli_model.tar.gz +ARG TRAINED_MODEL_PATH +ARG SERVICE_PORT + +ENV TRAINED_MODEL_PATH ${TRAINED_MODEL_PATH} +ENV SERVICE_PORT ${SERVICE_PORT} + +RUN apt-get update && \ + apt-get install -y --allow-unauthenticated wget && \ + rm -rf /var/lib/apt/lists/* + +COPY ${WORK_DIR}/requirements.txt /src/requirements.txt +RUN pip install -r /src/requirements.txt +COPY ${WORK_DIR} /src +WORKDIR /src + +RUN mkdir /cache /data /data/nli_model/ /data/convert_model/ +RUN wget -c -q $NLI_URL -P /tmp/ && \ + tar -xf /tmp/nli_model.tar.gz -C /data/nli_model/ && \ + wget -c -q $CONVERT_URL -P /tmp/ && \ + tar -xf /tmp/convert_model.tar.gz -C /data/convert_model/ && \ + rm -rf /tmp/ \ No newline at end of file diff --git a/annotators/ConveRTBasedNLI/README.md b/annotators/ConveRTBasedNLI/README.md new file mode 100644 index 0000000000..5fa6bf60e4 --- /dev/null +++ b/annotators/ConveRTBasedNLI/README.md @@ -0,0 +1,11 @@ +This model is designed to solve the Natural Language Inference problem. + +It consists of two parts: +* [ConveRT model](https://arxiv.org/abs/1911.03688) that vectorizes the data +* Custom model consisting from 4 linear layers + +The model was trained on the **Stanford Natural Language Inference** (SNLI) corpus that contains human-written English sentence pairs with the labels entailment, contradiction, and neutral. + +Pre-trained model available [here](http://files.deeppavlov.ai/tmp/nli_model.tar.gz). + +If you want to train a model from scratch, just omit TRAINED_MODEL_PATH input argument or set it to _None_. diff --git a/annotators/ConveRTBasedNLI/convert_annotator.py b/annotators/ConveRTBasedNLI/convert_annotator.py new file mode 100644 index 0000000000..b65724919a --- /dev/null +++ b/annotators/ConveRTBasedNLI/convert_annotator.py @@ -0,0 +1,243 @@ +import os +import logging +import numpy as np +import random + +from encoder import Encoder +import tensorflow as tf +import tensorflow_datasets as tfds + + +seed = 1 +os.environ["PYTHONHASHSEED"] = str(seed) +random.seed(seed) +tf.random.set_seed(seed) +np.random.seed(seed) + +logging.basicConfig( + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO +) +logger = logging.getLogger(__name__) + +TRAINED_MODEL_PATH = os.environ.get("TRAINED_MODEL_PATH", None) + + +def data_generation(file_path): + premise = np.load(file_path)["arr_0"][0] + hypothesis = np.load(file_path)["arr_0"][1] + label = np.load(file_path)["arr_1"] + label = label.reshape((len(label), 1)) + return premise, hypothesis, label + + +class DataGenerator(tf.compat.v2.keras.utils.Sequence): + def __init__(self, list_examples, shuffle=False): + self.list_examples = list_examples + self.shuffle = shuffle + self.indexes = None + self.on_epoch_end() + + def __len__(self): + return len(self.list_examples) + + def __getitem__(self, index): + pos = self.indexes[index] + premise, hypothesis, label = data_generation(self.list_examples[pos]) + + return [premise, hypothesis], label + + def on_epoch_end(self): + self.indexes = np.arange(len(self.list_examples)) + if self.shuffle: + np.random.shuffle(self.indexes) + + +class ConveRTAnnotator: + def __init__(self): + self.encoder = Encoder() + self.model = None + + if TRAINED_MODEL_PATH: + self.model_path = TRAINED_MODEL_PATH + "/model.h5" + else: + self.batch_size = 1024 + self.__prepare_data() + self.__create_model() + self.__train_model() + + def __prepare_data(self): + logger.info("The download of SNLI dataset has begun.") + snli_dataset = tfds.text.Snli() + snli_dataset.download_and_prepare(download_dir="/cache") + + datasets = snli_dataset.as_dataset() + train_dataset, test_dataset, val_dataset = ( + datasets["train"], + datasets["test"], + datasets["validation"], + ) + val_dataset = val_dataset.batch(self.batch_size).prefetch( + tf.data.experimental.AUTOTUNE + ) + test_dataset = test_dataset.batch(self.batch_size).prefetch( + tf.data.experimental.AUTOTUNE + ) + train_dataset = train_dataset.batch(self.batch_size).prefetch( + tf.data.experimental.AUTOTUNE + ) + + logger.info("Dataset downloaded.") + + common_path = "/cache/data" + val_path = common_path + "/validation/" + test_path = common_path + "/test/" + train_path = common_path + "/train/" + if not os.path.exists(val_path): + os.makedirs(val_path) + if not os.path.exists(test_path): + os.makedirs(test_path) + if not os.path.exists(train_path): + os.makedirs(train_path) + + logger.info("Started making validation dataset.") + self.__vectorize_data(val_path + "val_", val_dataset) + logger.info("Started making test dataset.") + self.__vectorize_data(test_path + "test_", test_dataset) + logger.info("Started making train dataset.") + self.__vectorize_data(train_path + "train_", train_dataset) + + train_examples = os.listdir(train_path) + train_examples = [train_path + f_name for f_name in train_examples] + test_examples = os.listdir(test_path) + test_examples = [test_path + f_name for f_name in test_examples] + val_examples = os.listdir(val_path) + val_examples = [val_path + f_name for f_name in val_examples] + + self.train_generator = DataGenerator(train_examples) + self.test_generator = DataGenerator(test_examples) + self.val_generator = DataGenerator(val_examples) + + logger.info("All datasets have been created.") + + def __vectorize_data(self, data_path, dataset): + counter = 0 + for example in tfds.as_numpy(dataset): + counter += 1 + premise, hypothesis, label = ( + example["premise"], + example["hypothesis"], + example["label"], + ) + + useless_pos = np.where(label == -1)[0] + premise = np.delete(premise, useless_pos) + hypothesis = np.delete(hypothesis, useless_pos) + label = np.delete(label, useless_pos) + + premise_encoded = self.encoder.encode_sentences(premise) + hypothesis_encoded = self.encoder.encode_sentences(hypothesis) + np.savez( + data_path + str(counter), [premise_encoded, hypothesis_encoded], label + ) + + if counter % 10 == 0: + logger.info(f"Prepared {counter} files.") + logger.info("Prepared all files.") + + def __create_model(self): + inp_p = tf.keras.layers.Input(shape=self.batch_size) + inp_h = tf.keras.layers.Input(shape=self.batch_size) + combined = tf.keras.layers.concatenate([inp_p, inp_h]) + linear_1 = tf.keras.layers.Dense(1024, activation="relu")(combined) + dropout_1 = tf.keras.layers.Dropout(0.45)(linear_1) + linear_2 = tf.keras.layers.Dense(512, activation="relu")(dropout_1) + linear_3 = tf.keras.layers.Dense(256, activation="relu")(linear_2) + output = tf.keras.layers.Dense(3, activation="softmax")(linear_3) + + self.model = tf.keras.models.Model(inputs=[inp_p, inp_h], outputs=output) + self.model.compile( + loss=tf.keras.losses.SparseCategoricalCrossentropy(), + optimizer="adam", + metrics=["accuracy"], + ) + + def __train_model(self): + log_dir = "/cache/logs/" + if not os.path.exists(log_dir): + os.makedirs(log_dir) + csv_logger = tf.keras.callbacks.CSVLogger(log_dir + "log.csv") + + ch_path = "/cache/checkpoints" + if not os.path.exists(ch_path): + os.makedirs(ch_path) + ch_path += "/cp-{epoch:04d}.ckpt" + model_checkpoint = tf.keras.callbacks.ModelCheckpoint( + filepath=ch_path, save_weights_only=True + ) + + early_stopping = tf.keras.callbacks.EarlyStopping( + monitor="val_loss", patience=10 + ) + + _ = self.model.fit( + x=self.train_generator, + validation_data=self.val_generator, + use_multiprocessing=True, + workers=6, + epochs=100, + callbacks=[model_checkpoint, csv_logger, early_stopping], + ) + + self.model_path = "/cache/model.h5" + self.model.save(self.model_path) + os.environ["TRAINED_MODEL_PATH"] = self.model_path + logger.info("Model is trained.") + + def candidate_selection(self, candidates, bot_uttr_history, threshold=0.8): + self.model = tf.keras.models.load_model(self.model_path) + labels = {0: "entailment", 1: "neutral", 2: "contradiction"} + base_dict = { + "decision": labels[1], + labels[0]: 0.0, + labels[1]: 1.0, + labels[2]: 0.0, + } + + rez_list = list(base_dict.copy() for _ in range(len(candidates))) + unique_history = {u for b in bot_uttr_history for u in b} + + if unique_history and candidates: + vectorized_candidates = self.__response_encoding(candidates) + vectorized_history = self.__response_encoding(list(unique_history)) + + vectorized_history = dict(zip(unique_history, vectorized_history)) + history_arr = [ + vectorized_history.get(u) for b in bot_uttr_history for u in b + ] + candidates_arr = [] + for i in range(len(candidates)): + candidates_arr.extend( + [vectorized_candidates[i]] * len(bot_uttr_history[i]) + ) + + pred_rez = self.model.predict([history_arr, candidates_arr]) + pred_rez_idx = 0 + for i in range(len(candidates)): + for _ in range(len(bot_uttr_history[i])): + row_probab = pred_rez[pred_rez_idx] + if row_probab[2] < threshold: + row_probab[2] = -row_probab[2] + label = int(np.argmax(row_probab, axis=-1)) + if rez_list[i]["decision"] != labels[2]: + rez_list[i] = { + "decision": labels[label], + labels[0]: row_probab[0].astype(float), + labels[1]: row_probab[1].astype(float), + labels[2]: np.abs(row_probab[2]).astype(float), + } + pred_rez_idx += 1 + logger.info(rez_list) + return rez_list + + def __response_encoding(self, responses): + return self.encoder.encode_sentences(responses) diff --git a/annotators/ConveRTBasedNLI/encoder.py b/annotators/ConveRTBasedNLI/encoder.py new file mode 100644 index 0000000000..f893525dcc --- /dev/null +++ b/annotators/ConveRTBasedNLI/encoder.py @@ -0,0 +1,51 @@ +import numpy as np + +import tensorflow as tf +import tensorflow_text +import tensorflow_hub as tfhub + + +tf.compat.v1.disable_eager_execution() +tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) + + +def normalize_vectors(vectors): + vectors = np.vstack(vectors) + norm = np.linalg.norm(vectors, ord=2, axis=-1, keepdims=True) + return vectors / norm + + +class Encoder: + def __init__(self): + self.sess = tf.compat.v1.Session() + self.text_placeholder = tf.compat.v1.placeholder(dtype=tf.string, shape=[None]) + + self.module = tfhub.Module("/data/convert_model") + self.context_encoding_tensor = self.module( + self.text_placeholder, signature="encode_context" + ) + self.encoding_tensor = self.module(self.text_placeholder) + self.response_encoding_tensor = self.module( + self.text_placeholder, signature="encode_response" + ) + + self.sess.run(tf.compat.v1.tables_initializer()) + self.sess.run(tf.compat.v1.global_variables_initializer()) + + def encode_sentences(self, sentences): + vectors = self.sess.run( + self.encoding_tensor, feed_dict={self.text_placeholder: sentences} + ) + return normalize_vectors(vectors) + + def encode_contexts(self, sentences): + vectors = self.sess.run( + self.context_encoding_tensor, feed_dict={self.text_placeholder: sentences} + ) + return normalize_vectors(vectors) + + def encode_responses(self, sentences): + vectors = self.sess.run( + self.response_encoding_tensor, feed_dict={self.text_placeholder: sentences} + ) + return normalize_vectors(vectors) diff --git a/annotators/ConveRTBasedNLI/requirements.txt b/annotators/ConveRTBasedNLI/requirements.txt new file mode 100644 index 0000000000..4d9dd5ac0b --- /dev/null +++ b/annotators/ConveRTBasedNLI/requirements.txt @@ -0,0 +1,13 @@ +tensorflow==2.8.0 +tensorflow_hub==0.12.0 +tensorflow_text==2.8.2 +tensorflow-datasets==4.8.1 +flask==1.1.1 +itsdangerous==2.0.1 +numpy==1.21.6 +gunicorn==19.9.0 +requests==2.22.0 +sentry-sdk==0.12.3 +jinja2<=3.0.3 +Werkzeug<=2.0.3 +protobuf==3.20.3 \ No newline at end of file diff --git a/annotators/ConveRTBasedNLI/server.py b/annotators/ConveRTBasedNLI/server.py new file mode 100644 index 0000000000..31d97913b2 --- /dev/null +++ b/annotators/ConveRTBasedNLI/server.py @@ -0,0 +1,37 @@ +import logging +import time +from os import getenv + +from convert_annotator import ConveRTAnnotator +import sentry_sdk +from flask import Flask, jsonify, request + + +sentry_sdk.init(getenv("SENTRY_DSN")) + +logging.basicConfig( + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO +) +logger = logging.getLogger(__name__) +app = Flask(__name__) +app.config["JSON_SORT_KEYS"] = False + +annotator = ConveRTAnnotator() +logger.info("Annotator is loaded.") + + +@app.route("/batch_model", methods=["POST"]) +def respond_batch(): + start_time = time.time() + sentences = request.json.get("sentences", []) + last_bot_utterances = request.json.get("last_bot_utterances", []) + logger.debug(f"Sentences: {sentences}") + logger.debug(f"Last bot utterances: {last_bot_utterances}") + result = annotator.candidate_selection(sentences, last_bot_utterances) + total_time = time.time() - start_time + logger.info(f"convert-based-nli exec time: {round(total_time, 2)} sec") + return jsonify([{"batch": result}]) + + +if __name__ == "__main__": + app.run(debug=False, host="0.0.0.0", port=8150) diff --git a/annotators/ConveRTBasedNLI/service_configs/convert-based-nli/environment.yml b/annotators/ConveRTBasedNLI/service_configs/convert-based-nli/environment.yml new file mode 100644 index 0000000000..e200334473 --- /dev/null +++ b/annotators/ConveRTBasedNLI/service_configs/convert-based-nli/environment.yml @@ -0,0 +1,4 @@ +SERVICE_PORT: 8150 +TRAINED_MODEL_PATH: /data/nli_model +SERVICE_NAME: convert_based_nli +FLASK_APP: server diff --git a/annotators/ConveRTBasedNLI/service_configs/convert-based-nli/service.yml b/annotators/ConveRTBasedNLI/service_configs/convert-based-nli/service.yml new file mode 100644 index 0000000000..bdf034f666 --- /dev/null +++ b/annotators/ConveRTBasedNLI/service_configs/convert-based-nli/service.yml @@ -0,0 +1,28 @@ +name: convert-based-nli +endpoints: +- batch_model +compose: + env_file: + - .env + build: + args: + SERVICE_PORT: 8150 + SERVICE_NAME: convert_based_nli + TRAINED_MODEL_PATH: /data/nli_model + FLASK_APP: server + context: annotators/ConveRTBasedNLI/ + command: flask run -h 0.0.0.0 -p 8150 + environment: + - FLASK_APP=server + deploy: + resources: + limits: + memory: 1.5G + reservations: + memory: 1.5G + volumes: + - ./annotators/ConveRTBasedNLI:/src + - ./common:/src/common + ports: + - 8150:8150 +proxy: null \ No newline at end of file diff --git a/annotators/ConveRTBasedNLI/test.py b/annotators/ConveRTBasedNLI/test.py new file mode 100644 index 0000000000..c14e3a59b8 --- /dev/null +++ b/annotators/ConveRTBasedNLI/test.py @@ -0,0 +1,76 @@ +import requests + + +def main(): + url = "http://0.0.0.0:8150/batch_model" + + input_data = { + "sentences": [ + "Do you like ice cream?", + "It's going to be sunny today", + "I love dogs", + "Do you want to know some interesting fact?", + "Wolves have small teeth", + ], + "last_bot_utterances": [ + ["I hate dogs", "The moon is a satellite of the earth"], + [], + [ + "I hate dogs", + "Wolves have big teeth", + "The moon is a satellite of the earth", + ], + ["The moon is a satellite of the earth"], + ["Wolves have big teeth", "The moon is a satellite of the earth"], + ], + } + desired_output = [ + { + "decision": "neutral", + "entailment": 0.0019908840768039227, + "neutral": 0.7070657014846802, + "contradiction": 0.2909433841705322, + }, + { + "decision": "neutral", + "entailment": 0.0, + "neutral": 1.0, + "contradiction": 0.0, + }, + { + "decision": "contradiction", + "entailment": 2.6359959974797675e-06, + "neutral": 0.0002536950050853193, + "contradiction": 0.999743640422821, + }, + { + "decision": "neutral", + "entailment": 0.014720427803695202, + "neutral": 0.9783505797386169, + "contradiction": 0.0069289617240428925, + }, + { + "decision": "contradiction", + "entailment": 0.0019739873241633177, + "neutral": 0.0290225762873888, + "contradiction": 0.9690034985542297, + }, + ] + + result = requests.post(url, json=input_data).json() + + for rez in desired_output: + for k, v in rez.items(): + if type(v) == float: + rez[k] = round(v, 2) + + for rez in result[0]["batch"]: + for k, v in rez.items(): + if type(v) == float: + rez[k] = round(v, 2) + assert result[0]["batch"] == desired_output + print("Successfully predicted contradiction!") + + +if __name__ == "__main__": + main() diff --git a/annotators/ConveRTBasedNLI/test.sh b/annotators/ConveRTBasedNLI/test.sh new file mode 100644 index 0000000000..61672db785 --- /dev/null +++ b/annotators/ConveRTBasedNLI/test.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +python test.py diff --git a/assistant_dists/dream/dev.yml b/assistant_dists/dream/dev.yml index 5364ec9961..7c7696a4c3 100644 --- a/assistant_dists/dream/dev.yml +++ b/assistant_dists/dream/dev.yml @@ -454,6 +454,11 @@ services: - "~/.deeppavlov/cache:/root/.cache" ports: - 8102:8102 + convert-based-nli: + volumes: + - "./annotators/ConveRTBasedNLI:/src" + ports: + - 8150:8150 dff-template-skill: volumes: - "./skills/dff_template_skill:/src" diff --git a/assistant_dists/dream/docker-compose.override.yml b/assistant_dists/dream/docker-compose.override.yml index 12664f15a6..86fb84ec36 100644 --- a/assistant_dists/dream/docker-compose.override.yml +++ b/assistant_dists/dream/docker-compose.override.yml @@ -1386,6 +1386,22 @@ services: memory: 4G reservations: memory: 4G + + convert-based-nli: + env_file: [.env] + build: + args: + TRAINED_MODEL_PATH: /data/nli_model + context: ./annotators/ConveRTBasedNLI/ + command: flask run -h 0.0.0.0 -p 8150 + environment: + - FLASK_APP=server + deploy: + resources: + limits: + memory: 1.5G + reservations: + memory: 1.5G dff-template-skill: env_file: [ .env ] @@ -1402,4 +1418,5 @@ services: memory: 128M reservations: memory: 128M + version: '3.7' diff --git a/assistant_dists/dream/pipeline_conf.json b/assistant_dists/dream/pipeline_conf.json index 8b076b77f8..7047c10587 100644 --- a/assistant_dists/dream/pipeline_conf.json +++ b/assistant_dists/dream/pipeline_conf.json @@ -705,6 +705,25 @@ "component": "components/PbLNvh4hrvs47rPaf2bfYQ.yml", "service": "annotators/combined_classification/service_configs/combined-classification" } + }, + "convert_based_nli": { + "connector": { + "protocol": "http", + "timeout": 10, + "url": "http://convert-based-nli:8150/batch_model" + }, + "dialog_formatter": "state_formatters.dp_formatters:convert_nli_hypotheses_annotator_formatter", + "response_formatter": "state_formatters.dp_formatters:simple_formatter_service", + "previous_services": [ + "skills" + ], + "state_manager_method": "add_hypothesis_annotation_batch", + "is_enabled": true, + "source": { + "directory": "annotators/ConveRTBasedNLI", + "container": "convert-based-nli", + "endpoint": "batch_model" + } } }, "skill_selectors": { diff --git a/common/utils.py b/common/utils.py index 0177170916..61e5528eb8 100644 --- a/common/utils.py +++ b/common/utils.py @@ -1308,6 +1308,12 @@ def is_toxic_or_badlisted_utterance(annotated_utterance): return is_toxic_utterance(annotated_utterance) or is_badlisted_utterance(annotated_utterance) +def is_contradiction_utterance(annotated_utterance): + contradiction_result = annotated_utterance.get("annotations", {}).get("convert_based_nli", {}).get("decision", "") + + return "contradiction" in contradiction_result + + FACTOID_PATTERNS = re.compile( r"^(do you know |((can |could )you )tell me )?(please )?" r"((what|who|which|where) (is|are|was|were)\b|how to\b|when)", diff --git a/components.tsv b/components.tsv index 189889c450..d021638fd6 100644 --- a/components.tsv +++ b/components.tsv @@ -151,7 +151,7 @@ 8147 dff-universal-prompted-skill 8148 8149 transformers-lm-llama7bru -8150 +8150 convert-based-nli 8151 dff-dream-persona-llama7bru-prompted-skill 8152 dff-deepy-prompted-skill 8153 diff --git a/components/I90h9nwf9IWI9WEneLdT.yml b/components/I90h9nwf9IWI9WEneLdT.yml new file mode 100644 index 0000000000..055696e2f1 --- /dev/null +++ b/components/I90h9nwf9IWI9WEneLdT.yml @@ -0,0 +1,24 @@ +name: convert-based-nli +display_name: ConveRT based NLI +container_name: convert-based-nli +component_type: null +model_type: NN-based +is_customizable: false +author: DeepPavlov +description: Defines wheather 2 sentences are correlated as entailment, neutral or contradiction +ram_usage: 1.5G +gpu_usage: null +connector: + protocol: http + timeout: 2.0 + url: http://convert-based-nli:8150/batch_model +dialog_formatter: state_formatters.dp_formatters:convert_nli_hypotheses_annotator_formatter +response_formatter: state_formatters.dp_formatters:simple_formatter_service +previous_services: +- skills +required_previous_services: null +state_manager_method: add_hypothesis_annotation_batch +tags: null +endpoint: batch_model +service: annotators/ConveRTBasedNLI/service_configs/convert-based-nli +date_created: '2023-06-05T09:45:32' \ No newline at end of file diff --git a/response_selectors/convers_evaluation_based_selector/server.py b/response_selectors/convers_evaluation_based_selector/server.py index a450b5a82d..e67a9cb49e 100644 --- a/response_selectors/convers_evaluation_based_selector/server.py +++ b/response_selectors/convers_evaluation_based_selector/server.py @@ -15,12 +15,17 @@ from nltk.tokenize import sent_tokenize from common.greeting import greeting_spec, HI_THIS_IS_DREAM -from common.universal_templates import if_chat_about_particular_topic, if_choose_topic, DUMMY_DONTKNOW_RESPONSES +from common.universal_templates import ( + if_chat_about_particular_topic, + if_choose_topic, + DUMMY_DONTKNOW_RESPONSES, +) from common.utils import ( get_intent_name, low_priority_intents, substitute_nonwords, is_toxic_or_badlisted_utterance, + is_contradiction_utterance, ) from common.response_selection import ACTIVE_SKILLS from tag_based_selection import tag_based_response_selection @@ -39,12 +44,16 @@ sentry_sdk.init(getenv("SENTRY_DSN")) -logging.basicConfig(format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.DEBUG) +logging.basicConfig( + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.DEBUG +) logger = logging.getLogger(__name__) app = Flask(__name__) -CALL_BY_NAME_PROBABILITY = float(getenv("CALL_BY_NAME_PROBABILITY", 0.5)) # if name is already known +CALL_BY_NAME_PROBABILITY = float( + getenv("CALL_BY_NAME_PROBABILITY", 0.5) +) # if name is already known TAG_BASED_SELECTION = getenv("TAG_BASED_SELECTION", False) MOST_DUMMY_RESPONSES = [ "I really do not know what to answer.", @@ -53,6 +62,8 @@ ] LANGUAGE = getenv("LANGUAGE", "EN") GREETING_FIRST = int(getenv("GREETING_FIRST", 1)) +TOXIC_FILTERING = getenv("TOXIC_FILTERING", True) +CONTRADICTION_FILTERING = getenv("CONTRADICTION_FILTERING", True) @app.route("/respond", methods=["POST"]) @@ -61,7 +72,9 @@ def respond(): st_time = time.time() dialogs_batch = request.json["dialogs"] - all_prev_active_skills_batch = request.json.get("all_prev_active_skills", [[]] * len(dialogs_batch)) + all_prev_active_skills_batch = request.json.get( + "all_prev_active_skills", [[]] * len(dialogs_batch) + ) selected_skill_names = [] selected_texts = [] @@ -70,10 +83,12 @@ def respond(): selected_bot_attributes = [] selected_attributes = [] - for i, (dialog, all_prev_active_skills) in enumerate(zip(dialogs_batch, all_prev_active_skills_batch)): + for i, (dialog, all_prev_active_skills) in enumerate( + zip(dialogs_batch, all_prev_active_skills_batch) + ): curr_confidences = [] curr_scores = [] - curr_is_toxics = [] + curr_is_toxics_or_contr = [] try: curr_candidates = dialog["human_utterances"][-1]["hypotheses"] @@ -88,16 +103,28 @@ def respond(): curr_confidences += [skill_data["confidence"]] if skill_data["text"] and skill_data["confidence"]: if not skill_data.get("annotations"): - logger.warning(f"Valid skill data without annotations: {skill_data}") + logger.warning( + f"Valid skill data without annotations: {skill_data}" + ) is_toxic_utterance = is_toxic_or_badlisted_utterance(skill_data) - curr_is_toxics.append(is_toxic_utterance) + is_contr_utterance = is_contradiction_utterance(skill_data) + + is_toxic_or_contr_utterance = False + if is_toxic_utterance and TOXIC_FILTERING: + is_toxic_or_contr_utterance = is_toxic_utterance + if is_contr_utterance and CONTRADICTION_FILTERING: + is_toxic_or_contr_utterance = is_contr_utterance + + curr_is_toxics_or_contr.append(is_toxic_or_contr_utterance) if is_toxic_utterance: with sentry_sdk.push_scope() as scope: scope.set_extra("utterance", skill_data["text"]) scope.set_extra("selected_skills", skill_data) - sentry_sdk.capture_message("response selector got candidate with badlisted phrases") + sentry_sdk.capture_message( + "response selector got candidate with badlisted phrases" + ) msg = ( "response selector got candidate with badlisted phrases:\n" f"utterance: {skill_data['text']}\n" @@ -105,19 +132,42 @@ def respond(): ) logger.info(msg) + if is_contr_utterance: + with sentry_sdk.push_scope() as scope: + scope.set_extra("utterance", skill_data["text"]) + scope.set_extra("selected_skills", skill_data) + sentry_sdk.capture_message( + "response selector got contradicting candidate" + ) + msg = ( + "response selector got contradicting candidate:\n" + f"utterance: {skill_data['text']}\n" + f"skill name: {skill_data['skill_name']}" + ) + logger.info(msg) + curr_scores += [ - calculate_single_evaluator_score(skill_data.get("annotations"), skill_data["confidence"]) + calculate_single_evaluator_score( + skill_data.get("annotations"), skill_data["confidence"] + ) ] - curr_is_toxics = np.array(curr_is_toxics) + curr_is_toxics_or_contr = np.array(curr_is_toxics_or_contr) curr_scores = np.array(curr_scores) curr_confidences = np.array(curr_confidences) # now we collected all current candidates and their annotations. select response among them - best_skill_name, best_text, best_confidence, best_human_attrs, best_bot_attrs, best_attrs = select_response( + ( + best_skill_name, + best_text, + best_confidence, + best_human_attrs, + best_bot_attrs, + best_attrs, + ) = select_response( curr_candidates, curr_scores, curr_confidences, - curr_is_toxics, + curr_is_toxics_or_contr, dialog, all_prev_active_skills, ) @@ -125,10 +175,14 @@ def respond(): logger.exception(e) sentry_sdk.capture_exception(e) if dialog["human_utterances"][-1].get("hypotheses", []): - logger.info("Response Selector Error: randomly choosing final response among hypotheses.") + logger.info( + "Response Selector Error: randomly choosing final response among hypotheses." + ) best_cand = random.choice(dialog["human_utterances"][-1]["hypotheses"]) else: - logger.info("Response Selector Error: randomly choosing response among dummy responses.") + logger.info( + "Response Selector Error: randomly choosing response among dummy responses." + ) best_cand = { "text": random.choice(DUMMY_DONTKNOW_RESPONSES[LANGUAGE]), "confidence": 0.1, @@ -177,7 +231,13 @@ def respond(): def rule_score_based_selection( - dialog, candidates, scores, confidences, is_toxics, bot_utterances, all_prev_active_skills + dialog, + candidates, + scores, + confidences, + is_toxics, + bot_utterances, + all_prev_active_skills, ): curr_single_scores = [] @@ -199,17 +259,30 @@ def rule_score_based_selection( factoid_index = skill_names.index("factoid_qa") logging.debug("factoid") logging.debug(str(candidates[factoid_index])) - if "not sure" in candidates[factoid_index] and candidates[factoid_index]["not sure"]: + if ( + "not sure" in candidates[factoid_index] + and candidates[factoid_index]["not sure"] + ): not_sure_factoid = True for i in range(len(scores)): curr_score = None - is_misheard = misheard_with_spec1 in candidates[i]["text"] or misheard_with_spec2 in candidates[i]["text"] + is_misheard = ( + misheard_with_spec1 in candidates[i]["text"] + or misheard_with_spec2 in candidates[i]["text"] + ) intent_name = get_intent_name(candidates[i]["text"]) - is_intent_candidate = (skill_names[i] in ["dff_intent_responder_skill", "dff_program_y_skill"]) and intent_name - is_intent_candidate = is_intent_candidate and intent_name not in low_priority_intents + is_intent_candidate = ( + skill_names[i] in ["dff_intent_responder_skill", "dff_program_y_skill"] + ) and intent_name + is_intent_candidate = ( + is_intent_candidate and intent_name not in low_priority_intents + ) # print("is intent candidate? " + str(is_intent_candidate), flush=True) - if len(dialog["human_utterances"]) == 1 and greeting_spec[LANGUAGE] not in candidates[i]["text"]: + if ( + len(dialog["human_utterances"]) == 1 + and greeting_spec[LANGUAGE] not in candidates[i]["text"] + ): logger.info("Dialog Beginning detected.") if ( if_chat_about_particular_topic(dialog["utterances"][0]) @@ -220,50 +293,86 @@ def rule_score_based_selection( if skill_names[i] == "factoid_qa": logger.info("Particular topic. Facts + Greeting to very big score.") # I don't have an opinion on that but I know some facts. - resp = candidates[i]["text"].replace("I don't have an opinion on that but I know some facts.", "") + resp = candidates[i]["text"].replace( + "I don't have an opinion on that but I know some facts.", "" + ) candidates[i]["text"] = f"{HI_THIS_IS_DREAM[LANGUAGE]} {resp}" curr_score = very_big_score - elif skill_names[i] == "meta_script_skill" and len(candidates[i]["text"]) > 0 and confidences[i] > 0.98: - logger.info("Particular topic. meta_script_skill + Greeting to very big score.") + elif ( + skill_names[i] == "meta_script_skill" + and len(candidates[i]["text"]) > 0 + and confidences[i] > 0.98 + ): + logger.info( + "Particular topic. meta_script_skill + Greeting to very big score." + ) # I don't have an opinion on that but I know some facts. resp = candidates[i]["text"] candidates[i]["text"] = f"{HI_THIS_IS_DREAM[LANGUAGE]} {resp}" curr_score = very_big_score elif skill_names[i] == "small_talk_skill": - logger.info("Particular topic. Small-talk + Greeting NOT to very big score.") + logger.info( + "Particular topic. Small-talk + Greeting NOT to very big score." + ) # for now do not give small talk a very big score here - candidates[i]["text"] = f"{HI_THIS_IS_DREAM[LANGUAGE]} {candidates[i]['text']}" + candidates[i][ + "text" + ] = f"{HI_THIS_IS_DREAM[LANGUAGE]} {candidates[i]['text']}" # curr_score = very_big_score - elif if_choose_topic(dialog["utterances"][0]) and "about it" not in dialog["utterances"][0]["text"].lower(): + elif ( + if_choose_topic(dialog["utterances"][0]) + and "about it" not in dialog["utterances"][0]["text"].lower() + ): logger.info("User wants bot to choose the topic") # if user says `let's chat about something` if skill_names[i] == "small_talk_skill": logger.info("No topic. Small-talk + Greeting to very big score.") - candidates[i]["text"] = f"{HI_THIS_IS_DREAM[LANGUAGE]} {candidates[i]['text']}" + candidates[i][ + "text" + ] = f"{HI_THIS_IS_DREAM[LANGUAGE]} {candidates[i]['text']}" curr_score = very_big_score - elif skill_names[i] == "meta_script_skill" and len(candidates[i]["text"]) > 0: + elif ( + skill_names[i] == "meta_script_skill" + and len(candidates[i]["text"]) > 0 + ): logger.info("No topic. Meta-script + Greeting to very big score.") - candidates[i]["text"] = f"{HI_THIS_IS_DREAM[LANGUAGE]} {candidates[i]['text']}" + candidates[i][ + "text" + ] = f"{HI_THIS_IS_DREAM[LANGUAGE]} {candidates[i]['text']}" curr_score = very_big_score else: logger.info("User just wants to talk.") # if user says something else - if skill_names[i] == "program_y" and greeting_spec[LANGUAGE] in candidates[i]["text"]: + if ( + skill_names[i] == "program_y" + and greeting_spec[LANGUAGE] in candidates[i]["text"] + ): logger.info("Just chat. Program-y to very big score.") curr_score = very_big_score elif ( skill_names[i] == "dff_friendship_skill" - and (how_are_you_spec in candidates[i]["text"] or what_i_can_do_spec in candidates[i]["text"]) + and ( + how_are_you_spec in candidates[i]["text"] + or what_i_can_do_spec in candidates[i]["text"] + ) and len(dialog["utterances"]) < 16 ): curr_score = very_big_score - elif skill_names[i] == "dff_friendship_skill" and greeting_spec[LANGUAGE] in candidates[i]["text"]: + elif ( + skill_names[i] == "dff_friendship_skill" + and greeting_spec[LANGUAGE] in candidates[i]["text"] + ): if len(dialog["utterances"]) < 2: curr_score = very_big_score else: - confidences[i] = 0.2 # Low confidence for greeting in the middle of dialogue + confidences[ + i + ] = 0.2 # Low confidence for greeting in the middle of dialogue # we don't have 'cobotqa' anymore; instead we have factoid_qa - elif skill_names[i] in ["factoid_qa"] and "Here's something I found on the web." in candidates[i]["text"]: + elif ( + skill_names[i] in ["factoid_qa"] + and "Here's something I found on the web." in candidates[i]["text"] + ): confidences[i] = 0.6 elif ( skill_names[i] == "factoid_qa" @@ -278,17 +387,33 @@ def rule_score_based_selection( curr_score = very_big_score elif is_intent_candidate: curr_score = very_big_score - elif skill_names[i] in ["dummy_skill", "convert_reddit", "alice", "eliza", "tdidf_retrieval", "program_y"]: - if "question" in candidates[i].get("type", "") or "?" in candidates[i]["text"]: + elif skill_names[i] in [ + "dummy_skill", + "convert_reddit", + "alice", + "eliza", + "tdidf_retrieval", + "program_y", + ]: + if ( + "question" in candidates[i].get("type", "") + or "?" in candidates[i]["text"] + ): penalty_start_utt = 1 if skill_names[i] == "program_y": penalty_start_utt = 4 n_questions = 0 - if len(bot_utterances) >= penalty_start_utt and "?" in bot_utterances[-1]: + if ( + len(bot_utterances) >= penalty_start_utt + and "?" in bot_utterances[-1] + ): confidences[i] /= 1.5 n_questions += 1 - if len(bot_utterances) >= penalty_start_utt + 1 and "?" in bot_utterances[-2]: + if ( + len(bot_utterances) >= penalty_start_utt + 1 + and "?" in bot_utterances[-2] + ): confidences[i] /= 1.1 n_questions += 1 if n_questions == 2: @@ -298,7 +423,9 @@ def rule_score_based_selection( if "link_to_for_response_selector" in candidates[i].get("type", ""): link_to_question = candidates[i]["text"] link_to_human_attrs = candidates[i].get("human_attributes", {}) - if skill_names[i] == "dummy_skill" and "question" in candidates[i].get("type", ""): + if skill_names[i] == "dummy_skill" and "question" in candidates[i].get( + "type", "" + ): dummy_question = candidates[i]["text"] dummy_question_human_attr = candidates[i].get("human_attributes", {}) @@ -314,22 +441,32 @@ def rule_score_based_selection( confidence = confidences[i] skill_name = skill_names[i] logger.info( - f"Skill {skill_name} has final score: {score}. Confidence: {confidence}. " f"Toxicity: {is_toxics[i]}" + f"Skill {skill_name} has final score: {score}. Confidence: {confidence}. " + f"Toxicity: {is_toxics[i]}" ) curr_single_scores.append(score) else: score = scores[i] skill_name = skill_names[i] - logger.info(f"Skill {skill_name} has final score: {score}. " f"Toxicity: {is_toxics[i]}") + logger.info( + f"Skill {skill_name} has final score: {score}. " + f"Toxicity: {is_toxics[i]}" + ) curr_single_scores.append(score) highest_conf_exist = True if any(confidences >= 1.0) else False if highest_conf_exist: logger.info("Found skill with the highest confidence.") for j in range(len(candidates)): - if highest_conf_exist and confidences[j] < 1.0 and curr_single_scores[j] < very_big_score: + if ( + highest_conf_exist + and confidences[j] < 1.0 + and curr_single_scores[j] < very_big_score + ): # need to drop this candidates - logger.info(f"Dropping {skill_names[j]} which does not have a highest confidence or `very big score`.") + logger.info( + f"Dropping {skill_names[j]} which does not have a highest confidence or `very big score`." + ) curr_single_scores[j] = very_low_score best_id = np.argmax(curr_single_scores) @@ -351,16 +488,24 @@ def rule_score_based_selection( return best_candidate, best_id, curr_single_scores -def select_response(candidates, scores, confidences, is_toxics, dialog, all_prev_active_skills=None): +def select_response( + candidates, scores, confidences, is_toxics, dialog, all_prev_active_skills=None +): # TOXICITY & BADLISTS checks - n_toxic_candidates, scores, confidences = downscore_toxic_badlisted_responses(scores, confidences, is_toxics) + n_toxic_candidates, scores, confidences = downscore_toxic_badlisted_responses( + scores, confidences, is_toxics + ) if n_toxic_candidates == len(candidates): # the most dummy заглушка на случай, когда все абсолютно скиллы вернули токсичные ответы return None, np.random.choice(DUMMY_DONTKNOW_RESPONSES[LANGUAGE]), 1.0, {}, {} # REPEAT checks - bot_utterances = [sent_tokenize(uttr["text"].lower()) for uttr in dialog["bot_utterances"]] - prev_large_utterances = [[sent] for utt in bot_utterances[:-15] for sent in utt if len(sent) >= 40] + bot_utterances = [ + sent_tokenize(uttr["text"].lower()) for uttr in dialog["bot_utterances"] + ] + prev_large_utterances = [ + [sent] for utt in bot_utterances[:-15] for sent in utt if len(sent) >= 40 + ] bot_utterances = prev_large_utterances + bot_utterances[-15:] # flatten 2d list to 1d list of all appeared sentences of bot replies bot_utterances = sum(bot_utterances, []) @@ -369,12 +514,23 @@ def select_response(candidates, scores, confidences, is_toxics, dialog, all_prev if TAG_BASED_SELECTION: logger.info("Tag based selection") best_candidate, best_id, curr_single_scores = tag_based_response_selection( - dialog, candidates, scores, confidences, bot_utterances, all_prev_active_skills + dialog, + candidates, + scores, + confidences, + bot_utterances, + all_prev_active_skills, ) else: logger.info("Confidence & ConvEvaluationAnnotator Scores based selection") best_candidate, best_id, curr_single_scores = rule_score_based_selection( - dialog, candidates, scores, confidences, is_toxics, bot_utterances, all_prev_active_skills + dialog, + candidates, + scores, + confidences, + is_toxics, + bot_utterances, + all_prev_active_skills, ) logger.info(f"Best candidate: {best_candidate}") @@ -384,7 +540,11 @@ def select_response(candidates, scores, confidences, is_toxics, dialog, all_prev best_human_attributes = best_candidate.get("human_attributes", {}) best_bot_attributes = best_candidate.get("bot_attributes", {}) - if len(dialog["bot_utterances"]) == 0 and greeting_spec[LANGUAGE] not in best_text and GREETING_FIRST: + if ( + len(dialog["bot_utterances"]) == 0 + and greeting_spec[LANGUAGE] not in best_text + and GREETING_FIRST + ): # add greeting to the first bot uttr, if it's not already included best_text = f"{HI_THIS_IS_DREAM[LANGUAGE]} {best_text}" @@ -399,7 +559,10 @@ def select_response(candidates, scores, confidences, is_toxics, dialog, all_prev if sum(curr_single_scores) == 0.0: break - if dialog["human"]["profile"].get("name", False) and best_skill_name != "personal_info_skill": + if ( + dialog["human"]["profile"].get("name", False) + and best_skill_name != "personal_info_skill" + ): name = dialog["human"]["profile"].get("name", False) if len(dialog["bot_utterances"]) >= 1: if re.search(r"\b" + name + r"\b", dialog["bot_utterances"][-1]["text"]): @@ -423,7 +586,14 @@ def select_response(candidates, scores, confidences, is_toxics, dialog, all_prev candidates[best_id].pop("annotations", {}) best_attrs = candidates[best_id] - return best_skill_name, best_text, best_confidence, best_human_attributes, best_bot_attributes, best_attrs + return ( + best_skill_name, + best_text, + best_confidence, + best_human_attributes, + best_bot_attributes, + best_attrs, + ) if __name__ == "__main__": diff --git a/state_formatters/dp_formatters.py b/state_formatters/dp_formatters.py index 0734ab3e49..af942487b3 100755 --- a/state_formatters/dp_formatters.py +++ b/state_formatters/dp_formatters.py @@ -407,6 +407,14 @@ def last_utt_and_history_dialog(dialog: Dict) -> List: ] +def convert_nli_hypotheses_annotator_formatter(dialog: Dict) -> List[Dict]: + # Used by: convert_based_nli candidate annotators + hypotheses = dialog["human_utterances"][-1]["hypotheses"] + hypots = [h["text"] for h in hypotheses] + last_bot_utterances = [u["text"] for u in dialog["bot_utterances"][-20:]] + return [{"sentences": hypots, "last_bot_utterances": [last_bot_utterances] * len(hypots)}] + + def convers_evaluator_annotator_formatter(dialog: Dict) -> List[Dict]: dialog = utils.get_last_n_turns(dialog) dialog = utils.remove_clarification_turns_from_dialog(dialog)