From a051d4e127fea71897faad3fc7a7d18399b85498 Mon Sep 17 00:00:00 2001 From: Francesc Lapedriza Date: Sun, 13 Oct 2019 03:01:45 +0200 Subject: [PATCH 1/4] Add docker support --- .dockerignore | 8 ++++++++ .gitignore | 1 + docker/Dockerfile | 19 ++++++++++++++++++ docker/README.md | 31 ++++++++++++++++++++++++++++ docker/build.sh | 51 +++++++++++++++++++++++++++++++++++++++++++++++ docker/run.sh | 30 ++++++++++++++++++++++++++++ 6 files changed, 140 insertions(+) create mode 100644 .dockerignore create mode 100644 docker/Dockerfile create mode 100644 docker/README.md create mode 100755 docker/build.sh create mode 100755 docker/run.sh diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..e6b9720 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,8 @@ +LICENCE +.gitignore +README.md +docker +inferSent/dataset/GloVe +inferSent/encoder/infersent1.pickle +datasets +model_checkpoints \ No newline at end of file diff --git a/.gitignore b/.gitignore index b6309df..e337baf 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ model/results /etc/ datasets/ /cornell_movie_dialogue/ +model_checkpoints/ *.orig *.lprof .idea diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000..1c00767 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,19 @@ +FROM nvidia/cuda:10.0-runtime + +WORKDIR /code + +RUN apt-get -y update && apt-get install -y build-essential python3.6 python3.6-dev python3-pip wget + +RUN ln -s /usr/bin/python3.6 /usr/bin/python && ln -s /usr/bin/pip3 /usr/bin/pip + +RUN pip install --no-cache-dir --upgrade ipython + +RUN pip install --no-cache-dir torch===0.4.0 -f https://download.pytorch.org/whl/cu100/torch_stable.html + +COPY requirements.txt /build/requirements.txt + +RUN pip install --no-cache-dir -r /build/requirements.txt + +COPY . /code/ + +RUN pip install -e . diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 0000000..b00873f --- /dev/null +++ b/docker/README.md @@ -0,0 +1,31 @@ +# Running the project using Docker + +To be able to run this project using docker, you will need [Docker](https://docs.docker.com/install/) and [Nvidia Container Toolkit](https://github.com/NVIDIA/nvidia-docker) installed in your machine + +## Building the docker image + +To create the docker image just run the `build.sh` script located in this folder, you will be prompted to download GloVe during the process, this won't be included in the docker image to avoid yielding a huge image, instead, it will be mounted as a volume in the container when running it. + +## Before running the docker image + +Before running the project inside docker, all extra files mentioned in the base `README` file should be located in the same location mentioned in the `README`, they will be mounted as volumes. + +You can either download them from your host system or execute the `run.sh` script with the same commands to download them using the docker image without having to install the possible dependencies. + +For example, you can run: + +```bash +docker/run.sh python dataset_preprocess.py --dataset=cornell --shortcut +``` + +To download the Cornell dataset and place it in the `./datasets/cornell/` path + +## Running the project in the docker image +To run the image in interactive mode, just run +```bash +docker/run.sh bash +``` + +After that you will have a bash shell running inside the docker image with all the necessary files mounted as volumes, there you can interact with the project the same way you would do it in your machine. + +To run actions in background, add -d as the first argument of the `run.sh` script. diff --git a/docker/build.sh b/docker/build.sh new file mode 100755 index 0000000..99356c3 --- /dev/null +++ b/docker/build.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +set -ue + +DIR=$(dirname "$(realpath $0)") +PROJECT_ROOT="$(realpath $DIR/..)" +DOCKER_IMAGE="neuralchat" + +function get_glove { + GLOVE_DIR=$PROJECT_ROOT/inferSent/dataset/GloVe + mkdir -p $GLOVE_DIR + if ! [ -f $GLOVE_DIR/glove.840B.300d.txt ]; then + curl -Lo $GLOVE_DIR/glove.840B.300d.zip http://nlp.stanford.edu/data/glove.840B.300d.zip + unzip $GLOVE_DIR/glove.840B.300d.zip -d $GLOVE_DIR + rm -f $GLOVE_DIR/glove.840B.300d.zip + else + echo "GloVe file is already present in $GLOVE_DIR" + fi + + if ! [ -f $PROJECT_ROOT/inferSent/encoder/infersent1.pickle ]; then + curl -Lo $PROJECT_ROOT/inferSent/encoder/infersent1.pickle https://affect.media.mit.edu/neural_chat/inferSent/encoder/infersent1.pickle + else + echo "$PROJECT_ROOT/inferSent/encoder/infersent1.pickle already exists" + fi +} + +function get_torchmoji { + docker run -it -v $PROJECT_ROOT/torchMoji/model:/code/torchMoji/model $DOCKER_IMAGE:latest python torchMoji/scripts/download_weights.py +} + +echo -n "Do you want to download GloVe[y/N]? " +read answer + +if [ "$answer" != "${answer#[Yy]}" ] ;then + get_glove +else + echo "GLoVe will not be downloaded, be sure to put it in the correct location before running the docker image" +fi + +cd $PROJECT_ROOT && docker build -f $PROJECT_ROOT/docker/Dockerfile -t $DOCKER_IMAGE:latest . && cd - + +echo -n "Do you want to download torchMoji weights and include them in the image [y/N]? " +read answer + +if [ "$answer" != "${answer#[Yy]}" ] ;then + get_torchmoji + cd $PROJECT_ROOT && docker build -f $PROJECT_ROOT/docker/Dockerfile -t $DOCKER_IMAGE:latest . && cd - +else + echo "torchMoji will not be downloaded, be sure to put it in the correct location before running the docker image and add the corresponding volume to the run file" +fi + + diff --git a/docker/run.sh b/docker/run.sh new file mode 100755 index 0000000..9299ce6 --- /dev/null +++ b/docker/run.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +DIR=$(dirname "$(realpath $0)") +PROJECT_ROOT="$(realpath $DIR/..)" +DOCKER_IMAGE="neuralchat" + +if [[ "$(docker images -q $DOCKER_IMAGE:latest 2> /dev/null)" == "" ]]; then + echo "Docker image does not exist, please build it using the build.sh script" + exit 1 +fi + +# Docker volumes for extra data that is too big to be included in the docker image +MODELS_VOLUME="$PROJECT_ROOT/datasets/:/code/datasets" +GLOVE_VOLUME="$PROJECT_ROOT/inferSent/dataset/GloVe/:/code/inferSent/dataset/GloVe/" +CHECKPOINTS_VOLUME="$PROJECT_ROOT/model_checkpoints/:/code/model_checkpoints" +INFERSENT_PICKLE_VOLUME="$PROJECT_ROOT/inferSent/encoder/infersent1.pickle:/code/inferSent/encoder/infersent1.pickle" + +if [ $# = 0 ]; then + echo "Usage: $0 [-d] " + exit 1 +fi + +if [ $1 = "-d" ]; then + DETACH="True" + shift +else + IT="True" +fi + +docker run --gpus all ${DETACH:+-d} ${IT:+-it} -v $MODELS_VOLUME -v $GLOVE_VOLUME -v $CHECKPOINTS_VOLUME -v $INFERSENT_PICKLE_VOLUME $DOCKER_IMAGE:latest $@ From 1d492568bb39421476dd41593f6108e46459a4ab Mon Sep 17 00:00:00 2001 From: Francesc Lapedriza Date: Sun, 13 Oct 2019 03:01:57 +0200 Subject: [PATCH 2/4] Fix encoding problem --- encoder/models.py | 4 ++-- inferSent/encoder/models.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/encoder/models.py b/encoder/models.py index 6bf95c7..5595efc 100755 --- a/encoder/models.py +++ b/encoder/models.py @@ -107,7 +107,7 @@ def get_w2v(self, word_dict): assert hasattr(self, 'w2v_path'), 'w2v path not set' # create word_vec with w2v vectors word_vec = {} - with open(self.w2v_path) as f: + with open(self.w2v_path, encoding='utf-8') as f: for line in f: word, vec = line.split(' ', 1) if word in word_dict: @@ -120,7 +120,7 @@ def get_w2v_k(self, K): # create word_vec with k first w2v vectors k = 0 word_vec = {} - with open(self.w2v_path) as f: + with open(self.w2v_path, encoding='utf-8') as f: for line in f: word, vec = line.split(' ', 1) if k <= K: diff --git a/inferSent/encoder/models.py b/inferSent/encoder/models.py index 01e33f6..c025b0a 100755 --- a/inferSent/encoder/models.py +++ b/inferSent/encoder/models.py @@ -140,7 +140,7 @@ def get_w2v(self, word_dict): assert hasattr(self, 'w2v_path'), 'w2v path not set' # create word_vec with w2v vectors word_vec = {} - with open(self.w2v_path) as f: + with open(self.w2v_path, encoding='utf-8') as f: for line in f: word, vec = line.split(' ', 1) if word in word_dict: @@ -153,7 +153,7 @@ def get_w2v_k(self, K): # create word_vec with k first w2v vectors k = 0 word_vec = {} - with open(self.w2v_path) as f: + with open(self.w2v_path, encoding='utf-8') as f: for line in f: word, vec = line.split(' ', 1) if k <= K: From 860fdf3a7bc173c0d90927c268de3299e2a7c9c2 Mon Sep 17 00:00:00 2001 From: Francesc Date: Sun, 13 Oct 2019 19:21:02 +0200 Subject: [PATCH 3/4] Update documentation --- README.md | 7 ++++++- docker/README.md | 23 +++++++++++++++++++---- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 636b9d4..e3d6029 100755 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ This repository is accompanied by [Neural Chat Web](https://github.com/asmadotgh This code is inspired by and built off of "A Hierarchical Latent Structure for Variational Conversation Modeling" ([code](https://github.com/ctr4si/A-Hierarchical-Latent-Structure-for-Variational-Conversation-Modeling), [paper](https://arxiv.org/abs/1804.03424), [presentation](https://vimeo.com/277671819)). -## Prerequisites +## Prerequisites for running in local machine This section includes installation of required libraries, and downloading pre-trained models. ### Installation @@ -57,7 +57,12 @@ Run the download script to downloads the pre-trained torchMoji weights [~85MB] f python torchMoji/scripts/download_weights.py ``` +## Running using Docker + +To run the project using docker, check the [corresponding](docker/README.md) file + ## Download & Preprocess Data + The following scripts will: 1. Create directories `./datasets/reddit_casual/` and `./datasets/cornell/` respectively. diff --git a/docker/README.md b/docker/README.md index b00873f..0bd1919 100644 --- a/docker/README.md +++ b/docker/README.md @@ -4,13 +4,13 @@ To be able to run this project using docker, you will need [Docker](https://docs ## Building the docker image -To create the docker image just run the `build.sh` script located in this folder, you will be prompted to download GloVe during the process, this won't be included in the docker image to avoid yielding a huge image, instead, it will be mounted as a volume in the container when running it. +To create the docker image just run the `build.sh` script located in this folder, you will be prompted to download GloVe during the process, this won't be included in the docker image to avoid yielding a huge image, instead, it will be stored in the host machine and mounted as a volume in the container when running it. ## Before running the docker image -Before running the project inside docker, all extra files mentioned in the base `README` file should be located in the same location mentioned in the `README`, they will be mounted as volumes. +Before running the project inside docker, all extra files mentioned in the [base `README` file](../README.md) should be located in the same location mentioned there, they will be mounted as volumes. -You can either download them from your host system or execute the `run.sh` script with the same commands to download them using the docker image without having to install the possible dependencies. +You can either download them from your host system or execute the `run.sh` script with the same commands stated in the tutorial to download them using the docker image without having to potentially install any dependency in your host machine. For example, you can run: @@ -18,7 +18,7 @@ For example, you can run: docker/run.sh python dataset_preprocess.py --dataset=cornell --shortcut ``` -To download the Cornell dataset and place it in the `./datasets/cornell/` path +To download the Cornell dataset and place it in the `./datasets/cornell/` path in the host machine. The `datasets` folder will be mounted as a volume in the docker container when using the `run.sh` script. ## Running the project in the docker image To run the image in interactive mode, just run @@ -29,3 +29,18 @@ docker/run.sh bash After that you will have a bash shell running inside the docker image with all the necessary files mounted as volumes, there you can interact with the project the same way you would do it in your machine. To run actions in background, add -d as the first argument of the `run.sh` script. + +## TL;DR + +In order to have a docker image ready to run with the default configuration and the cornell dataset just follow this steps: + +```bash +git clone git@github.com:natashamjaques/neural_chat.git +cd neural_chat +docker/build.sh +# Answer y to all questions +docker/run.sh python dataset_preprocess.py --dataset=cornell --shortcut +docker/run.sh bash +``` + +Now you are running a terminal inside the docker image with all the dependencies (including cuda and python requirements) installed and ready to start training the model without having to install anything besides docker and the Nvidia runtime in your host machine. \ No newline at end of file From f120a8649bffdb5e1a5554370fdc52a5d754624d Mon Sep 17 00:00:00 2001 From: Francesc Lapedriza Date: Fri, 23 Oct 2020 22:55:30 +0200 Subject: [PATCH 4/4] Add custom changes --- dataset_preprocess.py | 73 +++++++++++++------ docker/Dockerfile | 26 +++++-- docker/run.sh | 2 +- model/selfplay.py | 116 +++++++++++++++++++++++++++++++ model/solver.py | 8 ++- torchMoji/api/dataset_emojize.py | 2 +- 6 files changed, 193 insertions(+), 34 deletions(-) create mode 100644 model/selfplay.py diff --git a/dataset_preprocess.py b/dataset_preprocess.py index 784c7ac..e0362f3 100644 --- a/dataset_preprocess.py +++ b/dataset_preprocess.py @@ -1,20 +1,22 @@ # Download/Preprocess data-sets -from multiprocessing import Pool import argparse +import json +import os import pickle import random -import os -import json +import tarfile +from multiprocessing import Pool +from pathlib import Path +from subprocess import call from urllib.request import urlretrieve from zipfile import ZipFile -from pathlib import Path + from tqdm import tqdm +print('import things') from model.utils import Tokenizer, Vocab, PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, pad_sentences -from subprocess import call - -import tarfile +print('start') project_dir = Path(__file__).resolve().parent datasets_dir = project_dir.joinpath('datasets') cornell_dir = datasets_dir.joinpath('cornell') @@ -32,13 +34,17 @@ def shortcut_download(dataset, compression_type='tar.gz'): compression_type = 'tar.gz' if dataset == 'reddit_casual' and compression_type == 'zip': - print('Warning! Zip format is not supported for reddit casual dataset due to file size. Changing to tar.gz') + print( + 'Warning! Zip format is not supported for reddit casual dataset due to file size. ' + 'Changing to tar.gz' + ) compression_type = 'tar.gz' if not os.path.exists(datasets_dir): os.makedirs(datasets_dir) - compressed_url = f'https://affect.media.mit.edu/neural_chat/datasets/{dataset}_preprocessed.{compression_type}' + compressed_url = f'https://affect.media.mit.edu/neural_chat/datasets/' \ + f'{dataset}_preprocessed.{compression_type}' compressed_file_dir = datasets_dir.joinpath(dataset) compressed_file_path = datasets_dir.joinpath(f'{dataset}_preprocessed.{compression_type}') @@ -72,7 +78,6 @@ def shortcut_download(dataset, compression_type='tar.gz'): print('Directory already exists. Aborting download.') - def prepare_reddit_casual_data(): """Download and unpack dialogs""" @@ -252,14 +257,15 @@ def pad_conversation(conversation): def load_conversations_cornell(cornell_dir): # Download and extract dialogs if necessary. - prepare_cornell_data() - + if download: + prepare_cornell_data() + cornell_path = cornell_dir.joinpath(f'lines_{recall}/{toxic}') print("Loading lines") - lines = load_lines(cornell_dir.joinpath("movie_lines.txt")) + lines = load_lines(cornell_path.joinpath("movie_lines.txt")) print('Number of lines:', len(lines)) print("Loading conversations...") - conversations = load_conversations(cornell_dir.joinpath("movie_conversations.txt"), lines) + conversations = load_conversations(cornell_path.joinpath("movie_conversations.txt"), lines) print('Number of conversations:', len(conversations)) return conversations @@ -287,6 +293,15 @@ def load_conversations_cornell(cornell_dir): # Input dataset parser.add_argument('--dataset', type=str, default='cornell') + # Does the dataset need to be downloaded + parser.add_argument('--download', default=False, action='store_true') + + # Use toxic or non toxic dataset + parser.add_argument('--classified-as', type=str, default='notoxic') + + # Which Recall dataset to use + parser.add_argument('--recall', type=str, default='0.8') + # Bypassing pre-processing by directly downloading all the files parser.add_argument('--shortcut', action="store_true", default=False, help="Whether to download the preprocessed dataset instead.") @@ -300,6 +315,9 @@ def load_conversations_cornell(cornell_dir): max_vocab_size = args.max_vocab_size min_freq = args.min_vocab_frequency n_workers = args.n_workers + download = args.download + toxic = args.classified_as + recall = args.recall if args.shortcut: shortcut_download(args.dataset, args.shortcut_compression_type) @@ -319,10 +337,12 @@ def load_conversations_cornell(cornell_dir): print('Train/Valid/Test Split') train, valid, test = train_valid_test_split_by_conversation(conversations, split_ratio) + def to_pickle(obj, path): with open(path, 'wb') as f: pickle.dump(obj, f) + for split_type, conv_objects in [('train', train), ('valid', valid), ('test', test)]: print(f'Processing {split_type} dataset...') split_data_dir = dataset_dir.joinpath(split_type) @@ -330,18 +350,21 @@ def to_pickle(obj, path): print(f'Tokenize.. (n_workers={n_workers})') + def _tokenize_conversation(conv): return tokenize_conversation(conv['lines']) + with Pool(n_workers) as pool: conversations = list(tqdm(pool.imap(_tokenize_conversation, conv_objects), - total=len(conv_objects))) + total=len(conv_objects))) conversation_length = [min(len(conv['lines']), max_conv_len) for conv in conv_objects] - raw_sentences = [[line['text'] for line in conv['lines'][0:min(len(conv['lines']), max_conv_len)]] - for conv in conv_objects] + raw_sentences = [ + [line['text'] for line in conv['lines'][0:min(len(conv['lines']), max_conv_len)]] + for conv in conv_objects] sentences, sentence_length = pad_sentences( conversations, @@ -355,31 +378,35 @@ def _tokenize_conversation(conv): to_pickle(sentence_length, split_data_dir.joinpath('sentence_length.pkl')) if split_type == 'train': - print('Save Vocabulary...') vocab = Vocab(tokenizer) vocab.add_dataframe(conversations) vocab.update(max_size=max_vocab_size, min_freq=min_freq) print('Vocabulary size: ', len(vocab)) - vocab.pickle(dataset_dir.joinpath('word2id.pkl'), dataset_dir.joinpath('id2word.pkl')) + vocab.pickle(dataset_dir.joinpath('word2id.pkl'), + dataset_dir.joinpath('id2word.pkl')) print('Done downloading and pre-processing dataset.') print('Inferring InferSent encoding for dataset...') - infersent_export_script = os.path.join(os.path.join('inferSent', 'api'), 'export_dataset_embeddings.py') + infersent_export_script = os.path.join(os.path.join('inferSent', 'api'), + 'export_dataset_embeddings.py') for split_type in ['train', 'valid', 'test']: filepath = os.path.join(os.path.join(dataset_dir, split_type), 'raw_sentences.pkl') if split_type == 'train': call(["python", infersent_export_script, f'--filepath={filepath}', '--streaming']) else: call(["python", infersent_export_script, f'--filepath={filepath}']) - infersent_reduction_script = os.path.join(os.path.join('inferSent', 'api'), 'reduce_embeddings_dimension.py') - call(["python", infersent_reduction_script, f'--dataset={dataset_dir}', '--savepca', '--exportembeddings']) + infersent_reduction_script = os.path.join(os.path.join('inferSent', 'api'), + 'reduce_embeddings_dimension.py') + call(["python", infersent_reduction_script, f'--dataset={dataset_dir}', '--savepca', + '--exportembeddings']) print('Done exporting InferSent embedding.') print('Inferring TorchMoji encoding for dataset...') - torchmoji_export_script = os.path.join(os.path.join('torchMoji', 'api'), 'dataset_emojize.py') + torchmoji_export_script = os.path.join(os.path.join('torchMoji', 'api'), + 'dataset_emojize.py') for split_type in ['train', 'valid', 'test']: filepath = os.path.join(os.path.join(dataset_dir, split_type), 'raw_sentences.pkl') call(["python", torchmoji_export_script, f'--filepath={filepath}']) diff --git a/docker/Dockerfile b/docker/Dockerfile index 1c00767..c9793c9 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,19 +1,33 @@ -FROM nvidia/cuda:10.0-runtime - +FROM nvidia/cuda:10.0-cudnn7-runtime +ENV PYTHONUNBUFFERED=1 WORKDIR /code -RUN apt-get -y update && apt-get install -y build-essential python3.6 python3.6-dev python3-pip wget +#RUN apt-get -y update && \ +# apt-get install -y --no-install-recommends software-properties-common && \ +# add-apt-repository -y ppa:deadsnakes/ppa && \ +# apt-get --purge remove -y software-properties-common && \ +# rm -rf /var/lib/apt/lists/* + +RUN apt-get -y update && \ + apt-get install -y --no-install-recommends wget python3.6 python3-pip && \ + rm -rf /var/lib/apt/lists/* RUN ln -s /usr/bin/python3.6 /usr/bin/python && ln -s /usr/bin/pip3 /usr/bin/pip -RUN pip install --no-cache-dir --upgrade ipython +RUN python -m pip install --no-cache-dir --upgrade ipython RUN pip install --no-cache-dir torch===0.4.0 -f https://download.pytorch.org/whl/cu100/torch_stable.html COPY requirements.txt /build/requirements.txt -RUN pip install --no-cache-dir -r /build/requirements.txt +RUN apt-get -y update && \ + apt-get -y install --no-install-recommends build-essential python3.6-dev && \ + python -m pip install --no-cache-dir -r /build/requirements.txt && \ + apt-get -y --purge remove build-essential && \ + rm -rf /var/lib/apt/lists/* COPY . /code/ -RUN pip install -e . +RUN python -m pip install -e . + +EXPOSE 6009 diff --git a/docker/run.sh b/docker/run.sh index 9299ce6..d1c2373 100755 --- a/docker/run.sh +++ b/docker/run.sh @@ -27,4 +27,4 @@ else IT="True" fi -docker run --gpus all ${DETACH:+-d} ${IT:+-it} -v $MODELS_VOLUME -v $GLOVE_VOLUME -v $CHECKPOINTS_VOLUME -v $INFERSENT_PICKLE_VOLUME $DOCKER_IMAGE:latest $@ +docker run --gpus '"device=1"' ${DETACH:+-d} ${IT:+-it} -p 0.0.0.0:6009:6009 -v $MODELS_VOLUME -v $GLOVE_VOLUME -v $CHECKPOINTS_VOLUME -v $INFERSENT_PICKLE_VOLUME $DOCKER_IMAGE:latest $@ diff --git a/model/selfplay.py b/model/selfplay.py new file mode 100644 index 0000000..3a77bab --- /dev/null +++ b/model/selfplay.py @@ -0,0 +1,116 @@ +import argparse +import csv + +from solver import Solver, VariationalSolver +from data_loader import get_loader +from configs import get_config_from_dir +from model.utils import Vocab, Tokenizer +import os +import pickle +from model.models import VariationalModels + + +def load_pickle(path): + with open(path, 'rb') as f: + return pickle.load(f) + + +greetings = [ + 'Good morning', + 'Hello', + 'Hi', + 'How are you?', + 'What\'s up?', + 'Wudup', + 'Hey!', + 'Everything okay?', + 'Tell me about you', + 'What\'s your story?', + 'Having fun?', + 'What are you doing this weekend?', + 'Long time no see!', + 'Greetings', + 'Where have you been?', +] + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--checkpoint', type=str, default=None) + + # Maximum valid length of sentence + # => SOS/EOS will surround sentence (EOS for source / SOS for target) + # => maximum length of tensor = max_sentence_length + 1 + parser.add_argument('-s', '--max_sentence_length', type=int, default=30) + parser.add_argument('-c', '--max_conversation_length', type=int, default=5) + parser.add_argument('--conversation_length', type=int, default=10) + parser.add_argument('--print-history', action='store_true') + parser.add_argument('--kind', type=str, default='toxic', choices=['toxic', 'notoxic']) + + # Allow sampling with higher priority to samples with longer length and + # fewer unknowns + parser.add_argument('--sample_by', type=str, default='priority', + help="Type of sampling for generated responses. Can" + \ + "be None, 'priority', or 'length'") + + # Turn on debugging outputs + parser.add_argument('--debug', action='store_true') + parser.set_defaults(debug=False) + + # For interacting with rl models + parser.add_argument('--load_rl_ckpt', action='store_true') + parser.set_defaults(load_rl_ckpt=False) + parser.add_argument('--rl_ckpt_epoch', type=int, default=None) + + kwargs = parser.parse_args() + kw_dict = vars(kwargs) + kw_dict['mode'] = 'test' + + config = get_config_from_dir(kwargs.checkpoint, **kw_dict) + config.beam_size = 5 + + print('Loading Vocabulary...') + vocab = Vocab() + vocab.load(config.word2id_path, config.id2word_path) + print(f'Vocabulary size: {vocab.vocab_size}') + + config.vocab_size = vocab.vocab_size + + # If checkpoint is for an emotion model, load that pickle file + emotion_sentences = None + if config.emotion: + emotion_sentences = load_pickle(config.emojis_path) + + # Load infersent embeddings if necessary + infersent_sentences = None + if config.infersent: + print('Loading infersent sentence embeddings...') + infersent_sentences = load_pickle(config.infersent_path) + embedding_size = infersent_sentences[0][0].shape[0] + config.infersent_output_size = embedding_size + + data_loader = get_loader( + sentences=load_pickle(config.sentences_path), + conversation_length=load_pickle(config.conversation_length_path), + sentence_length=load_pickle(config.sentence_length_path), + vocab=vocab, + batch_size=config.batch_size, + emojis=emotion_sentences, + infersent=infersent_sentences) + + if config.model in VariationalModels: + solver = VariationalSolver(config, None, data_loader, vocab=vocab, is_train=False) + else: + solver = Solver(config, None, data_loader, vocab=vocab, is_train=False) + + solver.build() + with open(f'/code/datasets/conversations_{kwargs.kind}.csv', 'w') as f: + writer = csv.writer(f) + for i in greetings: + conv = solver.self_play(conversation_length=kwargs.conversation_length, + max_sentence_length=kwargs.max_sentence_length, + max_conversation_length=kwargs.max_conversation_length, + sample_by=kwargs.sample_by, + print_history=kwargs.print_history, + first_interaction=i) + writer.writerows([[c] for c in conv]) diff --git a/model/solver.py b/model/solver.py index 3352b77..95fd9ea 100755 --- a/model/solver.py +++ b/model/solver.py @@ -449,8 +449,8 @@ def process_user_input(self, sentences, max_sentence_length=30): return coded, lens def self_play(self, conversation_length=10, max_sentence_length=30, max_conversation_length=5, - sample_by='priority', print_history=False): - context_sentences = [''] + sample_by='priority', print_history=False, first_interaction=''): + context_sentences = [first_interaction] for i in range(conversation_length): gen_response = self.generate_response_to_input( @@ -463,7 +463,9 @@ def self_play(self, conversation_length=10, max_sentence_length=30, max_conversa if print_history: for s in context_sentences: print(s) - # removing the first empty prompt + print('--------------------') + + # removing the first empty prompt return context_sentences[1:] def interact(self, max_sentence_length=30, max_conversation_length=5, diff --git a/torchMoji/api/dataset_emojize.py b/torchMoji/api/dataset_emojize.py index b549ded..7494d57 100644 --- a/torchMoji/api/dataset_emojize.py +++ b/torchMoji/api/dataset_emojize.py @@ -90,7 +90,7 @@ def top_elements(array, k): # Map to emojis emojis = map(lambda x: EMOJIS[x], emoji_ids) print('Retokenized: ', st.to_sentence(list(tokenized[0]))) - print(emoji.emojize("{} {}".format(sentence, ' '.join(emojis)), use_aliases=True)) + # print(emoji.emojize("{} {}".format(sentence, ' '.join(emojis)), use_aliases=True)) if idx % args.step == 0: print('Step: ', idx)