natashamjaques · flapedriza · Oct 13, 2019 · Oct 13, 2019 · Oct 13, 2019 · Oct 23, 2020
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,8 @@
+LICENCE
+.gitignore
+README.md
+docker
+inferSent/dataset/GloVe
+inferSent/encoder/infersent1.pickle
+datasets
+model_checkpoints
diff --git a/.gitignore b/.gitignore
@@ -17,6 +17,7 @@ model/results
 /etc/
 datasets/
 /cornell_movie_dialogue/
+model_checkpoints/
 *.orig
 *.lprof
 .idea

diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@ This repository is accompanied by [Neural Chat Web](https://github.com/asmadotgh
 
 This code is inspired by and built off of "A Hierarchical Latent Structure for Variational Conversation Modeling" ([code](https://github.com/ctr4si/A-Hierarchical-Latent-Structure-for-Variational-Conversation-Modeling), [paper](https://arxiv.org/abs/1804.03424), [presentation](https://vimeo.com/277671819)).
 
-## Prerequisites
+## Prerequisites for running in local machine
 This section includes installation of required libraries, and downloading pre-trained models.
 
 ### Installation
@@ -57,7 +57,12 @@ Run the download script to downloads the pre-trained torchMoji weights [~85MB] f
 python torchMoji/scripts/download_weights.py
 ```
 
+## Running using Docker
+
+To run the project using docker, check the [corresponding](docker/README.md) file
+
 ## Download & Preprocess Data
+
 The following scripts will:
 
 1. Create directories `./datasets/reddit_casual/` and `./datasets/cornell/` respectively.

diff --git a/dataset_preprocess.py b/dataset_preprocess.py
@@ -1,20 +1,22 @@
 # Download/Preprocess data-sets
 
-from multiprocessing import Pool
 import argparse
+import json
+import os
 import pickle
 import random
-import os
-import json
+import tarfile
+from multiprocessing import Pool
+from pathlib import Path
+from subprocess import call
 from urllib.request import urlretrieve
 from zipfile import ZipFile
-from pathlib import Path
+
 from tqdm import tqdm
+print('import things')
 from model.utils import Tokenizer, Vocab, PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, pad_sentences
-from subprocess import call
-
-import tarfile
 
+print('start')
 project_dir = Path(__file__).resolve().parent
 datasets_dir = project_dir.joinpath('datasets')
 cornell_dir = datasets_dir.joinpath('cornell')
@@ -32,13 +34,17 @@ def shortcut_download(dataset, compression_type='tar.gz'):
         compression_type = 'tar.gz'
 
     if dataset == 'reddit_casual' and compression_type == 'zip':
-        print('Warning! Zip format is not supported for reddit casual dataset due to file size. Changing to tar.gz')
+        print(
+            'Warning! Zip format is not supported for reddit casual dataset due to file size. '
+            'Changing to tar.gz'
+        )
         compression_type = 'tar.gz'
 
     if not os.path.exists(datasets_dir):
         os.makedirs(datasets_dir)
 
-    compressed_url = f'https://affect.media.mit.edu/neural_chat/datasets/{dataset}_preprocessed.{compression_type}'
+    compressed_url = f'https://affect.media.mit.edu/neural_chat/datasets/' \
+                     f'{dataset}_preprocessed.{compression_type}'
     compressed_file_dir = datasets_dir.joinpath(dataset)
     compressed_file_path = datasets_dir.joinpath(f'{dataset}_preprocessed.{compression_type}')
 
@@ -72,7 +78,6 @@ def shortcut_download(dataset, compression_type='tar.gz'):
         print('Directory already exists. Aborting download.')
 
 
-
 def prepare_reddit_casual_data():
     """Download and unpack dialogs"""
 
@@ -252,14 +257,15 @@ def pad_conversation(conversation):
 
 def load_conversations_cornell(cornell_dir):
     # Download and extract dialogs if necessary.
-    prepare_cornell_data()
-
+    if download:
+        prepare_cornell_data()
+    cornell_path = cornell_dir.joinpath(f'lines_{recall}/{toxic}')
     print("Loading lines")
-    lines = load_lines(cornell_dir.joinpath("movie_lines.txt"))
+    lines = load_lines(cornell_path.joinpath("movie_lines.txt"))
     print('Number of lines:', len(lines))
 
     print("Loading conversations...")
-    conversations = load_conversations(cornell_dir.joinpath("movie_conversations.txt"), lines)
+    conversations = load_conversations(cornell_path.joinpath("movie_conversations.txt"), lines)
     print('Number of conversations:', len(conversations))
     return conversations
 
@@ -287,6 +293,15 @@ def load_conversations_cornell(cornell_dir):
     # Input dataset
     parser.add_argument('--dataset', type=str, default='cornell')
 
+    # Does the dataset need to be downloaded
+    parser.add_argument('--download', default=False, action='store_true')
+
+    # Use toxic or non toxic dataset
+    parser.add_argument('--classified-as', type=str, default='notoxic')
+
+    # Which Recall dataset to use
+    parser.add_argument('--recall', type=str, default='0.8')
+
     # Bypassing pre-processing by directly downloading all the files
     parser.add_argument('--shortcut', action="store_true", default=False,
                         help="Whether to download the preprocessed dataset instead.")
@@ -300,6 +315,9 @@ def load_conversations_cornell(cornell_dir):
     max_vocab_size = args.max_vocab_size
     min_freq = args.min_vocab_frequency
     n_workers = args.n_workers
+    download = args.download
+    toxic = args.classified_as
+    recall = args.recall
 
     if args.shortcut:
         shortcut_download(args.dataset, args.shortcut_compression_type)
@@ -319,29 +337,34 @@ def load_conversations_cornell(cornell_dir):
         print('Train/Valid/Test Split')
         train, valid, test = train_valid_test_split_by_conversation(conversations, split_ratio)
 
+
         def to_pickle(obj, path):
             with open(path, 'wb') as f:
                 pickle.dump(obj, f)
 
+
         for split_type, conv_objects in [('train', train), ('valid', valid), ('test', test)]:
             print(f'Processing {split_type} dataset...')
             split_data_dir = dataset_dir.joinpath(split_type)
             split_data_dir.mkdir(exist_ok=True)
 
             print(f'Tokenize.. (n_workers={n_workers})')
 
+
             def _tokenize_conversation(conv):
                 return tokenize_conversation(conv['lines'])
 
+
             with Pool(n_workers) as pool:
                 conversations = list(tqdm(pool.imap(_tokenize_conversation, conv_objects),
-                                         total=len(conv_objects)))
+                                          total=len(conv_objects)))
 
             conversation_length = [min(len(conv['lines']), max_conv_len)
                                    for conv in conv_objects]
 
-            raw_sentences = [[line['text'] for line in conv['lines'][0:min(len(conv['lines']), max_conv_len)]]
-                             for conv in conv_objects]
+            raw_sentences = [
+                [line['text'] for line in conv['lines'][0:min(len(conv['lines']), max_conv_len)]]
+                for conv in conv_objects]
 
             sentences, sentence_length = pad_sentences(
                 conversations,
@@ -355,31 +378,35 @@ def _tokenize_conversation(conv):
             to_pickle(sentence_length, split_data_dir.joinpath('sentence_length.pkl'))
 
             if split_type == 'train':
-
                 print('Save Vocabulary...')
                 vocab = Vocab(tokenizer)
                 vocab.add_dataframe(conversations)
                 vocab.update(max_size=max_vocab_size, min_freq=min_freq)
 
                 print('Vocabulary size: ', len(vocab))
-                vocab.pickle(dataset_dir.joinpath('word2id.pkl'), dataset_dir.joinpath('id2word.pkl'))
+                vocab.pickle(dataset_dir.joinpath('word2id.pkl'),
+                             dataset_dir.joinpath('id2word.pkl'))
 
         print('Done downloading and pre-processing dataset.')
 
         print('Inferring InferSent encoding for dataset...')
-        infersent_export_script = os.path.join(os.path.join('inferSent', 'api'), 'export_dataset_embeddings.py')
+        infersent_export_script = os.path.join(os.path.join('inferSent', 'api'),
+                                               'export_dataset_embeddings.py')
         for split_type in ['train', 'valid', 'test']:
             filepath = os.path.join(os.path.join(dataset_dir, split_type), 'raw_sentences.pkl')
             if split_type == 'train':
                 call(["python", infersent_export_script, f'--filepath={filepath}', '--streaming'])
             else:
                 call(["python", infersent_export_script, f'--filepath={filepath}'])
-        infersent_reduction_script = os.path.join(os.path.join('inferSent', 'api'), 'reduce_embeddings_dimension.py')
-        call(["python", infersent_reduction_script, f'--dataset={dataset_dir}', '--savepca', '--exportembeddings'])
+        infersent_reduction_script = os.path.join(os.path.join('inferSent', 'api'),
+                                                  'reduce_embeddings_dimension.py')
+        call(["python", infersent_reduction_script, f'--dataset={dataset_dir}', '--savepca',
+              '--exportembeddings'])
         print('Done exporting InferSent embedding.')
 
         print('Inferring TorchMoji encoding for dataset...')
-        torchmoji_export_script = os.path.join(os.path.join('torchMoji', 'api'), 'dataset_emojize.py')
+        torchmoji_export_script = os.path.join(os.path.join('torchMoji', 'api'),
+                                               'dataset_emojize.py')
         for split_type in ['train', 'valid', 'test']:
             filepath = os.path.join(os.path.join(dataset_dir, split_type), 'raw_sentences.pkl')
             call(["python", torchmoji_export_script, f'--filepath={filepath}'])

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -0,0 +1,33 @@
+FROM nvidia/cuda:10.0-cudnn7-runtime
+ENV PYTHONUNBUFFERED=1
+WORKDIR /code
+
+#RUN apt-get -y update && \
+#    apt-get install -y --no-install-recommends software-properties-common && \
+#    add-apt-repository -y ppa:deadsnakes/ppa && \
+#    apt-get --purge remove -y software-properties-common && \
+#    rm -rf /var/lib/apt/lists/*
+
+RUN apt-get -y update && \
+    apt-get install -y --no-install-recommends wget python3.6 python3-pip && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN ln -s /usr/bin/python3.6 /usr/bin/python && ln -s /usr/bin/pip3 /usr/bin/pip
+
+RUN python -m pip install --no-cache-dir --upgrade ipython
+
+RUN pip install --no-cache-dir torch===0.4.0 -f https://download.pytorch.org/whl/cu100/torch_stable.html
+
+COPY requirements.txt /build/requirements.txt
+
+RUN apt-get -y update && \
+    apt-get -y install --no-install-recommends build-essential python3.6-dev && \
+    python -m pip install --no-cache-dir -r /build/requirements.txt && \
+    apt-get -y --purge remove build-essential && \
+    rm -rf /var/lib/apt/lists/*
+
+COPY . /code/
+
+RUN python -m pip install -e .
+
+EXPOSE 6009
diff --git a/docker/README.md b/docker/README.md
@@ -0,0 +1,46 @@
+# Running the project using Docker
+
+To be able to run this project using docker, you will need [Docker](https://docs.docker.com/install/) and [Nvidia Container Toolkit](https://github.com/NVIDIA/nvidia-docker) installed in your machine
+
+## Building the docker image
+
+To create the docker image just run the `build.sh` script located in this folder, you will be prompted to download GloVe during the process, this won't be included in the docker image to avoid yielding a huge image, instead, it will be stored in the host machine and mounted as a volume in the container when running it.
+
+## Before running the docker image
+
+Before running the project inside docker, all extra files mentioned in the [base `README` file](../README.md) should be located in the same location mentioned there, they will be mounted as volumes.
+
+You can either download them from your host system or execute the `run.sh` script with the same commands stated in the tutorial to download them using the docker image without having to potentially install any dependency in your host machine.
+
+For example, you can run:
+
+```bash
+docker/run.sh python dataset_preprocess.py --dataset=cornell --shortcut
+```
+
+To download the Cornell dataset and place it in the `./datasets/cornell/` path in the host machine. The `datasets` folder will be mounted as a volume in the docker container when using the `run.sh` script.
+
+## Running the project in the docker image
+To run the image in interactive mode, just run
+```bash
+docker/run.sh bash
+```
+
+After that you will have a bash shell running inside the docker image with all the necessary files mounted as volumes, there you can interact with the project the same way you would do it in your machine.
+
+To run actions in background, add -d as the first argument of the `run.sh` script.
+
+## TL;DR
+
+In order to have a docker image ready to run with the default configuration and the cornell dataset just follow this steps:
+
+```bash
+git clone [email protected]:natashamjaques/neural_chat.git
+cd neural_chat
+docker/build.sh
+# Answer y to all questions
+docker/run.sh python dataset_preprocess.py --dataset=cornell --shortcut
+docker/run.sh bash
+```
+
+Now you are running a terminal inside the docker image with all the dependencies (including cuda and python requirements) installed and ready to start training the model without having to install anything besides docker and the Nvidia runtime in your host machine.
diff --git a/docker/build.sh b/docker/build.sh
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+set -ue
+
+DIR=$(dirname "$(realpath $0)")
+PROJECT_ROOT="$(realpath $DIR/..)"
+DOCKER_IMAGE="neuralchat"
+
+function get_glove {
+    GLOVE_DIR=$PROJECT_ROOT/inferSent/dataset/GloVe
+    mkdir -p $GLOVE_DIR
+    if ! [ -f $GLOVE_DIR/glove.840B.300d.txt ]; then
+        curl -Lo $GLOVE_DIR/glove.840B.300d.zip http://nlp.stanford.edu/data/glove.840B.300d.zip
+        unzip $GLOVE_DIR/glove.840B.300d.zip -d $GLOVE_DIR
+        rm -f $GLOVE_DIR/glove.840B.300d.zip
+    else
+        echo "GloVe file is already present in $GLOVE_DIR"
+    fi
+
+    if ! [ -f $PROJECT_ROOT/inferSent/encoder/infersent1.pickle ]; then
+        curl -Lo $PROJECT_ROOT/inferSent/encoder/infersent1.pickle https://affect.media.mit.edu/neural_chat/inferSent/encoder/infersent1.pickle
+    else
+        echo "$PROJECT_ROOT/inferSent/encoder/infersent1.pickle already exists"
+    fi
+}
+
+function get_torchmoji {
+    docker run -it -v $PROJECT_ROOT/torchMoji/model:/code/torchMoji/model $DOCKER_IMAGE:latest python torchMoji/scripts/download_weights.py
+}
+
+echo -n "Do you want to download GloVe[y/N]? "
+read answer
+
+if [ "$answer" != "${answer#[Yy]}" ] ;then
+    get_glove
+else
+    echo "GLoVe will not be downloaded, be sure to put it in the correct location before running the docker image"
+fi
+
+cd $PROJECT_ROOT && docker build -f $PROJECT_ROOT/docker/Dockerfile -t $DOCKER_IMAGE:latest . && cd -
+
+echo -n "Do you want to download torchMoji weights and include them in the image [y/N]? "
+read answer
+
+if [ "$answer" != "${answer#[Yy]}" ] ;then
+    get_torchmoji
+    cd $PROJECT_ROOT && docker build -f $PROJECT_ROOT/docker/Dockerfile -t $DOCKER_IMAGE:latest . && cd -
+else
+    echo "torchMoji will not be downloaded, be sure to put it in the correct location before running the docker image and add the corresponding volume to the run file"
+fi
+
+
diff --git a/docker/run.sh b/docker/run.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+DIR=$(dirname "$(realpath $0)")
+PROJECT_ROOT="$(realpath $DIR/..)"
+DOCKER_IMAGE="neuralchat"
+
+if [[ "$(docker images -q $DOCKER_IMAGE:latest 2> /dev/null)" == "" ]]; then
+  echo "Docker image does not exist, please build it using the build.sh script"
+  exit 1
+fi
+
+# Docker volumes for extra data that is too big to be included in the docker image
+MODELS_VOLUME="$PROJECT_ROOT/datasets/:/code/datasets"
+GLOVE_VOLUME="$PROJECT_ROOT/inferSent/dataset/GloVe/:/code/inferSent/dataset/GloVe/"
+CHECKPOINTS_VOLUME="$PROJECT_ROOT/model_checkpoints/:/code/model_checkpoints"
+INFERSENT_PICKLE_VOLUME="$PROJECT_ROOT/inferSent/encoder/infersent1.pickle:/code/inferSent/encoder/infersent1.pickle"
+
+if [ $# = 0 ]; then
+  echo "Usage: $0 [-d] <command>"
+  exit 1
+fi
+
+if [ $1 = "-d" ]; then
+  DETACH="True"
+  shift
+else
+  IT="True"
+fi
+
+docker run --gpus '"device=1"' ${DETACH:+-d} ${IT:+-it} -p 0.0.0.0:6009:6009 -v $MODELS_VOLUME -v $GLOVE_VOLUME -v $CHECKPOINTS_VOLUME -v $INFERSENT_PICKLE_VOLUME $DOCKER_IMAGE:latest $@