Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add docker support #3

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
LICENCE
.gitignore
README.md
docker
inferSent/dataset/GloVe
inferSent/encoder/infersent1.pickle
datasets
model_checkpoints
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ model/results
/etc/
datasets/
/cornell_movie_dialogue/
model_checkpoints/
*.orig
*.lprof
.idea
Expand Down
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ This repository is accompanied by [Neural Chat Web](https://github.com/asmadotgh

This code is inspired by and built off of "A Hierarchical Latent Structure for Variational Conversation Modeling" ([code](https://github.com/ctr4si/A-Hierarchical-Latent-Structure-for-Variational-Conversation-Modeling), [paper](https://arxiv.org/abs/1804.03424), [presentation](https://vimeo.com/277671819)).

## Prerequisites
## Prerequisites for running in local machine
This section includes installation of required libraries, and downloading pre-trained models.

### Installation
Expand Down Expand Up @@ -57,7 +57,12 @@ Run the download script to downloads the pre-trained torchMoji weights [~85MB] f
python torchMoji/scripts/download_weights.py
```

## Running using Docker

To run the project using docker, check the [corresponding](docker/README.md) file

## Download & Preprocess Data

The following scripts will:

1. Create directories `./datasets/reddit_casual/` and `./datasets/cornell/` respectively.
Expand Down
73 changes: 50 additions & 23 deletions dataset_preprocess.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,22 @@
# Download/Preprocess data-sets

from multiprocessing import Pool
import argparse
import json
import os
import pickle
import random
import os
import json
import tarfile
from multiprocessing import Pool
from pathlib import Path
from subprocess import call
from urllib.request import urlretrieve
from zipfile import ZipFile
from pathlib import Path

from tqdm import tqdm
print('import things')
from model.utils import Tokenizer, Vocab, PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, pad_sentences
from subprocess import call

import tarfile

print('start')
project_dir = Path(__file__).resolve().parent
datasets_dir = project_dir.joinpath('datasets')
cornell_dir = datasets_dir.joinpath('cornell')
Expand All @@ -32,13 +34,17 @@ def shortcut_download(dataset, compression_type='tar.gz'):
compression_type = 'tar.gz'

if dataset == 'reddit_casual' and compression_type == 'zip':
print('Warning! Zip format is not supported for reddit casual dataset due to file size. Changing to tar.gz')
print(
'Warning! Zip format is not supported for reddit casual dataset due to file size. '
'Changing to tar.gz'
)
compression_type = 'tar.gz'

if not os.path.exists(datasets_dir):
os.makedirs(datasets_dir)

compressed_url = f'https://affect.media.mit.edu/neural_chat/datasets/{dataset}_preprocessed.{compression_type}'
compressed_url = f'https://affect.media.mit.edu/neural_chat/datasets/' \
f'{dataset}_preprocessed.{compression_type}'
compressed_file_dir = datasets_dir.joinpath(dataset)
compressed_file_path = datasets_dir.joinpath(f'{dataset}_preprocessed.{compression_type}')

Expand Down Expand Up @@ -72,7 +78,6 @@ def shortcut_download(dataset, compression_type='tar.gz'):
print('Directory already exists. Aborting download.')



def prepare_reddit_casual_data():
"""Download and unpack dialogs"""

Expand Down Expand Up @@ -252,14 +257,15 @@ def pad_conversation(conversation):

def load_conversations_cornell(cornell_dir):
# Download and extract dialogs if necessary.
prepare_cornell_data()

if download:
prepare_cornell_data()
cornell_path = cornell_dir.joinpath(f'lines_{recall}/{toxic}')
print("Loading lines")
lines = load_lines(cornell_dir.joinpath("movie_lines.txt"))
lines = load_lines(cornell_path.joinpath("movie_lines.txt"))
print('Number of lines:', len(lines))

print("Loading conversations...")
conversations = load_conversations(cornell_dir.joinpath("movie_conversations.txt"), lines)
conversations = load_conversations(cornell_path.joinpath("movie_conversations.txt"), lines)
print('Number of conversations:', len(conversations))
return conversations

Expand Down Expand Up @@ -287,6 +293,15 @@ def load_conversations_cornell(cornell_dir):
# Input dataset
parser.add_argument('--dataset', type=str, default='cornell')

# Does the dataset need to be downloaded
parser.add_argument('--download', default=False, action='store_true')

# Use toxic or non toxic dataset
parser.add_argument('--classified-as', type=str, default='notoxic')

# Which Recall dataset to use
parser.add_argument('--recall', type=str, default='0.8')

# Bypassing pre-processing by directly downloading all the files
parser.add_argument('--shortcut', action="store_true", default=False,
help="Whether to download the preprocessed dataset instead.")
Expand All @@ -300,6 +315,9 @@ def load_conversations_cornell(cornell_dir):
max_vocab_size = args.max_vocab_size
min_freq = args.min_vocab_frequency
n_workers = args.n_workers
download = args.download
toxic = args.classified_as
recall = args.recall

if args.shortcut:
shortcut_download(args.dataset, args.shortcut_compression_type)
Expand All @@ -319,29 +337,34 @@ def load_conversations_cornell(cornell_dir):
print('Train/Valid/Test Split')
train, valid, test = train_valid_test_split_by_conversation(conversations, split_ratio)


def to_pickle(obj, path):
with open(path, 'wb') as f:
pickle.dump(obj, f)


for split_type, conv_objects in [('train', train), ('valid', valid), ('test', test)]:
print(f'Processing {split_type} dataset...')
split_data_dir = dataset_dir.joinpath(split_type)
split_data_dir.mkdir(exist_ok=True)

print(f'Tokenize.. (n_workers={n_workers})')


def _tokenize_conversation(conv):
return tokenize_conversation(conv['lines'])


with Pool(n_workers) as pool:
conversations = list(tqdm(pool.imap(_tokenize_conversation, conv_objects),
total=len(conv_objects)))
total=len(conv_objects)))

conversation_length = [min(len(conv['lines']), max_conv_len)
for conv in conv_objects]

raw_sentences = [[line['text'] for line in conv['lines'][0:min(len(conv['lines']), max_conv_len)]]
for conv in conv_objects]
raw_sentences = [
[line['text'] for line in conv['lines'][0:min(len(conv['lines']), max_conv_len)]]
for conv in conv_objects]

sentences, sentence_length = pad_sentences(
conversations,
Expand All @@ -355,31 +378,35 @@ def _tokenize_conversation(conv):
to_pickle(sentence_length, split_data_dir.joinpath('sentence_length.pkl'))

if split_type == 'train':

print('Save Vocabulary...')
vocab = Vocab(tokenizer)
vocab.add_dataframe(conversations)
vocab.update(max_size=max_vocab_size, min_freq=min_freq)

print('Vocabulary size: ', len(vocab))
vocab.pickle(dataset_dir.joinpath('word2id.pkl'), dataset_dir.joinpath('id2word.pkl'))
vocab.pickle(dataset_dir.joinpath('word2id.pkl'),
dataset_dir.joinpath('id2word.pkl'))

print('Done downloading and pre-processing dataset.')

print('Inferring InferSent encoding for dataset...')
infersent_export_script = os.path.join(os.path.join('inferSent', 'api'), 'export_dataset_embeddings.py')
infersent_export_script = os.path.join(os.path.join('inferSent', 'api'),
'export_dataset_embeddings.py')
for split_type in ['train', 'valid', 'test']:
filepath = os.path.join(os.path.join(dataset_dir, split_type), 'raw_sentences.pkl')
if split_type == 'train':
call(["python", infersent_export_script, f'--filepath={filepath}', '--streaming'])
else:
call(["python", infersent_export_script, f'--filepath={filepath}'])
infersent_reduction_script = os.path.join(os.path.join('inferSent', 'api'), 'reduce_embeddings_dimension.py')
call(["python", infersent_reduction_script, f'--dataset={dataset_dir}', '--savepca', '--exportembeddings'])
infersent_reduction_script = os.path.join(os.path.join('inferSent', 'api'),
'reduce_embeddings_dimension.py')
call(["python", infersent_reduction_script, f'--dataset={dataset_dir}', '--savepca',
'--exportembeddings'])
print('Done exporting InferSent embedding.')

print('Inferring TorchMoji encoding for dataset...')
torchmoji_export_script = os.path.join(os.path.join('torchMoji', 'api'), 'dataset_emojize.py')
torchmoji_export_script = os.path.join(os.path.join('torchMoji', 'api'),
'dataset_emojize.py')
for split_type in ['train', 'valid', 'test']:
filepath = os.path.join(os.path.join(dataset_dir, split_type), 'raw_sentences.pkl')
call(["python", torchmoji_export_script, f'--filepath={filepath}'])
Expand Down
33 changes: 33 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
FROM nvidia/cuda:10.0-cudnn7-runtime
ENV PYTHONUNBUFFERED=1
WORKDIR /code

#RUN apt-get -y update && \
# apt-get install -y --no-install-recommends software-properties-common && \
# add-apt-repository -y ppa:deadsnakes/ppa && \
# apt-get --purge remove -y software-properties-common && \
# rm -rf /var/lib/apt/lists/*

RUN apt-get -y update && \
apt-get install -y --no-install-recommends wget python3.6 python3-pip && \
rm -rf /var/lib/apt/lists/*

RUN ln -s /usr/bin/python3.6 /usr/bin/python && ln -s /usr/bin/pip3 /usr/bin/pip

RUN python -m pip install --no-cache-dir --upgrade ipython

RUN pip install --no-cache-dir torch===0.4.0 -f https://download.pytorch.org/whl/cu100/torch_stable.html

COPY requirements.txt /build/requirements.txt

RUN apt-get -y update && \
apt-get -y install --no-install-recommends build-essential python3.6-dev && \
python -m pip install --no-cache-dir -r /build/requirements.txt && \
apt-get -y --purge remove build-essential && \
rm -rf /var/lib/apt/lists/*

COPY . /code/

RUN python -m pip install -e .

EXPOSE 6009
46 changes: 46 additions & 0 deletions docker/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Running the project using Docker

To be able to run this project using docker, you will need [Docker](https://docs.docker.com/install/) and [Nvidia Container Toolkit](https://github.com/NVIDIA/nvidia-docker) installed in your machine

## Building the docker image

To create the docker image just run the `build.sh` script located in this folder, you will be prompted to download GloVe during the process, this won't be included in the docker image to avoid yielding a huge image, instead, it will be stored in the host machine and mounted as a volume in the container when running it.

## Before running the docker image

Before running the project inside docker, all extra files mentioned in the [base `README` file](../README.md) should be located in the same location mentioned there, they will be mounted as volumes.

You can either download them from your host system or execute the `run.sh` script with the same commands stated in the tutorial to download them using the docker image without having to potentially install any dependency in your host machine.

For example, you can run:

```bash
docker/run.sh python dataset_preprocess.py --dataset=cornell --shortcut
```

To download the Cornell dataset and place it in the `./datasets/cornell/` path in the host machine. The `datasets` folder will be mounted as a volume in the docker container when using the `run.sh` script.

## Running the project in the docker image
To run the image in interactive mode, just run
```bash
docker/run.sh bash
```

After that you will have a bash shell running inside the docker image with all the necessary files mounted as volumes, there you can interact with the project the same way you would do it in your machine.

To run actions in background, add -d as the first argument of the `run.sh` script.

## TL;DR

In order to have a docker image ready to run with the default configuration and the cornell dataset just follow this steps:

```bash
git clone [email protected]:natashamjaques/neural_chat.git
cd neural_chat
docker/build.sh
# Answer y to all questions
docker/run.sh python dataset_preprocess.py --dataset=cornell --shortcut
docker/run.sh bash
```

Now you are running a terminal inside the docker image with all the dependencies (including cuda and python requirements) installed and ready to start training the model without having to install anything besides docker and the Nvidia runtime in your host machine.
51 changes: 51 additions & 0 deletions docker/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env bash
set -ue

DIR=$(dirname "$(realpath $0)")
PROJECT_ROOT="$(realpath $DIR/..)"
DOCKER_IMAGE="neuralchat"

function get_glove {
GLOVE_DIR=$PROJECT_ROOT/inferSent/dataset/GloVe
mkdir -p $GLOVE_DIR
if ! [ -f $GLOVE_DIR/glove.840B.300d.txt ]; then
curl -Lo $GLOVE_DIR/glove.840B.300d.zip http://nlp.stanford.edu/data/glove.840B.300d.zip
unzip $GLOVE_DIR/glove.840B.300d.zip -d $GLOVE_DIR
rm -f $GLOVE_DIR/glove.840B.300d.zip
else
echo "GloVe file is already present in $GLOVE_DIR"
fi

if ! [ -f $PROJECT_ROOT/inferSent/encoder/infersent1.pickle ]; then
curl -Lo $PROJECT_ROOT/inferSent/encoder/infersent1.pickle https://affect.media.mit.edu/neural_chat/inferSent/encoder/infersent1.pickle
else
echo "$PROJECT_ROOT/inferSent/encoder/infersent1.pickle already exists"
fi
}

function get_torchmoji {
docker run -it -v $PROJECT_ROOT/torchMoji/model:/code/torchMoji/model $DOCKER_IMAGE:latest python torchMoji/scripts/download_weights.py
}

echo -n "Do you want to download GloVe[y/N]? "
read answer

if [ "$answer" != "${answer#[Yy]}" ] ;then
get_glove
else
echo "GLoVe will not be downloaded, be sure to put it in the correct location before running the docker image"
fi

cd $PROJECT_ROOT && docker build -f $PROJECT_ROOT/docker/Dockerfile -t $DOCKER_IMAGE:latest . && cd -

echo -n "Do you want to download torchMoji weights and include them in the image [y/N]? "
read answer

if [ "$answer" != "${answer#[Yy]}" ] ;then
get_torchmoji
cd $PROJECT_ROOT && docker build -f $PROJECT_ROOT/docker/Dockerfile -t $DOCKER_IMAGE:latest . && cd -
else
echo "torchMoji will not be downloaded, be sure to put it in the correct location before running the docker image and add the corresponding volume to the run file"
fi


30 changes: 30 additions & 0 deletions docker/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env bash

DIR=$(dirname "$(realpath $0)")
PROJECT_ROOT="$(realpath $DIR/..)"
DOCKER_IMAGE="neuralchat"

if [[ "$(docker images -q $DOCKER_IMAGE:latest 2> /dev/null)" == "" ]]; then
echo "Docker image does not exist, please build it using the build.sh script"
exit 1
fi

# Docker volumes for extra data that is too big to be included in the docker image
MODELS_VOLUME="$PROJECT_ROOT/datasets/:/code/datasets"
GLOVE_VOLUME="$PROJECT_ROOT/inferSent/dataset/GloVe/:/code/inferSent/dataset/GloVe/"
CHECKPOINTS_VOLUME="$PROJECT_ROOT/model_checkpoints/:/code/model_checkpoints"
INFERSENT_PICKLE_VOLUME="$PROJECT_ROOT/inferSent/encoder/infersent1.pickle:/code/inferSent/encoder/infersent1.pickle"

if [ $# = 0 ]; then
echo "Usage: $0 [-d] <command>"
exit 1
fi

if [ $1 = "-d" ]; then
DETACH="True"
shift
else
IT="True"
fi

docker run --gpus '"device=1"' ${DETACH:+-d} ${IT:+-it} -p 0.0.0.0:6009:6009 -v $MODELS_VOLUME -v $GLOVE_VOLUME -v $CHECKPOINTS_VOLUME -v $INFERSENT_PICKLE_VOLUME $DOCKER_IMAGE:latest $@
Loading