Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add wordrank in dockerfile #1460

Merged
merged 8 commits into from
Jul 19, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 21 additions & 3 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ FROM ubuntu:16.04
MAINTAINER Parul Sethi <[email protected]>

ENV GENSIM_REPOSITORY https://github.com/RaRe-Technologies/gensim.git
ENV GENSIM_VERSION bd6db9a41baf219ecc4a1770cc21b01c8ff122e5
ENV GENSIM_VERSION b818c91c698b4a149c55455b88953714d1701031

# Installs python, pip and setup tools (with fixed versions)
RUN apt-get update \
Expand Down Expand Up @@ -47,6 +47,7 @@ RUN pip2 install \
matplotlib==2.0.0 \
nltk==3.2.2 \
pandas==0.19.2 \
spacy==1.8.1 \
git+https://github.com/mila-udem/blocks.git@7beb788f1fcfc78d56c59a5edf9b4e8d98f8d7d9 \
-r https://raw.githubusercontent.com/mila-udem/blocks/stable/requirements.txt

Expand All @@ -56,13 +57,18 @@ RUN pip3 install \
matplotlib==2.0.0 \
nltk==3.2.2 \
pandas==0.19.2 \
spacy==1.8.1 \
git+https://github.com/mila-udem/blocks.git@7beb788f1fcfc78d56c59a5edf9b4e8d98f8d7d9 \
-r https://raw.githubusercontent.com/mila-udem/blocks/stable/requirements.txt

# avoid using old numpy version installed by blocks requirements
RUN pip2 install -U numpy
RUN pip3 install -U numpy

# Download english model of Spacy
RUN python2 -m spacy download en
RUN python3 -m spacy download en

# Download gensim from Github
RUN git clone $GENSIM_REPOSITORY \
&& cd /gensim \
Expand All @@ -76,12 +82,14 @@ RUN git clone $GENSIM_REPOSITORY \
RUN mkdir /gensim/gensim_dependencies

# Set ENV variables for wrappers
ENV WR_HOME /gensim/gensim_dependencies/wordrank
ENV FT_HOME /gensim/gensim_dependencies/fastText
ENV MALLET_HOME /gensim/gensim_dependencies/mallet
ENV DTM_PATH /gensim/gensim_dependencies/dtm/dtm/main
ENV VOWPAL_WABBIT_PATH /gensim/gensim_dependencies/vowpal_wabbit/vowpalwabbit/vw

# For fixed version downloads of gensim wrappers dependencies
# For fixed version downloads of gensim wrappers dependencies
ENV WORDRANK_VERSION 44f3f7786f76c79c083dfad9d64e20bacfb4a0b0
ENV FASTTEXT_VERSION f24a781021862f0e475a5fb9c55b7c1cec3b6e2e
ENV MORPHOLOGICALPRIORSFORWORDEMBEDDINGS_VERSION ec2e37a3bcb8bd7b56b75b043c47076bc5decf22
ENV DTM_VERSION 67139e6f526b2bc33aef56dc36176a1b8b210056
Expand All @@ -90,7 +98,17 @@ ENV VOWPAL_WABBIT_VERSION 69ecc2847fa0c876c6e0557af409f386f0ced59a

# Install custom dependencies

# TODO: Install wordrank (need to install mpich/openmpi with multithreading enabled)
# Install mpich (a wordrank dependency) and remove openmpi to avoid mpirun conflict
RUN apt-get purge -y openmpi-common openmpi-bin libopenmpi1.10
RUN apt-get install -y mpich

# Install wordrank
RUN cd /gensim/gensim_dependencies \
&& git clone https://bitbucket.org/shihaoji/wordrank \
&& cd /gensim/gensim_dependencies/wordrank \
&& git checkout $WORDRANK_VERSION \
&& sed -i -e 's/#export CC=gcc CXX=g++/export CC=gcc CXX=g++/g' install.sh \
&& sh ./install.sh

# Install fastText
RUN cd /gensim/gensim_dependencies \
Expand Down
14 changes: 9 additions & 5 deletions gensim/models/wrappers/wordrank.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

.. [1] https://bitbucket.org/shihaoji/wordrank/
.. [2] https://arxiv.org/pdf/1506.02761v3.pdf

Note that the wrapper might not work in a docker container for large datasets due to memory limits (caused by MPI).
"""

from __future__ import division
Expand Down Expand Up @@ -47,13 +49,13 @@ class Wordrank(KeyedVectors):
@classmethod
def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0,
sgd_num=100, lrate=0.001, period=10, iter=90, epsilon=0.75, dump_period=10, reg=0, alpha=100,
beta=99, loss='hinge', memory=4.0, cleanup_files=False, sorted_vocab=1, ensemble=0):
beta=99, loss='hinge', memory=4.0, np=1, cleanup_files=False, sorted_vocab=1, ensemble=0):
"""
The word and context embedding files are generated by wordrank binary and are saved in "out_name" directory
which is created inside wordrank directory. The vocab and cooccurence files are generated using glove code
available inside the wordrank directory. These files are used by the wordrank binary for training.

`wr_path` is the path to the Wordrank directory.
`wr_path` is the absolute path to the Wordrank directory.
`corpus_file` is the filename of the text file to be used for training the Wordrank model.
Expects file to contain space-separated tokens in a single line
`out_name` is name of the directory which will be created (in wordrank folder) to save embeddings and training data.
Expand All @@ -79,6 +81,7 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1,
`beta` is the beta parameter of gamma distribution.
`loss` = name of the loss (logistic, hinge).
`memory` = soft limit for memory consumption, in GB.
`np` number of copies to execute. (mpirun option)
`cleanup_files` if True, delete directory and files used by this wrapper, setting to False can be useful for debugging
`sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before assigning word indexes.
`ensemble` = 0 (default), use ensemble of word and context vectors
Expand Down Expand Up @@ -137,7 +140,7 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1,
)

wr_args = {
'path': 'meta',
'path': meta_dir,
'nthread': multiprocessing.cpu_count(),
'sgd_num': sgd_num,
'lrate': lrate,
Expand All @@ -153,9 +156,10 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1,
'loss': loss
}

os.chdir('..')
# run wordrank executable with wr_args
cmd = ['mpirun', '-np', '1', '../wordrank']
cmd = ['mpirun', '-np']
cmd.append(np)
cmd.append(os.path.join(wr_path, 'wordrank'))
for option, value in wr_args.items():
cmd.append('--%s' % option)
cmd.append(str(value))
Expand Down
2 changes: 1 addition & 1 deletion gensim/test/test_wordrank_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def setUp(self):
self.wr_file = datapath('test_glove.txt')
if not self.wr_path:
return
self.test_model = wordrank.Wordrank.train(self.wr_path, self.corpus_file, self.out_name, iter=6, dump_period=5, period=5, cleanup_files=True)
self.test_model = wordrank.Wordrank.train(self.wr_path, self.corpus_file, self.out_name, iter=6, dump_period=5, period=5, np=2, cleanup_files=True)

def testLoadWordrankFormat(self):
"""Test model successfully loaded from Wordrank format file"""
Expand Down