piskvorky · menshikh-iv · Jul 19, 2017 · Jul 3, 2017 · Jul 3, 2017 · Jul 4, 2017
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -3,7 +3,7 @@ FROM ubuntu:16.04
 MAINTAINER Parul Sethi <[email protected]>
 
 ENV GENSIM_REPOSITORY https://github.com/RaRe-Technologies/gensim.git
-ENV GENSIM_VERSION bd6db9a41baf219ecc4a1770cc21b01c8ff122e5
+ENV GENSIM_VERSION b818c91c698b4a149c55455b88953714d1701031
 
 # Installs python, pip and setup tools (with fixed versions)
 RUN apt-get update \
@@ -47,6 +47,7 @@ RUN pip2 install \
     matplotlib==2.0.0 \
     nltk==3.2.2 \
     pandas==0.19.2 \
+    spacy==1.8.1 \
     git+https://github.com/mila-udem/blocks.git@7beb788f1fcfc78d56c59a5edf9b4e8d98f8d7d9 \
     -r https://raw.githubusercontent.com/mila-udem/blocks/stable/requirements.txt
 
@@ -56,13 +57,18 @@ RUN pip3 install \
     matplotlib==2.0.0 \
     nltk==3.2.2 \
     pandas==0.19.2 \
+    spacy==1.8.1 \
     git+https://github.com/mila-udem/blocks.git@7beb788f1fcfc78d56c59a5edf9b4e8d98f8d7d9 \
     -r https://raw.githubusercontent.com/mila-udem/blocks/stable/requirements.txt
 
 # avoid using old numpy version installed by blocks requirements
 RUN pip2 install -U numpy
 RUN pip3 install -U numpy
 
+# Download english model of Spacy
+RUN python2 -m spacy download en
+RUN python3 -m spacy download en
+
 # Download gensim from Github
 RUN git clone $GENSIM_REPOSITORY \
     && cd /gensim \
@@ -76,12 +82,14 @@ RUN git clone $GENSIM_REPOSITORY \
 RUN mkdir /gensim/gensim_dependencies
 
 # Set ENV variables for wrappers
+ENV WR_HOME /gensim/gensim_dependencies/wordrank
 ENV FT_HOME /gensim/gensim_dependencies/fastText
 ENV MALLET_HOME /gensim/gensim_dependencies/mallet
 ENV DTM_PATH /gensim/gensim_dependencies/dtm/dtm/main
 ENV VOWPAL_WABBIT_PATH /gensim/gensim_dependencies/vowpal_wabbit/vowpalwabbit/vw
 
-# For fixed version downloads of gensim wrappers dependencies 
+# For fixed version downloads of gensim wrappers dependencies
+ENV WORDRANK_VERSION 44f3f7786f76c79c083dfad9d64e20bacfb4a0b0
 ENV FASTTEXT_VERSION f24a781021862f0e475a5fb9c55b7c1cec3b6e2e
 ENV MORPHOLOGICALPRIORSFORWORDEMBEDDINGS_VERSION ec2e37a3bcb8bd7b56b75b043c47076bc5decf22
 ENV DTM_VERSION 67139e6f526b2bc33aef56dc36176a1b8b210056
@@ -90,7 +98,17 @@ ENV VOWPAL_WABBIT_VERSION 69ecc2847fa0c876c6e0557af409f386f0ced59a
 
 # Install custom dependencies
 
-# TODO: Install wordrank (need to install mpich/openmpi with multithreading enabled)
+# Install mpich (a wordrank dependency) and remove openmpi to avoid mpirun conflict
+RUN apt-get purge -y openmpi-common openmpi-bin libopenmpi1.10
+RUN apt-get install -y mpich
+
+# Install wordrank
+RUN cd /gensim/gensim_dependencies \
+    && git clone https://bitbucket.org/shihaoji/wordrank \
+    && cd /gensim/gensim_dependencies/wordrank \
+    && git checkout $WORDRANK_VERSION \
+    && sed -i -e 's/#export CC=gcc CXX=g++/export CC=gcc CXX=g++/g' install.sh \
+    && sh ./install.sh
 
 # Install fastText
 RUN cd /gensim/gensim_dependencies \

diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py
@@ -13,6 +13,8 @@
 
 .. [1] https://bitbucket.org/shihaoji/wordrank/
 .. [2] https://arxiv.org/pdf/1506.02761v3.pdf
+
+Note that the wrapper might not work in a docker container for large datasets due to memory limits (caused by MPI).
 """
 
 from __future__ import division
@@ -47,13 +49,13 @@ class Wordrank(KeyedVectors):
     @classmethod
     def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0,
               sgd_num=100, lrate=0.001, period=10, iter=90, epsilon=0.75, dump_period=10, reg=0, alpha=100,
-              beta=99, loss='hinge', memory=4.0, cleanup_files=False, sorted_vocab=1, ensemble=0):
+              beta=99, loss='hinge', memory=4.0, np=1, cleanup_files=False, sorted_vocab=1, ensemble=0):
         """
         The word and context embedding files are generated by wordrank binary and are saved in "out_name" directory
         which is created inside wordrank directory. The vocab and cooccurence files are generated using glove code
         available inside the wordrank directory. These files are used by the wordrank binary for training.
 
-        `wr_path` is the path to the Wordrank directory.
+        `wr_path` is the absolute path to the Wordrank directory.
         `corpus_file` is the filename of the text file to be used for training the Wordrank model.
         Expects file to contain space-separated tokens in a single line
         `out_name` is name of the directory which will be created (in wordrank folder) to save embeddings and training data.
@@ -79,6 +81,7 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1,
         `beta` is the beta parameter of gamma distribution.
         `loss` = name of the loss (logistic, hinge).
         `memory` = soft limit for memory consumption, in GB.
+        `np` number of copies to execute. (mpirun option)
         `cleanup_files` if True, delete directory and files used by this wrapper, setting to False can be useful for debugging
         `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before assigning word indexes.
         `ensemble` = 0 (default), use ensemble of word and context vectors
@@ -137,7 +140,7 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1,
                 )
 
         wr_args = {
-            'path': 'meta',
+            'path': meta_dir,
             'nthread': multiprocessing.cpu_count(),
             'sgd_num': sgd_num,
             'lrate': lrate,
@@ -153,9 +156,10 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1,
             'loss': loss
         }
 
-        os.chdir('..')
         # run wordrank executable with wr_args
-        cmd = ['mpirun', '-np', '1', '../wordrank']
+        cmd = ['mpirun', '-np']
+        cmd.append(np)
+        cmd.append(os.path.join(wr_path, 'wordrank'))
         for option, value in wr_args.items():
             cmd.append('--%s' % option)
             cmd.append(str(value))

diff --git a/gensim/test/test_wordrank_wrapper.py b/gensim/test/test_wordrank_wrapper.py
@@ -34,7 +34,7 @@ def setUp(self):
         self.wr_file = datapath('test_glove.txt')
         if not self.wr_path:
             return
-        self.test_model = wordrank.Wordrank.train(self.wr_path, self.corpus_file, self.out_name, iter=6, dump_period=5, period=5, cleanup_files=True)
+        self.test_model = wordrank.Wordrank.train(self.wr_path, self.corpus_file, self.out_name, iter=6, dump_period=5, period=5, np=2, cleanup_files=True)
 
     def testLoadWordrankFormat(self):
         """Test model successfully loaded from Wordrank format file"""