Merge pull request #101 from stephenhky/wmd

Removing all `PuLP` dependencies
stephenhky · Jul 23, 2020 · cc5167a · cc5167a
2 parents daa9aad + b1b8b6a
commit cc5167a
Show file tree

Hide file tree

Showing 12 changed files with 928 additions and 115 deletions.
diff --git a/.gitignore b/.gitignore
diff --git a/README.md b/README.md
@@ -86,6 +86,7 @@ If you would like to contribute, feel free to submit the pull requests. You can
 
 ## News
 
+* 07/23/2020: `shorttext` 1.3.0 released.
 * 06/05/2020: `shorttext` 1.2.6 released.
 * 05/20/2020: `shorttext` 1.2.5 released.
 * 05/13/2020: `shorttext` 1.2.4 released.
@@ -141,7 +142,7 @@ If you would like to contribute, feel free to submit the pull requests. You can
 
 ## Possible Future Updates
 
-- [ ] Removing `pulp` dependency;
+- [x] Removing `pulp` dependency;
 - [ ] Including transformer-based models;
 - [ ] Use of DASK;
 - [ ] Dividing components to other packages;

diff --git a/apidocs/source/conf.py b/apidocs/source/conf.py
@@ -58,9 +58,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '1.2'
+version = '1.3'
 # The full version, including alpha/beta/rc tags.
-release = '1.2.6'
+release = '1.3.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/docs/conf.py b/docs/conf.py
@@ -56,9 +56,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = u'1.2'
+version = u'1.3'
 # The full version, including alpha/beta/rc tags.
-release = u'1.2.6'
+release = u'1.3.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/docs/install.rst b/docs/install.rst
@@ -66,15 +66,14 @@ Required Packages
 
 - Numpy_ (Numerical Python, version >= 1.16.0)
 - SciPy_ (Scientific Python, version >= 1.2.0)
-- Scikit-Learn_ (Machine Learning in Python)
+- Scikit-Learn_ (Machine Learning in Python, version >= 0.23.0)
 - keras_ (Deep Learning Library for Theano and Tensorflow, version >= 2.3.0)
-- gensim_ (Topic Modeling for Humans, version >= 3.2.0)
-- Pandas_ (Python Data Analysis Library)
-- PuLP_ (Optimization with PuLP)
-- snowballstemmer_ (Snowball Stemmer)
-- TensorFlow_ (TensorFlow, >= 2.0.0)
-- Flask_ (Flask)
-- Joblib_ (Joblib: lightweight Python pipelining)
+- gensim_ (Topic Modeling for Humans, version >= 3.8.0)
+- Pandas_ (Python Data Analysis Library, version >= 1.0.0)
+- snowballstemmer_ (Snowball Stemmer, version >= 2.0.0)
+- TensorFlow_ (TensorFlow, version >= 2.0.0)
+- Flask_ (Flask, version >= 1.1.0)
+- Joblib_ (Joblib: lightweight Python pipelining, version >= 0.14)
 
 Home: :doc:`index`
 
@@ -88,7 +87,6 @@ Home: :doc:`index`
 .. _keras: https://keras.io/
 .. _gensim: https://radimrehurek.com/gensim/
 .. _Pandas: http://pandas.pydata.org/
-.. _PuLP: https://pythonhosted.org/PuLP/
 .. _snowballstemmer: https://github.com/snowballstem/snowball
 .. _TensorFlow: https://www.tensorflow.org/
 .. _Flask: https://flask.palletsprojects.com/

diff --git a/docs/news.rst b/docs/news.rst
@@ -1,6 +1,7 @@
 News
 ====
 
+* 07/23/2020: `shorttext` 1.3.0 released.
 * 06/05/2020: `shorttext` 1.2.6 released.
 * 05/20/2020: `shorttext` 1.2.5 released.
 * 05/13/2020: `shorttext` 1.2.4 released.
@@ -58,6 +59,11 @@ News
 What's New
 ----------
 
+Release 1.3.0 (July 23, 2020)
+-----------------------------
+
+* Removed all dependencies on `PuLP`; all computations of word mover's distance (WMD) is performed using `SciPy`.
+
 Release 1.2.6 (June 20, 2020)
 -----------------------------
 

diff --git a/requirements.txt b/requirements.txt
@@ -3,9 +3,9 @@ numpy>=1.16.0
 scipy>=1.2.0
 joblib>=0.14
 scikit-learn>=0.23.0
+tensorflow>=2.0.0
 keras>=2.3.0
 gensim>=3.8.0
 pandas>=1.0.0
-pulp>=2.0
 flask>=1.1.0
 snowballstemmer>=2.0.0
diff --git a/setup.py b/setup.py
@@ -28,7 +28,7 @@ def setup_requirements():
 
 
 setup(name='shorttext',
-      version='1.2.6',
+      version='1.3.0a01',
       description="Short Text Mining",
       long_description=package_description(),
       long_description_content_type='text/markdown',

diff --git a/shorttext/metrics/wasserstein/__init__.py b/shorttext/metrics/wasserstein/__init__.py
@@ -1,2 +1,2 @@
 
-from .wordmoverdist import word_mover_distance_probspec, word_mover_distance
+from .wordmoverdist import word_mover_distance_linprog, word_mover_distance
diff --git a/shorttext/metrics/wasserstein/wordmoverdist.py b/shorttext/metrics/wasserstein/wordmoverdist.py
@@ -1,62 +1,73 @@
 
 from itertools import product
+import warnings
 
-import pulp
+import numpy as np
 from scipy.spatial.distance import euclidean
+from scipy.sparse import csr_matrix
+from scipy.optimize import linprog
 
 from shorttext.utils.gensim_corpora import tokens_to_fracdict
 
 
-# use PuLP
-def word_mover_distance_probspec(first_sent_tokens, second_sent_tokens, wvmodel, distancefunc=euclidean, lpFile=None):
+def word_mover_distance_linprog(first_sent_tokens, second_sent_tokens, wvmodel, distancefunc=euclidean):
     """ Compute the Word Mover's distance (WMD) between the two given lists of tokens, and return the LP problem class.
 
     Using methods of linear programming, supported by PuLP, calculate the WMD between two lists of words. A word-embedding
-    model has to be provided. The problem class is returned, containing all the information about the LP.
+    model has to be provided. The whole `scipy.optimize.Optimize` object is returned.
 
     Reference: Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, "From Word Embeddings to Document Distances," *ICML* (2015).
 
     :param first_sent_tokens: first list of tokens.
     :param second_sent_tokens: second list of tokens.
     :param wvmodel: word-embedding models.
     :param distancefunc: distance function that takes two numpy ndarray.
-    :param lpFile: log file to write out.
-    :return: a linear programming problem contains the solution
+    :return: the whole result of the linear programming problem
     :type first_sent_tokens: list
     :type second_sent_tokens: list
     :type wvmodel: gensim.models.keyedvectors.KeyedVectors
     :type distancefunc: function
-    :type lpFile: str
-    :rtype: pulp.LpProblem
+    :rtype: scipy.optimize.OptimizeResult
     """
+    nb_tokens_first_sent = len(first_sent_tokens)
+    nb_tokens_second_sent = len(second_sent_tokens)
+
     all_tokens = list(set(first_sent_tokens+second_sent_tokens))
     wordvecs = {token: wvmodel[token] for token in all_tokens}
 
     first_sent_buckets = tokens_to_fracdict(first_sent_tokens)
     second_sent_buckets = tokens_to_fracdict(second_sent_tokens)
 
-    T = pulp.LpVariable.dicts('T_matrix', list(product(all_tokens, all_tokens)), lowBound=0)
+    collapsed_idx_func = lambda i, j: i*nb_tokens_second_sent + j
 
-    prob = pulp.LpProblem('WMD', sense=pulp.LpMinimize)
-    prob += pulp.lpSum([T[token1, token2]*distancefunc(wordvecs[token1], wordvecs[token2])
-                        for token1, token2 in product(all_tokens, all_tokens)])
-    for token2 in second_sent_buckets:
-        prob += pulp.lpSum([T[token1, token2] for token1 in first_sent_buckets])==second_sent_buckets[token2]
-    for token1 in first_sent_buckets:
-        prob += pulp.lpSum([T[token1, token2] for token2 in second_sent_buckets])==first_sent_buckets[token1]
+    # assigning T
+    T = np.zeros(nb_tokens_first_sent*nb_tokens_second_sent)
+    for i, j in product(range(nb_tokens_first_sent), range(nb_tokens_second_sent)):
+        T[collapsed_idx_func(i, j)] = distancefunc(wordvecs[first_sent_tokens[i]],
+                                                   wordvecs[second_sent_tokens[j]])
 
-    if lpFile!=None:
-        prob.writeLP(lpFile)
+    # assigning Aeq and beq
+    Aeq = csr_matrix(
+        (nb_tokens_first_sent+nb_tokens_second_sent,
+         nb_tokens_first_sent*nb_tokens_second_sent)
+    )
+    beq = np.zeros(nb_tokens_first_sent+nb_tokens_second_sent)
+    for i in range(nb_tokens_first_sent):
+        for j in range(nb_tokens_second_sent):
+            Aeq[i, collapsed_idx_func(i, j)] = 1.
+        beq[i] = first_sent_buckets[first_sent_tokens[i]]
+    for j in range(nb_tokens_second_sent):
+        for i in range(nb_tokens_first_sent):
+            Aeq[j+nb_tokens_first_sent, collapsed_idx_func(i, j)] = 1.
+        beq[j+nb_tokens_first_sent] = second_sent_buckets[second_sent_tokens[j]]
 
-    prob.solve()
-
-    return prob
+    return linprog(T, A_eq=Aeq, b_eq=beq)
 
 
 def word_mover_distance(first_sent_tokens, second_sent_tokens, wvmodel, distancefunc=euclidean, lpFile=None):
     """ Compute the Word Mover's distance (WMD) between the two given lists of tokens.
 
-    Using methods of linear programming, supported by PuLP, calculate the WMD between two lists of words. A word-embedding
+    Using methods of linear programming, calculate the WMD between two lists of words. A word-embedding
     model has to be provided. WMD is returned.
 
     Reference: Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, "From Word Embeddings to Document Distances," *ICML* (2015).
@@ -65,7 +76,7 @@ def word_mover_distance(first_sent_tokens, second_sent_tokens, wvmodel, distance
     :param second_sent_tokens: second list of tokens.
     :param wvmodel: word-embedding models.
     :param distancefunc: distance function that takes two numpy ndarray.
-    :param lpFile: log file to write out.
+    :param lpFile: deprecated, kept for backward incompatibility. (default: None)
     :return: Word Mover's distance (WMD)
     :type first_sent_tokens: list
     :type second_sent_tokens: list
@@ -74,6 +85,10 @@ def word_mover_distance(first_sent_tokens, second_sent_tokens, wvmodel, distance
     :type lpFile: str
     :rtype: float
     """
-    prob = word_mover_distance_probspec(first_sent_tokens, second_sent_tokens, wvmodel,
-                                        distancefunc=distancefunc, lpFile=lpFile)
-    return pulp.value(prob.objective)
+    linprog_result = word_mover_distance_linprog(first_sent_tokens, second_sent_tokens, wvmodel,
+                                                 distancefunc=distancefunc)
+    if lpFile is not None:
+        warnings.warn('The parameter `lpFile` (value: {}) is not used; parameter is deprecated as ' + \
+                      'the package `pulp` is no longer used. Check your code if there is a dependency on ' + \
+                      'this parameter.')
+    return linprog_result['fun']
diff --git a/test/test_var_nn_embedded_vec_classifier.py b/test/test_var_nn_embedded_vec_classifier.py
@@ -1,23 +1,17 @@
 import os
 import unittest
 import urllib
-import sys
 
 import shorttext
 
 
-# The "test_w2v_model.bin" in this directory is adapted from: https://raw.githubusercontent.com/chinmayapancholi13/shorttext_test_data/master/test_w2v_model
-
 class TestVarNNEmbeddedVecClassifier(unittest.TestCase):
     def setUp(self):
         print("Downloading word-embedding model....")
         link = "https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/test_w2v_model.bin"
         filename = "test_w2v_model.bin"
         if not os.path.isfile("test_w2v_model.bin"):
-            if sys.version_info[0]==2:
-                urllib.urlretrieve(link, filename)
-            else:
-                urllib.request.urlretrieve(link, filename)
+            urllib.request.urlretrieve(link, filename)
         self.w2v_model = shorttext.utils.load_word2vec_model(filename, binary=True)  # load word2vec model
         self.trainclass_dict = shorttext.data.subjectkeywords()  # load training data
 

diff --git a/test/test_wmd.py b/test/test_wmd.py
@@ -0,0 +1,36 @@
+import os
+import unittest
+import urllib
+
+from shorttext.metrics.wasserstein import word_mover_distance
+from shorttext.utils import load_word2vec_model
+
+
+class TestWMD(unittest.TestCase):
+    def setUp(self):
+        print("Downloading word-embedding model....")
+        link = "https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/test_w2v_model.bin"
+        filename = "test_w2v_model.bin"
+        if not os.path.isfile("test_w2v_model.bin"):
+            urllib.request.urlretrieve(link, filename)
+        self.w2v_model = load_word2vec_model(filename, binary=True)  # load word2vec model
+
+    def tearDown(self):
+        print("Removing word-embedding model")
+        if os.path.isfile("test_w2v_model.bin"):
+            os.remove('test_w2v_model.bin')
+
+    def calculate_wmd(self, tokens1, tokens2, answer):
+        wdistance = word_mover_distance(tokens1, tokens2, self.w2v_model)
+        self.assertAlmostEqual(wdistance, answer, delta=1e-3)
+
+    def test_metrics(self):
+        tokens1 = ['president', 'speaks']
+        tokens2 = ['president', 'talks']
+        known_answer = 0.19936788082122803
+        self.calculate_wmd(tokens1, tokens2, known_answer)
+
+        tokens1 = ['fan', 'book']
+        tokens2 = ['apple', 'orange']
+        known_answer = 1.8019972145557404
+        self.calculate_wmd(tokens1, tokens2, known_answer)