Skip to content

Commit

Permalink
Merge pull request #101 from stephenhky/wmd
Browse files Browse the repository at this point in the history
Removing all `PuLP` dependencies
  • Loading branch information
stephenhky committed Jul 23, 2020
2 parents daa9aad + b1b8b6a commit cc5167a
Show file tree
Hide file tree
Showing 12 changed files with 928 additions and 115 deletions.
893 changes: 828 additions & 65 deletions .gitignore

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ If you would like to contribute, feel free to submit the pull requests. You can

## News

* 07/23/2020: `shorttext` 1.3.0 released.
* 06/05/2020: `shorttext` 1.2.6 released.
* 05/20/2020: `shorttext` 1.2.5 released.
* 05/13/2020: `shorttext` 1.2.4 released.
Expand Down Expand Up @@ -141,7 +142,7 @@ If you would like to contribute, feel free to submit the pull requests. You can

## Possible Future Updates

- [ ] Removing `pulp` dependency;
- [x] Removing `pulp` dependency;
- [ ] Including transformer-based models;
- [ ] Use of DASK;
- [ ] Dividing components to other packages;
Expand Down
4 changes: 2 additions & 2 deletions apidocs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@
# built documents.
#
# The short X.Y version.
version = '1.2'
version = '1.3'
# The full version, including alpha/beta/rc tags.
release = '1.2.6'
release = '1.3.0'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
4 changes: 2 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@
# built documents.
#
# The short X.Y version.
version = u'1.2'
version = u'1.3'
# The full version, including alpha/beta/rc tags.
release = u'1.2.6'
release = u'1.3.0'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
16 changes: 7 additions & 9 deletions docs/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -66,15 +66,14 @@ Required Packages

- Numpy_ (Numerical Python, version >= 1.16.0)
- SciPy_ (Scientific Python, version >= 1.2.0)
- Scikit-Learn_ (Machine Learning in Python)
- Scikit-Learn_ (Machine Learning in Python, version >= 0.23.0)
- keras_ (Deep Learning Library for Theano and Tensorflow, version >= 2.3.0)
- gensim_ (Topic Modeling for Humans, version >= 3.2.0)
- Pandas_ (Python Data Analysis Library)
- PuLP_ (Optimization with PuLP)
- snowballstemmer_ (Snowball Stemmer)
- TensorFlow_ (TensorFlow, >= 2.0.0)
- Flask_ (Flask)
- Joblib_ (Joblib: lightweight Python pipelining)
- gensim_ (Topic Modeling for Humans, version >= 3.8.0)
- Pandas_ (Python Data Analysis Library, version >= 1.0.0)
- snowballstemmer_ (Snowball Stemmer, version >= 2.0.0)
- TensorFlow_ (TensorFlow, version >= 2.0.0)
- Flask_ (Flask, version >= 1.1.0)
- Joblib_ (Joblib: lightweight Python pipelining, version >= 0.14)

Home: :doc:`index`

Expand All @@ -88,7 +87,6 @@ Home: :doc:`index`
.. _keras: https://keras.io/
.. _gensim: https://radimrehurek.com/gensim/
.. _Pandas: http://pandas.pydata.org/
.. _PuLP: https://pythonhosted.org/PuLP/
.. _snowballstemmer: https://github.com/snowballstem/snowball
.. _TensorFlow: https://www.tensorflow.org/
.. _Flask: https://flask.palletsprojects.com/
Expand Down
6 changes: 6 additions & 0 deletions docs/news.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
News
====

* 07/23/2020: `shorttext` 1.3.0 released.
* 06/05/2020: `shorttext` 1.2.6 released.
* 05/20/2020: `shorttext` 1.2.5 released.
* 05/13/2020: `shorttext` 1.2.4 released.
Expand Down Expand Up @@ -58,6 +59,11 @@ News
What's New
----------

Release 1.3.0 (July 23, 2020)
-----------------------------

* Removed all dependencies on `PuLP`; all computations of word mover's distance (WMD) is performed using `SciPy`.

Release 1.2.6 (June 20, 2020)
-----------------------------

Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ numpy>=1.16.0
scipy>=1.2.0
joblib>=0.14
scikit-learn>=0.23.0
tensorflow>=2.0.0
keras>=2.3.0
gensim>=3.8.0
pandas>=1.0.0
pulp>=2.0
flask>=1.1.0
snowballstemmer>=2.0.0
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def setup_requirements():


setup(name='shorttext',
version='1.2.6',
version='1.3.0a01',
description="Short Text Mining",
long_description=package_description(),
long_description_content_type='text/markdown',
Expand Down
2 changes: 1 addition & 1 deletion shorttext/metrics/wasserstein/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@

from .wordmoverdist import word_mover_distance_probspec, word_mover_distance
from .wordmoverdist import word_mover_distance_linprog, word_mover_distance
67 changes: 41 additions & 26 deletions shorttext/metrics/wasserstein/wordmoverdist.py
Original file line number Diff line number Diff line change
@@ -1,62 +1,73 @@

from itertools import product
import warnings

import pulp
import numpy as np
from scipy.spatial.distance import euclidean
from scipy.sparse import csr_matrix
from scipy.optimize import linprog

from shorttext.utils.gensim_corpora import tokens_to_fracdict


# use PuLP
def word_mover_distance_probspec(first_sent_tokens, second_sent_tokens, wvmodel, distancefunc=euclidean, lpFile=None):
def word_mover_distance_linprog(first_sent_tokens, second_sent_tokens, wvmodel, distancefunc=euclidean):
""" Compute the Word Mover's distance (WMD) between the two given lists of tokens, and return the LP problem class.
Using methods of linear programming, supported by PuLP, calculate the WMD between two lists of words. A word-embedding
model has to be provided. The problem class is returned, containing all the information about the LP.
model has to be provided. The whole `scipy.optimize.Optimize` object is returned.
Reference: Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, "From Word Embeddings to Document Distances," *ICML* (2015).
:param first_sent_tokens: first list of tokens.
:param second_sent_tokens: second list of tokens.
:param wvmodel: word-embedding models.
:param distancefunc: distance function that takes two numpy ndarray.
:param lpFile: log file to write out.
:return: a linear programming problem contains the solution
:return: the whole result of the linear programming problem
:type first_sent_tokens: list
:type second_sent_tokens: list
:type wvmodel: gensim.models.keyedvectors.KeyedVectors
:type distancefunc: function
:type lpFile: str
:rtype: pulp.LpProblem
:rtype: scipy.optimize.OptimizeResult
"""
nb_tokens_first_sent = len(first_sent_tokens)
nb_tokens_second_sent = len(second_sent_tokens)

all_tokens = list(set(first_sent_tokens+second_sent_tokens))
wordvecs = {token: wvmodel[token] for token in all_tokens}

first_sent_buckets = tokens_to_fracdict(first_sent_tokens)
second_sent_buckets = tokens_to_fracdict(second_sent_tokens)

T = pulp.LpVariable.dicts('T_matrix', list(product(all_tokens, all_tokens)), lowBound=0)
collapsed_idx_func = lambda i, j: i*nb_tokens_second_sent + j

prob = pulp.LpProblem('WMD', sense=pulp.LpMinimize)
prob += pulp.lpSum([T[token1, token2]*distancefunc(wordvecs[token1], wordvecs[token2])
for token1, token2 in product(all_tokens, all_tokens)])
for token2 in second_sent_buckets:
prob += pulp.lpSum([T[token1, token2] for token1 in first_sent_buckets])==second_sent_buckets[token2]
for token1 in first_sent_buckets:
prob += pulp.lpSum([T[token1, token2] for token2 in second_sent_buckets])==first_sent_buckets[token1]
# assigning T
T = np.zeros(nb_tokens_first_sent*nb_tokens_second_sent)
for i, j in product(range(nb_tokens_first_sent), range(nb_tokens_second_sent)):
T[collapsed_idx_func(i, j)] = distancefunc(wordvecs[first_sent_tokens[i]],
wordvecs[second_sent_tokens[j]])

if lpFile!=None:
prob.writeLP(lpFile)
# assigning Aeq and beq
Aeq = csr_matrix(
(nb_tokens_first_sent+nb_tokens_second_sent,
nb_tokens_first_sent*nb_tokens_second_sent)
)
beq = np.zeros(nb_tokens_first_sent+nb_tokens_second_sent)
for i in range(nb_tokens_first_sent):
for j in range(nb_tokens_second_sent):
Aeq[i, collapsed_idx_func(i, j)] = 1.
beq[i] = first_sent_buckets[first_sent_tokens[i]]
for j in range(nb_tokens_second_sent):
for i in range(nb_tokens_first_sent):
Aeq[j+nb_tokens_first_sent, collapsed_idx_func(i, j)] = 1.
beq[j+nb_tokens_first_sent] = second_sent_buckets[second_sent_tokens[j]]

prob.solve()

return prob
return linprog(T, A_eq=Aeq, b_eq=beq)


def word_mover_distance(first_sent_tokens, second_sent_tokens, wvmodel, distancefunc=euclidean, lpFile=None):
""" Compute the Word Mover's distance (WMD) between the two given lists of tokens.
Using methods of linear programming, supported by PuLP, calculate the WMD between two lists of words. A word-embedding
Using methods of linear programming, calculate the WMD between two lists of words. A word-embedding
model has to be provided. WMD is returned.
Reference: Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, "From Word Embeddings to Document Distances," *ICML* (2015).
Expand All @@ -65,7 +76,7 @@ def word_mover_distance(first_sent_tokens, second_sent_tokens, wvmodel, distance
:param second_sent_tokens: second list of tokens.
:param wvmodel: word-embedding models.
:param distancefunc: distance function that takes two numpy ndarray.
:param lpFile: log file to write out.
:param lpFile: deprecated, kept for backward incompatibility. (default: None)
:return: Word Mover's distance (WMD)
:type first_sent_tokens: list
:type second_sent_tokens: list
Expand All @@ -74,6 +85,10 @@ def word_mover_distance(first_sent_tokens, second_sent_tokens, wvmodel, distance
:type lpFile: str
:rtype: float
"""
prob = word_mover_distance_probspec(first_sent_tokens, second_sent_tokens, wvmodel,
distancefunc=distancefunc, lpFile=lpFile)
return pulp.value(prob.objective)
linprog_result = word_mover_distance_linprog(first_sent_tokens, second_sent_tokens, wvmodel,
distancefunc=distancefunc)
if lpFile is not None:
warnings.warn('The parameter `lpFile` (value: {}) is not used; parameter is deprecated as ' + \
'the package `pulp` is no longer used. Check your code if there is a dependency on ' + \
'this parameter.')
return linprog_result['fun']
8 changes: 1 addition & 7 deletions test/test_var_nn_embedded_vec_classifier.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,17 @@
import os
import unittest
import urllib
import sys

import shorttext


# The "test_w2v_model.bin" in this directory is adapted from: https://raw.githubusercontent.com/chinmayapancholi13/shorttext_test_data/master/test_w2v_model

class TestVarNNEmbeddedVecClassifier(unittest.TestCase):
def setUp(self):
print("Downloading word-embedding model....")
link = "https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/test_w2v_model.bin"
filename = "test_w2v_model.bin"
if not os.path.isfile("test_w2v_model.bin"):
if sys.version_info[0]==2:
urllib.urlretrieve(link, filename)
else:
urllib.request.urlretrieve(link, filename)
urllib.request.urlretrieve(link, filename)
self.w2v_model = shorttext.utils.load_word2vec_model(filename, binary=True) # load word2vec model
self.trainclass_dict = shorttext.data.subjectkeywords() # load training data

Expand Down
36 changes: 36 additions & 0 deletions test/test_wmd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import os
import unittest
import urllib

from shorttext.metrics.wasserstein import word_mover_distance
from shorttext.utils import load_word2vec_model


class TestWMD(unittest.TestCase):
def setUp(self):
print("Downloading word-embedding model....")
link = "https://shorttext-data-northernvirginia.s3.amazonaws.com/trainingdata/test_w2v_model.bin"
filename = "test_w2v_model.bin"
if not os.path.isfile("test_w2v_model.bin"):
urllib.request.urlretrieve(link, filename)
self.w2v_model = load_word2vec_model(filename, binary=True) # load word2vec model

def tearDown(self):
print("Removing word-embedding model")
if os.path.isfile("test_w2v_model.bin"):
os.remove('test_w2v_model.bin')

def calculate_wmd(self, tokens1, tokens2, answer):
wdistance = word_mover_distance(tokens1, tokens2, self.w2v_model)
self.assertAlmostEqual(wdistance, answer, delta=1e-3)

def test_metrics(self):
tokens1 = ['president', 'speaks']
tokens2 = ['president', 'talks']
known_answer = 0.19936788082122803
self.calculate_wmd(tokens1, tokens2, known_answer)

tokens1 = ['fan', 'book']
tokens2 = ['apple', 'orange']
known_answer = 1.8019972145557404
self.calculate_wmd(tokens1, tokens2, known_answer)

0 comments on commit cc5167a

Please sign in to comment.