Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix WordEmbeddingsKeyedVectors.most_similar #2461

Merged
merged 11 commits into from
May 4, 2019
Merged
69 changes: 43 additions & 26 deletions gensim/models/keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ def add(self, entities, weights, replace=False):
----------
entities : list of str
Entities specified by string ids.
weights: {list of numpy.ndarray, numpy.ndarray}
weights: list of numpy.ndarray or numpy.ndarray
List of 1D np.array vectors or a 2D np.array of vectors.
replace: bool, optional
Flag indicating whether to replace vectors for entities which already exist in the vocabulary,
Expand Down Expand Up @@ -323,7 +323,7 @@ def __setitem__(self, entities, weights):
----------
entities : {str, list of str}
Entities specified by their string ids.
weights: {list of numpy.ndarray, numpy.ndarray}
weights: list of numpy.ndarray or numpy.ndarray
List of 1D np.array vectors or 2D np.array of vectors.

"""
Expand Down Expand Up @@ -502,8 +502,9 @@ def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=Non
List of words that contribute positively.
negative : list of str, optional
List of words that contribute negatively.
topn : int, optional
Number of top-N similar words to return.
topn : int or None, optional
Number of top-N similar words to return, when `topn` is int. When `topn` is None,
then similarities for all words are returned.
restrict_vocab : int, optional
Optional integer which limits the range of vectors which
are searched for most-similar values. For example, restrict_vocab=10000 would
Expand All @@ -512,11 +513,13 @@ def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=Non

Returns
-------
list of (str, float)
Sequence of (word, similarity).
list of (str, float) or numpy.array
When `topn` is int, a sequence of (word, similarity) is returned.
When `topn` is None, then similarities for all words are returned as a
one-dimensional numpy array with the size of the vocabulary.

"""
if topn is not None and topn < 1:
if isinstance(topn, int) and topn < 1:
return []

if positive is None:
Expand Down Expand Up @@ -553,12 +556,12 @@ def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=Non
raise ValueError("cannot compute similarity with no input")
mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)

if indexer is not None:
if indexer is not None and isinstance(topn, int):
return indexer.most_similar(mean, topn)

limited = self.vectors_norm if restrict_vocab is None else self.vectors_norm[:restrict_vocab]
dists = dot(limited, mean)
if topn is None:
if not topn:
return dists
best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True)
# ignore (don't return) words from the input
Expand All @@ -572,8 +575,8 @@ def similar_by_word(self, word, topn=10, restrict_vocab=None):
----------
word : str
Word
topn : {int, False}, optional
Number of top-N similar words to return. If topn is False, similar_by_word returns
topn : int or None, optional
Number of top-N similar words to return. If topn is None, similar_by_word returns
the vector of similarity scores.
restrict_vocab : int, optional
Optional integer which limits the range of vectors which
Expand All @@ -583,8 +586,10 @@ def similar_by_word(self, word, topn=10, restrict_vocab=None):

Returns
-------
list of (str, float)
Sequence of (word, similarity).
list of (str, float) or numpy.array
When `topn` is int, a sequence of (word, similarity) is returned.
When `topn` is None, then similarities for all words are returned as a
one-dimensional numpy array with the size of the vocabulary.

"""
return self.most_similar(positive=[word], topn=topn, restrict_vocab=restrict_vocab)
Expand All @@ -596,9 +601,9 @@ def similar_by_vector(self, vector, topn=10, restrict_vocab=None):
----------
vector : numpy.array
Vector from which similarities are to be computed.
topn : {int, False}, optional
Number of top-N similar words to return. If topn is False, similar_by_vector returns
the vector of similarity scores.
topn : int or None, optional
Number of top-N similar words to return, when `topn` is int. When `topn` is None,
then similarities for all words are returned.
restrict_vocab : int, optional
Optional integer which limits the range of vectors which
are searched for most-similar values. For example, restrict_vocab=10000 would
Expand All @@ -607,8 +612,10 @@ def similar_by_vector(self, vector, topn=10, restrict_vocab=None):

Returns
-------
list of (str, float)
Sequence of (word, similarity).
list of (str, float) or numpy.array
When `topn` is int, a sequence of (word, similarity) is returned.
When `topn` is None, then similarities for all words are returned as a
one-dimensional numpy array with the size of the vocabulary.

"""
return self.most_similar(positive=[vector], topn=topn, restrict_vocab=restrict_vocab)
Expand Down Expand Up @@ -788,15 +795,21 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10):
List of words that contribute positively.
negative : list of str, optional
List of words that contribute negatively.
topn : int, optional
Number of top-N similar words to return.
topn : int or None, optional
Number of top-N similar words to return, when `topn` is int. When `topn` is None,
then similarities for all words are returned.

Returns
-------
list of (str, float)
Sequence of (word, similarity).
list of (str, float) or numpy.array
When `topn` is int, a sequence of (word, similarity) is returned.
When `topn` is None, then similarities for all words are returned as a
one-dimensional numpy array with the size of the vocabulary.

"""
if isinstance(topn, int) and topn < 1:
return []

if positive is None:
positive = []
if negative is None:
Expand Down Expand Up @@ -1189,7 +1202,7 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c
ignore = {a, b, c} # input words to be ignored
predicted = None
# find the most likely prediction, ignoring OOV words and input words
sims = most_similar(self, positive=[b, c], negative=[a], topn=False, restrict_vocab=restrict_vocab)
sims = most_similar(self, positive=[b, c], negative=[a], topn=None, restrict_vocab=restrict_vocab)
self.vocab = original_vocab
for index in matutils.argsort(sims, reverse=True):
predicted = self.index2word[index].upper() if case_insensitive else self.index2word[index]
Expand Down Expand Up @@ -1651,8 +1664,9 @@ def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip
List of doctags/indexes that contribute positively.
negative : list of {str, int}, optional
List of doctags/indexes that contribute negatively.
topn : int, optional
Number of top-N similar docvecs to return.
topn : int or None, optional
Number of top-N similar docvecs to return, when `topn` is int. When `topn` is None,
then similarities for all docvecs are returned.
clip_start : int
Start clipping index.
clip_end : int
Expand All @@ -1664,6 +1678,9 @@ def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip
Sequence of (doctag/index, similarity).

"""
if isinstance(topn, int) and topn < 1:
return []

if positive is None:
positive = []
if negative is None:
Expand Down Expand Up @@ -1700,7 +1717,7 @@ def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip
raise ValueError("cannot compute similarity with no input")
mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)

if indexer is not None:
if indexer is not None and isinstance(topn, int):
return indexer.most_similar(mean, topn)

dists = dot(self.vectors_docs_norm[clip_start:clip_end], mean)
Expand Down
3 changes: 3 additions & 0 deletions gensim/test/test_keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ def test_most_similar_topn(self):
predicted = self.vectors.most_similar('war', topn=None)
self.assertEqual(len(predicted), len(self.vectors.vocab))

predicted = self.vectors.most_similar('war', topn=0)
self.assertEqual(len(predicted), 0)

def test_relative_cosine_similarity(self):
"""Test relative_cosine_similarity returns expected results with an input of a word pair and topn"""
wordnet_syn = [
Expand Down
8 changes: 8 additions & 0 deletions gensim/test/test_similarities.py
Original file line number Diff line number Diff line change
Expand Up @@ -608,6 +608,14 @@ def assertApproxNeighborsMatchExact(self, model, wv, index):

self.assertEqual(approx_words, exact_words)

def assertAllSimilaritiesDisableIndexer(self, model, wv, index):
vector = wv.vectors_norm[0]
approx_similarities = model.wv.most_similar([vector], topn=None, indexer=index)
exact_similarities = model.wv.most_similar(positive=[vector], topn=None)

self.assertEqual(approx_similarities, exact_similarities)
self.assertEqual(len(approx_similarities), len(wv.vectors.vocab))

def assertIndexSaved(self, index):
fname = get_tmpfile('gensim_similarities.tst.pkl')
index.save(fname)
Expand Down