diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 43f7210792..94626f92b6 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -282,7 +282,7 @@ def add(self, entities, weights, replace=False): ---------- entities : list of str Entities specified by string ids. - weights: {list of numpy.ndarray, numpy.ndarray} + weights: list of numpy.ndarray or numpy.ndarray List of 1D np.array vectors or a 2D np.array of vectors. replace: bool, optional Flag indicating whether to replace vectors for entities which already exist in the vocabulary, @@ -323,7 +323,7 @@ def __setitem__(self, entities, weights): ---------- entities : {str, list of str} Entities specified by their string ids. - weights: {list of numpy.ndarray, numpy.ndarray} + weights: list of numpy.ndarray or numpy.ndarray List of 1D np.array vectors or 2D np.array of vectors. """ @@ -502,8 +502,9 @@ def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=Non List of words that contribute positively. negative : list of str, optional List of words that contribute negatively. - topn : int, optional - Number of top-N similar words to return. + topn : int or None, optional + Number of top-N similar words to return, when `topn` is int. When `topn` is None, + then similarities for all words are returned. restrict_vocab : int, optional Optional integer which limits the range of vectors which are searched for most-similar values. For example, restrict_vocab=10000 would @@ -512,11 +513,13 @@ def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=Non Returns ------- - list of (str, float) - Sequence of (word, similarity). + list of (str, float) or numpy.array + When `topn` is int, a sequence of (word, similarity) is returned. + When `topn` is None, then similarities for all words are returned as a + one-dimensional numpy array with the size of the vocabulary. """ - if topn is not None and topn < 1: + if isinstance(topn, int) and topn < 1: return [] if positive is None: @@ -553,12 +556,12 @@ def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=Non raise ValueError("cannot compute similarity with no input") mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) - if indexer is not None: + if indexer is not None and isinstance(topn, int): return indexer.most_similar(mean, topn) limited = self.vectors_norm if restrict_vocab is None else self.vectors_norm[:restrict_vocab] dists = dot(limited, mean) - if topn is None: + if not topn: return dists best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True) # ignore (don't return) words from the input @@ -572,8 +575,8 @@ def similar_by_word(self, word, topn=10, restrict_vocab=None): ---------- word : str Word - topn : {int, False}, optional - Number of top-N similar words to return. If topn is False, similar_by_word returns + topn : int or None, optional + Number of top-N similar words to return. If topn is None, similar_by_word returns the vector of similarity scores. restrict_vocab : int, optional Optional integer which limits the range of vectors which @@ -583,8 +586,10 @@ def similar_by_word(self, word, topn=10, restrict_vocab=None): Returns ------- - list of (str, float) - Sequence of (word, similarity). + list of (str, float) or numpy.array + When `topn` is int, a sequence of (word, similarity) is returned. + When `topn` is None, then similarities for all words are returned as a + one-dimensional numpy array with the size of the vocabulary. """ return self.most_similar(positive=[word], topn=topn, restrict_vocab=restrict_vocab) @@ -596,9 +601,9 @@ def similar_by_vector(self, vector, topn=10, restrict_vocab=None): ---------- vector : numpy.array Vector from which similarities are to be computed. - topn : {int, False}, optional - Number of top-N similar words to return. If topn is False, similar_by_vector returns - the vector of similarity scores. + topn : int or None, optional + Number of top-N similar words to return, when `topn` is int. When `topn` is None, + then similarities for all words are returned. restrict_vocab : int, optional Optional integer which limits the range of vectors which are searched for most-similar values. For example, restrict_vocab=10000 would @@ -607,8 +612,10 @@ def similar_by_vector(self, vector, topn=10, restrict_vocab=None): Returns ------- - list of (str, float) - Sequence of (word, similarity). + list of (str, float) or numpy.array + When `topn` is int, a sequence of (word, similarity) is returned. + When `topn` is None, then similarities for all words are returned as a + one-dimensional numpy array with the size of the vocabulary. """ return self.most_similar(positive=[vector], topn=topn, restrict_vocab=restrict_vocab) @@ -788,15 +795,21 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10): List of words that contribute positively. negative : list of str, optional List of words that contribute negatively. - topn : int, optional - Number of top-N similar words to return. + topn : int or None, optional + Number of top-N similar words to return, when `topn` is int. When `topn` is None, + then similarities for all words are returned. Returns ------- - list of (str, float) - Sequence of (word, similarity). + list of (str, float) or numpy.array + When `topn` is int, a sequence of (word, similarity) is returned. + When `topn` is None, then similarities for all words are returned as a + one-dimensional numpy array with the size of the vocabulary. """ + if isinstance(topn, int) and topn < 1: + return [] + if positive is None: positive = [] if negative is None: @@ -1189,7 +1202,7 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c ignore = {a, b, c} # input words to be ignored predicted = None # find the most likely prediction, ignoring OOV words and input words - sims = most_similar(self, positive=[b, c], negative=[a], topn=False, restrict_vocab=restrict_vocab) + sims = most_similar(self, positive=[b, c], negative=[a], topn=None, restrict_vocab=restrict_vocab) self.vocab = original_vocab for index in matutils.argsort(sims, reverse=True): predicted = self.index2word[index].upper() if case_insensitive else self.index2word[index] @@ -1651,8 +1664,9 @@ def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip List of doctags/indexes that contribute positively. negative : list of {str, int}, optional List of doctags/indexes that contribute negatively. - topn : int, optional - Number of top-N similar docvecs to return. + topn : int or None, optional + Number of top-N similar docvecs to return, when `topn` is int. When `topn` is None, + then similarities for all docvecs are returned. clip_start : int Start clipping index. clip_end : int @@ -1664,6 +1678,9 @@ def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip Sequence of (doctag/index, similarity). """ + if isinstance(topn, int) and topn < 1: + return [] + if positive is None: positive = [] if negative is None: @@ -1700,7 +1717,7 @@ def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip raise ValueError("cannot compute similarity with no input") mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) - if indexer is not None: + if indexer is not None and isinstance(topn, int): return indexer.most_similar(mean, topn) dists = dot(self.vectors_docs_norm[clip_start:clip_end], mean) diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py index 143ae03bc2..97d10a4f7d 100644 --- a/gensim/test/test_keyedvectors.py +++ b/gensim/test/test_keyedvectors.py @@ -106,6 +106,9 @@ def test_most_similar_topn(self): predicted = self.vectors.most_similar('war', topn=None) self.assertEqual(len(predicted), len(self.vectors.vocab)) + predicted = self.vectors.most_similar('war', topn=0) + self.assertEqual(len(predicted), 0) + def test_relative_cosine_similarity(self): """Test relative_cosine_similarity returns expected results with an input of a word pair and topn""" wordnet_syn = [ diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py index 7aafbd34d7..00f916f869 100644 --- a/gensim/test/test_similarities.py +++ b/gensim/test/test_similarities.py @@ -608,6 +608,14 @@ def assertApproxNeighborsMatchExact(self, model, wv, index): self.assertEqual(approx_words, exact_words) + def assertAllSimilaritiesDisableIndexer(self, model, wv, index): + vector = wv.vectors_norm[0] + approx_similarities = model.wv.most_similar([vector], topn=None, indexer=index) + exact_similarities = model.wv.most_similar(positive=[vector], topn=None) + + self.assertEqual(approx_similarities, exact_similarities) + self.assertEqual(len(approx_similarities), len(wv.vectors.vocab)) + def assertIndexSaved(self, index): fname = get_tmpfile('gensim_similarities.tst.pkl') index.save(fname)