piskvorky · menshikh-iv · Jun 22, 2018 · Jun 15, 2018 · Jun 18, 2018 · Jun 18, 2018
diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py
@@ -298,7 +298,7 @@ def _set_train_params(self, **kwargs):
         raise NotImplementedError()
 
     def __init__(self, sentences=None, workers=3, vector_size=100, epochs=5, callbacks=(), batch_words=10000,
-                 trim_rule=None, sg=0, alpha=0.025, window=5, seed=1, hs=0, negative=5, cbow_mean=1,
+                 trim_rule=None, sg=0, alpha=0.025, window=5, seed=1, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1,
                  min_alpha=0.0001, compute_loss=False, fast_version=0, **kwargs):
         self.sg = int(sg)
         if vector_size % 4 != 0:
@@ -309,6 +309,7 @@ def __init__(self, sentences=None, workers=3, vector_size=100, epochs=5, callbac
         self.min_alpha = float(min_alpha)
         self.hs = int(hs)
         self.negative = int(negative)
+        self.ns_exponent = ns_exponent
         self.cbow_mean = int(cbow_mean)
         self.compute_loss = bool(compute_loss)
         self.running_training_loss = 0
@@ -627,6 +628,10 @@ def _check_training_sanity(self, epochs=None, total_examples=None, total_words=N
     @classmethod
     def load(cls, *args, **kwargs):
         model = super(BaseWordEmbeddingsModel, cls).load(*args, **kwargs)
+        if not hasattr(model, 'ns_exponent'):
+            model.ns_exponent = 0.75
+        if not hasattr(model.vocabulary, 'ns_exponent'):
+            model.vocabulary.ns_exponent = 0.75
         if model.negative and hasattr(model.wv, 'index2word'):
             model.vocabulary.make_cum_table(model.wv)  # rebuild cum_table from vocabulary
         if not hasattr(model, 'corpus_count'):

diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
@@ -325,6 +325,11 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0
             If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
             should be drawn (usually between 5-20).
             If set to 0, no negative sampling is used.
+        ns_exponent : float
+            The exponent used to smooth the cumulative distribution used for negative sampling.
+            1.0 leads to a sampling based on the frequency distribution, 0.0 makes items beings sampled equally,
+            while a negative value makes unpopular items being sampled more often than popular onces. The default value
+            is empirically set to 0.75 following the original paper of Word2Vec.
         dm_mean : int {1,0}
             If 0 , use the sum of the context word vectors. If 1, use the mean.
             Only applies when `dm` is used in non-concatenative mode.
@@ -383,7 +388,7 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0
         self.dm_tag_count = int(dm_tag_count)
 
         kwargs['null_word'] = dm_concat
-        vocabulary_keys = ['max_vocab_size', 'min_count', 'sample', 'sorted_vocab', 'null_word']
+        vocabulary_keys = ['max_vocab_size', 'min_count', 'sample', 'sorted_vocab', 'null_word', 'ns_exponent']
         vocabulary_kwargs = dict((k, kwargs[k]) for k in vocabulary_keys if k in kwargs)
         self.vocabulary = Doc2VecVocab(**vocabulary_kwargs)
 
@@ -790,10 +795,10 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No
 
 
 class Doc2VecVocab(Word2VecVocab):
-    def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0):
+    def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0, ns_exponent=0.75):
         super(Doc2VecVocab, self).__init__(
             max_vocab_size=max_vocab_size, min_count=min_count, sample=sample,
-            sorted_vocab=sorted_vocab, null_word=null_word)
+            sorted_vocab=sorted_vocab, null_word=null_word, ns_exponent=ns_exponent)
 
     def scan_vocab(self, documents, docvecs, progress_per=10000, trim_rule=None):
         logger.info("collecting all words and their counts")

diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py
@@ -162,8 +162,8 @@ class FastText(BaseWordEmbeddingsModel):
     """
     def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5,
                  max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
-                 negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1,
-                 bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=()):
+                 negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6,
+                 sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=()):
         """Initialize the model from an iterable of `sentences`. Each sentence is a
         list of words (unicode strings) that will be used for training.
 
@@ -210,6 +210,11 @@ def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5,
             If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
             should be drawn (usually between 5-20).
             If set to 0, no negative sampling is used.
+        ns_exponent : float
+            The exponent used to smooth the cumulative distribution used for negative sampling.
+            1.0 leads to a sampling based on the frequency distribution, 0.0 makes items beings sampled equally,
+            while a negative value makes unpopular items being sampled more often than popular onces. The default value
+            is empirically set to 0.75 following the original paper of Word2Vec.
         cbow_mean : int {1,0}
             If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
         hashfxn : function
@@ -267,7 +272,7 @@ def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5,
         self.wv = FastTextKeyedVectors(size, min_n, max_n)
         self.vocabulary = FastTextVocab(
             max_vocab_size=max_vocab_size, min_count=min_count, sample=sample,
-            sorted_vocab=bool(sorted_vocab), null_word=null_word)
+            sorted_vocab=bool(sorted_vocab), null_word=null_word, ns_exponent=ns_exponent)
         self.trainables = FastTextTrainables(
             vector_size=size, seed=seed, bucket=bucket, hashfxn=hashfxn)
         self.wv.bucket = self.bucket
@@ -731,10 +736,10 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=None, case_inse
 
 
 class FastTextVocab(Word2VecVocab):
-    def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0):
+    def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0, ns_exponent=0.75):
         super(FastTextVocab, self).__init__(
             max_vocab_size=max_vocab_size, min_count=min_count, sample=sample,
-            sorted_vocab=sorted_vocab, null_word=null_word)
+            sorted_vocab=sorted_vocab, null_word=null_word, ns_exponent=ns_exponent)
 
     def prepare_vocab(self, hs, negative, wv, update=False, keep_raw_vocab=False, trim_rule=None,
                       min_count=None, sample=None, dry_run=False):

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
@@ -425,7 +425,7 @@ class Word2Vec(BaseWordEmbeddingsModel):
 
     def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
                  max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
-                 sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
+                 sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
                  trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(),
                  max_final_vocab=None):
         """
@@ -480,6 +480,11 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
             If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
             should be drawn (usually between 5-20).
             If set to 0, no negative sampling is used.
+        ns_exponent : float
+            The exponent used to smooth the cumulative distribution used for negative sampling.
+            1.0 leads to a sampling based on the frequency distribution, 0.0 makes items beings sampled equally,
+            while a negative value makes unpopular items being sampled more often than popular onces. The default value
+            is empirically set to 0.75 following the original paper of Word2Vec.
         cbow_mean : int {1,0}
             If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
         hashfxn : function
@@ -523,8 +528,8 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
 
         self.wv = Word2VecKeyedVectors(size)
         self.vocabulary = Word2VecVocab(
-            max_vocab_size=max_vocab_size, min_count=min_count, sample=sample,
-            sorted_vocab=bool(sorted_vocab), null_word=null_word, max_final_vocab=max_final_vocab)
+            max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, sorted_vocab=bool(sorted_vocab),
+            null_word=null_word, max_final_vocab=max_final_vocab, ns_exponent=ns_exponent)
         self.trainables = Word2VecTrainables(seed=seed, vector_size=size, hashfxn=hashfxn)
 
         super(Word2Vec, self).__init__(
@@ -1146,7 +1151,7 @@ def __iter__(self):
 
 class Word2VecVocab(utils.SaveLoad):
     def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0,
-        max_final_vocab=None):
+        max_final_vocab=None, ns_exponent=0.75):
         self.max_vocab_size = max_vocab_size
         self.min_count = min_count
         self.sample = sample
@@ -1155,6 +1160,7 @@ def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=T
         self.cum_table = None  # for negative sampling
         self.raw_vocab = None
         self.max_final_vocab = max_final_vocab
+        self.ns_exponent = ns_exponent
 
     def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
         """Do an initial scan of all words appearing in sentences."""
@@ -1397,7 +1403,7 @@ def create_binary_tree(self, wv):
 
             logger.info("built huffman tree with maximum node depth %i", max_depth)
 
-    def make_cum_table(self, wv, power=0.75, domain=2**31 - 1):
+    def make_cum_table(self, wv, domain=2**31 - 1):
         """Create a cumulative-distribution table using stored vocabulary word counts for
         drawing random words in the negative-sampling training routines.
 
@@ -1413,10 +1419,10 @@ def make_cum_table(self, wv, power=0.75, domain=2**31 - 1):
         # compute sum of all power (Z in paper)
         train_words_pow = 0.0
         for word_index in xrange(vocab_size):
-            train_words_pow += wv.vocab[wv.index2word[word_index]].count**power
+            train_words_pow += wv.vocab[wv.index2word[word_index]].count**self.ns_exponent
         cumulative = 0.0
         for word_index in xrange(vocab_size):
-            cumulative += wv.vocab[wv.index2word[word_index]].count**power
+            cumulative += wv.vocab[wv.index2word[word_index]].count**self.ns_exponent
             self.cum_table[word_index] = round(cumulative / train_words_pow * domain)
         if len(self.cum_table) > 0:
             assert self.cum_table[-1] == domain