From 2bc36139a60ee990de9d3aa43f378a00fee15a90 Mon Sep 17 00:00:00 2001
From: Cristi Burca <mail@scribu.net>
Date: Fri, 31 May 2019 16:08:33 +0100
Subject: [PATCH 1/4] Expose max_final_vocab parameter in FastText constructor

---
 gensim/models/fasttext.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py
index 4739534612..29714cacb0 100644
--- a/gensim/models/fasttext.py
+++ b/gensim/models/fasttext.py
@@ -461,7 +461,7 @@ class FastText(BaseWordEmbeddingsModel):
 
     """
     def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5,
-                 max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
+                 max_vocab_size=None, max_final_vocab=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
                  negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6,
                  sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=(),
                  compatible_hash=True):
@@ -507,6 +507,10 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha
             Limits the RAM during vocabulary building; if there are more unique
             words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM.
             Set to `None` for no limit.
+        max_final_vocab : int, optional
+            Prunes the final vocabulary to this number of word types.
+
+            Set to `None` to disable.
         sample : float, optional
             The threshold for configuring which higher-frequency words are randomly downsampled,
             useful range is (0, 1e-5).
@@ -589,8 +593,14 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha
 
         self.wv = FastTextKeyedVectors(size, min_n, max_n, bucket, compatible_hash)
         self.vocabulary = FastTextVocab(
-            max_vocab_size=max_vocab_size, min_count=min_count, sample=sample,
-            sorted_vocab=bool(sorted_vocab), null_word=null_word, ns_exponent=ns_exponent)
+            max_vocab_size=max_vocab_size,
+            max_final_vocab=max_final_vocab,
+            min_count=min_count,
+            sample=sample,
+            sorted_vocab=bool(sorted_vocab),
+            null_word=null_word,
+            ns_exponent=ns_exponent
+        )
         self.trainables = FastTextTrainables(vector_size=size, seed=seed, bucket=bucket, hashfxn=hashfxn)
         self.trainables.prepare_weights(hs, negative, self.wv, update=False, vocabulary=self.vocabulary)
         self.wv.bucket = self.trainables.bucket

From c16f5b42ad05795f2cf3e2ffa5e94b4d6a69963f Mon Sep 17 00:00:00 2001
From: Cristi Burca <mail@scribu.net>
Date: Fri, 31 May 2019 16:47:12 +0100
Subject: [PATCH 2/4] Fix lint error

---
 gensim/models/fasttext.py | 37 ++++++++++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 5 deletions(-)

diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py
index 29714cacb0..d9777055aa 100644
--- a/gensim/models/fasttext.py
+++ b/gensim/models/fasttext.py
@@ -460,11 +460,38 @@ class FastText(BaseWordEmbeddingsModel):
         for the internal structure of words, besides their concurrence counts.
 
     """
-    def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5,
-                 max_vocab_size=None, max_final_vocab=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
-                 negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6,
-                 sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=(),
-                 compatible_hash=True):
+    def __init__(
+            self,
+            sentences=None,
+            corpus_file=None,
+            sg=0,
+            hs=0,
+            size=100,
+            alpha=0.025,
+            window=5,
+            min_count=5,
+            max_vocab_size=None,
+            max_final_vocab=None,
+            word_ngrams=1,
+            sample=1e-3,
+            seed=1,
+            workers=3,
+            min_alpha=0.0001,
+            negative=5,
+            ns_exponent=0.75,
+            cbow_mean=1,
+            hashfxn=hash,
+            iter=5,
+            null_word=0,
+            min_n=3,
+            max_n=6,
+            sorted_vocab=1,
+            bucket=2000000,
+            trim_rule=None,
+            batch_words=MAX_WORDS_IN_BATCH,
+            callbacks=(),
+            compatible_hash=True
+    ):
         """
 
         Parameters

From 97e7a293202ebe1f849ca55847594e06fcc1f8c4 Mon Sep 17 00:00:00 2001
From: Michael Penkov <m@penkov.dev>
Date: Fri, 26 Jun 2020 11:17:51 +0900
Subject: [PATCH 3/4] respond to reviewer comments

---
 gensim/models/fasttext.py | 55 +++++++++------------------------------
 1 file changed, 13 insertions(+), 42 deletions(-)

diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py
index 9d44965a9c..2307b04468 100644
--- a/gensim/models/fasttext.py
+++ b/gensim/models/fasttext.py
@@ -342,38 +342,11 @@ class FastText(BaseWordEmbeddingsModel):
         for the internal structure of words, besides their concurrence counts.
 
     """
-    def __init__(
-            self,
-            sentences=None,
-            corpus_file=None,
-            sg=0,
-            hs=0,
-            size=100,
-            alpha=0.025,
-            window=5,
-            min_count=5,
-            max_vocab_size=None,
-            max_final_vocab=None,
-            word_ngrams=1,
-            sample=1e-3,
-            seed=1,
-            workers=3,
-            min_alpha=0.0001,
-            negative=5,
-            ns_exponent=0.75,
-            cbow_mean=1,
-            hashfxn=hash,
-            iter=5,
-            null_word=0,
-            min_n=3,
-            max_n=6,
-            sorted_vocab=1,
-            bucket=2000000,
-            trim_rule=None,
-            batch_words=MAX_WORDS_IN_BATCH,
-            callbacks=(),
-            compatible_hash=True
-    ):
+    def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5,
+                 max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
+                 negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6,
+                 sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=(),
+                 compatible_hash=True, max_final_vocab=None):
         """
 
         Parameters
@@ -416,10 +389,6 @@ def __init__(
             Limits the RAM during vocabulary building; if there are more unique
             words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM.
             Set to `None` for no limit.
-        max_final_vocab : int, optional
-            Prunes the final vocabulary to this number of word types.
-
-            Set to `None` to disable.
         sample : float, optional
             The threshold for configuring which higher-frequency words are randomly downsampled,
             useful range is (0, 1e-5).
@@ -479,6 +448,12 @@ def __init__(
             Older versions were not 100% compatible due to a bug.
             To use the older, incompatible hash function, set this to False.
 
+        max_final_vocab : int, optional
+            Limits the vocab to a target vocab size by automatically selecting
+            ``min_count```.  If the specified ``min_count`` is more than the
+            automatically calculated ``min_count``, the former will be used.
+            Set to ``None`` if not required.
+
         Examples
         --------
         Initialize and train a `FastText` model:
@@ -502,13 +477,9 @@ def __init__(
 
         self.wv = FastTextKeyedVectors(size, min_n, max_n, bucket, compatible_hash)
         self.vocabulary = FastTextVocab(
-            max_vocab_size=max_vocab_size,
+            max_vocab_size=max_vocab_size, min_count=min_count, sample=sample,
+            sorted_vocab=bool(sorted_vocab), null_word=null_word, ns_exponent=ns_exponent,
             max_final_vocab=max_final_vocab,
-            min_count=min_count,
-            sample=sample,
-            sorted_vocab=bool(sorted_vocab),
-            null_word=null_word,
-            ns_exponent=ns_exponent
         )
         self.trainables = FastTextTrainables(vector_size=size, seed=seed, bucket=bucket, hashfxn=hashfxn)
         self.trainables.prepare_weights(hs, negative, self.wv, update=False, vocabulary=self.vocabulary)

From c5ea4df19b535012bba1251e1aedea19e2da31d7 Mon Sep 17 00:00:00 2001
From: Michael Penkov <m@penkov.dev>
Date: Fri, 26 Jun 2020 11:24:36 +0900
Subject: [PATCH 4/4] add unit test

---
 gensim/test/test_fasttext.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py
index 3517a355a9..8f691d4608 100644
--- a/gensim/test/test_fasttext.py
+++ b/gensim/test/test_fasttext.py
@@ -866,6 +866,27 @@ def test_sg_hs_against_wrapper(self):
         self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all())  # vector should vary after training
         self.compare_with_wrapper(model_gensim, model_wrapper)
 
+    def test_vocab_pruning(self):
+        """Does the model correctly interpret the max_final_vocab parameter?"""
+        sentences = [
+            ["graph", "system"],
+            ["graph", "system"],
+            ["system", "eps"],
+            ["graph", "system"],
+        ]
+        model = FT_gensim(sentences, size=10, min_count=2, max_final_vocab=2)
+        self.assertEqual(len(model.wv.vocab), 2)
+        self.assertEqual(model.wv.vocab['graph'].count, 3)
+        self.assertEqual(model.wv.vocab['system'].count, 4)
+
+        model = FT_gensim(sentences, size=10, min_count=2, max_final_vocab=1)
+        self.assertEqual(len(model.wv.vocab), 1)
+        self.assertEqual(model.wv.vocab['system'].count, 4)
+
+        model = FT_gensim(sentences, size=10, min_count=4)
+        self.assertEqual(len(model.wv.vocab), 1)
+        self.assertEqual(model.wv.vocab['system'].count, 4)
+
 
 with open(datapath('toy-data.txt')) as fin:
     TOY_SENTENCES = [fin.read().strip().split(' ')]