From 2bc36139a60ee990de9d3aa43f378a00fee15a90 Mon Sep 17 00:00:00 2001 From: Cristi Burca Date: Fri, 31 May 2019 16:08:33 +0100 Subject: [PATCH 1/4] Expose max_final_vocab parameter in FastText constructor --- gensim/models/fasttext.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 4739534612..29714cacb0 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -461,7 +461,7 @@ class FastText(BaseWordEmbeddingsModel): """ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, - max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, + max_vocab_size=None, max_final_vocab=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=(), compatible_hash=True): @@ -507,6 +507,10 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha Limits the RAM during vocabulary building; if there are more unique words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM. Set to `None` for no limit. + max_final_vocab : int, optional + Prunes the final vocabulary to this number of word types. + + Set to `None` to disable. sample : float, optional The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5). @@ -589,8 +593,14 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha self.wv = FastTextKeyedVectors(size, min_n, max_n, bucket, compatible_hash) self.vocabulary = FastTextVocab( - max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, - sorted_vocab=bool(sorted_vocab), null_word=null_word, ns_exponent=ns_exponent) + max_vocab_size=max_vocab_size, + max_final_vocab=max_final_vocab, + min_count=min_count, + sample=sample, + sorted_vocab=bool(sorted_vocab), + null_word=null_word, + ns_exponent=ns_exponent + ) self.trainables = FastTextTrainables(vector_size=size, seed=seed, bucket=bucket, hashfxn=hashfxn) self.trainables.prepare_weights(hs, negative, self.wv, update=False, vocabulary=self.vocabulary) self.wv.bucket = self.trainables.bucket From c16f5b42ad05795f2cf3e2ffa5e94b4d6a69963f Mon Sep 17 00:00:00 2001 From: Cristi Burca Date: Fri, 31 May 2019 16:47:12 +0100 Subject: [PATCH 2/4] Fix lint error --- gensim/models/fasttext.py | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 29714cacb0..d9777055aa 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -460,11 +460,38 @@ class FastText(BaseWordEmbeddingsModel): for the internal structure of words, besides their concurrence counts. """ - def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, - max_vocab_size=None, max_final_vocab=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, - negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, - sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=(), - compatible_hash=True): + def __init__( + self, + sentences=None, + corpus_file=None, + sg=0, + hs=0, + size=100, + alpha=0.025, + window=5, + min_count=5, + max_vocab_size=None, + max_final_vocab=None, + word_ngrams=1, + sample=1e-3, + seed=1, + workers=3, + min_alpha=0.0001, + negative=5, + ns_exponent=0.75, + cbow_mean=1, + hashfxn=hash, + iter=5, + null_word=0, + min_n=3, + max_n=6, + sorted_vocab=1, + bucket=2000000, + trim_rule=None, + batch_words=MAX_WORDS_IN_BATCH, + callbacks=(), + compatible_hash=True + ): """ Parameters From 97e7a293202ebe1f849ca55847594e06fcc1f8c4 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 26 Jun 2020 11:17:51 +0900 Subject: [PATCH 3/4] respond to reviewer comments --- gensim/models/fasttext.py | 55 +++++++++------------------------------ 1 file changed, 13 insertions(+), 42 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 9d44965a9c..2307b04468 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -342,38 +342,11 @@ class FastText(BaseWordEmbeddingsModel): for the internal structure of words, besides their concurrence counts. """ - def __init__( - self, - sentences=None, - corpus_file=None, - sg=0, - hs=0, - size=100, - alpha=0.025, - window=5, - min_count=5, - max_vocab_size=None, - max_final_vocab=None, - word_ngrams=1, - sample=1e-3, - seed=1, - workers=3, - min_alpha=0.0001, - negative=5, - ns_exponent=0.75, - cbow_mean=1, - hashfxn=hash, - iter=5, - null_word=0, - min_n=3, - max_n=6, - sorted_vocab=1, - bucket=2000000, - trim_rule=None, - batch_words=MAX_WORDS_IN_BATCH, - callbacks=(), - compatible_hash=True - ): + def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, + max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, + negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, + sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=(), + compatible_hash=True, max_final_vocab=None): """ Parameters @@ -416,10 +389,6 @@ def __init__( Limits the RAM during vocabulary building; if there are more unique words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM. Set to `None` for no limit. - max_final_vocab : int, optional - Prunes the final vocabulary to this number of word types. - - Set to `None` to disable. sample : float, optional The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5). @@ -479,6 +448,12 @@ def __init__( Older versions were not 100% compatible due to a bug. To use the older, incompatible hash function, set this to False. + max_final_vocab : int, optional + Limits the vocab to a target vocab size by automatically selecting + ``min_count```. If the specified ``min_count`` is more than the + automatically calculated ``min_count``, the former will be used. + Set to ``None`` if not required. + Examples -------- Initialize and train a `FastText` model: @@ -502,13 +477,9 @@ def __init__( self.wv = FastTextKeyedVectors(size, min_n, max_n, bucket, compatible_hash) self.vocabulary = FastTextVocab( - max_vocab_size=max_vocab_size, + max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, + sorted_vocab=bool(sorted_vocab), null_word=null_word, ns_exponent=ns_exponent, max_final_vocab=max_final_vocab, - min_count=min_count, - sample=sample, - sorted_vocab=bool(sorted_vocab), - null_word=null_word, - ns_exponent=ns_exponent ) self.trainables = FastTextTrainables(vector_size=size, seed=seed, bucket=bucket, hashfxn=hashfxn) self.trainables.prepare_weights(hs, negative, self.wv, update=False, vocabulary=self.vocabulary) From c5ea4df19b535012bba1251e1aedea19e2da31d7 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 26 Jun 2020 11:24:36 +0900 Subject: [PATCH 4/4] add unit test --- gensim/test/test_fasttext.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 3517a355a9..8f691d4608 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -866,6 +866,27 @@ def test_sg_hs_against_wrapper(self): self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all()) # vector should vary after training self.compare_with_wrapper(model_gensim, model_wrapper) + def test_vocab_pruning(self): + """Does the model correctly interpret the max_final_vocab parameter?""" + sentences = [ + ["graph", "system"], + ["graph", "system"], + ["system", "eps"], + ["graph", "system"], + ] + model = FT_gensim(sentences, size=10, min_count=2, max_final_vocab=2) + self.assertEqual(len(model.wv.vocab), 2) + self.assertEqual(model.wv.vocab['graph'].count, 3) + self.assertEqual(model.wv.vocab['system'].count, 4) + + model = FT_gensim(sentences, size=10, min_count=2, max_final_vocab=1) + self.assertEqual(len(model.wv.vocab), 1) + self.assertEqual(model.wv.vocab['system'].count, 4) + + model = FT_gensim(sentences, size=10, min_count=4) + self.assertEqual(len(model.wv.vocab), 1) + self.assertEqual(model.wv.vocab['system'].count, 4) + with open(datapath('toy-data.txt')) as fin: TOY_SENTENCES = [fin.read().strip().split(' ')]