From fbf776b503f4df92683c2ee8276183df52e9762a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Mon, 26 Jul 2021 14:56:05 +0200 Subject: [PATCH 1/7] improve docs --- gensim/corpora/dictionary.py | 4 +--- gensim/models/word2vec.py | 6 +++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index 9ebfceb09a..d954061caf 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -26,9 +26,7 @@ class Dictionary(utils.SaveLoad, Mapping): Attributes ---------- token2id : dict of (str, int) - token -> tokenId. - id2token : dict of (int, str) - Reverse mapping for token2id, initialized in a lazy manner to save memory (not created until needed). + token -> token_id. I.e. the reverse mapping to `self[token_id]`. cfs : dict of (int, int) Collection frequencies: token_id -> how many instances of this token are contained in the documents. dfs : dict of (int, int) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 1a34c367e6..356f711408 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -529,7 +529,7 @@ def build_vocab_from_freq( # to be directly the raw vocab raw_vocab = word_freq logger.info( - "collected %i different raw word, with total frequency of %i", + "collected %i unique word types, with total frequency of %i", len(raw_vocab), sum(raw_vocab.values()), ) @@ -611,8 +611,8 @@ def prepare_vocab( # set effective_min_count to min_count in case max_final_vocab isn't set self.effective_min_count = min_count - # if max_final_vocab is specified instead of min_count - # pick a min_count which satisfies max_final_vocab as well as possible + # If max_final_vocab is specified instead of min_count, + # pick a min_count which satisfies max_final_vocab as well as possible. if self.max_final_vocab is not None: sorted_vocab = sorted(self.raw_vocab.keys(), key=lambda word: self.raw_vocab[word], reverse=True) calc_min_count = 1 From 6cdcb018e83ac670460d556f8b4a23de204e3b72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Mon, 26 Jul 2021 14:56:16 +0200 Subject: [PATCH 2/7] reduce MAX_WORD_LENGTH in FastSS Levenshtein --- gensim/similarities/fastss.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/similarities/fastss.pyx b/gensim/similarities/fastss.pyx index 07203073d7..a4e8cba54b 100644 --- a/gensim/similarities/fastss.pyx +++ b/gensim/similarities/fastss.pyx @@ -15,13 +15,13 @@ import itertools from cpython.ref cimport PyObject -DEF MAX_WORD_LENGTH = 10000 # Maximum allowed word length, in characters. Must fit in the C `int` range. +DEF MAX_WORD_LENGTH = 1000 # Maximum allowed word length, in characters. Must fit in the C `int` range. cdef extern from *: """ #define WIDTH int - #define MAX_WORD_LENGTH 10000 + #define MAX_WORD_LENGTH 1000 int ceditdist(PyObject * s1, PyObject * s2, WIDTH maximum) { WIDTH row1[MAX_WORD_LENGTH + 1]; From 982aa1fe179b4a59dae00e3fa3e5b44016bec449 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Mon, 26 Jul 2021 15:35:11 +0200 Subject: [PATCH 3/7] update CHANGELOG --- CHANGELOG.md | 82 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 60 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7a05a5567a..7bb150c347 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,37 +3,75 @@ Changes ## Unreleased +Gensim 4.1 bring two major new functionalities: + +* [Ensamble LDA](https://radimrehurek.com/gensim/auto_examples/tutorials/run_ensemblelda.html) for robust training, selection and comparison of LDA models. +* [FastSS module](https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/similarities/fastss.pyx) for super fast Levenshtein "fuzzy search" queries. Used e.g. for ["soft term similarity"](https://github.com/RaRe-Technologies/gensim/pull/3146) calculations. + +Plus a large number of smaller improvements and fixes, as usual. + +**⚠️ If migrating from old Gensim 3.x, read the [Migration guide](https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4) first.** + + +### :+1: New features + +* [#3169](https://github.com/RaRe-Technologies/gensim/pull/3169): Implement `shrink_windows` argument for Word2Vec, by [@M-Demay](https://github.com/M-Demay) +* [#3163](https://github.com/RaRe-Technologies/gensim/pull/3163): Optimize word mover distance (WMD) computation, by [@flowlight0](https://github.com/flowlight0) +* [#3157](https://github.com/RaRe-Technologies/gensim/pull/3157): New KeyedVectors.vectors_for_all method for vectorizing all words in a dictionary, by [@Witiko](https://github.com/Witiko) +* [#3153](https://github.com/RaRe-Technologies/gensim/pull/3153): Vectorize word2vec.predict_output_word for speed, by [@M-Demay](https://github.com/M-Demay) +* [#3146](https://github.com/RaRe-Technologies/gensim/pull/3146): Use FastSS for fast kNN over Levenshtein distance, by [@Witiko](https://github.com/Witiko) +* [#3128](https://github.com/RaRe-Technologies/gensim/pull/3128): Materialize and copy the corpus passed to SoftCosineSimilarity, by [@Witiko](https://github.com/Witiko) +* [#3115](https://github.com/RaRe-Technologies/gensim/pull/3115): Make LSI dispatcher CLI param for number of jobs optional, by [@robguinness](https://github.com/robguinness) +* [#3091](https://github.com/RaRe-Technologies/gensim/pull/3091): LsiModel: Only log top words that actually exist in the dictionary, by [@kmurphy4](https://github.com/kmurphy4) +* [#2980](https://github.com/RaRe-Technologies/gensim/pull/2980): Added EnsembleLda for stable LDA topics, by [@sezanzeb](https://github.com/sezanzeb) +* [#2978](https://github.com/RaRe-Technologies/gensim/pull/2978): Optimize performance of Author-Topic model, by [@horpto](https://github.com/horpto) + + +### :books: Tutorials and docs + +* [#3155](https://github.com/RaRe-Technologies/gensim/pull/3155): Correct parameter name in documentation of fasttext.py, by [@bizzyvinci](https://github.com/bizzyvinci) +* [#3148](https://github.com/RaRe-Technologies/gensim/pull/3148): Fix broken link to mycorpus.txt in documentation, by [@rohit901](https://github.com/rohit901) +* [#3142](https://github.com/RaRe-Technologies/gensim/pull/3142): Use more permanent pdf link and update code link, by [@dymil](https://github.com/dymil) +* [#3141](https://github.com/RaRe-Technologies/gensim/pull/3141): Update link for online LDA paper, by [@dymil](https://github.com/dymil) +* [#3133](https://github.com/RaRe-Technologies/gensim/pull/3133): Update link to Hoffman paper (online VB LDA), by [@jonaschn](https://github.com/jonaschn) +* [#3129](https://github.com/RaRe-Technologies/gensim/pull/3129): [MRG] Add bronze sponsor: TechTarget, by [@piskvorky](https://github.com/piskvorky) +* [#3126](https://github.com/RaRe-Technologies/gensim/pull/3126): Fix typos in make_wiki_online.py and make_wikicorpus.py, by [@nicolasassi](https://github.com/nicolasassi) +* [#3125](https://github.com/RaRe-Technologies/gensim/pull/3125): Improve & unify docs for dirichlet priors, by [@jonaschn](https://github.com/jonaschn) +* [#3123](https://github.com/RaRe-Technologies/gensim/pull/3123): Fix hyperlink for doc2vec tutorial, by [@AdityaSoni19031997](https://github.com/AdityaSoni19031997) +* [#3121](https://github.com/RaRe-Technologies/gensim/pull/3121): [MRG] Add bronze sponsor: eaccidents.com, by [@piskvorky](https://github.com/piskvorky) +* [#3120](https://github.com/RaRe-Technologies/gensim/pull/3120): Fix URL for ldamodel.py, by [@jonaschn](https://github.com/jonaschn) +* [#3118](https://github.com/RaRe-Technologies/gensim/pull/3118): Fix URL in doc string, by [@jonaschn](https://github.com/jonaschn) +* [#3107](https://github.com/RaRe-Technologies/gensim/pull/3107): Draw attention to sponsoring in README, by [@piskvorky](https://github.com/piskvorky) +* [#3105](https://github.com/RaRe-Technologies/gensim/pull/3105): Fix documentation links: Travis to Github Actions, by [@piskvorky](https://github.com/piskvorky) +* [#3057](https://github.com/RaRe-Technologies/gensim/pull/3057): Clarify doc comment in LdaModel.inference(), by [@yocen](https://github.com/yocen) +* [#2964](https://github.com/RaRe-Technologies/gensim/pull/2964): Document that preprocessing.strip_punctuation is limited to ASCII, by [@sciatro](https://github.com/sciatro) + + ### :red_circle: Bug fixes -* [#3116](https://github.com/RaRe-Technologies/gensim/pull/3116): Fix bug where saved Phrases model did not load its connector_words, by [@aloknayak29](https://github.com/aloknayak29) -* [#3136](https://github.com/RaRe-Technologies/gensim/pull/3136): Fix indexing error in word2vec_inner.pyx, by [@bluekura](https://github.com/bluekura) -* [#3174](https://github.com/RaRe-Technologies/gensim/pull/3174): Fix a bug when upgrading phraser from gensim 3.x to 4.0, by [@emgucv](https://github.com/emgucv) * [#3178](https://github.com/RaRe-Technologies/gensim/pull/3178): Fix Unicode string incompatibility in gensim.similarities.fastss.editdist, by [@Witiko](https://github.com/Witiko) -* [#3176](https://github.com/RaRe-Technologies/gensim/pull/3176): Eliminate obsolete step parameter from doc2vec infer_vector and similarity_unseen_docs, by [@rock420](https://github.com/rock420) +* [#3174](https://github.com/RaRe-Technologies/gensim/pull/3174): Fix loading Phraser models stored in Gensim 3.x into Gensim 4.0, by [@emgucv](https://github.com/emgucv) +* [#3136](https://github.com/RaRe-Technologies/gensim/pull/3136): Fix indexing error in word2vec_inner.pyx, by [@bluekura](https://github.com/bluekura) +* [#3131](https://github.com/RaRe-Technologies/gensim/pull/3131): Add missing import to NMF docs and models/__init__.py, by [@properGrammar](https://github.com/properGrammar) +* [#3116](https://github.com/RaRe-Technologies/gensim/pull/3116): Fix bug where saved Phrases model did not load its connector_words, by [@aloknayak29](https://github.com/aloknayak29) * [#2830](https://github.com/RaRe-Technologies/gensim/pull/2830): Fixed KeyError in coherence model, by [@pietrotrope](https://github.com/pietrotrope) -### :+1: Improvements -* [#2978](https://github.com/RaRe-Technologies/gensim/pull/2978): Optimize performance of Author-Topic model, by [@horpto](https://github.com/horpto) -* [#3091](https://github.com/RaRe-Technologies/gensim/pull/3091): LsiModel: Only log top words that actually exist in the dictionary, by [@kmurphy4](https://github.com/kmurphy4) -* [#3115](https://github.com/RaRe-Technologies/gensim/pull/3115): Make LSI dispatcher CLI param for number of jobs optional, by [@robguinness](https://github.com/robguinness) -* [#3128](https://github.com/RaRe-Technologies/gensim/pull/3128): Materialize and copy the corpus passed to SoftCosineSimilarity, by [@Witiko](https://github.com/Witiko) -* [#3131](https://github.com/RaRe-Technologies/gensim/pull/3131): Added import to Nmf docs, and to models/__init__.py, by [@properGrammar](https://github.com/properGrammar) -* [#3153](https://github.com/RaRe-Technologies/gensim/pull/3153): Vectorize word2vec.predict_output_word for speed, by [@M-Demay](https://github.com/M-Demay) -* [#3157](https://github.com/RaRe-Technologies/gensim/pull/3157): New KeyedVectors.vectors_for_all method for vectorizing all words in a dictionary, by [@Witiko](https://github.com/Witiko) -* [#3163](https://github.com/RaRe-Technologies/gensim/pull/3163): Optimize word mover distance (WMD) computation, by [@flowlight0](https://github.com/flowlight0) +### :warning: Removed functionality & deprecations + +* [#3176](https://github.com/RaRe-Technologies/gensim/pull/3176): Eliminate obsolete step parameter from doc2vec infer_vector and similarity_unseen_docs, by [@rock420](https://github.com/rock420) * [#2965](https://github.com/RaRe-Technologies/gensim/pull/2965): Remove strip_punctuation2 alias of strip_punctuation, by [@sciatro](https://github.com/sciatro) -* [#3169](https://github.com/RaRe-Technologies/gensim/pull/3169): Implement `shrink_windows` argument for Word2Vec., by [@M-Demay](https://github.com/M-Demay) -### :books: Documentation -* [#3123](https://github.com/RaRe-Technologies/gensim/pull/3123): Fix hyperlink for doc2vec tutorial, by [@AdityaSoni19031997](https://github.com/AdityaSoni19031997) -* [#3125](https://github.com/RaRe-Technologies/gensim/pull/3125): Improve & unify docs for dirichlet priors, by [@jonaschn](https://github.com/jonaschn) -* [#3133](https://github.com/RaRe-Technologies/gensim/pull/3133): Update link to Hoffman paper (online VB LDA), by [@jonaschn](https://github.com/jonaschn) -* [#3141](https://github.com/RaRe-Technologies/gensim/pull/3141): Update link for online LDA paper, by [@dymil](https://github.com/dymil) -* [#3148](https://github.com/RaRe-Technologies/gensim/pull/3148): Fix broken link in documentation, by [@rohit901](https://github.com/rohit901) -* [#3155](https://github.com/RaRe-Technologies/gensim/pull/3155): Correct parameter name in documentation of fasttext.py, by [@bizzyvinci](https://github.com/bizzyvinci) -* [#2964](https://github.com/RaRe-Technologies/gensim/pull/2964): Document that preprocessing.strip_punctuation is limited to ASCII, by [@sciatro](https://github.com/sciatro) +### 🔮 Testing, CI, housekeeping + +* [#3156](https://github.com/RaRe-Technologies/gensim/pull/3156): Update Numpy minimum version to 1.17.0, by [@PrimozGodec](https://github.com/PrimozGodec) +* [#3143](https://github.com/RaRe-Technologies/gensim/pull/3143): replace _mul function with explicit casts, by [@mpenkov](https://github.com/mpenkov) +* [#2952](https://github.com/RaRe-Technologies/gensim/pull/2952): Allow newer versions of the Morfessor module for the tests, by [@pabs3](https://github.com/pabs3) +* [#2965](https://github.com/RaRe-Technologies/gensim/pull/2965): Remove strip_punctuation2 alias of strip_punctuation, by [@sciatro](https://github.com/sciatro) + + + ## 4.0.1, 2021-04-01 Bugfix release to address issues with Wheels on Windows: From 8cd5ad34ce12cf4ca5cc095a15960de6461572e6 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Tue, 27 Jul 2021 12:31:00 +0900 Subject: [PATCH 4/7] Mention backward incompatibilities --- CHANGELOG.md | 59 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7bb150c347..26362603c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,16 +3,69 @@ Changes ## Unreleased -Gensim 4.1 bring two major new functionalities: +Gensim 4.1 brings two major new functionalities: -* [Ensamble LDA](https://radimrehurek.com/gensim/auto_examples/tutorials/run_ensemblelda.html) for robust training, selection and comparison of LDA models. +* [Ensemble LDA](https://radimrehurek.com/gensim/auto_examples/tutorials/run_ensemblelda.html) for robust training, selection and comparison of LDA models. * [FastSS module](https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/similarities/fastss.pyx) for super fast Levenshtein "fuzzy search" queries. Used e.g. for ["soft term similarity"](https://github.com/RaRe-Technologies/gensim/pull/3146) calculations. +There are several minor changes that are **not** backwards compatible with previous versions of Gensim. +The affected functionality is relatively less used, so it is unlikely to affect most users, so we have opted to not require a major version bump. +Nevertheless, we describe them below. + +### Improved parameter edge-case handling in KeyedVectors most_similar and most_similar_cosmul methods + +We now handle both ``positive`` and ``negative`` keyword parameters consistently. +These parameters typically specify +They may now be either: + +1. A string, in which case the value is reinterpreted as a list of one element (the string value) +2. A vector, in which case the value is reinterpreted as a list of one element (the vector) +3. A list of strings +4. A list of vectors + +So you can now simply do: + +```python + model.most_similar(positive='war', negative='peace') +``` + +instead of the slightly more involved + +```python +model.most_similar(positive=['war'], negative=['peace']) +``` + +Both invocations remain correct, so you can use whichever is most convenient. +If you were somehow expecting gensim to interpret the strings as a list of characters, e.g. + +```python +model.most_similar(positive=['w', 'a', 'r'], negative=['p', 'e', 'a', 'c', 'e']) +``` + +then you will need to specify the lists explicitly in gensim 4.1. +### Deprecated obsolete `step` parameter from doc2vec + +With the newer version, do this: + +```python +model.infer_vector(..., epochs=123) +``` + +instead of this: + +```python +model.infer_vector(..., steps=123) +``` + +### Hidden KEY_TYPES in gensim/models/keyedvectors.py + +This "constant" is now internal to the keyedvectors submodule. +You can still access it, but do so at your own risk. + Plus a large number of smaller improvements and fixes, as usual. **⚠️ If migrating from old Gensim 3.x, read the [Migration guide](https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4) first.** - ### :+1: New features * [#3169](https://github.com/RaRe-Technologies/gensim/pull/3169): Implement `shrink_windows` argument for Word2Vec, by [@M-Demay](https://github.com/M-Demay) From cfa4f584202b31ea921ec7f7a808d0feec830f16 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Tue, 27 Jul 2021 15:15:22 +0900 Subject: [PATCH 5/7] removed KEY_TYPES description from change log --- CHANGELOG.md | 5 ----- 1 file changed, 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 26362603c9..c23172893e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -57,11 +57,6 @@ instead of this: model.infer_vector(..., steps=123) ``` -### Hidden KEY_TYPES in gensim/models/keyedvectors.py - -This "constant" is now internal to the keyedvectors submodule. -You can still access it, but do so at your own risk. - Plus a large number of smaller improvements and fixes, as usual. **⚠️ If migrating from old Gensim 3.x, read the [Migration guide](https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4) first.** From 11c3fba58872cbfbf7256fe68d40deec760c1fbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sat, 7 Aug 2021 09:42:33 +0200 Subject: [PATCH 6/7] hanging indent + improve docs --- gensim/models/fasttext.py | 16 ++++++++++------ gensim/similarities/docsim.py | 1 + 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index ba08a15a92..a94bc17f27 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -496,16 +496,20 @@ def estimate_memory(self, vocab_size=None, report=None): ) return report - def _do_train_epoch(self, corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch, - total_examples=None, total_words=None, **kwargs): + def _do_train_epoch( + self, corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch, + total_examples=None, total_words=None, **kwargs, + ): work, neu1 = thread_private_mem if self.sg: - examples, tally, raw_tally = train_epoch_sg(self, corpus_file, offset, cython_vocab, cur_epoch, - total_examples, total_words, work, neu1) + examples, tally, raw_tally = train_epoch_sg( + self, corpus_file, offset, cython_vocab, cur_epoch, total_examples, total_words, work, neu1, + ) else: - examples, tally, raw_tally = train_epoch_cbow(self, corpus_file, offset, cython_vocab, cur_epoch, - total_examples, total_words, work, neu1) + examples, tally, raw_tally = train_epoch_cbow( + self, corpus_file, offset, cython_vocab, cur_epoch, total_examples, total_words, work, neu1, + ) return examples, tally, raw_tally diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py index dafc8c7ead..db66db67e0 100644 --- a/gensim/similarities/docsim.py +++ b/gensim/similarities/docsim.py @@ -27,6 +27,7 @@ .. sourcecode:: pycon + >>> from gensim.similarities import Similarity >>> from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile >>> >>> index_tmpfile = get_tmpfile("index") From 3d1c7b87041988b528586ad65515eca7ac992e07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sat, 7 Aug 2021 09:47:27 +0200 Subject: [PATCH 7/7] avoid github API throttling --- release/generate_changelog.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/release/generate_changelog.py b/release/generate_changelog.py index 72b03c7cda..97cc306f62 100644 --- a/release/generate_changelog.py +++ b/release/generate_changelog.py @@ -8,7 +8,7 @@ """Generate changelog entries for all PRs merged since the last release.""" import re import requests - +import time # # The releases get sorted in reverse chronological order, so the first release @@ -37,6 +37,8 @@ def iter_merged_prs(since=release_timestamp): yield pr page += 1 + # Avoid Github API throttling; see https://github.com/RaRe-Technologies/gensim/pull/3203#issuecomment-887453109 + time.sleep(1) def iter_closed_issues(since=release_timestamp): @@ -58,6 +60,8 @@ def iter_closed_issues(since=release_timestamp): if 'pull_request' not in issue and issue['closed_at'] > since: yield issue page += 1 + # Avoid Github API throttling; see https://github.com/RaRe-Technologies/gensim/pull/3203#issuecomment-887453109 + time.sleep(1) fixed_issue_numbers = set()