From f0d0621d187cb15ce597ec406920b3cf944e31c4 Mon Sep 17 00:00:00 2001 From: horpto <__singleton__@hackerdom.ru> Date: Wed, 19 Dec 2018 10:02:53 +0500 Subject: [PATCH] Improve filter_extremes methods in Dictionary and HashDictionary - use search by set instead of list - refine default value of dict.get() method - inplace sort --- gensim/corpora/dictionary.py | 14 +++++++------- gensim/corpora/hashdictionary.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index 8d2ce58364..c08d4e31b8 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -357,18 +357,18 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=N # determine which tokens to keep if keep_tokens: - keep_ids = [self.token2id[v] for v in keep_tokens if v in self.token2id] - good_ids = ( + keep_ids = {self.token2id[v] for v in keep_tokens if v in self.token2id} + good_ids = [ v for v in itervalues(self.token2id) if no_below <= self.dfs.get(v, 0) <= no_above_abs or v in keep_ids - ) - good_ids = sorted(good_ids, key=lambda x: self.num_docs if x in keep_ids else self.dfs.get(x), reverse=True) + ] + good_ids.sort(key=lambda x: self.num_docs if x in keep_ids else self.dfs.get(x, 0), reverse=True) else: - good_ids = ( + good_ids = [ v for v in itervalues(self.token2id) if no_below <= self.dfs.get(v, 0) <= no_above_abs - ) - good_ids = sorted(good_ids, key=self.dfs.get, reverse=True) + ] + good_ids.sort(key=self.dfs.get, reverse=True) if keep_n is not None: good_ids = good_ids[:keep_n] bad_words = [(self[idx], self.dfs.get(idx, 0)) for idx in set(self).difference(good_ids)] diff --git a/gensim/corpora/hashdictionary.py b/gensim/corpora/hashdictionary.py index 141f384271..433f61aa42 100644 --- a/gensim/corpora/hashdictionary.py +++ b/gensim/corpora/hashdictionary.py @@ -303,7 +303,7 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): tokenid: {token for token in tokens if token in self.dfs_debug} for tokenid, tokens in iteritems(self.id2token) } - self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if self.id2token.get(tokenid, set())} + self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if self.id2token.get(tokenid, False)} # for word->document frequency logger.info(