Skip to content

Commit

Permalink
Improve filter_extremes methods in Dictionary and HashDictionary
Browse files Browse the repository at this point in the history
- use search by set instead of list
- refine default value of dict.get() method
- inplace sort
  • Loading branch information
horpto committed Dec 19, 2018
1 parent 3d5a21c commit f0d0621
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 8 deletions.
14 changes: 7 additions & 7 deletions gensim/corpora/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,18 +357,18 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=N

# determine which tokens to keep
if keep_tokens:
keep_ids = [self.token2id[v] for v in keep_tokens if v in self.token2id]
good_ids = (
keep_ids = {self.token2id[v] for v in keep_tokens if v in self.token2id}
good_ids = [
v for v in itervalues(self.token2id)
if no_below <= self.dfs.get(v, 0) <= no_above_abs or v in keep_ids
)
good_ids = sorted(good_ids, key=lambda x: self.num_docs if x in keep_ids else self.dfs.get(x), reverse=True)
]
good_ids.sort(key=lambda x: self.num_docs if x in keep_ids else self.dfs.get(x, 0), reverse=True)
else:
good_ids = (
good_ids = [
v for v in itervalues(self.token2id)
if no_below <= self.dfs.get(v, 0) <= no_above_abs
)
good_ids = sorted(good_ids, key=self.dfs.get, reverse=True)
]
good_ids.sort(key=self.dfs.get, reverse=True)
if keep_n is not None:
good_ids = good_ids[:keep_n]
bad_words = [(self[idx], self.dfs.get(idx, 0)) for idx in set(self).difference(good_ids)]
Expand Down
2 changes: 1 addition & 1 deletion gensim/corpora/hashdictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000):
tokenid: {token for token in tokens if token in self.dfs_debug}
for tokenid, tokens in iteritems(self.id2token)
}
self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if self.id2token.get(tokenid, set())}
self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if self.id2token.get(tokenid, False)}

# for word->document frequency
logger.info(
Expand Down

0 comments on commit f0d0621

Please sign in to comment.