From 5348a6945b873a57226811249d31099c31b0a983 Mon Sep 17 00:00:00 2001 From: Silvia Terragni Date: Wed, 21 Jul 2021 14:43:45 +0200 Subject: [PATCH 1/4] fix #3181 --- gensim/topic_coherence/text_analysis.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 83cbdc6471..bd440c38ad 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -300,18 +300,10 @@ def accumulate(self, texts, window_size): def _iter_texts(self, texts): dtype = np.uint16 if np.iinfo(np.uint16).max >= self._vocab_size else np.uint32 for text in texts: - if self.text_is_relevant(text): - yield np.fromiter(( - self.id2contiguous[self.token2id[w]] if w in self.relevant_words - else self._none_token - for w in text), dtype=dtype, count=len(text)) - - def text_is_relevant(self, text): - """Check if the text has any relevant words.""" - for word in text: - if word in self.relevant_words: - return True - return False + yield np.fromiter(( + self.id2contiguous[self.token2id[w]] if w in self.relevant_words + else self._none_token + for w in text), dtype=dtype, count=len(text)) class InvertedIndexAccumulator(WindowedTextsAnalyzer, InvertedIndexBased): From 9c83e5193396610befd10f50baf840849fd7234b Mon Sep 17 00:00:00 2001 From: Silvia Terragni Date: Wed, 21 Jul 2021 14:46:08 +0200 Subject: [PATCH 2/4] added tests --- gensim/test/test_coherencemodel.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py index 9396fe5ac0..5ac159ec87 100644 --- a/gensim/test/test_coherencemodel.py +++ b/gensim/test/test_coherencemodel.py @@ -70,8 +70,13 @@ def check_coherence_measure(self, coherence): cm2 = CoherenceModel(topics=self.topics2, **kwargs) cm3 = CoherenceModel(topics=self.topics3, **kwargs) cm4 = CoherenceModel(topics=self.topicIds1, **kwargs) + + # check if the same topic always returns the same coherence value + cm5 = CoherenceModel(topics=[self.topics1[0]], **kwargs) + self.assertRaises(ValueError, lambda: CoherenceModel(topics=self.topics4, **kwargs)) self.assertEqual(cm1.get_coherence(), cm4.get_coherence()) + self.assertEqual(cm1.get_coherence_per_topic()[0], cm5.get_coherence()) self.assertIsInstance(cm3.get_coherence(), np.double) self.assertGreater(cm1.get_coherence(), cm2.get_coherence()) From 72debfbbf20c1a5425cc98612773303ce8d131b0 Mon Sep 17 00:00:00 2001 From: Silvia Terragni Date: Sat, 26 Feb 2022 12:13:27 +0100 Subject: [PATCH 3/4] improve readability --- gensim/topic_coherence/text_analysis.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 62a4b84df0..2c06185a0b 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -300,10 +300,11 @@ def accumulate(self, texts, window_size): def _iter_texts(self, texts): dtype = np.uint16 if np.iinfo(np.uint16).max >= self._vocab_size else np.uint32 for text in texts: - yield np.fromiter(( - self.id2contiguous[self.token2id[w]] if w in self.relevant_words - else self._none_token - for w in text), dtype=dtype, count=len(text)) + ids = ( + self.id2contiguous[self.token2id[w]] if w in self.relevant_words else self._none_token + for w in text + ) + yield np.fromiter(ids, dtype=dtype, count=len(text)) class InvertedIndexAccumulator(WindowedTextsAnalyzer, InvertedIndexBased): From 298880bbad2f4c582013157cf2354a1e6a2e8436 Mon Sep 17 00:00:00 2001 From: Silvia Terragni Date: Sat, 26 Feb 2022 12:15:21 +0100 Subject: [PATCH 4/4] add test for topics with unseen words --- gensim/test/test_coherencemodel.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py index 5ac159ec87..2b111f7306 100644 --- a/gensim/test/test_coherencemodel.py +++ b/gensim/test/test_coherencemodel.py @@ -50,6 +50,11 @@ def setUp(self): ['not a token', 'not an id', 'tests using', "this list"], ['should raise', 'an error', 'to pass', 'correctly'] ] + # list of topics with unseen words in the dictionary + self.topics5 = [ + ['aaaaa', 'bbbbb', 'ccccc', 'eeeee'], + ['ddddd', 'fffff', 'ggggh', 'hhhhh'] + ] self.topicIds1 = [] for topic in self.topics1: self.topicIds1.append([self.dictionary.token2id[token] for token in topic]) @@ -75,6 +80,7 @@ def check_coherence_measure(self, coherence): cm5 = CoherenceModel(topics=[self.topics1[0]], **kwargs) self.assertRaises(ValueError, lambda: CoherenceModel(topics=self.topics4, **kwargs)) + self.assertRaises(ValueError, lambda: CoherenceModel(topics=self.topics5, **kwargs)) self.assertEqual(cm1.get_coherence(), cm4.get_coherence()) self.assertEqual(cm1.get_coherence_per_topic()[0], cm5.get_coherence()) self.assertIsInstance(cm3.get_coherence(), np.double)