From 5348a6945b873a57226811249d31099c31b0a983 Mon Sep 17 00:00:00 2001
From: Silvia Terragni <s.terragni4@campus.unimib.it>
Date: Wed, 21 Jul 2021 14:43:45 +0200
Subject: [PATCH 1/4] fix #3181

---
 gensim/topic_coherence/text_analysis.py | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py
index 83cbdc6471..bd440c38ad 100644
--- a/gensim/topic_coherence/text_analysis.py
+++ b/gensim/topic_coherence/text_analysis.py
@@ -300,18 +300,10 @@ def accumulate(self, texts, window_size):
     def _iter_texts(self, texts):
         dtype = np.uint16 if np.iinfo(np.uint16).max >= self._vocab_size else np.uint32
         for text in texts:
-            if self.text_is_relevant(text):
-                yield np.fromiter((
-                    self.id2contiguous[self.token2id[w]] if w in self.relevant_words
-                    else self._none_token
-                    for w in text), dtype=dtype, count=len(text))
-
-    def text_is_relevant(self, text):
-        """Check if the text has any relevant words."""
-        for word in text:
-            if word in self.relevant_words:
-                return True
-        return False
+            yield np.fromiter((
+                self.id2contiguous[self.token2id[w]] if w in self.relevant_words
+                else self._none_token
+                for w in text), dtype=dtype, count=len(text))
 
 
 class InvertedIndexAccumulator(WindowedTextsAnalyzer, InvertedIndexBased):

From 9c83e5193396610befd10f50baf840849fd7234b Mon Sep 17 00:00:00 2001
From: Silvia Terragni <s.terragni4@campus.unimib.it>
Date: Wed, 21 Jul 2021 14:46:08 +0200
Subject: [PATCH 2/4] added tests

---
 gensim/test/test_coherencemodel.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py
index 9396fe5ac0..5ac159ec87 100644
--- a/gensim/test/test_coherencemodel.py
+++ b/gensim/test/test_coherencemodel.py
@@ -70,8 +70,13 @@ def check_coherence_measure(self, coherence):
         cm2 = CoherenceModel(topics=self.topics2, **kwargs)
         cm3 = CoherenceModel(topics=self.topics3, **kwargs)
         cm4 = CoherenceModel(topics=self.topicIds1, **kwargs)
+
+        # check if the same topic always returns the same coherence value
+        cm5 = CoherenceModel(topics=[self.topics1[0]], **kwargs)
+
         self.assertRaises(ValueError, lambda: CoherenceModel(topics=self.topics4, **kwargs))
         self.assertEqual(cm1.get_coherence(), cm4.get_coherence())
+        self.assertEqual(cm1.get_coherence_per_topic()[0], cm5.get_coherence())
         self.assertIsInstance(cm3.get_coherence(), np.double)
         self.assertGreater(cm1.get_coherence(), cm2.get_coherence())
 

From 72debfbbf20c1a5425cc98612773303ce8d131b0 Mon Sep 17 00:00:00 2001
From: Silvia Terragni <s.terragni4@campus.unimib.it>
Date: Sat, 26 Feb 2022 12:13:27 +0100
Subject: [PATCH 3/4] improve readability

---
 gensim/topic_coherence/text_analysis.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py
index 62a4b84df0..2c06185a0b 100644
--- a/gensim/topic_coherence/text_analysis.py
+++ b/gensim/topic_coherence/text_analysis.py
@@ -300,10 +300,11 @@ def accumulate(self, texts, window_size):
     def _iter_texts(self, texts):
         dtype = np.uint16 if np.iinfo(np.uint16).max >= self._vocab_size else np.uint32
         for text in texts:
-            yield np.fromiter((
-                self.id2contiguous[self.token2id[w]] if w in self.relevant_words
-                else self._none_token
-                for w in text), dtype=dtype, count=len(text))
+            ids = (
+                self.id2contiguous[self.token2id[w]] if w in self.relevant_words else self._none_token
+                for w in text
+            )
+            yield np.fromiter(ids, dtype=dtype, count=len(text))
 
 
 class InvertedIndexAccumulator(WindowedTextsAnalyzer, InvertedIndexBased):

From 298880bbad2f4c582013157cf2354a1e6a2e8436 Mon Sep 17 00:00:00 2001
From: Silvia Terragni <s.terragni4@campus.unimib.it>
Date: Sat, 26 Feb 2022 12:15:21 +0100
Subject: [PATCH 4/4] add test for topics with unseen words

---
 gensim/test/test_coherencemodel.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py
index 5ac159ec87..2b111f7306 100644
--- a/gensim/test/test_coherencemodel.py
+++ b/gensim/test/test_coherencemodel.py
@@ -50,6 +50,11 @@ def setUp(self):
             ['not a token', 'not an id', 'tests using', "this list"],
             ['should raise', 'an error', 'to pass', 'correctly']
         ]
+        # list of topics with unseen words in the dictionary
+        self.topics5 = [
+            ['aaaaa', 'bbbbb', 'ccccc', 'eeeee'],
+            ['ddddd', 'fffff', 'ggggh', 'hhhhh']
+        ]
         self.topicIds1 = []
         for topic in self.topics1:
             self.topicIds1.append([self.dictionary.token2id[token] for token in topic])
@@ -75,6 +80,7 @@ def check_coherence_measure(self, coherence):
         cm5 = CoherenceModel(topics=[self.topics1[0]], **kwargs)
 
         self.assertRaises(ValueError, lambda: CoherenceModel(topics=self.topics4, **kwargs))
+        self.assertRaises(ValueError, lambda: CoherenceModel(topics=self.topics5, **kwargs))
         self.assertEqual(cm1.get_coherence(), cm4.get_coherence())
         self.assertEqual(cm1.get_coherence_per_topic()[0], cm5.get_coherence())
         self.assertIsInstance(cm3.get_coherence(), np.double)