piskvorky · tmylk · Oct 5, 2015 · Sep 13, 2015 · Sep 13, 2015 · Sep 13, 2015
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -1,8 +1,17 @@
 Changes
 =======
 
+NEXT
 
-0.12.2
+* Make show_topics return value consistent across models (Christopher Corley, #448)
+  - All models with the `show_topics` method should return a list of
+    `(topic_number, topic)` tuples, where `topic` is a list of
+    `(word, probability)` tuples.
+  - This is a breaking change that affects users of the `LsiModel`, `LdaModel`,
+  and `LdaMulticore` that may be reliant on the old tuple layout of
+  `(probability, word)`.
+
+0.12.2, 19/09/2015
 
 * tutorial on text summarization (Ólavur Mortensen, #436)
 * more flexible vocabulary construction in word2vec & doc2vec (Philipp Dowling, #434)

diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py
@@ -606,7 +606,7 @@ def show_topics(self, topics=10, topn=10, log=False, formatted=True):
                 if log:
                     logger.info(topic)
             else:
-                topic = [k, topic_terms]
+                topic = (k, topic_terms)
             shown.append(topic)
 
         return shown

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
@@ -688,7 +688,7 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):
         (10 words per topic, by default).
 
         The topics are returned as a list -- a list of strings if `formatted` is
-        True, or a list of (probability, word) 2-tuples if False.
+        True, or a list of `(word, probability)` 2-tuples if False.
 
         If `log` is True, also output this result to log.
 
@@ -716,29 +716,38 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):
             else:
                 topic = self.show_topic(i, topn=num_words)
 
-            shown.append(topic)
+            shown.append((i, topic))
             if log:
                 logger.info("topic #%i (%.3f): %s", i, self.alpha[i], topic)
 
         return shown
 
     def show_topic(self, topicid, topn=10):
         """
-        Return a list of `(words_probability, word)` 2-tuples for the most probable
+        Return a list of `(word, probability)` 2-tuples for the most probable
         words in topic `topicid`.
 
         Only return 2-tuples for the topn most probable words (ignore the rest).
 
+        """
+        return [(self.id2word[id], value) for id, value in self.get_topic_terms(topicid, topn)]
+
+    def get_topic_terms(self, topicid, topn=10):
+        """
+        Return a list of `(word_id, probability)` 2-tuples for the most
+        probable words in topic `topicid`.
+
+        Only return 2-tuples for the topn most probable words (ignore the rest).
+
         """
         topic = self.state.get_lambda()[topicid]
         topic = topic / topic.sum()  # normalize to probability distribution
         bestn = matutils.argsort(topic, topn, reverse=True)
-        beststr = [(topic[id], self.id2word[id]) for id in bestn]
-        return beststr
+        return [(id, topic[id]) for id in bestn]
 
     def print_topic(self, topicid, topn=10):
         """Return the result of `show_topic`, but formatted as a single string."""
-        return ' + '.join(['%.3f*%s' % v for v in self.show_topic(topicid, topn)])
+        return ' + '.join(['%.3f*%s' % (v, k)  for k, v in self.show_topic(topicid, topn)])
 
     def top_topics(self, corpus, num_words=20):
         """

diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py
@@ -474,7 +474,7 @@ def show_topic(self, topicno, topn=10):
         of the topic (both negative and positive).
 
         >>> lsimodel.show_topic(10, topn=5)
-        [(-0.340, "category"), (0.298, "$M$"), (0.183, "algebra"), (-0.174, "functor"), (-0.168, "operator")]
+        [("category", -0.340), ("$M$", 0.298), ("algebra", 0.183), ("functor", -0.174), ("operator", -0.168)]
 
         """
         # size of the projection matrix can actually be smaller than `self.num_topics`,
@@ -485,7 +485,7 @@ def show_topic(self, topicno, topn=10):
         c = numpy.asarray(self.projection.u.T[topicno, :]).flatten()
         norm = numpy.sqrt(numpy.sum(numpy.dot(c, c)))
         most = matutils.argsort(numpy.abs(c), topn, reverse=True)
-        return [(1.0 * c[val] / norm, self.id2word[val]) for val in most]
+        return [(self.id2word[val], 1.0 * c[val] / norm) for val in most]
 
     def print_topic(self, topicno, topn=10):
         """
@@ -495,15 +495,15 @@ def print_topic(self, topicno, topn=10):
         '-0.340 * "category" + 0.298 * "$M$" + 0.183 * "algebra" + -0.174 * "functor" + -0.168 * "operator"'
 
         """
-        return ' + '.join(['%.3f*"%s"' % v for v in self.show_topic(topicno, topn)])
+        return ' + '.join(['%.3f*"%s"' % (v, k) for k, v in self.show_topic(topicno, topn)])
 
     def show_topics(self, num_topics=-1, num_words=10, log=False, formatted=True):
         """
         Return `num_topics` most significant topics (return all by default).
         For each topic, show `num_words` most significant words (10 words by default).
 
         The topics are returned as a list -- a list of strings if `formatted` is
-        True, or a list of (weight, word) 2-tuples if False.
+        True, or a list of `(word, probability)` 2-tuples if False.
 
         If `log` is True, also output this result to log.
 
@@ -517,7 +517,7 @@ def show_topics(self, num_topics=-1, num_words=10, log=False, formatted=True):
                     topic = self.print_topic(i, topn=num_words)
                 else:
                     topic = self.show_topic(i, topn=num_words)
-                shown.append(topic)
+                shown.append((i, topic))
                 if log:
                     logger.info("topic #%i(%.3f): %s", i, self.projection.s[i], topic)
         return shown

diff --git a/gensim/test/test_hdpmodel.py b/gensim/test/test_hdpmodel.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2010 Radim Rehurek <[email protected]>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+"""
+Automated tests for checking transformation algorithms (the models package).
+"""
+
+
+import logging
+import unittest
+import os
+import os.path
+import tempfile
+
+import six
+import numpy
+import scipy.linalg
+
+from gensim.corpora import mmcorpus, Dictionary
+from gensim.models import hdpmodel
+from gensim import matutils
+
+
+module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
+datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
+
+
+# set up vars used in testing ("Deerwester" from the web tutorial)
+texts = [['human', 'interface', 'computer'],
+ ['survey', 'user', 'computer', 'system', 'response', 'time'],
+ ['eps', 'user', 'interface', 'system'],
+ ['system', 'human', 'system', 'eps'],
+ ['user', 'response', 'time'],
+ ['trees'],
+ ['graph', 'trees'],
+ ['graph', 'minors', 'trees'],
+ ['graph', 'minors', 'survey']]
+dictionary = Dictionary(texts)
+corpus = [dictionary.doc2bow(text) for text in texts]
+
+
+def testfile():
+    # temporary data will be stored to this file
+    return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')
+
+
+
+class TestHdpModel(unittest.TestCase):
+    def setUp(self):
+        self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
+        self.class_ = hdpmodel.HdpModel
+        self.model = self.class_(corpus, id2word=dictionary)
+
+    def testShowTopics(self):
+        topics = self.model.show_topics(formatted=False)
+
+        for topic_no, topic in topics:
+            self.assertTrue(isinstance(topic_no, int))
+            self.assertTrue(isinstance(topic, list))
+            for k, v in topic:
+                self.assertTrue(isinstance(k, six.string_types))
+                self.assertTrue(isinstance(v, float))
+
+
+
+if __name__ == '__main__':
+    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
+    unittest.main()
diff --git a/gensim/test/test_ldamallet_wrapper.py b/gensim/test/test_ldamallet_wrapper.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2010 Radim Rehurek <[email protected]>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+"""
+Automated tests for checking transformation algorithms (the models package).
+"""
+
+
+import logging
+import unittest
+import os
+import os.path
+import tempfile
+
+import six
+import numpy
+import scipy.linalg
+
+from gensim.corpora import mmcorpus, Dictionary
+from gensim.models.wrappers import ldamallet
+from gensim import matutils
+
+
+module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
+datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
+
+
+# set up vars used in testing ("Deerwester" from the web tutorial)
+texts = [['human', 'interface', 'computer'],
+ ['survey', 'user', 'computer', 'system', 'response', 'time'],
+ ['eps', 'user', 'interface', 'system'],
+ ['system', 'human', 'system', 'eps'],
+ ['user', 'response', 'time'],
+ ['trees'],
+ ['graph', 'trees'],
+ ['graph', 'minors', 'trees'],
+ ['graph', 'minors', 'survey']]
+dictionary = Dictionary(texts)
+corpus = [dictionary.doc2bow(text) for text in texts]
+
+
+def testfile():
+    # temporary data will be stored to this file
+    return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')
+
+
+class TestLdaMallet(unittest.TestCase):
+    def setUp(self):
+        self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
+        mallet_home = os.environ.get('MALLET_HOME', None)
+        self.mallet_path = os.path.join(mallet_home, 'bin', 'mallet') if mallet_home else None
+
+    def testTransform(self):
+        if not self.mallet_path:
+            return
+        passed = False
+        for i in range(5): # restart at most 5 times
+            # create the transformation model
+            model = ldamallet.LdaMallet(self.mallet_path, corpus, id2word=dictionary, num_topics=2, iterations=200)
+
+            # transform one document
+            doc = list(corpus)[0]
+            transformed = model[doc]
+
+            vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests
+            expected = [0.49, 0.51]
+            passed = numpy.allclose(sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering
+            if passed:
+                break
+            logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" %
+                            (i, sorted(vec), sorted(expected)))
+        self.assertTrue(passed)
+
+
+    def testPersistence(self):
+        if not self.mallet_path:
+            return
+        fname = testfile()
+        model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100)
+        model.save(fname)
+        model2 = ldamallet.LdaMallet.load(fname)
+        self.assertEqual(model.num_topics, model2.num_topics)
+        self.assertTrue(numpy.allclose(model.wordtopics, model2.wordtopics))
+        tstvec = []
+        self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
+
+    def testPersistenceCompressed(self):
+        if not self.mallet_path:
+            return
+        fname = testfile() + '.gz'
+        model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100)
+        model.save(fname)
+        model2 = ldamallet.LdaMallet.load(fname, mmap=None)
+        self.assertEqual(model.num_topics, model2.num_topics)
+        self.assertTrue(numpy.allclose(model.wordtopics, model2.wordtopics))
+        tstvec = []
+        self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
+
+    def testLargeMmap(self):
+        if not self.mallet_path:
+            return
+        fname = testfile()
+        model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100)
+
+        # simulate storing large arrays separately
+        model.save(testfile(), sep_limit=0)
+
+        # test loading the large model arrays with mmap
+        model2 = ldamodel.LdaModel.load(testfile(), mmap='r')
+        self.assertEqual(model.num_topics, model2.num_topics)
+        self.assertTrue(isinstance(model2.wordtopics, numpy.memmap))
+        self.assertTrue(numpy.allclose(model.wordtopics, model2.wordtopics))
+        tstvec = []
+        self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
+
+    def testLargeMmapCompressed(self):
+        if not self.mallet_path:
+            return
+        fname = testfile() + '.gz'
+        model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100)
+
+        # simulate storing large arrays separately
+        model.save(fname, sep_limit=0)
+
+        # test loading the large model arrays with mmap
+        self.assertRaises(IOError, ldamodel.LdaModel.load, fname, mmap='r')
+#endclass TestLdaMallet
+
+if __name__ == '__main__':
+    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
+    unittest.main()