Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make show_topics more consistent across models #448

Merged
merged 5 commits into from
Oct 5, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion CHANGELOG.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
Changes
=======

NEXT

0.12.2
* Make show_topics return value consistent across models (Christopher Corley, #448)
- All models with the `show_topics` method should return a list of
`(topic_number, topic)` tuples, where `topic` is a list of
`(word, probability)` tuples.
- This is a breaking change that affects users of the `LsiModel`, `LdaModel`,
and `LdaMulticore` that may be reliant on the old tuple layout of
`(probability, word)`.

0.12.2, 19/09/2015

* tutorial on text summarization (Ólavur Mortensen, #436)
* more flexible vocabulary construction in word2vec & doc2vec (Philipp Dowling, #434)
Expand Down
2 changes: 1 addition & 1 deletion gensim/models/hdpmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -606,7 +606,7 @@ def show_topics(self, topics=10, topn=10, log=False, formatted=True):
if log:
logger.info(topic)
else:
topic = [k, topic_terms]
topic = (k, topic_terms)
shown.append(topic)

return shown
Expand Down
21 changes: 15 additions & 6 deletions gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -688,7 +688,7 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):
(10 words per topic, by default).

The topics are returned as a list -- a list of strings if `formatted` is
True, or a list of (probability, word) 2-tuples if False.
True, or a list of `(word, probability)` 2-tuples if False.

If `log` is True, also output this result to log.

Expand Down Expand Up @@ -716,29 +716,38 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):
else:
topic = self.show_topic(i, topn=num_words)

shown.append(topic)
shown.append((i, topic))
if log:
logger.info("topic #%i (%.3f): %s", i, self.alpha[i], topic)

return shown

def show_topic(self, topicid, topn=10):
"""
Return a list of `(words_probability, word)` 2-tuples for the most probable
Return a list of `(word, probability)` 2-tuples for the most probable
words in topic `topicid`.

Only return 2-tuples for the topn most probable words (ignore the rest).

"""
return [(self.id2word[id], value) for id, value in self.get_topic_terms(topicid, topn)]

def get_topic_terms(self, topicid, topn=10):
"""
Return a list of `(word_id, probability)` 2-tuples for the most
probable words in topic `topicid`.

Only return 2-tuples for the topn most probable words (ignore the rest).

"""
topic = self.state.get_lambda()[topicid]
topic = topic / topic.sum() # normalize to probability distribution
bestn = matutils.argsort(topic, topn, reverse=True)
beststr = [(topic[id], self.id2word[id]) for id in bestn]
return beststr
return [(id, topic[id]) for id in bestn]

def print_topic(self, topicid, topn=10):
"""Return the result of `show_topic`, but formatted as a single string."""
return ' + '.join(['%.3f*%s' % v for v in self.show_topic(topicid, topn)])
return ' + '.join(['%.3f*%s' % (v, k) for k, v in self.show_topic(topicid, topn)])

def top_topics(self, corpus, num_words=20):
"""
Expand Down
10 changes: 5 additions & 5 deletions gensim/models/lsimodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,7 @@ def show_topic(self, topicno, topn=10):
of the topic (both negative and positive).

>>> lsimodel.show_topic(10, topn=5)
[(-0.340, "category"), (0.298, "$M$"), (0.183, "algebra"), (-0.174, "functor"), (-0.168, "operator")]
[("category", -0.340), ("$M$", 0.298), ("algebra", 0.183), ("functor", -0.174), ("operator", -0.168)]

"""
# size of the projection matrix can actually be smaller than `self.num_topics`,
Expand All @@ -485,7 +485,7 @@ def show_topic(self, topicno, topn=10):
c = numpy.asarray(self.projection.u.T[topicno, :]).flatten()
norm = numpy.sqrt(numpy.sum(numpy.dot(c, c)))
most = matutils.argsort(numpy.abs(c), topn, reverse=True)
return [(1.0 * c[val] / norm, self.id2word[val]) for val in most]
return [(self.id2word[val], 1.0 * c[val] / norm) for val in most]

def print_topic(self, topicno, topn=10):
"""
Expand All @@ -495,15 +495,15 @@ def print_topic(self, topicno, topn=10):
'-0.340 * "category" + 0.298 * "$M$" + 0.183 * "algebra" + -0.174 * "functor" + -0.168 * "operator"'

"""
return ' + '.join(['%.3f*"%s"' % v for v in self.show_topic(topicno, topn)])
return ' + '.join(['%.3f*"%s"' % (v, k) for k, v in self.show_topic(topicno, topn)])

def show_topics(self, num_topics=-1, num_words=10, log=False, formatted=True):
"""
Return `num_topics` most significant topics (return all by default).
For each topic, show `num_words` most significant words (10 words by default).

The topics are returned as a list -- a list of strings if `formatted` is
True, or a list of (weight, word) 2-tuples if False.
True, or a list of `(word, probability)` 2-tuples if False.

If `log` is True, also output this result to log.

Expand All @@ -517,7 +517,7 @@ def show_topics(self, num_topics=-1, num_words=10, log=False, formatted=True):
topic = self.print_topic(i, topn=num_words)
else:
topic = self.show_topic(i, topn=num_words)
shown.append(topic)
shown.append((i, topic))
if log:
logger.info("topic #%i(%.3f): %s", i, self.projection.s[i], topic)
return shown
Expand Down
71 changes: 71 additions & 0 deletions gensim/test/test_hdpmodel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2010 Radim Rehurek <[email protected]>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""
Automated tests for checking transformation algorithms (the models package).
"""


import logging
import unittest
import os
import os.path
import tempfile

import six
import numpy
import scipy.linalg

from gensim.corpora import mmcorpus, Dictionary
from gensim.models import hdpmodel
from gensim import matutils


module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)


# set up vars used in testing ("Deerwester" from the web tutorial)
texts = [['human', 'interface', 'computer'],
['survey', 'user', 'computer', 'system', 'response', 'time'],
['eps', 'user', 'interface', 'system'],
['system', 'human', 'system', 'eps'],
['user', 'response', 'time'],
['trees'],
['graph', 'trees'],
['graph', 'minors', 'trees'],
['graph', 'minors', 'survey']]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]


def testfile():
# temporary data will be stored to this file
return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')



class TestHdpModel(unittest.TestCase):
def setUp(self):
self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
self.class_ = hdpmodel.HdpModel
self.model = self.class_(corpus, id2word=dictionary)

def testShowTopics(self):
topics = self.model.show_topics(formatted=False)

for topic_no, topic in topics:
self.assertTrue(isinstance(topic_no, int))
self.assertTrue(isinstance(topic, list))
for k, v in topic:
self.assertTrue(isinstance(k, six.string_types))
self.assertTrue(isinstance(v, float))



if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
unittest.main()
134 changes: 134 additions & 0 deletions gensim/test/test_ldamallet_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2010 Radim Rehurek <[email protected]>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""
Automated tests for checking transformation algorithms (the models package).
"""


import logging
import unittest
import os
import os.path
import tempfile

import six
import numpy
import scipy.linalg

from gensim.corpora import mmcorpus, Dictionary
from gensim.models.wrappers import ldamallet
from gensim import matutils


module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)


# set up vars used in testing ("Deerwester" from the web tutorial)
texts = [['human', 'interface', 'computer'],
['survey', 'user', 'computer', 'system', 'response', 'time'],
['eps', 'user', 'interface', 'system'],
['system', 'human', 'system', 'eps'],
['user', 'response', 'time'],
['trees'],
['graph', 'trees'],
['graph', 'minors', 'trees'],
['graph', 'minors', 'survey']]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]


def testfile():
# temporary data will be stored to this file
return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')


class TestLdaMallet(unittest.TestCase):
def setUp(self):
self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
mallet_home = os.environ.get('MALLET_HOME', None)
self.mallet_path = os.path.join(mallet_home, 'bin', 'mallet') if mallet_home else None

def testTransform(self):
if not self.mallet_path:
return
passed = False
for i in range(5): # restart at most 5 times
# create the transformation model
model = ldamallet.LdaMallet(self.mallet_path, corpus, id2word=dictionary, num_topics=2, iterations=200)

# transform one document
doc = list(corpus)[0]
transformed = model[doc]

vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests
expected = [0.49, 0.51]
passed = numpy.allclose(sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering
if passed:
break
logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" %
(i, sorted(vec), sorted(expected)))
self.assertTrue(passed)


def testPersistence(self):
if not self.mallet_path:
return
fname = testfile()
model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100)
model.save(fname)
model2 = ldamallet.LdaMallet.load(fname)
self.assertEqual(model.num_topics, model2.num_topics)
self.assertTrue(numpy.allclose(model.wordtopics, model2.wordtopics))
tstvec = []
self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector

def testPersistenceCompressed(self):
if not self.mallet_path:
return
fname = testfile() + '.gz'
model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100)
model.save(fname)
model2 = ldamallet.LdaMallet.load(fname, mmap=None)
self.assertEqual(model.num_topics, model2.num_topics)
self.assertTrue(numpy.allclose(model.wordtopics, model2.wordtopics))
tstvec = []
self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector

def testLargeMmap(self):
if not self.mallet_path:
return
fname = testfile()
model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100)

# simulate storing large arrays separately
model.save(testfile(), sep_limit=0)

# test loading the large model arrays with mmap
model2 = ldamodel.LdaModel.load(testfile(), mmap='r')
self.assertEqual(model.num_topics, model2.num_topics)
self.assertTrue(isinstance(model2.wordtopics, numpy.memmap))
self.assertTrue(numpy.allclose(model.wordtopics, model2.wordtopics))
tstvec = []
self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector

def testLargeMmapCompressed(self):
if not self.mallet_path:
return
fname = testfile() + '.gz'
model = ldamallet.LdaMallet(self.mallet_path, self.corpus, num_topics=2, iterations=100)

# simulate storing large arrays separately
model.save(fname, sep_limit=0)

# test loading the large model arrays with mmap
self.assertRaises(IOError, ldamodel.LdaModel.load, fname, mmap='r')
#endclass TestLdaMallet

if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
unittest.main()
Loading