Merge remote-tracking branch 'upstream/develop' into develop

piskvorky · Jun 24, 2016 · 35a45e4 · 35a45e4
2 parents f6e5771 + 225fa67
commit 35a45e4
Show file tree

Hide file tree

Showing 9 changed files with 170 additions and 127 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 Changes
 =======
+0.13.2
+
+* topics, topn parameters changed to num_topics and num_words in show_topics() and print_topics()(@droudy, #747)
+  - In hdpmodel and dtmmodel
+  - NOT BACKWARDS COMPATIBLE!
 
 0.13.1, 2016-06-22
 

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -2,7 +2,7 @@ recursive-include docs *
 recursive-include gensim/test/test_data *
 recursive-include . *.sh
 prune docs/src*
-include README.rst
+include README.md
 include CHANGELOG.md
 include COPYING
 include COPYING.LESSER

diff --git a/README.md b/README.md
@@ -0,0 +1,149 @@
+gensim – Topic Modelling in Python
+==================================
+
+![Travis]![Wheel]
+
+Gensim is a Python library for *topic modelling*, *document indexing*
+and *similarity retrieval* with large corpora. Target audience is the
+*natural language processing* (NLP) and *information retrieval* (IR)
+community.
+
+Features
+--------
+
+-   All algorithms are **memory-independent** w.r.t. the corpus size
+    (can process input larger than RAM, streamed, out-of-core),
+-   **Intuitive interfaces**
+    -   easy to plug in your own input corpus/datastream (trivial
+        streaming API)
+    -   easy to extend with other Vector Space algorithms (trivial
+        transformation API)
+-   Efficient multicore implementations of popular algorithms, such as
+    online **Latent Semantic Analysis (LSA/LSI/SVD)**, **Latent
+    Dirichlet Allocation (LDA)**, **Random Projections (RP)**,
+    **Hierarchical Dirichlet Process (HDP)** or **word2vec deep
+    learning**.
+-   **Distributed computing**: can run *Latent Semantic Analysis* and
+    *Latent Dirichlet Allocation* on a cluster of computers.
+-   Extensive [documentation and Jupyter Notebook tutorials].
+
+If this feature list left you scratching your head, you can first read
+more about the [Vector Space Model] and [unsupervised document analysis]
+on Wikipedia.
+
+Installation
+------------
+
+This software depends on [NumPy and Scipy], two Python packages for
+scientific computing. You must have them installed prior to installing
+gensim.
+
+It is also recommended you install a fast BLAS library before installing
+NumPy. This is optional, but using an optimized BLAS such as [ATLAS] or
+[OpenBLAS] is known to improve performance by as much as an order of
+magnitude. On OS X, NumPy picks up the BLAS that comes with it
+automatically, so you don’t need to do anything special.
+
+The simple way to install gensim is:
+
+    pip install -U gensim
+
+Or, if you have instead downloaded and unzipped the [source tar.gz]
+package, you’d run:
+
+    python setup.py test
+    python setup.py install
+
+For alternative modes of installation (without root privileges,
+development installation, optional install features), see the
+[documentation].
+
+This version has been tested under Python 2.6, 2.7, 3.3, 3.4 and 3.5
+(support for Python 2.5 was dropped in gensim 0.10.0; install gensim
+0.9.1 if you *must* use Python 2.5). Gensim’s github repo is hooked
+against [Travis CI for automated testing] on every commit push and pull
+request.
+
+How come gensim is so fast and memory efficient? Isn’t it pure Python, and isn’t Python slow and greedy?
+--------------------------------------------------------------------------------------------------------
+
+Many scientific algorithms can be expressed in terms of large matrix
+operations (see the BLAS note above). Gensim taps into these low-level
+BLAS libraries, by means of its dependency on NumPy. So while
+gensim-the-top-level-code is pure Python, it actually executes highly
+optimized Fortran/C under the hood, including multithreading (if your
+BLAS is so configured).
+
+Memory-wise, gensim makes heavy use of Python’s built-in generators and
+iterators for streamed data processing. Memory efficiency was one of
+gensim’s [design goals], and is a central feature of gensim, rather than
+something bolted on as an afterthought.
+
+Documentation
+-------------
+
+-   [QuickStart]
+-   [Tutorials]
+-   [Tutorial Videos]
+-   [Official Documentation and Walkthrough]
+
+  [QuickStart]: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/gensim%20Quick%20Start.ipynb
+  [Tutorials]: https://github.com/RaRe-Technologies/gensim/blob/develop/tutorials.md#tutorials
+  [Tutorial Videos]: https://github.com/RaRe-Technologies/gensim/blob/develop/tutorials.md#videos
+  [Official Documentation and Walkthrough]: http://radimrehurek.com/gensim/
+
+---------
+
+Adopters
+--------
+
+
+
+| Name                                   | Logo                                                                                                                           | URL                                                                                              | Description                                                                                                                                                                                                           |
+|----------------------------------------|--------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|                                                                                                                         
+| RaRe Technologies                            | <img src="http://rare-technologies.com/wp-content/uploads/2016/02/rare_image_only.png" width="100"> | [rare-technologies.com](http://rare-technologies.com)                                                           | Machine Learning NLP consulting and training |
+| Talentpair                            | ![Talentpair](https://avatars3.githubusercontent.com/u/8418395?v=3&s=100)                                                 | [talentpair.com](http://talentpair.com)                                                           | Data science driving high-touch recruiting                                                    |    
+-------
+
+
+
+Citing gensim
+------------
+
+When [citing gensim in academic papers and theses], please use this
+BibTeX entry:
+
+    @inproceedings{rehurek_lrec,
+          title = {{Software Framework for Topic Modelling with Large Corpora}},
+          author = {Radim {\v R}eh{\r u}{\v r}ek and Petr Sojka},
+          booktitle = {{Proceedings of the LREC 2010 Workshop on New
+               Challenges for NLP Frameworks}},
+          pages = {45--50},
+          year = 2010,
+          month = May,
+          day = 22,
+          publisher = {ELRA},
+          address = {Valletta, Malta},
+          note={\url{http://is.muni.cz/publication/884893/en}},
+          language={English}
+    }
+
+  [citing gensim in academic papers and theses]: https://scholar.google.cz/citations?view_op=view_citation&hl=en&user=9vG_kV0AAAAJ&citation_for_view=9vG_kV0AAAAJ:u-x6o8ySG0sC
+
+  [Travis CI for automated testing]: https://travis-ci.org/RaRe-Technologies/gensim
+  [design goals]: http://radimrehurek.com/gensim/about.html
+  [RaRe Technologies]: http://rare-technologies.com/wp-content/uploads/2016/02/rare_image_only.png%20=10x20
+  [rare\_tech]: //rare-technologies.com
+  [Talentpair]: https://avatars3.githubusercontent.com/u/8418395?v=3&s=100
+  [citing gensim in academic papers and theses]: https://scholar.google.cz/citations?view_op=view_citation&hl=en&user=9vG_kV0AAAAJ&citation_for_view=9vG_kV0AAAAJ:u-x6o8ySG0sC
+
+  [Travis]: https://img.shields.io/travis/RaRe-Technologies/gensim/develop.svg
+  [Wheel]: https://img.shields.io/pypi/wheel/gensim.svg
+  [documentation and Jupyter Notebook tutorials]: https://github.com/RaRe-Technologies/gensim/#documentation
+  [Vector Space Model]: http://en.wikipedia.org/wiki/Vector_space_model
+  [unsupervised document analysis]: http://en.wikipedia.org/wiki/Latent_semantic_indexing
+  [NumPy and Scipy]: http://www.scipy.org/Download
+  [ATLAS]: http://math-atlas.sourceforge.net/
+  [OpenBLAS]: http://xianyi.github.io/OpenBLAS/
+  [source tar.gz]: http://pypi.python.org/pypi/gensim
+  [documentation]: http://radimrehurek.com/gensim/install.html
diff --git a/README.rst b/README.rst
diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py
@@ -586,9 +586,10 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):
         if num_topics < 0:
             num_topics = len(self.data)
 
-        num_topics = min(topics, len(self.data))
 
-        for k in xrange(topics):
+        num_topics = min(num_topics, len(self.data))
+
+        for k in xrange(num_topics):
             lambdak = list(self.data[k, :])
             lambdak = lambdak / sum(lambdak)
 

diff --git a/gensim/models/wrappers/dtmmodel.py b/gensim/models/wrappers/dtmmodel.py
@@ -235,23 +235,23 @@ def train(self, corpus, time_slices, mode, model):
                 # influence[2,5] influence of document 2 on topic 5
                 self.influences_time.append(influence)
 
-    def print_topics(self, topics=10, times=5, topn=10):
-        return self.show_topics(topics, times, topn, log=True)
+    def print_topics(self, num_topics=10, times=5, num_words=10):
+        return self.show_topics(num_topics, times, num_words, log=True)
 
-    def show_topics(self, topics=10, times=5, topn=10, log=False, formatted=True):
+    def show_topics(self, num_topics=10, times=5, num_words=10, log=False, formatted=True):
         """
-        Print the `topn` most probable words for `topics` number of topics at 'times' time slices.
+        Print the `num_words` most probable words for `num_topics` number of topics at 'times' time slices.
         Set `topics=-1` to print all topics.
 
         Set `formatted=True` to return the topics as a list of strings, or `False` as lists of (weight, word) pairs.
 
         """
-        if topics < 0 or topics >= self.num_topics:
-            topics = self.num_topics
-            chosen_topics = range(topics)
+        if num_topics < 0 or num_topics >= self.num_topics:
+            num_topics = self.num_topics
+            chosen_topics = range(num_topics)
         else:
-            topics = min(topics, self.num_topics)
-            chosen_topics = range(topics)
+            num_topics = min(num_topics, self.num_topics)
+            chosen_topics = range(num_topics)
              # add a little random jitter, to randomize results around the same
             # alpha
             # sort_alpha = self.alpha + 0.0001 * \

diff --git a/gensim/test/test_hdpmodel.py b/gensim/test/test_hdpmodel.py
@@ -55,7 +55,7 @@ def setUp(self):
         self.model = self.class_(corpus, id2word=dictionary)
 
     def testShowTopics(self):
-        topics = self.model.show_topics(formatted=False)
+        topics = self.model.show_topics(formatted=False, num_topics=20, num_words=20)
 
         for topic_no, topic in topics:
             self.assertTrue(isinstance(topic_no, int))

diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py
@@ -47,10 +47,6 @@ def testfile():
     # temporary data will be stored to this file
     return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')
 
-def testRandomState():
-    testcases = [numpy.random.seed(0), None, numpy.random.RandomState(0), 0]
-    for testcase in testcases:
-        assert(isinstance(ldamodel.check_random_state(testcase),numpy.random.RandomState))
 
 class TestLdaModel(unittest.TestCase):
     def setUp(self):
@@ -97,7 +93,7 @@ def testAlpha(self):
         expected_shape = (2,)
 
         # should not raise anything
-        self.class_(**kwargs)
+        self.class_(**kwarghs)
 
         kwargs['alpha'] = 'symmetric'
         model = self.class_(**kwargs)

diff --git a/setup.py b/setup.py
@@ -125,7 +125,7 @@ def readfile(fname):
     name='gensim',
     version='0.13.1',
     description='Python framework for fast Vector Space Modelling',
-    long_description=readfile('README.rst'),
+    long_description=readfile('README.md'),
 
     ext_modules=[
         Extension('gensim.models.word2vec_inner',