Fix documentation for gensim.corpora. Partial fix #1671 (#1729)

* Fix typo * Make `save_corpus` private * Annotate `bleicorpus.py` * Make __save_corpus weakly private * Fix _save_corpus in tests * Fix _save_corpus[2] * Document bleicorpus in Numpy style * Document indexedcorpus * Annotate csvcorpus * Add "Yields" section * Make `_save_corpus` public * Annotate bleicorpus * Fix indentation in bleicorpus * `_save_corpus` -> `save_corpus` * Annotate bleicorpus * Convert dictionary docs to numpy style * Convert hashdictionary docs to numpy style * Convert indexedcorpus docs to numpy style * Convert lowcorpus docs to numpy style * Convert malletcorpus docs to numpy style * Convert mmcorpus docs to numpy style * Convert sharded_corpus docs to numpy style * Convert svmlightcorpus docs to numpy style * Convert textcorpus docs to numpy style * Convert ucicorpus docs to numpy style * Convert wikicorpus docs to numpy style * Add sphinx tweaks * Remove trailing whitespaces * Annotate wikicorpus * SVMLight Corpus annotated * Fix TODO * Fix grammar mistake * Undo changes to dictionary * Undo changes to hashdictionary * Document indexedcorpus * Document indexedcorpus[2] Fix identation * Remove redundant files * Add more dots. :) * Fix monospace * remove useless method * fix bleicorpus * fix csvcorpus * fix indexedcorpus * fix svmlightcorpus * fix wikicorpus[1] * fix wikicorpus[2] * fix wikicorpus[3] * fix review comments
piskvorky · Jan 22, 2018 · c5f487d · c5f487d
1 parent 74dae4d
commit c5f487d
Show file tree

Hide file tree

Showing 5 changed files with 553 additions and 202 deletions.
diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py
@@ -5,9 +5,7 @@
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
 
-"""
-Blei's LDA-C format.
-"""
+"""Сorpus in Blei's LDA-C format."""
 
 from __future__ import with_statement
 
@@ -19,30 +17,44 @@
 from six.moves import xrange
 
 
-logger = logging.getLogger('gensim.corpora.bleicorpus')
+logger = logging.getLogger(__name__)
 
 
 class BleiCorpus(IndexedCorpus):
-    """
-    Corpus in Blei's LDA-C format.
+    """Corpus in Blei's LDA-C format.
 
     The corpus is represented as two files: one describing the documents, and another
     describing the mapping between words and their ids.
 
     Each document is one line::
 
-      N fieldId1:fieldValue1 fieldId2:fieldValue2 ... fieldIdN:fieldValueN
+        N fieldId1:fieldValue1 fieldId2:fieldValue2 ... fieldIdN:fieldValueN
+
+
+    The vocabulary is a file with words, one word per line; word at line K has an implicit `id=K`.
 
-    The vocabulary is a file with words, one word per line; word at line K has an
-    implicit ``id=K``.
     """
 
     def __init__(self, fname, fname_vocab=None):
         """
-        Initialize the corpus from a file.
 
-        `fname_vocab` is the file with vocabulary; if not specified, it defaults to
-        `fname.vocab`.
+        Parameters
+        ----------
+        fname : str
+            Path to corpus.
+        fname_vocab : str, optional
+            Vocabulary file. If `fname_vocab` is None, searching one of variants:
+
+            * `fname`.vocab
+            * `fname`/vocab.txt
+            * `fname_without_ext`.vocab
+            * `fname_folder`/vocab.txt
+
+        Raises
+        ------
+        IOError
+            If vocabulary file doesn't exist.
+
         """
         IndexedCorpus.__init__(self, fname)
         logger.info("loading corpus from %s", fname)
@@ -67,8 +79,13 @@ def __init__(self, fname, fname_vocab=None):
         self.id2word = dict(enumerate(words))
 
     def __iter__(self):
-        """
-        Iterate over the corpus, returning one sparse vector at a time.
+        """Iterate over the corpus, returning one sparse (BoW) vector at a time.
+
+        Yields
+        ------
+        list of (int, float)
+            Document's BoW representation.
+
         """
         lineno = -1
         with utils.smart_open(self.fname) as fin:
@@ -77,6 +94,19 @@ def __iter__(self):
         self.length = lineno + 1
 
     def line2doc(self, line):
+        """Convert line in Blei LDA-C format to document (BoW representation).
+
+        Parameters
+        ----------
+        line : str
+            Line in Blei's LDA-C format.
+
+        Returns
+        -------
+        list of (int, float)
+            Document's BoW representation.
+
+        """
         parts = utils.to_unicode(line).split()
         if int(parts[0]) != len(parts) - 1:
             raise ValueError("invalid format in %s: %s" % (self.fname, repr(line)))
@@ -86,14 +116,28 @@ def line2doc(self, line):
 
     @staticmethod
     def save_corpus(fname, corpus, id2word=None, metadata=False):
-        """
-        Save a corpus in the LDA-C format.
-
-        There are actually two files saved: `fname` and `fname.vocab`, where
-        `fname.vocab` is the vocabulary file.
+        """Save a corpus in the LDA-C format.
+
+        Notes
+        -----
+        There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file.
+
+        Parameters
+        ----------
+        fname : str
+            Path to output file.
+        corpus : iterable of iterable of (int, float)
+            Input corpus in BoW format.
+        id2word : dict of (str, str), optional
+            Mapping id -> word for `corpus`.
+        metadata : bool, optional
+            THIS PARAMETER WILL BE IGNORED.
+
+        Returns
+        -------
+        list of int
+            Offsets for each line in file (in bytes).
 
-        This function is automatically called by `BleiCorpus.serialize`; don't
-        call it directly, call `serialize` instead.
         """
         if id2word is None:
             logger.info("no word id mapping provided; initializing from corpus")
@@ -121,8 +165,19 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
         return offsets
 
     def docbyoffset(self, offset):
-        """
-        Return the document stored at file position `offset`.
+        """Get document corresponding to `offset`.
+        Offset can be given from :meth:`~gensim.corpora.bleicorpus.BleiCorpus.save_corpus`.
+
+        Parameters
+        ----------
+        offset : int
+            Position of the document in the file (in bytes).
+
+        Returns
+        -------
+        list of (int, float)
+            Document in BoW format.
+
         """
         with utils.smart_open(self.fname) as f:
             f.seek(offset)

diff --git a/gensim/corpora/csvcorpus.py b/gensim/corpora/csvcorpus.py
@@ -4,10 +4,7 @@
 # Copyright (C) 2013 Zygmunt Zając <[email protected]>
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
-"""
-Corpus in CSV format.
-
-"""
+"""Corpus in CSV format."""
 
 
 from __future__ import with_statement
@@ -18,22 +15,28 @@
 
 from gensim import interfaces, utils
 
-logger = logging.getLogger('gensim.corpora.csvcorpus')
+logger = logging.getLogger(__name__)
 
 
 class CsvCorpus(interfaces.CorpusABC):
-    """
-    Corpus in CSV format. The CSV delimiter, headers etc. are guessed automatically
-    based on the file content.
+    """Corpus in CSV format.
 
+    Notes
+    -----
+    The CSV delimiter, headers etc. are guessed automatically based on the file content.
     All row values are expected to be ints/floats.
 
     """
 
     def __init__(self, fname, labels):
         """
-        Initialize the corpus from a file.
-        `labels` = are class labels present in the input file? => skip the first column
+
+        Parameters
+        ----------
+        fname : str
+            Path to corpus.
+        labels : bool
+            If True - ignore first column (class labels).
 
         """
         logger.info("loading corpus from %s", fname)
@@ -48,8 +51,12 @@ def __init__(self, fname, labels):
         logger.info("sniffed CSV delimiter=%r, headers=%s", self.dialect.delimiter, self.headers)
 
     def __iter__(self):
-        """
-        Iterate over the corpus, returning one sparse vector at a time.
+        """Iterate over the corpus, returning one BoW vector at a time.
+
+        Yields
+        ------
+        list of (int, float)
+            Document in BoW format.
 
         """
         reader = csv.reader(utils.smart_open(self.fname), self.dialect)