piskvorky · menshikh-iv · Mar 15, 2018 · Feb 10, 2018 · Feb 10, 2018 · Feb 10, 2018
diff --git a/gensim/sklearn_api/lsimodel.py b/gensim/sklearn_api/lsimodel.py
@@ -5,11 +5,6 @@
 # Copyright (C) 2017 Radim Rehurek <[email protected]>
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
-"""
-Scikit learn interface for gensim for easy use of gensim with scikit-learn
-Follows scikit-learn API conventions
-"""
-
 import numpy as np
 from scipy import sparse
 from sklearn.base import TransformerMixin, BaseEstimator
@@ -20,14 +15,36 @@
 
 
 class LsiTransformer(TransformerMixin, BaseEstimator):
-    """
-    Base LSI module
+    """Base LSI module.
+
+    Scikit learn interface for `gensim.models.lsimodel` for easy use of gensim with scikit-learn.
+    Follows scikit-learn API conventions.
+
     """
 
     def __init__(self, num_topics=200, id2word=None, chunksize=20000,
                  decay=1.0, onepass=True, power_iters=2, extra_samples=100):
-        """
-        Sklearn wrapper for LSI model. See gensim.model.LsiModel for parameter details.
+        """Sklearn wrapper for LSI model.
+
+        Parameters
+        ----------
+        num_topics : int, optional
+            Number of requested factors (latent dimensions)
+        id2word : dict of {int: str}, optional
+            ID to word mapping, optional.
+        chunksize :  int, optional
+            Number of documents to be used in each training chunk.
+        decay : float, optional
+            Weight of existing observations relatively to new ones.
+        onepass : bool, optional
+            Whether the one-pass algorithm should be used for training.
+            Pass `False` to force a multi-pass stochastic algorithm.
+        power_iters: int, optional
+            Number of power iteration steps to be used.
+            Increasing the number of power iterations improves accuracy, but lowers performance
+        extra_samples : int, optional
+            Extra samples to be used besides the rank `k`. Can improve accuracy.
+
         """
         self.gensim_model = None
         self.num_topics = num_topics
@@ -42,6 +59,17 @@ def fit(self, X, y=None):
         """
         Fit the model according to the given training data.
         Calls gensim.models.LsiModel
+
+        Parameters
+        ----------
+        X : iterable of iterable of (int, float)
+            Stream of document vectors or sparse matrix of shape: [num_terms, num_documents].
+
+        Returns
+        -------
+        LsiTransformer
+            The trained model
+
         """
         if sparse.issparse(X):
             corpus = matutils.Sparse2Corpus(sparse=X, documents_columns=False)
@@ -55,14 +83,18 @@ def fit(self, X, y=None):
         return self
 
     def transform(self, docs):
-        """
-        Takes a list of documents as input ('docs').
-        Returns a matrix of topic distribution for the given document bow, where a_ij
-        indicates (topic_i, topic_probability_j).
-        The input `docs` should be in BOW format and can be a list of documents like
-        [[(4, 1), (7, 1)],
-        [(9, 1), (13, 1)], [(2, 1), (6, 1)]]
-        or a single document like : [(4, 1), (7, 1)]
+        """Computes the topic distribution matrix
+
+        Parameters
+        ----------
+        docs : iterable of iterable of (int, float)
+            Stream of document vectors or sparse matrix of shape: [`num_terms`, num_documents].
+
+        Returns
+        -------
+        list of (int, int)
+            Topic distribution matrix of shape [num_docs, num_topics]
+
         """
         if self.gensim_model is None:
             raise NotFittedError(
@@ -78,8 +110,22 @@ def transform(self, docs):
         return np.reshape(np.array(distribution), (len(docs), self.num_topics))
 
     def partial_fit(self, X):
-        """
-        Train model over X.
+        """Train model over a potentially incomplete set of documents.
+
+        This method can be used in two ways:
+            1. On an unfitted model in which case the model is initialized and trained on `X`.
+            2. On an already fitted model in which case the model is **further** trained on `X`.
+
+        Parameters
+        ----------
+        X : iterable of iterable of (int, float)
+            Stream of document vectors or sparse matrix of shape: [num_terms, num_documents].
+
+        Returns
+        -------
+        LsiTransformer
+            The trained model.
+
         """
         if sparse.issparse(X):
             X = matutils.Sparse2Corpus(sparse=X, documents_columns=False)

diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py
@@ -17,14 +17,62 @@
 
 
 class PhrasesTransformer(TransformerMixin, BaseEstimator):
-    """
-    Base Phrases module
+    """Base Phrases module
+
+    Scikit learn interface for `gensim.models.phrases` for easy use of gensim with scikit-learn.
+    Follows scikit-learn API conventions.
+
     """
 
     def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000,
                  delimiter=b'_', progress_per=10000, scoring='default'):
-        """
-        Sklearn wrapper for Phrases model.
+        """Sklearn wrapper for Phrases model.
+
+        Parameters
+        ----------
+        min_count : int
+            Terms with a count lower than this will be ignored
+        threshold : float
+            Only phrases scoring above this will be accepted, see `scoring` below.
+        max_vocab_size : int
+            Maximum size of the vocabulary.
+            Used to control pruning of less common words, to keep memory under control.
+            The default of 40M needs about 3.6GB of RAM;
+        delimiter : str
+            Character used to join collocation tokens. Should be a byte string (e.g. b'_').
+        progress_per : int
+            Training will report to the logger every that many phrases are learned.
+        scoring : str or callable
+            Specifies how potential phrases are scored for comparison to the `threshold`
+            setting. `scoring` can be set with either a string that refers to a built-in scoring function,
+            or with a function with the expected parameter names. Two built-in scoring functions are available
+            by setting `scoring` to a string:
+
+            'default': from [1]_.
+            'npmi': normalized pointwise mutual information, from [2]_.
+
+            'npmi' is more robust when dealing with common words that form part of common bigrams, and
+            ranges from -1 to 1, but is slower to calculate than the default.
+
+            To use a custom scoring function, create a function with the following parameters and set the `scoring`
+            parameter to the custom function. You must use all the parameters in your function call, even if the
+            function does not require all the parameters.
+
+                worda_count: number of occurrances in `sentences` of the first token in the phrase being scored
+                wordb_count: number of occurrances in `sentences` of the second token in the phrase being scored
+                bigram_count: number of occurrances in `sentences` of the phrase being scored
+                len_vocab: the number of unique tokens in `sentences`
+                min_count: the `min_count` setting of the Phrases class
+                corpus_word_count: the total number of (non-unique) tokens in `sentences`
+
+            A scoring function without any of these parameters (even if the parameters are not used) will
+            raise a ValueError on initialization of the Phrases class. The scoring function must be pic
+
+        References
+        ----------
+        .. [1] "Efficient Estimaton of Word Representations in Vector Space" by Mikolov, et. al.
+        .. [2] "Normalized (Pointwise) Mutual Information in Colocation Extraction" by Gerlof Bouma.
+
         """
         self.gensim_model = None
         self.min_count = min_count
@@ -35,8 +83,18 @@ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000,
         self.scoring = scoring
 
     def fit(self, X, y=None):
-        """
-        Fit the model according to the given training data.
+        """Fit the model according to the given training data.
+
+        Parameters
+        ----------
+        X : iterable of list of str
+            Sequence of sentences to be used for training the model.
+
+        Returns
+        -------
+        PhrasesTransformer
+            The trained model.
+
         """
         self.gensim_model = models.Phrases(
             sentences=X, min_count=self.min_count, threshold=self.threshold,
@@ -46,9 +104,22 @@ def fit(self, X, y=None):
         return self
 
     def transform(self, docs):
+        """Transform the input documents into phrase tokens.
+
+        Words in the sentence will be joined by u`_`.
+
+        Parameters
+        ----------
+        docs : iterable of list of str
+            Sequence of sentences to be used transformed.
+
+        Returns
+        -------
+        iterable of str
+            Phrase representation for each of the input sentences.
+
         """
-        Return the input documents to return phrase tokens.
-        """
+
         if self.gensim_model is None:
             raise NotFittedError(
                 "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
@@ -60,6 +131,24 @@ def transform(self, docs):
         return [self.gensim_model[doc] for doc in docs]
 
     def partial_fit(self, X):
+        """Train model over a potentially incomplete set of sentences.
+
+        This method can be used in two ways:
+            1. On an unfitted model in which case the model is initialized and trained on `X`.
+            2. On an already fitted model in which case the X sentences are **added** to the vocabulary.
+
+        Parameters
+        ----------
+        X : iterable of list of str
+            Sequence of sentences to be used for training the model.
+
+        Returns
+        -------
+        PhrasesTransformer
+            The trained model.
+
+        """
+
         if self.gensim_model is None:
             self.gensim_model = models.Phrases(
                 sentences=X, min_count=self.min_count, threshold=self.threshold,

diff --git a/gensim/sklearn_api/text2bow.py b/gensim/sklearn_api/text2bow.py
@@ -4,11 +4,6 @@
 # Copyright (C) 2011 Radim Rehurek <[email protected]>
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
-"""
-Scikit learn interface for gensim for easy use of gensim with scikit-learn
-Follows scikit-learn API conventions
-"""
-
 from six import string_types
 from sklearn.base import TransformerMixin, BaseEstimator
 from sklearn.exceptions import NotFittedError
@@ -18,29 +13,59 @@
 
 
 class Text2BowTransformer(TransformerMixin, BaseEstimator):
-    """
-    Base Text2Bow module
+    """Base Text2Bow module
+
+    Scikit learn interface for `gensim.models.lsimodel` for easy use of gensim with scikit-learn.
+    Follows scikit-learn API conventions.
+
     """
 
     def __init__(self, prune_at=2000000, tokenizer=tokenize):
-        """
-        Sklearn wrapper for Text2Bow model.
+        """Sklearn wrapper for Text2Bow model.
+
+        Parameters
+        ----------
+        prune_at : int, optional
+            Total number of unique words. Dictionary will keep not more than `prune_at` words.
+        tokenizer : callable (str -> list of str), optional
+            A callable to split a document into a list of each terms
+
         """
         self.gensim_model = None
         self.prune_at = prune_at
         self.tokenizer = tokenizer
 
     def fit(self, X, y=None):
-        """
-        Fit the model according to the given training data.
+        """Fit the model according to the given training data.
+
+        Parameters
+        ----------
+        X : iterable of str
+            A collection of documents used for training the model.
+
+        Returns
+        -------
+        Text2BowTransformer
+            The trained model.
+
         """
         tokenized_docs = [list(self.tokenizer(x)) for x in X]
         self.gensim_model = Dictionary(documents=tokenized_docs, prune_at=self.prune_at)
         return self
 
     def transform(self, docs):
-        """
-        Return the BOW format for the input documents.
+        """Return the BOW format for the input documents.
+
+        Parameters
+        ----------
+        docs : iterable of str
+            A collection of documents to be transformed.
+
+        Returns
+        -------
+        iterable of list (int, int) 2-tuples.
+            The BOW representation of each document.
+
         """
         if self.gensim_model is None:
             raise NotFittedError(
@@ -54,6 +79,23 @@ def transform(self, docs):
         return [self.gensim_model.doc2bow(doc) for doc in tokenized_docs]
 
     def partial_fit(self, X):
+        """Train model over a potentially incomplete set of documents.
+
+        This method can be used in two ways:
+            1. On an unfitted model in which case the dictionary is initialized and trained on `X`.
+            2. On an already fitted model in which case the dictionary is **expanded** by `X`.
+
+        Parameters
+        ----------
+        X : iterable of str
+            A collection of documents used to train the model.
+
+        Returns
+        -------
+        Text2BowTransformer
+            The trained model.
+
+        """
         if self.gensim_model is None:
             self.gensim_model = Dictionary(prune_at=self.prune_at)