piskvorky · menshikh-iv · Nov 20, 2017 · Nov 16, 2017 · Nov 16, 2017 · Nov 16, 2017
diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py
@@ -173,6 +173,40 @@ def doc2bow(self, document, allow_update=False, return_missing=False):
         else:
             return result
 
+    def doc2idx(self, document, unknown_word_index=-1):
+        """Convert `document` (a list of words) into a list of indexes = list of `token_id`.
+
+        Each word is assumed to be a **tokenized and normalized** string
+        (either unicode or utf8-encoded).
+        No further preprocessing is done on the words in `document`; apply tokenization, stemming etc. before calling
+        this method.
+
+        Replace all unknown words i.e, words not in the dictionary with the index as set via `unknown_word_index`,
+        defaults to -1.
+
+        Notes
+        -----
+            This function is `const`, aka read-only
+
+        Parameters
+        ----------
+        document : list
+            List of words tokenized, normalized and preprocessed.
+        unknown_word_index : int, optional
+            Index to use for words not in the dictionary.
+
+        Returns
+        -------
+        list
+            List of indexes in the dictionary for words in the `document`
+
+        """
+        if isinstance(document, string_types):
+            raise TypeError("doc2idx expects an array of unicode tokens on input, not a single string")
+
+        document = [word if isinstance(word, unicode) else unicode(word, 'utf-8') for word in document]
+        return [self.token2id.get(word, unknown_word_index) for word in document]
+
     def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None):
         """
         Filter out tokens that appear in

diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py
@@ -65,7 +65,7 @@ def strip_multiple_whitespaces(s):
 
 class TextCorpus(interfaces.CorpusABC):
     """Helper class to simplify the pipeline of getting bag-of-words vectors (= a
-    gensim corpus) from plain text.
+    gensim corpus) or an index vector from plain text.
 
     This is an abstract base class: override the `get_texts()` and `__len__()`
     methods to match your particular input.
@@ -76,7 +76,9 @@ class TextCorpus(interfaces.CorpusABC):
     this class via subclassing or by construction with different preprocessing arguments.
 
     The `iter` method converts the lists of tokens produced by `get_texts` to BoW format
-    using `Dictionary.doc2bow`. `get_texts` does the following:
+    if `bow_format` is set to True (default) using `Dictionary.doc2bow` or to an index vector
+    if `bow_format` is set to False using `Dictionary.doc2idx`.
+    `get_texts` does the following:
 
     1.  Calls `getstream` to get a generator over the texts. It yields each document in
         turn from the underlying text file or files.
@@ -112,7 +114,8 @@ class TextCorpus(interfaces.CorpusABC):
     6.  remove stopwords; see `gensim.parsing.preprocessing` for the list of stopwords
 
     """
-    def __init__(self, input=None, dictionary=None, metadata=False, character_filters=None, tokenizer=None, token_filters=None):
+    def __init__(self, input=None, dictionary=None, metadata=False, character_filters=None, tokenizer=None,
+                 token_filters=None, bow_format=True, unknown_word_index=-1):
         """
         Args:
             input (str): path to top-level directory to traverse for corpus documents.
@@ -134,6 +137,10 @@ def __init__(self, input=None, dictionary=None, metadata=False, character_filter
                 remove, or replace tokens, or do nothing at all. The default token filters
                 remove tokens less than 3 characters long and remove stopwords using the list
                 in `gensim.parsing.preprocessing.STOPWORDS`.
+            bow_format (bool): True to return BoW format (default) else return index vector as per
+                the `dictionary` if False
+            unknown_word_index (int): index to represent unknown words (default -1), i.e, words not
+                in the `dictionary`
         """
         self.input = input
         self.metadata = metadata
@@ -154,6 +161,9 @@ def __init__(self, input=None, dictionary=None, metadata=False, character_filter
         self.dictionary = None
         self.init_dictionary(dictionary)
 
+        self.bow_format = bow_format
+        self.unknown_word_index = unknown_word_index
+
     def init_dictionary(self, dictionary):
         """If `dictionary` is None, initialize to an empty Dictionary, and then if there
         is an `input` for the corpus, add all documents from that `input`. If the
@@ -179,10 +189,16 @@ def __iter__(self):
         """
         if self.metadata:
             for text, metadata in self.get_texts():
-                yield self.dictionary.doc2bow(text, allow_update=False), metadata
+                if self.bow_format:
+                    yield self.dictionary.doc2bow(text, allow_update=False), metadata
+                else:
+                    yield self.dictionary.doc2idx(text, unknown_word_index=self.unknown_word_index), metadata
         else:
             for text in self.get_texts():
-                yield self.dictionary.doc2bow(text, allow_update=False)
+                if self.bow_format:
+                    yield self.dictionary.doc2bow(text, allow_update=False)
+                else:
+                    yield self.dictionary.doc2idx(text, unknown_word_index=self.unknown_word_index)
 
     def getstream(self):
         """Yield documents from the underlying plain text collection (of one or more files).

diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py
@@ -20,7 +20,7 @@
 import numpy as np
 
 from gensim.corpora import (bleicorpus, mmcorpus, lowcorpus, svmlightcorpus,
-                            ucicorpus, malletcorpus, textcorpus, indexedcorpus)
+                            ucicorpus, malletcorpus, textcorpus, indexedcorpus, dictionary)
 from gensim.interfaces import TransformedCorpus
 from gensim.utils import to_unicode
 from gensim.test.utils import datapath, get_tmpfile
@@ -323,6 +323,23 @@ def test_load_with_metadata(self):
             doc, metadata = docmeta
             self.assertEqual(metadata[0], i)
 
+    def test_load_with_index_vector_mode(self):
+        dictionary_obj = dictionary.Dictionary()
+        dictionary_obj.token2id = {
+            'word': 0, 'computer': 1, 'human': 2, 'response': 3, 'survey': 4,
+            'system': 5, 'time': 6, 'user': 7, 'graph': 8, 'eps': 9, 'trees': 10,
+            'minors': 11
+        }
+        fname = datapath('testcorpus.' + self.file_extension.lstrip('.'))
+        corpus = self.corpus_class(
+            fname, dictionary=dictionary_obj, token_filters=[], bow_format=False, unknown_word_index=-1
+        )
+
+        docs = list(corpus)
+        first_doc, last_doc = docs[0], docs[-1]
+        self.assertEqual(first_doc, [1, 2, -1])
+        self.assertEqual(last_doc, [4, 8, 11])
+
     def test_default_preprocessing(self):
         lines = [
             "Šéf chomutovských komunistů dostal poštou bílý prášek",