diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index 08c4097f03..4894a365b9 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -173,6 +173,46 @@ def doc2bow(self, document, allow_update=False, return_missing=False): else: return result + def doc2idx(self, document, unknown_word_index=-1): + """Convert `document` (a list of words) into a list of indexes = list of `token_id`. + + Each word is assumed to be a **tokenized and normalized** string (either unicode or utf8-encoded). + No further preprocessing is done on the words in `document`; apply tokenization, stemming etc. before calling + this method. + + Replace all unknown words i.e, words not in the dictionary with the index as set via `unknown_word_index`, + defaults to -1. + + Notes + ----- + This function is `const`, aka read-only + + Parameters + ---------- + document : list of str + Tokenized, normalized and preprocessed words + unknown_word_index : int, optional + Index to use for words not in the dictionary. + + Returns + ------- + list of int + Indexes in the dictionary for words in the `document` preserving the order of words + + Examples + -------- + >>> dictionary_obj = Dictionary() + >>> dictionary_obj.token2id = {'computer': 0, 'human': 1, 'response': 2, 'survey': 3} + >>> dictionary_obj.doc2idx(document=['human', 'computer', 'interface'], unknown_word_index=-1) + [1, 0, -1] + + """ + if isinstance(document, string_types): + raise TypeError("doc2idx expects an array of unicode tokens on input, not a single string") + + document = [word if isinstance(word, unicode) else unicode(word, 'utf-8') for word in document] + return [self.token2id.get(word, unknown_word_index) for word in document] + def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None): """ Filter out tokens that appear in diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py index 23f5fa3bd1..e479b551c3 100644 --- a/gensim/corpora/textcorpus.py +++ b/gensim/corpora/textcorpus.py @@ -112,7 +112,8 @@ class TextCorpus(interfaces.CorpusABC): 6. remove stopwords; see `gensim.parsing.preprocessing` for the list of stopwords """ - def __init__(self, input=None, dictionary=None, metadata=False, character_filters=None, tokenizer=None, token_filters=None): + def __init__(self, input=None, dictionary=None, metadata=False, character_filters=None, tokenizer=None, + token_filters=None): """ Args: input (str): path to top-level directory to traverse for corpus documents.