diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index ec032067f1..ccf4345246 100755 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -257,11 +257,15 @@ def init_to_ignore_interrupt(): class WikiCorpus(TextCorpus): """ - Treat a wikipedia articles dump (\*articles.xml.bz2) as a (read-only) corpus. + Treat a wikipedia articles dump (wiki--pages-articles.xml.bz2 or wiki-latest-pages-articles.xml.bz2) as a (read-only) corpus. The documents are extracted on-the-fly, so that the whole (massive) dump can stay compressed on disk. + **Note:** "multistream" archives are *not* supported in Python 2 due to + `limitations in the core bz2 library + `_. + >>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id mapping, takes almost 8h >>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format plus file with id->word