Add comment explaining lack of multistream support (#1515)

* Add comment explaining lack of multistream support See #1496, looks like this has confused some people. -POLM * Add file patterns to documentation
piskvorky · Sep 18, 2017 · 6b8f1c0 · 6b8f1c0
1 parent e667069
commit 6b8f1c0
Showing 1 changed file with 5 additions and 1 deletion.
diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
@@ -259,11 +259,15 @@ def init_to_ignore_interrupt():
 
 class WikiCorpus(TextCorpus):
     """
-    Treat a wikipedia articles dump (\*articles.xml.bz2) as a (read-only) corpus.
+    Treat a wikipedia articles dump (<LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2 or <LANG>wiki-latest-pages-articles.xml.bz2) as a (read-only) corpus.
 
     The documents are extracted on-the-fly, so that the whole (massive) dump
     can stay compressed on disk.
 
+    **Note:** "multistream" archives are *not* supported in Python 2 due to
+    `limitations in the core bz2 library
+    <https://docs.python.org/2/library/bz2.html#de-compression-of-files>`_.
+
     >>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id mapping, takes almost 8h
     >>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format plus file with id->word