piskvorky · menshikh-iv · Nov 6, 2017 · Nov 5, 2017 · Nov 5, 2017 · Nov 5, 2017
diff --git a/gensim/scripts/segment_wiki.py b/gensim/scripts/segment_wiki.py
@@ -14,6 +14,29 @@
     'section_titles' (list) - list of titles of sections,
     'section_texts' (list) - list of content from sections.
 
+English Wikipedia dump available
+`here <https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2>`_. Approximate time
+for processing is 2.5 hours (i7-6700HQ, SSD).
+
+Examples
+--------
+
+Convert wiki to json-lines format:
+`python -m gensim.scripts.segment_wiki -f enwiki-latest-pages-articles.xml.bz2 | gzip > enwiki-latest.json.gz`
+
+Read json-lines dump
+
+>>> # iterate over the plain text file we just created
+>>> for line in smart_open('enwiki-latest.json.gz'):
+>>>    # decode JSON into a Python object
+>>>    article = json.loads(line)
+>>>
+>>>    # each article has a "title", "section_titles" and "section_texts" fields
+>>>    print("Article title: %s" % article['title'])
+>>>    for section_title, section_text in zip(article['section_titles'], article['section_texts']):
+>>>        print("Section title: %s" % section_title)
+>>>        print("Section text: %s" % section_text)
+
 """
 
 import argparse
@@ -226,7 +249,9 @@ def get_texts_with_sections(self):
         for group in utils.chunkize(page_xmls, chunksize=10 * self.processes, maxsize=1):
             for article_title, sections in pool.imap(segment, group):  # chunksize=10):
                 # article redirects are pruned here
-                if any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
+                if any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES) \
+                        or len(sections) == 0 \
+                        or sections[0][1].lstrip().startswith("#REDIRECT"):
                     continue
                 articles += 1
                 yield (article_title, sections)