-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix segment-wiki script #1694
Fix segment-wiki script #1694
Changes from 5 commits
68df069
5ea0700
357f8d2
0749351
849dfc9
d4f81a4
a691623
3eb057a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,12 +5,37 @@ | |
# Copyright (C) 2016 RaRe Technologies | ||
|
||
""" | ||
Construct a corpus from a Wikipedia (or other MediaWiki-based) database dump and extract sections of pages from it | ||
and save to json-line format. | ||
|
||
If you have the `pattern` package installed, this module will use a fancy | ||
lemmatization to get a lemma of each token (instead of plain alphabetic | ||
tokenizer). The package is available at https://github.com/clips/pattern . | ||
Construct a corpus from a Wikipedia (or other MediaWiki-based) database dump (typical filename | ||
is <LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2 or <LANG>wiki-latest-pages-articles.xml.bz2), | ||
extract titles, section names, section content and save to json-line format, | ||
that contains 3 fields :: | ||
|
||
'title' (str) - title of article, | ||
'section_titles' (list) - list of titles of sections, | ||
'section_texts' (list) - list of content from sections. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd prefer to include a concrete hands-on example, something like this: Process a raw Wikipedia dump (XML.bz2 format, for example https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 for the English Wikipedia) and extract all articles and their sections as plain text::
The output format of the parsed plain text Wikipedia is json-lines = one article per line, serialized into JSON. Here's an example how to work with it from Python::
|
||
|
||
English Wikipedia dump available | ||
`here <https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2>`_. Approximate time | ||
for processing is 2.5 hours (i7-6700HQ, SSD). | ||
|
||
Examples | ||
-------- | ||
|
||
Convert wiki to json-lines format: | ||
`python -m gensim.scripts.segment_wiki -f enwiki-latest-pages-articles.xml.bz2 | gzip > enwiki-latest.json.gz` | ||
|
||
Read json-lines dump | ||
|
||
>>> # iterate over the plain text file we just created | ||
>>> for line in smart_open('enwiki-latest.json.gz'): | ||
>>> # decode JSON into a Python object | ||
>>> article = json.loads(line) | ||
>>> | ||
>>> # each article has a "title", "section_titles" and "section_texts" fields | ||
>>> print("Article title: %s" % article['title']) | ||
>>> for section_title, section_text in zip(article['section_titles'], article['section_texts']): | ||
>>> print("Section title: %s" % section_title) | ||
>>> print("Section text: %s" % section_text) | ||
|
||
""" | ||
|
||
|
@@ -22,8 +47,7 @@ | |
import sys | ||
from xml.etree import cElementTree | ||
|
||
from gensim.corpora.wikicorpus import ARTICLE_MIN_WORDS, IGNORED_NAMESPACES, WikiCorpus, \ | ||
filter_wiki, get_namespace, tokenize, utils | ||
from gensim.corpora.wikicorpus import IGNORED_NAMESPACES, WikiCorpus, filter_wiki, get_namespace, utils | ||
from smart_open import smart_open | ||
|
||
|
||
|
@@ -46,20 +70,20 @@ def segment_all_articles(file_path): | |
|
||
""" | ||
with smart_open(file_path, 'rb') as xml_fileobj: | ||
wiki_sections_corpus = WikiSectionsCorpus(xml_fileobj) | ||
wiki_sections_corpus = _WikiSectionsCorpus(xml_fileobj) | ||
wiki_sections_corpus.metadata = True | ||
wiki_sections_text = wiki_sections_corpus.get_texts_with_sections() | ||
for article_title, article_sections in wiki_sections_text: | ||
yield article_title, article_sections | ||
|
||
|
||
def segment_and_print_all_articles(file_path, output_file): | ||
def segment_and_write_all_articles(file_path, output_file): | ||
"""Write article title and sections to output_file, | ||
output_file is json-line file with 3 fields:: | ||
|
||
'tl' - title of article, | ||
'st' - list of titles of sections, | ||
'sc' - list of content from sections. | ||
'title' - title of article, | ||
'section_titles' - list of titles of sections, | ||
'section_texts' - list of content from sections. | ||
|
||
Parameters | ||
---------- | ||
|
@@ -68,18 +92,25 @@ def segment_and_print_all_articles(file_path, output_file): | |
or <LANG>wiki-latest-pages-articles.xml.bz2. | ||
|
||
output_file : str | ||
Path to output file. | ||
Path to output file in json-lines format. | ||
|
||
""" | ||
with smart_open(output_file, 'w') as outfile: | ||
if output_file is None: | ||
outfile = sys.stdout | ||
else: | ||
outfile = smart_open(output_file, 'wb') | ||
|
||
try: | ||
for idx, (article_title, article_sections) in enumerate(segment_all_articles(file_path)): | ||
output_data = {"tl": article_title, "st": [], "sc": []} | ||
output_data = {"title": article_title, "section_titles": [], "section_texts": []} | ||
for section_heading, section_content in article_sections: | ||
output_data["st"].append(section_heading) | ||
output_data["sc"].append(section_content) | ||
output_data["section_titles"].append(section_heading) | ||
output_data["section_texts"].append(section_content) | ||
if (idx + 1) % 100000 == 0: | ||
logger.info("Processed #%d articles", idx + 1) | ||
outfile.write(json.dumps(output_data) + "\n") | ||
finally: | ||
outfile.close() | ||
|
||
|
||
def extract_page_xmls(f): | ||
|
@@ -160,7 +191,7 @@ def segment(page_xml): | |
return title, sections | ||
|
||
|
||
class WikiSectionsCorpus(WikiCorpus): | ||
class _WikiSectionsCorpus(WikiCorpus): | ||
"""Treat a wikipedia articles dump (<LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2 | ||
or <LANG>wiki-latest-pages-articles.xml.bz2) as a (read-only) corpus. | ||
|
||
|
@@ -217,15 +248,10 @@ def get_texts_with_sections(self): | |
# is dumb and would load the entire input into RAM at once... | ||
for group in utils.chunkize(page_xmls, chunksize=10 * self.processes, maxsize=1): | ||
for article_title, sections in pool.imap(segment, group): # chunksize=10): | ||
# article redirects and short stubs are pruned here | ||
num_total_tokens = 0 | ||
for section_title, section_content in sections: | ||
if self.lemmatize: | ||
num_total_tokens += len(utils.lemmatize(section_content)) | ||
else: | ||
num_total_tokens += len(tokenize(section_content)) | ||
if num_total_tokens < ARTICLE_MIN_WORDS or \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Including redirects and stubs is a bad idea. That's typically (never?) what people want, out of Wikipedia dumps. We want to keep only meaningful articles, such as at least 500 plain text characters (~1 paragraph) or something. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think so (about short articles), because we provide parsed wikipedia dump "as-is" and short articles can be useful for users for special cases (and easy to filter later if needed), for this reason, I removed this part. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But it's needed to filter trash (like the redirect), I'll add fix for this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds good 👍 Stubs are not really articles though; most of the text is something like "this article is a stub, help Wikipedia by expanding it" or something. Not terribly useful, potentially messing up corpus statistics for people who would be unaware of this. |
||
any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): | ||
# article redirects are pruned here | ||
if any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES) \ | ||
or len(sections) == 0 \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
or sections[0][1].lstrip().startswith("#REDIRECT"): | ||
continue | ||
articles += 1 | ||
yield (article_title, sections) | ||
|
@@ -239,8 +265,8 @@ def get_texts_with_sections(self): | |
|
||
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, description=globals()['__doc__']) | ||
parser.add_argument('-f', '--file', help='Path to MediaWiki database dump', required=True) | ||
parser.add_argument('-o', '--output', help='Path to output file', required=True) | ||
parser.add_argument('-o', '--output', help='Path to output file (stdout if not specified)') | ||
args = parser.parse_args() | ||
segment_and_print_all_articles(args.file, args.output) | ||
segment_and_write_all_articles(args.file, args.output) | ||
|
||
logger.info("finished running %s", sys.argv[0]) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd add a link to the actual place to download those, because it's not obvious.
For example, the English Wiki dump is here: https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2