Skip to content

Commit

Permalink
pep8 formatting fix
Browse files Browse the repository at this point in the history
  • Loading branch information
piskvorky committed Nov 30, 2015
1 parent b8224fe commit 602c0c7
Showing 1 changed file with 9 additions and 6 deletions.
15 changes: 9 additions & 6 deletions gensim/corpora/wikicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,10 @@ def get_namespace(tag):
return namespace
_get_namespace = get_namespace


def extract_pages(f, filter_namespaces=False):
"""
Extract pages from MediaWiki database dump.
Extract pages from a MediaWiki database dump = open file-like object `f`.
Return an iterable over (str, str, str) which generates (title, content, pageid) triplets.
Expand Down Expand Up @@ -224,6 +225,7 @@ def extract_pages(f, filter_namespaces=False):
elem.clear()
_extract_pages = extract_pages # for backward compatibility


def process_article(args):
"""
Parse a wikipedia article, returning its content as a list of tokens
Expand Down Expand Up @@ -293,7 +295,7 @@ def get_texts(self):
# is dumb and would load the entire input into RAM at once...
ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split()
for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1):
for tokens, title, pageid in pool.imap(process_article, group): # chunksize=10):
for tokens, title, pageid in pool.imap(process_article, group): # chunksize=10):
articles_all += 1
positions_all += len(tokens)
# article redirects and short stubs are pruned here
Expand All @@ -307,8 +309,9 @@ def get_texts(self):
yield tokens
pool.terminate()

logger.info("finished iterating over Wikipedia corpus of %i documents with %i positions"
" (total %i articles, %i positions before pruning articles shorter than %i words)" %
(articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS))
self.length = articles # cache corpus length
logger.info(
"finished iterating over Wikipedia corpus of %i documents with %i positions"
" (total %i articles, %i positions before pruning articles shorter than %i words)",
articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS)
self.length = articles # cache corpus length
# endclass WikiCorpus

0 comments on commit 602c0c7

Please sign in to comment.