Add article interlinks to the output of gensim.scripts.segment_wiki. …

…Fix piskvorky#1712 (piskvorky#1839) * promoting the markup gives up information needed to find the intelinks * Add interlinks to the output of `segment_wiki` * New output format is (str, list of (str, str), list of str, reflecting structure (title, [(section_heading, section_content), ...], [interlink, ...]) * `filter_wiki` in WikiCorpus will not promote uncaught markup to plain text as this will give up valuable information for the interlink discovery * Fixed PEP 8 * Refactoring identation and variable names * Removed debugging code from script * Fixed a bug where interlinks with a description or multiple names where disregarded * Due to preprocessing in `filter_wiki` interlinks containing alternative names had one of the 2 `[` and `]` characters removed. The regex now takes that into account. * Now stripping whitespace off section titles * Unit test `gensim.scripts.segment_wiki` * Initiate unit testing for all scripts. * Check for expected len given article filtering (namespace, size in characters and redirections). * Check for yielded title, section headings and texts as well as interlinks yielded from generator. * Check that the same is correctly persisted in JSON. * Fix PEP 8 * Fix Python 3.5 compatibility * Section text now completely clean from wiki markup * Refactored filtering functions in ``wikicorpus.py` so that uncaught markup can be optionally promoted to plain text * Interlink extraction logic moved to `wikicorpus.py` * Unit tests modified accordingly * Added extra logging info to troublehsoot weird Travis behavior * Fix PEP 8 * pin workers for segment_and_write_all_articles * Get rid of debugging stuff * Get rid of global logger * Interlinks are now mapping from the linked article's title to the actual interlink text * Used boolean argument with default argument in `filter_wiki`. The default value keeps the old functionality so that existing code does not brake * Overriding the default argument causes interlinks to not be simplified and lets `find_interlinks` create the mappings * Moved regex outside function * Interlink extraction is now optional and controlled with the `-i` command line argument * PEP 8 long lines * made scripts tests aware of the optional interlinks argument * Updated script help output for interlinks
sj29-innovate · Feb 21, 2018 · 7b056ec · 7b056ec
1 parent aac7a75
commit 7b056ec
Show file tree

Hide file tree

Showing 4 changed files with 536 additions and 335 deletions.
diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
@@ -22,7 +22,6 @@
 
 """
 
-
 import bz2
 import logging
 import multiprocessing
@@ -45,7 +44,6 @@
 TOKEN_MIN_LEN = 2
 TOKEN_MAX_LEN = 15
 
-
 RE_P0 = re.compile(r'<!--.*?-->', re.DOTALL | re.UNICODE)
 """Comments."""
 RE_P1 = re.compile(r'<ref([> ].*?)(</ref>|/>)', re.DOTALL | re.UNICODE)
@@ -78,6 +76,8 @@
 """Categories."""
 RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
 """Remove File and Image templates."""
+RE_P16 = re.compile(r'\[{2}(.*?)\]{2}', re.UNICODE)
+"""Capture interlinks text and article linked"""
 
 IGNORED_NAMESPACES = [
     'Wikipedia', 'Category', 'File', 'Portal', 'Template',
@@ -93,34 +93,70 @@
 """
 
 
-def filter_wiki(raw):
+def find_interlinks(raw):
+    """Find all interlinks to other articles in the dump.
+
+    Parameters
+    ----------
+    raw : str
+        Unicode or utf-8 encoded string.
+
+    Returns
+    -------
+    dict
+        Mapping from the linked article to the actual text found.
+    """
+    filtered = filter_wiki(raw, promote_remaining=False, simplify_links=False)
+    interlinks_raw = re.findall(RE_P16, filtered)
+
+    interlinks = {}
+    for parts in [i.split('|') for i in interlinks_raw]:
+        actual_title = parts[0]
+        try:
+            interlink_text = parts[1]
+            interlinks[actual_title] = interlink_text
+        except IndexError:
+            interlinks[actual_title] = actual_title
+
+    legit_interlinks = {i: j for i, j in interlinks.items() if '[' not in i and ']' not in i}
+    return legit_interlinks
+
+
+def filter_wiki(raw, promote_remaining=True, simplify_links=True):
     """Filter out wiki markup from `raw`, leaving only text.
 
     Parameters
     ----------
     raw : str
         Unicode or utf-8 encoded string.
+    promote_remaining : bool
+        Whether uncaught markup should be promoted to plain text.
+    simplify_links : bool
+        Whether links should be simplified keeping only their description text.
 
     Returns
     -------
     str
         `raw` without markup.
-
     """
     # parsing of the wiki markup is not perfect, but sufficient for our purposes
     # contributions to improving this code are welcome :)
     text = utils.to_unicode(raw, 'utf8', errors='ignore')
     text = utils.decode_htmlentities(text)  # '&amp;nbsp;' --> '\xa0'
-    return remove_markup(text)
+    return remove_markup(text, promote_remaining, simplify_links)
 
 
-def remove_markup(text):
+def remove_markup(text, promote_remaining=True, simplify_links=True):
     """Filter out wiki markup from `text`, leaving only text.
 
     Parameters
     ----------
     text : str
         String containing markup.
+    promote_remaining : bool
+        Whether uncaught markup should be promoted to plain text.
+    simplify_links : bool
+        Whether links should be simplified keeping only their description text.
 
     Returns
     -------
@@ -145,8 +181,11 @@ def remove_markup(text):
         text = re.sub(RE_P11, '', text)  # remove all remaining tags
         text = re.sub(RE_P14, '', text)  # remove categories
         text = re.sub(RE_P5, '\\3', text)  # remove urls, keep description
-        text = re.sub(RE_P6, '\\2', text)  # simplify links, keep description only
+
+        if simplify_links:
+            text = re.sub(RE_P6, '\\2', text)  # simplify links, keep description only
         # remove table markup
+
         text = text.replace('||', '\n|')  # each table cell on a separate line
         text = re.sub(RE_P12, '\n', text)  # remove formatting lines
         text = re.sub(RE_P13, '\n\\3', text)  # leave only cell content
@@ -156,9 +195,9 @@ def remove_markup(text):
         if old == text or iters > 2:
             break
 
-    # the following is needed to make the tokenizer see '[[socialist]]s' as a single word 'socialists'
-    # TODO is this really desirable?
-    text = text.replace('[', '').replace(']', '')  # promote all remaining markup to plain text
+    if promote_remaining:
+        text = text.replace('[', '').replace(']', '')  # promote all remaining markup to plain text
+
     return text
 
 
@@ -333,7 +372,7 @@ def extract_pages(f, filter_namespaces=False):
                     text = None
 
             pageid = elem.find(pageid_path).text
-            yield title, text or "", pageid     # empty page will yield None
+            yield title, text or "", pageid  # empty page will yield None
 
             # Prune the element tree, as per
             # http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
@@ -461,6 +500,7 @@ class WikiCorpus(TextCorpus):
     >>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format and mapping
 
     """
+
     def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None,
                  filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS,
                  token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True):