Skip to content

Commit

Permalink
Change find_interlinks return type to list of tuples (#2636)
Browse files Browse the repository at this point in the history
* Change interlinks format to list of tuples. Fixes #2635

This commit fixes the issue in #2635

This commit changes the interlinks storage in the `segment_wiki` script from dictionary to a list of tuples.

We can process the test wikidata used in the test suite of gensim to inspect the new behavior.
```
python gensim/scripts/segment_wiki.py -i \
    -f ~/Downloads/enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2 \
    -o ~/Downloads/enwiki-latest.json.gz
```

We get the following output:

```
$ cat ~/Downloads/enwiki-latest.json.gz | zcat | head -1 | jq -r '.interlinks[] | [.[0], .[1]] | @TSV' | sort | head
-ism	-ism
1848 Revolution	1848 Revolution
1917 October Revolution	1917 October Revolution
6 February 1934 crisis	February 1934 riots
A. S. Neill	A. S. Neill
AK Press	AK Press
Abu Hanifa	Abu Hanifa
Adolf Brand	Adolf Brand
Adolf Brand	Adolf Brand
Adolf Hitler	Hitler
```

All tests pass for the related test file.

```
python -m unittest gensim.test.test_scripts
/Users/smishra/miniconda3/envs/TwitterNER/lib/python3.7/bz2.py:131: ResourceWarning: unclosed file <_io.BufferedReader name='/Users/smishra/workspace/codes/python/gensim/gensim/test/test_data/enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2'>
  self._buffer = None
ResourceWarning: Enable tracemalloc to get the object allocation traceback
.....
----------------------------------------------------------------------
Ran 5 tests in 6.298s

OK
```

* Updated docstrings

* Fixed flake8 issue of long line in docsrtring

* Fixed comments and replaces assertTrue with assertEqual

* Fixed unittest comment and checks for wikicorpus
  • Loading branch information
napsternxg authored and mpenkov committed Oct 19, 2019
1 parent 3e027c2 commit e102574
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 19 deletions.
13 changes: 7 additions & 6 deletions gensim/corpora/wikicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,23 +164,24 @@ def find_interlinks(raw):
Returns
-------
dict
Mapping from the linked article to the actual text found.
list
List of tuples in format [(linked article, the actual text found), ...].
"""
filtered = filter_wiki(raw, promote_remaining=False, simplify_links=False)
interlinks_raw = re.findall(RE_P16, filtered)

interlinks = {}
interlinks = []
for parts in [i.split('|') for i in interlinks_raw]:
actual_title = parts[0]
try:
interlink_text = parts[1]
interlinks[actual_title] = interlink_text
except IndexError:
interlinks[actual_title] = actual_title
interlink_text = actual_title
interlink_tuple = (actual_title, interlink_text)
interlinks.append(interlink_tuple)

legit_interlinks = {i: j for i, j in interlinks.items() if '[' not in i and ']' not in i}
legit_interlinks = [(i, j) for i, j in interlinks if '[' not in i and ']' not in i]
return legit_interlinks


Expand Down
17 changes: 10 additions & 7 deletions gensim/scripts/segment_wiki.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,9 @@ def segment_all_articles(file_path, min_article_character=200, workers=None, inc
Yields
------
(str, list of (str, str), (Optionally) dict of str: str)
Structure contains (title, [(section_heading, section_content), ...], (Optionally) {interlinks}).
(str, list of (str, str), (Optionally) list of (str, str))
Structure contains (title, [(section_heading, section_content), ...],
(Optionally) [(interlink_article, interlink_text), ...]).
"""
with gensim.utils.open(file_path, 'rb') as xml_fileobj:
Expand Down Expand Up @@ -215,8 +216,9 @@ def segment(page_xml, include_interlinks=False):
Returns
-------
(str, list of (str, str), (Optionally) dict of (str: str))
Structure contains (title, [(section_heading, section_content), ...], (Optionally) {interlinks}).
(str, list of (str, str), (Optionally) list of (str, str))
Structure contains (title, [(section_heading, section_content), ...],
(Optionally) [(interlink_article, interlink_text), ...]).
"""
elem = cElementTree.fromstring(page_xml)
Expand Down Expand Up @@ -313,8 +315,9 @@ def get_texts_with_sections(self):
Yields
------
(str, list of (str, str), dict of (str: str))
Structure contains (title, [(section_heading, section_content), ...], (Optionally){interlinks}).
(str, list of (str, str), list of (str, str))
Structure contains (title, [(section_heading, section_content), ...],
(Optionally)[(interlink_article, interlink_text), ...]).
"""
skipped_namespace, skipped_length, skipped_redirect = 0, 0, 0
Expand Down Expand Up @@ -378,7 +381,7 @@ def get_texts_with_sections(self):
parser.add_argument(
'-i', '--include-interlinks',
help='Include a mapping for interlinks to other articles in the dump. The mappings format is: '
'"interlinks": {"article_title_1": "interlink_text_1", "article_title_2": "interlink_text_2", ...}',
'"interlinks": [("article_title_1", "interlink_text_1"), ("article_title_2", "interlink_text_2"), ...]',
action='store_true'
)
args = parser.parse_args()
Expand Down
15 changes: 9 additions & 6 deletions gensim/test/test_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,10 @@ def test_segment_all_articles(self):
self.assertTrue(first_sentence in first_section_text)

# Check interlinks
self.assertTrue(interlinks['self-governance'] == 'self-governed')
self.assertTrue(interlinks['Hierarchy'] == 'hierarchical')
self.assertTrue(interlinks['Pierre-Joseph Proudhon'] == 'Proudhon')
self.assertEqual(len(interlinks), 685)
self.assertTrue(interlinks[0] == ("political philosophy", "political philosophy"))
self.assertTrue(interlinks[1] == ("self-governance", "self-governed"))
self.assertTrue(interlinks[2] == ("stateless society", "stateless societies"))

def test_generator_len(self):
expected_num_articles = 106
Expand Down Expand Up @@ -105,9 +106,11 @@ def test_segment_and_write_all_articles(self):
self.assertEqual(section_titles, self.expected_section_titles)

# Check interlinks
self.assertTrue(interlinks['self-governance'] == 'self-governed')
self.assertTrue(interlinks['Hierarchy'] == 'hierarchical')
self.assertTrue(interlinks['Pierre-Joseph Proudhon'] == 'Proudhon')
# JSON has no tuples, only lists. So, we convert lists to tuples explicitly before comparison.
self.assertEqual(len(interlinks), 685)
self.assertEqual(tuple(interlinks[0]), ("political philosophy", "political philosophy"))
self.assertEqual(tuple(interlinks[1]), ("self-governance", "self-governed"))
self.assertEqual(tuple(interlinks[2]), ("stateless society", "stateless societies"))


class TestWord2Vec2Tensor(unittest.TestCase):
Expand Down

0 comments on commit e102574

Please sign in to comment.