piskvorky · menshikh-iv · Jan 11, 2018 · Dec 26, 2017 · Dec 26, 2017 · Dec 26, 2017
diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
@@ -3,6 +3,7 @@
 #
 # Copyright (C) 2010 Radim Rehurek <[email protected]>
 # Copyright (C) 2012 Lars Buitinck <[email protected]>
+# Copyright (C) 2018 Emmanouil Stergiadis <[email protected]>
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
 
@@ -56,8 +57,8 @@
 RE_P12 = re.compile(r'\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE)  # table formatting
 RE_P13 = re.compile(r'\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE)  # table cell formatting
 RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE)  # categories
-# Remove File and Image template
-RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
+RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)  # Remove File and Image template
+
 
 # MediaWiki namespaces (https://www.mediawiki.org/wiki/Manual:Namespace) that
 # ought to be ignored
@@ -332,19 +333,15 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
         self.token_min_len = token_min_len
         self.token_max_len = token_max_len
         self.lower = lower
-
-        if dictionary is None:
-            self.dictionary = Dictionary(self.get_texts())
-        else:
-            self.dictionary = dictionary
+        self.dictionary = dictionary or Dictionary(self.get_texts())
 
     def get_texts(self):
         """
         Iterate over the dump, returning text version of each article as a list
         of tokens.
 
         Only articles of sufficient length are returned (short articles & redirects
-        etc are ignored). This is control by `article_min_tokens` on the class instance.
+        etc are ignored). This is controlled by `article_min_tokens` on the class instance.
 
         Note that this iterates over the **texts**; if you want vectors, just use
         the standard corpus interface instead of this function::
@@ -380,6 +377,7 @@ def get_texts(self):
                         yield (tokens, (pageid, title))
                     else:
                         yield tokens
+
         except KeyboardInterrupt:
             logger.warn(
                 "user terminated iteration over Wikipedia corpus after %i documents with %i positions "

diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py
@@ -20,7 +20,7 @@
 import numpy as np
 
 from gensim.corpora import (bleicorpus, mmcorpus, lowcorpus, svmlightcorpus,
-                            ucicorpus, malletcorpus, textcorpus, indexedcorpus)
+                            ucicorpus, malletcorpus, textcorpus, indexedcorpus, wikicorpus)
 from gensim.interfaces import TransformedCorpus
 from gensim.utils import to_unicode
 from gensim.test.utils import datapath, get_tmpfile
@@ -400,6 +400,171 @@ def test_indexing(self):
         pass
 
 
+# Needed for the test_custom_tokenizer is the TestWikiCorpus class.
+# Cannot be nested due to serializing.
+def custom_tokenizer(content, token_min_len=2, token_max_len=15, lower=True):
+    return [
+        to_unicode(token.lower()) if lower else to_unicode(token) for token in content.split()
+        if token_min_len <= len(token) <= token_max_len and not token.startswith('_')
+    ]
+
+
+class TestWikiCorpus(TestTextCorpus):
+    def setUp(self):
+        self.corpus_class = wikicorpus.WikiCorpus
+        self.file_extension = '.xml.bz2'
+        self.fname = datapath('testcorpus.' + self.file_extension.lstrip('.'))
+        self.enwiki = datapath('enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2')
+
+    def test_default_preprocessing(self):
+        expected = ['computer', 'human', 'interface']
+        corpus = self.corpus_class(self.fname, article_min_tokens=0)
+        first_text = next(corpus.get_texts())
+        self.assertEqual(expected, first_text)
+
+    def test_len(self):
+        # When there is no min_token limit all 9 articles must be registered.
+        corpus = self.corpus_class(self.fname, article_min_tokens=0)
+        all_articles = corpus.get_texts()
+        assert (len(list(all_articles)) == 9)
+
+        # With a huge min_token limit, all articles should be filtered out.
+        corpus = self.corpus_class(self.fname, article_min_tokens=100000)
+        all_articles = corpus.get_texts()
+        assert (len(list(all_articles)) == 0)
+
+    def test_load_with_metadata(self):
+        corpus = self.corpus_class(self.fname, article_min_tokens=0)
+        corpus.metadata = True
+        self.assertEqual(len(corpus), 9)
+
+        docs = list(corpus)
+        self.assertEqual(len(docs), 9)
+
+        for i, docmeta in enumerate(docs):
+            doc, metadata = docmeta
+            article_no = i + 1  # Counting IDs from 1
+            self.assertEqual(metadata[0], str(article_no))
+            self.assertEqual(metadata[1], 'Article%d' % article_no)
+
+    def test_load(self):
+        corpus = self.corpus_class(self.fname, article_min_tokens=0)
+
+        docs = list(corpus)
+        # the deerwester corpus always has nine documents
+        self.assertEqual(len(docs), 9)
+
+    def test_first_element(self):
+        """
+        First two articles in this sample are
+        1) anarchism
+        2) autism
+        """
+        corpus = self.corpus_class(self.enwiki, processes=1)
+
+        texts = corpus.get_texts()
+        self.assertTrue(u'anarchism' in next(texts))
+        self.assertTrue(u'autism' in next(texts))
+
+    def test_unicode_element(self):
+        """
+        First unicode article in this sample is
+        1) папа
+        """
+        bgwiki = datapath('bgwiki-latest-pages-articles-shortened.xml.bz2')
+        corpus = self.corpus_class(bgwiki)
+        texts = corpus.get_texts()
+        self.assertTrue(u'папа' in next(texts))
+
+    def test_custom_tokenizer(self):
+        """
+        define a custom tokenizer function and use it
+        """
+        wc = self.corpus_class(self.enwiki, processes=1, lemmatize=False, tokenizer_func=custom_tokenizer,
+                        token_max_len=16, token_min_len=1, lower=False)
+        row = wc.get_texts()
+        list_tokens = next(row)
+        self.assertTrue(u'Anarchism' in list_tokens)
+        self.assertTrue(u'collectivization' in list_tokens)
+        self.assertTrue(u'a' in list_tokens)
+        self.assertTrue(u'i.e.' in list_tokens)
+
+    def test_lower_case_set_true(self):
+        """
+        Set the parameter lower to True and check that upper case 'Anarchism' token doesnt exist
+        """
+        corpus = self.corpus_class(self.enwiki, processes=1, lower=True, lemmatize=False)
+        row = corpus.get_texts()
+        list_tokens = next(row)
+        self.assertTrue(u'Anarchism' not in list_tokens)
+        self.assertTrue(u'anarchism' in list_tokens)
+
+    def test_lower_case_set_false(self):
+        """
+        Set the parameter lower to False and check that upper case Anarchism' token exists
+        """
+        corpus = self.corpus_class(self.enwiki, processes=1, lower=False, lemmatize=False)
+        row = corpus.get_texts()
+        list_tokens = next(row)
+        self.assertTrue(u'Anarchism' in list_tokens)
+        self.assertTrue(u'anarchism' in list_tokens)
+
+    def test_min_token_len_not_set(self):
+        """
+        Don't set the parameter token_min_len and check that 'a' as a token doesn't exist
+        Default token_min_len=2
+        """
+        corpus = self.corpus_class(self.enwiki, processes=1, lemmatize=False)
+        self.assertTrue(u'a' not in next(corpus.get_texts()))
+
+    def test_min_token_len_set(self):
+        """
+        Set the parameter token_min_len to 1 and check that 'a' as a token exists
+        """
+        corpus = self.corpus_class(self.enwiki, processes=1, token_min_len=1, lemmatize=False)
+        self.assertTrue(u'a' in next(corpus.get_texts()))
+
+    def test_max_token_len_not_set(self):
+        """
+        Don't set the parameter token_max_len and check that 'collectivisation' as a token doesn't exist
+        Default token_max_len=15
+        """
+        corpus = self.corpus_class(self.enwiki, processes=1, lemmatize=False)
+        self.assertTrue(u'collectivization' not in next(corpus.get_texts()))
+
+    def test_max_token_len_set(self):
+        """
+        Set the parameter token_max_len to 16 and check that 'collectivisation' as a token exists
+        """
+        corpus = self.corpus_class(self.enwiki, processes=1, token_max_len=16, lemmatize=False)
+        self.assertTrue(u'collectivization' in next(corpus.get_texts()))
+
+    # #TODO: sporadic failure to be investigated
+    # def test_get_texts_returns_generator_of_lists(self):
+    #     corpus = self.corpus_class(self.enwiki)
+    #     l = corpus.get_texts()
+    #     self.assertEqual(type(l), types.GeneratorType)
+    #     first = next(l)
+    #     self.assertEqual(type(first), list)
+    #     self.assertTrue(isinstance(first[0], bytes) or isinstance(first[0], str))
+
+    def test_sample_text(self):
+        # Cannot instantiate WikiCorpus from lines
+        pass
+
+    def test_sample_text_length(self):
+        # Cannot instantiate WikiCorpus from lines
+        pass
+
+    def test_sample_text_seed(self):
+        # Cannot instantiate WikiCorpus from lines
+        pass
+
+    def test_empty_input(self):
+        # An empty file is not legit XML
+        pass
+
+
 class TestTextDirectoryCorpus(unittest.TestCase):
 
     def write_one_level(self, *args):

diff --git a/gensim/test/test_data/testcorpus.xml.bz2 b/gensim/test/test_data/testcorpus.xml.bz2
diff --git a/gensim/test/test_wikicorpus.py b/gensim/test/test_wikicorpus.py