Merge pull request #528 from ziky90/fix_461

removed pattern dependency when it is not needed in order to fix #461
piskvorky · Nov 15, 2015 · 463b94d · 463b94d
2 parents d2befe9 + 95914f4
commit 463b94d
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 45 deletions.
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -1,6 +1,9 @@
 Changes
 =======
 
+* Loading of pattern library in utils.py is only in lemmatize function (Jan Zikes, #461)
+  - utils.HAS_PATTERN, has also changed to utils.has_pattern()
+
 0.12.3, 05/11/2015
 
 * Make show_topics return value consistent across models (Christopher Corley, #448)

diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
@@ -249,7 +249,7 @@ class WikiCorpus(TextCorpus):
     >>> MmCorpus.serialize('wiki_en_vocab200k', wiki) # another 8h, creates a file in MatrixMarket format plus file with id->word
 
     """
-    def __init__(self, fname, processes=None, lemmatize=utils.HAS_PATTERN, dictionary=None, filter_namespaces=('0',)):
+    def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',)):
         """
         Initialize the corpus. Unless a dictionary is provided, this scans the
         corpus once, to determine its vocabulary.

diff --git a/gensim/utils.py b/gensim/utils.py
@@ -76,14 +76,6 @@ def smart_open(fname, mode='rb'):
         return open(fname, mode)
 
 
-try:
-    from pattern.en import parse
-    logger.info("'pattern' package found; utils.lemmatize() is available for English")
-    HAS_PATTERN = True
-except ImportError:
-    HAS_PATTERN = False
-
-
 PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE)
 RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE)
 
@@ -1022,48 +1014,64 @@ def pyro_daemon(name, obj, random_suffix=False, ip=None, port=None):
             daemon.requestLoop()
 
 
-if HAS_PATTERN:
-    def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False,
-            stopwords=frozenset(), min_length=2, max_length=15):
-        """
-        This function is only available when the optional 'pattern' package is installed.
+def has_pattern():
+    """
+    Function to check if there is installed pattern library
+    """
+    pattern = False
+    try:
+        from pattern.en import parse
+        pattern = True
+    except ImportError:
+        logger.warning("Pattern library is not installed.")
+    return pattern
+
 
-        Use the English lemmatizer from `pattern` to extract UTF8-encoded tokens in
-        their base form=lemma, e.g. "are, is, being" -> "be" etc.
-        This is a smarter version of stemming, taking word context into account.
+def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False,
+        stopwords=frozenset(), min_length=2, max_length=15):
+    """
+    This function is only available when the optional 'pattern' package is installed.
 
-        Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded).
+    Use the English lemmatizer from `pattern` to extract UTF8-encoded tokens in
+    their base form=lemma, e.g. "are, is, being" -> "be" etc.
+    This is a smarter version of stemming, taking word context into account.
 
-        >>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')
-        ['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']
+    Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded).
 
-        >>> lemmatize('The study ranks high.')
-        ['study/NN', 'rank/VB', 'high/JJ']
+    >>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')
+    ['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']
 
-        >>> lemmatize('The ranks study hard.')
-        ['rank/NN', 'study/VB', 'hard/RB']
+    >>> lemmatize('The study ranks high.')
+    ['study/NN', 'rank/VB', 'high/JJ']
 
-        """
-        if light:
-            import warnings
-            warnings.warn("The light flag is no longer supported by pattern.")
-
-        # tokenization in `pattern` is weird; it gets thrown off by non-letters,
-        # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
-        # FIXME this throws away all fancy parsing cues, including sentence structure,
-        # abbreviations etc.
-        content = u(' ').join(tokenize(content, lower=True, errors='ignore'))
-
-        parsed = parse(content, lemmata=True, collapse=False)
-        result = []
-        for sentence in parsed:
-            for token, tag, _, _, lemma in sentence:
-                if min_length <= len(lemma) <= max_length and not lemma.startswith('_') and lemma not in stopwords:
-                    if allowed_tags.match(tag):
-                        lemma += "/" + tag[:2]
-                        result.append(lemma.encode('utf8'))
-        return result
-#endif HAS_PATTERN
+    >>> lemmatize('The ranks study hard.')
+    ['rank/NN', 'study/VB', 'hard/RB']
+
+    """
+    if not has_pattern():
+        raise ImportError("Pattern library is not installed. Pattern library is needed in order  \
+         to use lemmatize function")
+    from pattern.en import parse
+
+    if light:
+        import warnings
+        warnings.warn("The light flag is no longer supported by pattern.")
+
+    # tokenization in `pattern` is weird; it gets thrown off by non-letters,
+    # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
+    # FIXME this throws away all fancy parsing cues, including sentence structure,
+    # abbreviations etc.
+    content = u(' ').join(tokenize(content, lower=True, errors='ignore'))
+
+    parsed = parse(content, lemmata=True, collapse=False)
+    result = []
+    for sentence in parsed:
+        for token, tag, _, _, lemma in sentence:
+            if min_length <= len(lemma) <= max_length and not lemma.startswith('_') and lemma not in stopwords:
+                if allowed_tags.match(tag):
+                    lemma += "/" + tag[:2]
+                    result.append(lemma.encode('utf8'))
+    return result
 
 
 def mock_data_row(dim=1000, prob_nnz=0.5, lam=1.0):