removed pattern dependency when it is not needed in order to fix #461

piskvorky · Nov 13, 2015 · e453fb0 · e453fb0
1 parent d2befe9
commit e453fb0
Showing 1 changed file with 39 additions and 44 deletions.
diff --git a/gensim/utils.py b/gensim/utils.py
@@ -76,14 +76,6 @@ def smart_open(fname, mode='rb'):
         return open(fname, mode)
 
 
-try:
-    from pattern.en import parse
-    logger.info("'pattern' package found; utils.lemmatize() is available for English")
-    HAS_PATTERN = True
-except ImportError:
-    HAS_PATTERN = False
-
-
 PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE)
 RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE)
 
@@ -1022,48 +1014,51 @@ def pyro_daemon(name, obj, random_suffix=False, ip=None, port=None):
             daemon.requestLoop()
 
 
-if HAS_PATTERN:
-    def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False,
-            stopwords=frozenset(), min_length=2, max_length=15):
-        """
-        This function is only available when the optional 'pattern' package is installed.
+def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False,
+        stopwords=frozenset(), min_length=2, max_length=15):
+    """
+    This function is only available when the optional 'pattern' package is installed.
 
-        Use the English lemmatizer from `pattern` to extract UTF8-encoded tokens in
-        their base form=lemma, e.g. "are, is, being" -> "be" etc.
-        This is a smarter version of stemming, taking word context into account.
+    Use the English lemmatizer from `pattern` to extract UTF8-encoded tokens in
+    their base form=lemma, e.g. "are, is, being" -> "be" etc.
+    This is a smarter version of stemming, taking word context into account.
 
-        Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded).
+    Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded).
 
-        >>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')
-        ['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']
+    >>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')
+    ['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']
 
-        >>> lemmatize('The study ranks high.')
-        ['study/NN', 'rank/VB', 'high/JJ']
+    >>> lemmatize('The study ranks high.')
+    ['study/NN', 'rank/VB', 'high/JJ']
 
-        >>> lemmatize('The ranks study hard.')
-        ['rank/NN', 'study/VB', 'hard/RB']
+    >>> lemmatize('The ranks study hard.')
+    ['rank/NN', 'study/VB', 'hard/RB']
 
-        """
-        if light:
-            import warnings
-            warnings.warn("The light flag is no longer supported by pattern.")
-
-        # tokenization in `pattern` is weird; it gets thrown off by non-letters,
-        # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
-        # FIXME this throws away all fancy parsing cues, including sentence structure,
-        # abbreviations etc.
-        content = u(' ').join(tokenize(content, lower=True, errors='ignore'))
-
-        parsed = parse(content, lemmata=True, collapse=False)
-        result = []
-        for sentence in parsed:
-            for token, tag, _, _, lemma in sentence:
-                if min_length <= len(lemma) <= max_length and not lemma.startswith('_') and lemma not in stopwords:
-                    if allowed_tags.match(tag):
-                        lemma += "/" + tag[:2]
-                        result.append(lemma.encode('utf8'))
-        return result
-#endif HAS_PATTERN
+    """
+    try:
+        from pattern.en import parse
+    except ImportError:
+        logger.error("Pattern library is not installed. It needs to be installed for calling lemmatize()")
+
+    if light:
+        import warnings
+        warnings.warn("The light flag is no longer supported by pattern.")
+
+    # tokenization in `pattern` is weird; it gets thrown off by non-letters,
+    # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
+    # FIXME this throws away all fancy parsing cues, including sentence structure,
+    # abbreviations etc.
+    content = u(' ').join(tokenize(content, lower=True, errors='ignore'))
+
+    parsed = parse(content, lemmata=True, collapse=False)
+    result = []
+    for sentence in parsed:
+        for token, tag, _, _, lemma in sentence:
+            if min_length <= len(lemma) <= max_length and not lemma.startswith('_') and lemma not in stopwords:
+                if allowed_tags.match(tag):
+                    lemma += "/" + tag[:2]
+                    result.append(lemma.encode('utf8'))
+    return result
 
 
 def mock_data_row(dim=1000, prob_nnz=0.5, lam=1.0):