Skip to content

Commit

Permalink
removed pattern dependency when it is not needed in order to fix #461
Browse files Browse the repository at this point in the history
  • Loading branch information
ziky90 committed Nov 13, 2015
1 parent d2befe9 commit e453fb0
Showing 1 changed file with 39 additions and 44 deletions.
83 changes: 39 additions & 44 deletions gensim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,6 @@ def smart_open(fname, mode='rb'):
return open(fname, mode)


try:
from pattern.en import parse
logger.info("'pattern' package found; utils.lemmatize() is available for English")
HAS_PATTERN = True
except ImportError:
HAS_PATTERN = False


PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE)
RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE)

Expand Down Expand Up @@ -1022,48 +1014,51 @@ def pyro_daemon(name, obj, random_suffix=False, ip=None, port=None):
daemon.requestLoop()


if HAS_PATTERN:
def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False,
stopwords=frozenset(), min_length=2, max_length=15):
"""
This function is only available when the optional 'pattern' package is installed.
def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False,
stopwords=frozenset(), min_length=2, max_length=15):
"""
This function is only available when the optional 'pattern' package is installed.
Use the English lemmatizer from `pattern` to extract UTF8-encoded tokens in
their base form=lemma, e.g. "are, is, being" -> "be" etc.
This is a smarter version of stemming, taking word context into account.
Use the English lemmatizer from `pattern` to extract UTF8-encoded tokens in
their base form=lemma, e.g. "are, is, being" -> "be" etc.
This is a smarter version of stemming, taking word context into account.
Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded).
Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded).
>>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')
['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']
>>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')
['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']
>>> lemmatize('The study ranks high.')
['study/NN', 'rank/VB', 'high/JJ']
>>> lemmatize('The study ranks high.')
['study/NN', 'rank/VB', 'high/JJ']
>>> lemmatize('The ranks study hard.')
['rank/NN', 'study/VB', 'hard/RB']
>>> lemmatize('The ranks study hard.')
['rank/NN', 'study/VB', 'hard/RB']
"""
if light:
import warnings
warnings.warn("The light flag is no longer supported by pattern.")

# tokenization in `pattern` is weird; it gets thrown off by non-letters,
# producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
# FIXME this throws away all fancy parsing cues, including sentence structure,
# abbreviations etc.
content = u(' ').join(tokenize(content, lower=True, errors='ignore'))

parsed = parse(content, lemmata=True, collapse=False)
result = []
for sentence in parsed:
for token, tag, _, _, lemma in sentence:
if min_length <= len(lemma) <= max_length and not lemma.startswith('_') and lemma not in stopwords:
if allowed_tags.match(tag):
lemma += "/" + tag[:2]
result.append(lemma.encode('utf8'))
return result
#endif HAS_PATTERN
"""
try:
from pattern.en import parse
except ImportError:
logger.error("Pattern library is not installed. It needs to be installed for calling lemmatize()")

if light:
import warnings
warnings.warn("The light flag is no longer supported by pattern.")

# tokenization in `pattern` is weird; it gets thrown off by non-letters,
# producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
# FIXME this throws away all fancy parsing cues, including sentence structure,
# abbreviations etc.
content = u(' ').join(tokenize(content, lower=True, errors='ignore'))

parsed = parse(content, lemmata=True, collapse=False)
result = []
for sentence in parsed:
for token, tag, _, _, lemma in sentence:
if min_length <= len(lemma) <= max_length and not lemma.startswith('_') and lemma not in stopwords:
if allowed_tags.match(tag):
lemma += "/" + tag[:2]
result.append(lemma.encode('utf8'))
return result


def mock_data_row(dim=1000, prob_nnz=0.5, lam=1.0):
Expand Down

0 comments on commit e453fb0

Please sign in to comment.