Skip to content

Commit

Permalink
Merge pull request #528 from ziky90/fix_461
Browse files Browse the repository at this point in the history
removed pattern dependency when it is not needed in order to fix #461
  • Loading branch information
piskvorky committed Nov 15, 2015
2 parents d2befe9 + 95914f4 commit 463b94d
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 45 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
Changes
=======

* Loading of pattern library in utils.py is only in lemmatize function (Jan Zikes, #461)
- utils.HAS_PATTERN, has also changed to utils.has_pattern()

0.12.3, 05/11/2015

* Make show_topics return value consistent across models (Christopher Corley, #448)
Expand Down
2 changes: 1 addition & 1 deletion gensim/corpora/wikicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ class WikiCorpus(TextCorpus):
>>> MmCorpus.serialize('wiki_en_vocab200k', wiki) # another 8h, creates a file in MatrixMarket format plus file with id->word
"""
def __init__(self, fname, processes=None, lemmatize=utils.HAS_PATTERN, dictionary=None, filter_namespaces=('0',)):
def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',)):
"""
Initialize the corpus. Unless a dictionary is provided, this scans the
corpus once, to determine its vocabulary.
Expand Down
96 changes: 52 additions & 44 deletions gensim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,6 @@ def smart_open(fname, mode='rb'):
return open(fname, mode)


try:
from pattern.en import parse
logger.info("'pattern' package found; utils.lemmatize() is available for English")
HAS_PATTERN = True
except ImportError:
HAS_PATTERN = False


PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE)
RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE)

Expand Down Expand Up @@ -1022,48 +1014,64 @@ def pyro_daemon(name, obj, random_suffix=False, ip=None, port=None):
daemon.requestLoop()


if HAS_PATTERN:
def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False,
stopwords=frozenset(), min_length=2, max_length=15):
"""
This function is only available when the optional 'pattern' package is installed.
def has_pattern():
"""
Function to check if there is installed pattern library
"""
pattern = False
try:
from pattern.en import parse
pattern = True
except ImportError:
logger.warning("Pattern library is not installed.")
return pattern


Use the English lemmatizer from `pattern` to extract UTF8-encoded tokens in
their base form=lemma, e.g. "are, is, being" -> "be" etc.
This is a smarter version of stemming, taking word context into account.
def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False,
stopwords=frozenset(), min_length=2, max_length=15):
"""
This function is only available when the optional 'pattern' package is installed.
Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded).
Use the English lemmatizer from `pattern` to extract UTF8-encoded tokens in
their base form=lemma, e.g. "are, is, being" -> "be" etc.
This is a smarter version of stemming, taking word context into account.
>>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')
['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']
Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded).
>>> lemmatize('The study ranks high.')
['study/NN', 'rank/VB', 'high/JJ']
>>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')
['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']
>>> lemmatize('The ranks study hard.')
['rank/NN', 'study/VB', 'hard/RB']
>>> lemmatize('The study ranks high.')
['study/NN', 'rank/VB', 'high/JJ']
"""
if light:
import warnings
warnings.warn("The light flag is no longer supported by pattern.")

# tokenization in `pattern` is weird; it gets thrown off by non-letters,
# producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
# FIXME this throws away all fancy parsing cues, including sentence structure,
# abbreviations etc.
content = u(' ').join(tokenize(content, lower=True, errors='ignore'))

parsed = parse(content, lemmata=True, collapse=False)
result = []
for sentence in parsed:
for token, tag, _, _, lemma in sentence:
if min_length <= len(lemma) <= max_length and not lemma.startswith('_') and lemma not in stopwords:
if allowed_tags.match(tag):
lemma += "/" + tag[:2]
result.append(lemma.encode('utf8'))
return result
#endif HAS_PATTERN
>>> lemmatize('The ranks study hard.')
['rank/NN', 'study/VB', 'hard/RB']
"""
if not has_pattern():
raise ImportError("Pattern library is not installed. Pattern library is needed in order \
to use lemmatize function")
from pattern.en import parse

if light:
import warnings
warnings.warn("The light flag is no longer supported by pattern.")

# tokenization in `pattern` is weird; it gets thrown off by non-letters,
# producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
# FIXME this throws away all fancy parsing cues, including sentence structure,
# abbreviations etc.
content = u(' ').join(tokenize(content, lower=True, errors='ignore'))

parsed = parse(content, lemmata=True, collapse=False)
result = []
for sentence in parsed:
for token, tag, _, _, lemma in sentence:
if min_length <= len(lemma) <= max_length and not lemma.startswith('_') and lemma not in stopwords:
if allowed_tags.match(tag):
lemma += "/" + tag[:2]
result.append(lemma.encode('utf8'))
return result


def mock_data_row(dim=1000, prob_nnz=0.5, lam=1.0):
Expand Down

0 comments on commit 463b94d

Please sign in to comment.