-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
models.Phrases multiple scoring methods (#1363) #1464
Changes from 10 commits
04999a4
8984589
8b298c3
a36b2fb
5043dbb
384172e
e3eeb67
a6684de
b70648c
99ec301
5836467
e408f90
80b68c2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -64,6 +64,8 @@ | |
import warnings | ||
from collections import defaultdict | ||
import itertools as it | ||
from functools import partial | ||
from math import log | ||
|
||
from six import iteritems, string_types, next | ||
|
||
|
@@ -106,7 +108,8 @@ class Phrases(interfaces.TransformationABC): | |
|
||
""" | ||
def __init__(self, sentences=None, min_count=5, threshold=10.0, | ||
max_vocab_size=40000000, delimiter=b'_', progress_per=10000): | ||
max_vocab_size=40000000, delimiter=b'_', progress_per=10000, | ||
scoring = 'default'): | ||
""" | ||
Initialize the model from an iterable of `sentences`. Each sentence must be | ||
a list of words (unicode strings) that will be used for training. | ||
|
@@ -120,10 +123,9 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, | |
`min_count` ignore all words and bigrams with total collected count lower | ||
than this. | ||
|
||
`threshold` represents a threshold for forming the phrases (higher means | ||
fewer phrases). A phrase of words `a` and `b` is accepted if | ||
`(cnt(a, b) - min_count) * N / (cnt(a) * cnt(b)) > threshold`, where `N` is the | ||
total vocabulary size. | ||
`threshold` represents a score threshold for forming the phrases (higher means | ||
fewer phrases). A phrase of words `a` followed by `b` is accepted if the score of the | ||
phrase is greater than threshold. see the `scoring' setting | ||
|
||
`max_vocab_size` is the maximum size of the vocabulary. Used to control | ||
pruning of less common words, to keep memory under control. The default | ||
|
@@ -133,12 +135,31 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, | |
`delimiter` is the glue character used to join collocation tokens, and | ||
should be a byte string (e.g. b'_'). | ||
|
||
`scoring` specifies how potential phrases are scored for comparison to the `threshold` | ||
setting. two settings are available: | ||
'default': from "Efficient Estimaton of Word Representations in Vector Space" by | ||
Mikolov, et. al.: | ||
(count(worda followed by wordb) - min_count) * N / | ||
(count(worda) * count(wordb)) > threshold`, where `N` is the total vocabulary size. | ||
'npmi': normalized pointwise mutual information, from "Normalized (Pointwise) Mutual | ||
Information in Colocation Extraction" by Gerlof Bouma: | ||
ln(prop(worda followed by wordb) / (prop(worda)*prop(wordb))) / | ||
- ln(prop(worda followed by wordb) | ||
where prop(n) is the count of n / the count of everything in the entire corpus | ||
'npmi' is more robust when dealing with common words that form part of common bigrams, and | ||
ranges from -1 to 1, but is slower to calculate than the default | ||
|
||
""" | ||
if min_count <= 0: | ||
raise ValueError("min_count should be at least 1") | ||
|
||
if threshold <= 0: | ||
raise ValueError("threshold should be positive") | ||
if threshold <= 0 and scoring == 'default': | ||
raise ValueError("threshold should be positive for default scoring") | ||
if scoring == 'npmi' and (threshold < -1 or threshold > 1): | ||
raise ValueError("threshold should be between -1 and 1 for npmi scoring") | ||
|
||
if not (scoring == 'default' or scoring == 'npmi'): | ||
raise ValueError('unknown scoring function "' + scoring + '" specified') | ||
|
||
self.min_count = min_count | ||
self.threshold = threshold | ||
|
@@ -147,6 +168,8 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, | |
self.min_reduce = 1 # ignore any tokens with count smaller than this | ||
self.delimiter = delimiter | ||
self.progress_per = progress_per | ||
self.scoring = scoring | ||
self.corpus_word_count = 0L | ||
|
||
if sentences is not None: | ||
self.add_vocab(sentences) | ||
|
@@ -178,14 +201,15 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000): | |
if sentence: # add last word skipped by previous loop | ||
word = sentence[-1] | ||
vocab[word] += 1 | ||
total_words += 1 | ||
|
||
if len(vocab) > max_vocab_size: | ||
utils.prune_vocab(vocab, min_reduce) | ||
min_reduce += 1 | ||
|
||
logger.info("collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences" % | ||
(len(vocab), total_words, sentence_no + 1)) | ||
return min_reduce, vocab | ||
return min_reduce, vocab, total_words | ||
|
||
def add_vocab(self, sentences): | ||
""" | ||
|
@@ -197,8 +221,10 @@ def add_vocab(self, sentences): | |
# directly, but gives the new sentences a fighting chance to collect | ||
# sufficient counts, before being pruned out by the (large) accummulated | ||
# counts collected in previous learn_vocab runs. | ||
min_reduce, vocab = self.learn_vocab(sentences, self.max_vocab_size, self.delimiter, self.progress_per) | ||
min_reduce, vocab, total_words = \ | ||
self.learn_vocab(sentences, self.max_vocab_size, self.delimiter, self.progress_per) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Code style: bad indentation (unneeded line break). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the number of columns we cap at? I thought it was 100, which I believe this exceeded. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's no hard limit; if the line becomes hard to read, we break it. If the break would be even harder to read than the original (for semantic/visual/clarity reasons), we don't break it. Line continuations are indented at one extra level (4 spaces to the right). |
||
|
||
self.corpus_word_count += total_words | ||
if len(self.vocab) > 0: | ||
logger.info("merging %i counts into %s", len(vocab), self) | ||
self.min_reduce = max(self.min_reduce, min_reduce) | ||
|
@@ -226,31 +252,47 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False): | |
|
||
then you can debug the threshold with generated tsv | ||
""" | ||
|
||
vocab = self.vocab | ||
threshold = self.threshold | ||
delimiter = self.delimiter # delimiter used for lookup | ||
min_count = self.min_count | ||
scoring = self.scoring | ||
corpus_word_count = self.corpus_word_count | ||
|
||
if self.scoring == 'default': | ||
scoring_function = \ | ||
partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indentation (unneeded line break). |
||
elif self.scoring == 'npmi': | ||
scoring_function = \ | ||
partial(self.npmi_scorer, corpus_word_count = corpus_word_count) | ||
# no else here to catch unknown scoring function, check is done in Phrases.__init__ | ||
|
||
for sentence in sentences: | ||
s = [utils.any2utf8(w) for w in sentence] | ||
last_bigram = False | ||
vocab = self.vocab | ||
threshold = self.threshold | ||
delimiter = self.delimiter # delimiter used for lookup | ||
min_count = self.min_count | ||
|
||
for word_a, word_b in zip(s, s[1:]): | ||
if word_a in vocab and word_b in vocab: | ||
# last bigram check was moved here to save a few CPU cycles | ||
if word_a in vocab and word_b in vocab and not last_bigram: | ||
bigram_word = delimiter.join((word_a, word_b)) | ||
if bigram_word in vocab and not last_bigram: | ||
pa = float(vocab[word_a]) | ||
pb = float(vocab[word_b]) | ||
pab = float(vocab[bigram_word]) | ||
score = (pab - min_count) / pa / pb * len(vocab) | ||
if bigram_word in vocab: | ||
count_a = float(vocab[word_a]) | ||
count_b = float(vocab[word_b]) | ||
count_ab = float(vocab[bigram_word]) | ||
score = scoring_function(count_a, count_b, count_ab) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A pluggable scoring function would have to be called with all corpus constants and Phrases settings used in any scoring function. Right now that would look like: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that's still preferable. This string-passing seems inflexible. We could support some common use-cases by passing a string, but the code underneath should simply translate that string into a In other words, we could support both string and callable as param. If string, gensim converts that to a known callable (for easy-to-use common cases). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will make this change, hopefully before the end of the week, and make it part of a PR. |
||
# logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s", | ||
# bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score) | ||
if score > threshold: | ||
# added mincount check because if the scorer doesn't contain min_count | ||
# it would not be enforced otherwise | ||
if score > threshold and count_ab >= min_count: | ||
if as_tuples: | ||
yield ((word_a, word_b), score) | ||
else: | ||
yield (out_delimiter.join((word_a, word_b)), score) | ||
last_bigram = True | ||
continue | ||
last_bigram = False | ||
last_bigram = False | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this on purpose? What is this change about? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, this is on purpose. Matches up to line 277. If that test fails we have to set There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Aha, so this is a bug fix at the same time. Thanks! CC @menshikh-iv |
||
|
||
def __getitem__(self, sentence): | ||
""" | ||
|
@@ -311,6 +353,20 @@ def __getitem__(self, sentence): | |
|
||
return [utils.to_unicode(w) for w in new_s] | ||
|
||
# calculation of score based on original mikolov word2vec paper | ||
# len_vocab and min_count set so functools.partial works | ||
@staticmethod | ||
def original_scorer(worda_count, wordb_count, bigram_count, len_vocab = 0.0, min_count = 0.0): | ||
return (bigram_count - min_count) / worda_count / wordb_count * len_vocab | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Beware of integer divisions - this code is brittle. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't fix this in PR #1573 . Rather, I just cast everything before calling the scoring method in Phrases and Phraser. I think that's the better place to do the casting since then it fixes the problem for all custom scorers as well. Of course, I can do the casting in the scoring methods as well. Let me know if you still think I need it here and in |
||
|
||
# normalized PMI, requires corpus size | ||
@staticmethod | ||
def npmi_scorer(worda_count, wordb_count, bigram_count, corpus_word_count = 0.0): | ||
pa = worda_count / corpus_word_count | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this meant to be an integer or float division? (dtto below) |
||
pb = wordb_count / corpus_word_count | ||
pab = bigram_count / corpus_word_count | ||
return log(pab / (pa * pb)) / -log(pab) | ||
|
||
|
||
def pseudocorpus(source_vocab, sep): | ||
"""Feeds source_vocab's compound keys back to it, to discover phrases""" | ||
|
@@ -329,15 +385,16 @@ class Phraser(interfaces.TransformationABC): | |
After the one-time initialization, a Phraser will be much smaller and | ||
somewhat faster than using the full Phrases model. | ||
|
||
Reflects the results of the source model's `min_count` and `threshold` | ||
settings. (You can tamper with those & create a new Phraser to try | ||
Reflects the results of the source model's `min_count`, `threshold`, and | ||
`scoring` settings. (You can tamper with those & create a new Phraser to try | ||
other values.) | ||
|
||
""" | ||
def __init__(self, phrases_model): | ||
self.threshold = phrases_model.threshold | ||
self.min_count = phrases_model.min_count | ||
self.delimiter = phrases_model.delimiter | ||
self.scoring = phrases_model.scoring | ||
self.phrasegrams = {} | ||
corpus = pseudocorpus(phrases_model.vocab, phrases_model.delimiter) | ||
logger.info('source_vocab length %i', len(phrases_model.vocab)) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Capitalize first word in sentence, end in full stop.