Skip to content

Commit

Permalink
Add multiple scoring methods for Phrases. Partial fix #1363 (#1464)
Browse files Browse the repository at this point in the history
* initial commit of alternate scoring

now with a scoring parameter to initialize a Phrases object, defaults to
the mikolov paper scoring, but also switchable to 'npmi', normalized
pointwise mutual information

moved scoring calculation to call a function, scoring functions are now
top level functions in models.Phrases that are called when calculating
scores in models.Phrases.export_phrases

* all existing tests now pass

fixed some bugs with the pluggable scoring that were causing tests to
fail.

* added testScoringOriginal to test default scoring

* better name for test for default scorer

* moved scoring parameter checking logic to initialization

* fixed bugin export_phrases scoring function creation

* test for npmi scoring

* typo in phrases docstring

* copy scoring setting to Phraser

* fixing travis-ci errors

* no need to specify long vs. int
  • Loading branch information
Michael W. Sherman authored and menshikh-iv committed Jul 20, 2017
1 parent f3bf792 commit 5f54b60
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 24 deletions.
103 changes: 80 additions & 23 deletions gensim/models/phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@
import warnings
from collections import defaultdict
import itertools as it
from functools import partial
from math import log

from six import iteritems, string_types, next

Expand Down Expand Up @@ -106,7 +108,8 @@ class Phrases(interfaces.TransformationABC):
"""
def __init__(self, sentences=None, min_count=5, threshold=10.0,
max_vocab_size=40000000, delimiter=b'_', progress_per=10000):
max_vocab_size=40000000, delimiter=b'_', progress_per=10000,
scoring='default'):
"""
Initialize the model from an iterable of `sentences`. Each sentence must be
a list of words (unicode strings) that will be used for training.
Expand All @@ -120,10 +123,9 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
`min_count` ignore all words and bigrams with total collected count lower
than this.
`threshold` represents a threshold for forming the phrases (higher means
fewer phrases). A phrase of words `a` and `b` is accepted if
`(cnt(a, b) - min_count) * N / (cnt(a) * cnt(b)) > threshold`, where `N` is the
total vocabulary size.
`threshold` represents a score threshold for forming the phrases (higher means
fewer phrases). A phrase of words `a` followed by `b` is accepted if the score of the
phrase is greater than threshold. see the `scoring' setting
`max_vocab_size` is the maximum size of the vocabulary. Used to control
pruning of less common words, to keep memory under control. The default
Expand All @@ -133,12 +135,31 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
`delimiter` is the glue character used to join collocation tokens, and
should be a byte string (e.g. b'_').
`scoring` specifies how potential phrases are scored for comparison to the `threshold`
setting. two settings are available:
'default': from "Efficient Estimaton of Word Representations in Vector Space" by
Mikolov, et. al.:
(count(worda followed by wordb) - min_count) * N /
(count(worda) * count(wordb)) > threshold`, where `N` is the total vocabulary size.
'npmi': normalized pointwise mutual information, from "Normalized (Pointwise) Mutual
Information in Colocation Extraction" by Gerlof Bouma:
ln(prop(worda followed by wordb) / (prop(worda)*prop(wordb))) /
- ln(prop(worda followed by wordb)
where prop(n) is the count of n / the count of everything in the entire corpus
'npmi' is more robust when dealing with common words that form part of common bigrams, and
ranges from -1 to 1, but is slower to calculate than the default
"""
if min_count <= 0:
raise ValueError("min_count should be at least 1")

if threshold <= 0:
raise ValueError("threshold should be positive")
if threshold <= 0 and scoring == 'default':
raise ValueError("threshold should be positive for default scoring")
if scoring == 'npmi' and (threshold < -1 or threshold > 1):
raise ValueError("threshold should be between -1 and 1 for npmi scoring")

if not (scoring == 'default' or scoring == 'npmi'):
raise ValueError('unknown scoring function "' + scoring + '" specified')

self.min_count = min_count
self.threshold = threshold
Expand All @@ -147,6 +168,8 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
self.min_reduce = 1 # ignore any tokens with count smaller than this
self.delimiter = delimiter
self.progress_per = progress_per
self.scoring = scoring
self.corpus_word_count = 0

if sentences is not None:
self.add_vocab(sentences)
Expand Down Expand Up @@ -178,14 +201,15 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000):
if sentence: # add last word skipped by previous loop
word = sentence[-1]
vocab[word] += 1
total_words += 1

if len(vocab) > max_vocab_size:
utils.prune_vocab(vocab, min_reduce)
min_reduce += 1

logger.info("collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences" %
(len(vocab), total_words, sentence_no + 1))
return min_reduce, vocab
return min_reduce, vocab, total_words

def add_vocab(self, sentences):
"""
Expand All @@ -197,8 +221,10 @@ def add_vocab(self, sentences):
# directly, but gives the new sentences a fighting chance to collect
# sufficient counts, before being pruned out by the (large) accummulated
# counts collected in previous learn_vocab runs.
min_reduce, vocab = self.learn_vocab(sentences, self.max_vocab_size, self.delimiter, self.progress_per)
min_reduce, vocab, total_words = \
self.learn_vocab(sentences, self.max_vocab_size, self.delimiter, self.progress_per)

self.corpus_word_count += total_words
if len(self.vocab) > 0:
logger.info("merging %i counts into %s", len(vocab), self)
self.min_reduce = max(self.min_reduce, min_reduce)
Expand Down Expand Up @@ -226,31 +252,47 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False):
then you can debug the threshold with generated tsv
"""

vocab = self.vocab
threshold = self.threshold
delimiter = self.delimiter # delimiter used for lookup
min_count = self.min_count
scoring = self.scoring
corpus_word_count = self.corpus_word_count

if scoring == 'default':
scoring_function = \
partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count))
elif scoring == 'npmi':
scoring_function = \
partial(self.npmi_scorer, corpus_word_count=corpus_word_count)
# no else here to catch unknown scoring function, check is done in Phrases.__init__

for sentence in sentences:
s = [utils.any2utf8(w) for w in sentence]
last_bigram = False
vocab = self.vocab
threshold = self.threshold
delimiter = self.delimiter # delimiter used for lookup
min_count = self.min_count

for word_a, word_b in zip(s, s[1:]):
if word_a in vocab and word_b in vocab:
# last bigram check was moved here to save a few CPU cycles
if word_a in vocab and word_b in vocab and not last_bigram:
bigram_word = delimiter.join((word_a, word_b))
if bigram_word in vocab and not last_bigram:
pa = float(vocab[word_a])
pb = float(vocab[word_b])
pab = float(vocab[bigram_word])
score = (pab - min_count) / pa / pb * len(vocab)
if bigram_word in vocab:
count_a = float(vocab[word_a])
count_b = float(vocab[word_b])
count_ab = float(vocab[bigram_word])
score = scoring_function(count_a, count_b, count_ab)
# logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s",
# bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score)
if score > threshold:
# added mincount check because if the scorer doesn't contain min_count
# it would not be enforced otherwise
if score > threshold and count_ab >= min_count:
if as_tuples:
yield ((word_a, word_b), score)
else:
yield (out_delimiter.join((word_a, word_b)), score)
last_bigram = True
continue
last_bigram = False
last_bigram = False

def __getitem__(self, sentence):
"""
Expand Down Expand Up @@ -311,6 +353,20 @@ def __getitem__(self, sentence):

return [utils.to_unicode(w) for w in new_s]

# calculation of score based on original mikolov word2vec paper
# len_vocab and min_count set so functools.partial works
@staticmethod
def original_scorer(worda_count, wordb_count, bigram_count, len_vocab=0.0, min_count=0.0):
return (bigram_count - min_count) / worda_count / wordb_count * len_vocab

# normalized PMI, requires corpus size
@staticmethod
def npmi_scorer(worda_count, wordb_count, bigram_count, corpus_word_count=0.0):
pa = worda_count / corpus_word_count
pb = wordb_count / corpus_word_count
pab = bigram_count / corpus_word_count
return log(pab / (pa * pb)) / -log(pab)


def pseudocorpus(source_vocab, sep):
"""Feeds source_vocab's compound keys back to it, to discover phrases"""
Expand All @@ -329,15 +385,16 @@ class Phraser(interfaces.TransformationABC):
After the one-time initialization, a Phraser will be much smaller and
somewhat faster than using the full Phrases model.
Reflects the results of the source model's `min_count` and `threshold`
settings. (You can tamper with those & create a new Phraser to try
Reflects the results of the source model's `min_count`, `threshold`, and
`scoring` settings. (You can tamper with those & create a new Phraser to try
other values.)
"""
def __init__(self, phrases_model):
self.threshold = phrases_model.threshold
self.min_count = phrases_model.min_count
self.delimiter = phrases_model.delimiter
self.scoring = phrases_model.scoring
self.phrasegrams = {}
corpus = pseudocorpus(phrases_model.vocab, phrases_model.delimiter)
logger.info('source_vocab length %i', len(phrases_model.vocab))
Expand Down
32 changes: 31 additions & 1 deletion gensim/test/test_phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def testExportPhrases(self):
b'human interface'
])

def test_multiple_bigrams_single_entry(self):
def testMultipleBigramsSingleEntry(self):
""" a single entry should produce multiple bigrams. """
bigram = Phrases(sentences, min_count=1, threshold=1)

Expand All @@ -153,6 +153,36 @@ def test_multiple_bigrams_single_entry(self):
b'human interface'
])

def testScoringDefault(self):
""" test the default scoring, from the mikolov word2vec paper """
bigram = Phrases(sentences, min_count=1, threshold=1)

seen_scores = set()

test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
for phrase, score in bigram.export_phrases(test_sentences):
seen_scores.add(round(score, 3))

assert seen_scores == set([
5.167, # score for graph minors
3.444 # score for human interface
])

def testScoringNpmi(self):
""" test normalized pointwise mutual information scoring """
bigram = Phrases(sentences, min_count=1, threshold=.5, scoring='npmi')

seen_scores = set()

test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
for phrase, score in bigram.export_phrases(test_sentences):
seen_scores.add(round(score, 3))

assert seen_scores == set([
.882, # score for graph minors
.714 # score for human interface
])

def testBadParameters(self):
"""Test the phrases module with bad parameters."""
# should fail with something less or equal than 0
Expand Down

0 comments on commit 5f54b60

Please sign in to comment.