Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix phrases load, for backward compatibility #1758

Merged
merged 8 commits into from
Dec 6, 2017
87 changes: 50 additions & 37 deletions gensim/models/phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,55 @@ def analyze_sentence(self, sentence, threshold, common_terms, scorer):
yield (word, None)


class Phrases(SentenceAnalyzer, interfaces.TransformationABC):
class PhrasesTransformation(interfaces.TransformationABC):

@classmethod
def load(cls, *args, **kwargs):
"""
Load a previously saved Phrases class. Handles backwards compatibility from
older Phrases versions which did not support pluggable scoring functions.
Otherwise, relies on utils.load
"""

model = super(PhrasesTransformation, cls).load(*args, **kwargs)
# update older models
# if no scoring parameter, use default scoring
if not hasattr(model, 'scoring'):
logger.info('older version of %s loaded without scoring function', cls.__name__)
logger.info('setting pluggable scoring method to original_scorer for compatibility')
model.scoring = original_scorer
# if there is a scoring parameter, and it's a text value, load the proper scoring function
if hasattr(model, 'scoring'):
if isinstance(model.scoring, six.string_types):
if model.scoring == 'default':
logger.info(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick: we use 140 char limits for code, no need to split this call into several lines

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

argh travis failed, it was 120 not 140 @menshikh-iv, you joker ! ;-)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alexgarel oh, sorry, my mistake (120 instead of 80, not 140)

'older version of %s loaded with "default" scoring parameter',
cls.__name__)
logger.info(
'setting scoring method to original_scorer pluggable scoring method ' +
'for compatibility')
model.scoring = original_scorer
elif model.scoring == 'npmi':
logger.info(
'older version of %s loaded with "npmi" scoring parameter',
cls.__name__)
logger.info(
'setting scoring method to npmi_scorer pluggable scoring method ' +
'for compatibility')
model.scoring = npmi_scorer
else:
raise ValueError(
'failed to load %s model with unknown scoring setting %s' %
(cls.__name__, model.scoring))
# if there is non common_terms attribute, initialize
if not hasattr(model, "common_terms"):
logger.info('older version of %s loaded without common_terms attribute', cls.__name__)
logger.info('setting common_terms to empty set')
model.common_terms = frozenset()
return model


class Phrases(SentenceAnalyzer, PhrasesTransformation):
"""
Detect phrases, based on collected collocation counts. Adjacent words that appear
together more frequently than expected are joined together with the `_` character.
Expand Down Expand Up @@ -461,41 +509,6 @@ def __getitem__(self, sentence):

return [utils.to_unicode(w) for w in new_s]

@classmethod
def load(cls, *args, **kwargs):
"""
Load a previously saved Phrases class. Handles backwards compatibility from
older Phrases versions which did not support pluggable scoring functions. Otherwise, relies on utils.load
"""

# for python 2 and 3 compatibility. basestring is used to check if model.scoring is a string
try:
basestring
except NameError:
basestring = str

model = super(Phrases, cls).load(*args, **kwargs)
# update older models
# if no scoring parameter, use default scoring
if not hasattr(model, 'scoring'):
logger.info('older version of Phrases loaded without scoring function')
logger.info('setting pluggable scoring method to original_scorer for compatibility')
model.scoring = original_scorer
# if there is a scoring parameter, and it's a text value, load the proper scoring function
if hasattr(model, 'scoring'):
if isinstance(model.scoring, basestring):
if model.scoring == 'default':
logger.info('older version of Phrases loaded with "default" scoring parameter')
logger.info('setting scoring method to original_scorer pluggable scoring method for compatibility')
model.scoring = original_scorer
elif model.scoring == 'npmi':
logger.info('older version of Phrases loaded with "npmi" scoring parameter')
logger.info('setting scoring method to npmi_scorer pluggable scoring method for compatibility')
model.scoring = npmi_scorer
else:
raise ValueError('failed to load Phrases model with unknown scoring setting %s' % (model.scoring))
return model


# these two built-in scoring methods don't cast everything to float because the casting is done in the call
# to the scoring method in __getitem__ and export_phrases.
Expand Down Expand Up @@ -530,7 +543,7 @@ def pseudocorpus(source_vocab, sep, common_terms=frozenset()):
yield components


class Phraser(SentenceAnalyzer, interfaces.TransformationABC):
class Phraser(SentenceAnalyzer, PhrasesTransformation):
"""
Minimal state & functionality to apply results of a Phrases model to tokens.

Expand Down
141 changes: 105 additions & 36 deletions gensim/test/test_phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,29 @@
"""


import contextlib
import logging
import unittest
import os
import sys
import shutil
import tempfile
import unittest

import six

from gensim import utils
from gensim.models.phrases import SentenceAnalyzer, Phrases, Phraser, pseudocorpus
from gensim.models.phrases import SentenceAnalyzer, Phrases, Phraser
from gensim.models.phrases import pseudocorpus, original_scorer
from gensim.test.utils import common_texts

if sys.version_info[0] >= 3:
unicode = str

@contextlib.contextmanager
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks useful, please move it to gensim.test.utils https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/test/utils.py

def temporary_file(name):
# note : when dropping python2.7 support, we can use tempfile.TemporaryDirectory
tmp = tempfile.mkdtemp()
try:
yield os.path.join(tmp, name)
finally:
shutil.rmtree(tmp, ignore_errors=True)


class TestUtils(unittest.TestCase):
Expand Down Expand Up @@ -230,7 +242,7 @@ def testEncoding(self):
self.assertEqual(self.bigram_unicode[self.sentences[1]], expected)

transformed = ' '.join(self.bigram_utf8[self.sentences[1]])
self.assertTrue(isinstance(transformed, unicode))
self.assertTrue(isinstance(transformed, six.text_type))


# scorer for testCustomScorer
Expand Down Expand Up @@ -335,15 +347,15 @@ def testPruning(self):
# endclass TestPhrasesModel


class TestPhrasesScoringPersistence(PhrasesData, unittest.TestCase):
class TestPhrasesPersistence(PhrasesData, unittest.TestCase):

def testSaveLoadCustomScorer(self):
""" saving and loading a Phrases object with a custom scorer """

try:
with temporary_file("test.pkl") as fpath:
bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)
bigram.save("test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
bigram_loaded = Phrases.load("test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
bigram.save(fpath)
bigram_loaded = Phrases.load(fpath)
seen_scores = []
test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
for phrase, score in bigram_loaded.export_phrases(test_sentences):
Expand All @@ -352,17 +364,13 @@ def testSaveLoadCustomScorer(self):
assert all(seen_scores) # all scores 1
assert len(seen_scores) == 3 # 'graph minors' and 'survey human' and 'interface system'

finally:
if os.path.exists("test_phrases_testSaveLoadCustomScorer_temp_save.pkl"):
os.remove("test_phrases_testSaveLoadCustomScorer_temp_save.pkl")

def testSaveLoad(self):
""" Saving and loading a Phrases object."""

try:
with temporary_file("test.pkl") as fpath:
bigram = Phrases(self.sentences, min_count=1, threshold=1)
bigram.save("test_phrases_testSaveLoad_temp_save.pkl")
bigram_loaded = Phrases.load("test_phrases_testSaveLoad_temp_save.pkl")
bigram.save(fpath)
bigram_loaded = Phrases.load(fpath)
seen_scores = set()
test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
for phrase, score in bigram_loaded.export_phrases(test_sentences):
Expand All @@ -373,19 +381,15 @@ def testSaveLoad(self):
3.444 # score for human interface
])

finally:
if os.path.exists("test_phrases_testSaveLoad_temp_save.pkl"):
os.remove("test_phrases_testSaveLoad_temp_save.pkl")

def testSaveLoadStringScoring(self):
""" Saving and loading a Phrases object with a string scoring parameter.
This should ensure backwards compatibility with the previous version of Phrases"""

try:
with temporary_file("test.pkl") as fpath:
bigram = Phrases(self.sentences, min_count=1, threshold=1)
bigram.scoring = "default"
bigram.save("test_phrases_testSaveLoadStringScoring_temp_save.pkl")
bigram_loaded = Phrases.load("test_phrases_testSaveLoadStringScoring_temp_save.pkl")
bigram.save(fpath)
bigram_loaded = Phrases.load(fpath)
seen_scores = set()
test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
for phrase, score in bigram_loaded.export_phrases(test_sentences):
Expand All @@ -396,19 +400,15 @@ def testSaveLoadStringScoring(self):
3.444 # score for human interface
])

finally:
if os.path.exists("test_phrases_testSaveLoadStringScoring_temp_save.pkl"):
os.remove("test_phrases_testSaveLoadStringScoring_temp_save.pkl")

def testSaveLoadNoScoring(self):
""" Saving and loading a Phrases object with no scoring parameter.
This should ensure backwards compatibility with old versions of Phrases"""

try:
with temporary_file("test.pkl") as fpath:
bigram = Phrases(self.sentences, min_count=1, threshold=1)
del(bigram.scoring)
bigram.save("test_phrases_testSaveLoadNoScoring_temp_save.pkl")
bigram_loaded = Phrases.load("test_phrases_testSaveLoadNoScoring_temp_save.pkl")
bigram.save(fpath)
bigram_loaded = Phrases.load(fpath)
seen_scores = set()
test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
for phrase, score in bigram_loaded.export_phrases(test_sentences):
Expand All @@ -419,9 +419,78 @@ def testSaveLoadNoScoring(self):
3.444 # score for human interface
])

finally:
if os.path.exists("test_phrases_testSaveLoadNoScoring_temp_save.pkl"):
os.remove("test_phrases_testSaveLoadNoScoring_temp_save.pkl")
def testSaveLoadNoCommonTerms(self):
""" Saving and loading a Phrases objects without common_terms
This should ensure backwards compatibility with old versions of Phrases"""

with temporary_file("test.pkl") as fpath:
bigram = Phrases(self.sentences, min_count=1, threshold=1)
del(bigram.common_terms)
Copy link
Contributor

@menshikh-iv menshikh-iv Dec 5, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

better solution - checkout to the previous version, tran+save model and try to load old model here (in your current implementation, we possibly skip other bugs, that can be hidden due to the current bug)

FYI - del isn't a function, this is operator -> no needed brackets.

bigram.save(fpath)
bigram_loaded = Phrases.load(fpath)
self.assertEqual(bigram_loaded.common_terms, frozenset())
# can make a phraser, cf #1751
phraser = Phraser(bigram_loaded) # does not raise
phraser["some terms"] # does not raise


class TestPhraserPersistence(PhrasesData, unittest.TestCase):

def testSaveLoadCustomScorer(self):
"""Saving and loading a Phraser object with a custom scorer """

with temporary_file("test.pkl") as fpath:
bigram = Phraser(
Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer))
bigram.save(fpath)
bigram_loaded = Phraser.load(fpath)
# we do not much with scoring, just verify its the one expected
self.assertEqual(bigram_loaded.scoring, dumb_scorer)

def testSaveLoad(self):
""" Saving and loading a Phraser object."""
with temporary_file("test.pkl") as fpath:
bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1))
bigram.save(fpath)
bigram_loaded = Phraser.load(fpath)
self.assertEqual(
bigram_loaded[['graph', 'minors', 'survey', 'human', 'interface', 'system']],
['graph_minors', 'survey', 'human_interface', 'system'])

def testSaveLoadStringScoring(self):
""" Saving and loading a Phraser object with a string scoring parameter.
This should ensure backwards compatibility with the previous version of Phraser"""

with temporary_file("test.pkl") as fpath:
bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1))
bigram.scoring = "default"
bigram.save(fpath)
bigram_loaded = Phraser.load(fpath)
# we do not much with scoring, just verify its the one expected
self.assertEqual(bigram_loaded.scoring, original_scorer)

def testSaveLoadNoScoring(self):
""" Saving and loading a Phraser object with no scoring parameter.
This should ensure backwards compatibility with old versions of Phraser"""

with temporary_file("test.pkl") as fpath:
bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1))
del(bigram.scoring)
bigram.save(fpath)
bigram_loaded = Phraser.load(fpath)
# we do not much with scoring, just verify its the one expected
self.assertEqual(bigram_loaded.scoring, original_scorer)

def testSaveLoadNoCommonTerms(self):
""" Saving and loading a Phraser objects without common_terms
This should ensure backwards compatibility with old versions of Phraser"""

with temporary_file("test.pkl") as fpath:
bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1))
del(bigram.common_terms)
bigram.save(fpath)
bigram_loaded = Phraser.load(fpath)
self.assertEqual(bigram_loaded.common_terms, frozenset())


class TestPhraserModel(PhrasesData, PhrasesCommon, unittest.TestCase):
Expand Down Expand Up @@ -486,7 +555,7 @@ def testEncoding(self):
self.assertEqual(self.bigram_unicode[self.sentences[1]], expected)

transformed = ' '.join(self.bigram_utf8[self.sentences[1]])
self.assertTrue(isinstance(transformed, unicode))
self.assertTrue(isinstance(transformed, six.text_type))

def testMultipleBigramsSingleEntry(self):
""" a single entry should produce multiple bigrams. """
Expand Down Expand Up @@ -593,7 +662,7 @@ def testEncoding(self):
self.assertEqual(self.bigram_unicode[self.sentences[1]], expected)

transformed = ' '.join(self.bigram_utf8[self.sentences[1]])
self.assertTrue(isinstance(transformed, unicode))
self.assertTrue(isinstance(transformed, six.text_type))


if __name__ == '__main__':
Expand Down