From 523ab11c27e7fc070255de040db4580872e1eb7f Mon Sep 17 00:00:00 2001 From: Alex Garel Date: Mon, 4 Dec 2017 15:24:03 +0100 Subject: [PATCH 1/8] backward compatibility for Phrases models without common_terms --- gensim/models/phrases.py | 5 +++++ gensim/test/test_phrases.py | 19 ++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 2f2b2c4b9a..070eddc42a 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -494,6 +494,11 @@ def load(cls, *args, **kwargs): model.scoring = npmi_scorer else: raise ValueError('failed to load Phrases model with unknown scoring setting %s' % (model.scoring)) + # if there is non common_terms attribute, inizialize + if not hasattr(model, "common_terms"): + logger.info('older version of Phrases loaded without common_terms attribute') + logger.info('setting common_terms to empty set') + model.common_terms = frozenset() return model diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index e3a69760ca..fb9e0deb23 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -335,7 +335,7 @@ def testPruning(self): # endclass TestPhrasesModel -class TestPhrasesScoringPersistence(PhrasesData, unittest.TestCase): +class TestPhrasesPersistence(PhrasesData, unittest.TestCase): def testSaveLoadCustomScorer(self): """ saving and loading a Phrases object with a custom scorer """ @@ -423,6 +423,23 @@ def testSaveLoadNoScoring(self): if os.path.exists("test_phrases_testSaveLoadNoScoring_temp_save.pkl"): os.remove("test_phrases_testSaveLoadNoScoring_temp_save.pkl") + def testSaveLoadNoCommonTerms(self): + """ Saving and loading a Phrases objects without common_terms + This should ensure backwards compatibility with old versions of Phrases""" + + try: + bigram = Phrases(self.sentences, min_count=1, threshold=1) + del(bigram.common_terms) + bigram.save("test_phrases_testSaveLoadNoScoring_temp_save.pkl") + bigram_loaded = Phrases.load("test_phrases_testSaveLoadNoScoring_temp_save.pkl") + self.assertEqual(bigram_loaded.common_terms, frozenset()) + # can make a phraser, cf #1751 + phraser = Phraser(bigram_loaded) # does not raise + phraser["some terms"] # does not raise + finally: + if os.path.exists("test_phrases_testSaveLoadNoScoring_temp_save.pkl"): + os.remove("test_phrases_testSaveLoadNoScoring_temp_save.pkl") + class TestPhraserModel(PhrasesData, PhrasesCommon, unittest.TestCase): """ Test Phraser models.""" From aecc95daa134d3175fbf0892ba64d51827283ae1 Mon Sep 17 00:00:00 2001 From: Alex Garel Date: Mon, 4 Dec 2017 15:56:35 +0100 Subject: [PATCH 2/8] Phraser also needs compatible load for versions without scoring or common_terms --- gensim/models/phrases.py | 98 +++++++++++++++++++++---------------- gensim/test/test_phrases.py | 77 ++++++++++++++++++++++++++++- 2 files changed, 132 insertions(+), 43 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 070eddc42a..a4c8d34549 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -178,7 +178,61 @@ def analyze_sentence(self, sentence, threshold, common_terms, scorer): yield (word, None) -class Phrases(SentenceAnalyzer, interfaces.TransformationABC): +class PhrasesTransformation(interfaces.TransformationABC): + + @classmethod + def load(cls, *args, **kwargs): + """ + Load a previously saved Phrases class. Handles backwards compatibility from + older Phrases versions which did not support pluggable scoring functions. + Otherwise, relies on utils.load + """ + + # for python 2 and 3 compatibility. basestring is used to check if model.scoring is a string + try: + basestring + except NameError: + basestring = str + + model = super(PhrasesTransformation, cls).load(*args, **kwargs) + # update older models + # if no scoring parameter, use default scoring + if not hasattr(model, 'scoring'): + logger.info('older version of %s loaded without scoring function', cls.__name__) + logger.info('setting pluggable scoring method to original_scorer for compatibility') + model.scoring = original_scorer + # if there is a scoring parameter, and it's a text value, load the proper scoring function + if hasattr(model, 'scoring'): + if isinstance(model.scoring, basestring): + if model.scoring == 'default': + logger.info( + 'older version of %s loaded with "default" scoring parameter', + cls.__name__) + logger.info( + 'setting scoring method to original_scorer pluggable scoring method ' + + 'for compatibility') + model.scoring = original_scorer + elif model.scoring == 'npmi': + logger.info( + 'older version of %s loaded with "npmi" scoring parameter', + cls.__name__) + logger.info( + 'setting scoring method to npmi_scorer pluggable scoring method ' + + 'for compatibility') + model.scoring = npmi_scorer + else: + raise ValueError( + 'failed to load %s model with unknown scoring setting %s' % + (cls.__name__, model.scoring)) + # if there is non common_terms attribute, inizialize + if not hasattr(model, "common_terms"): + logger.info('older version of %s loaded without common_terms attribute', cls.__name__) + logger.info('setting common_terms to empty set') + model.common_terms = frozenset() + return model + + +class Phrases(SentenceAnalyzer, PhrasesTransformation): """ Detect phrases, based on collected collocation counts. Adjacent words that appear together more frequently than expected are joined together with the `_` character. @@ -461,46 +515,6 @@ def __getitem__(self, sentence): return [utils.to_unicode(w) for w in new_s] - @classmethod - def load(cls, *args, **kwargs): - """ - Load a previously saved Phrases class. Handles backwards compatibility from - older Phrases versions which did not support pluggable scoring functions. Otherwise, relies on utils.load - """ - - # for python 2 and 3 compatibility. basestring is used to check if model.scoring is a string - try: - basestring - except NameError: - basestring = str - - model = super(Phrases, cls).load(*args, **kwargs) - # update older models - # if no scoring parameter, use default scoring - if not hasattr(model, 'scoring'): - logger.info('older version of Phrases loaded without scoring function') - logger.info('setting pluggable scoring method to original_scorer for compatibility') - model.scoring = original_scorer - # if there is a scoring parameter, and it's a text value, load the proper scoring function - if hasattr(model, 'scoring'): - if isinstance(model.scoring, basestring): - if model.scoring == 'default': - logger.info('older version of Phrases loaded with "default" scoring parameter') - logger.info('setting scoring method to original_scorer pluggable scoring method for compatibility') - model.scoring = original_scorer - elif model.scoring == 'npmi': - logger.info('older version of Phrases loaded with "npmi" scoring parameter') - logger.info('setting scoring method to npmi_scorer pluggable scoring method for compatibility') - model.scoring = npmi_scorer - else: - raise ValueError('failed to load Phrases model with unknown scoring setting %s' % (model.scoring)) - # if there is non common_terms attribute, inizialize - if not hasattr(model, "common_terms"): - logger.info('older version of Phrases loaded without common_terms attribute') - logger.info('setting common_terms to empty set') - model.common_terms = frozenset() - return model - # these two built-in scoring methods don't cast everything to float because the casting is done in the call # to the scoring method in __getitem__ and export_phrases. @@ -535,7 +549,7 @@ def pseudocorpus(source_vocab, sep, common_terms=frozenset()): yield components -class Phraser(SentenceAnalyzer, interfaces.TransformationABC): +class Phraser(SentenceAnalyzer, PhrasesTransformation): """ Minimal state & functionality to apply results of a Phrases model to tokens. diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index fb9e0deb23..b9d3312e40 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -14,7 +14,8 @@ import sys from gensim import utils -from gensim.models.phrases import SentenceAnalyzer, Phrases, Phraser, pseudocorpus +from gensim.models.phrases import SentenceAnalyzer, Phrases, Phraser +from gensim.models.phrases import pseudocorpus, original_scorer from gensim.test.utils import common_texts if sys.version_info[0] >= 3: @@ -441,6 +442,80 @@ def testSaveLoadNoCommonTerms(self): os.remove("test_phrases_testSaveLoadNoScoring_temp_save.pkl") +class TestPhraserPersistence(PhrasesData, unittest.TestCase): + + def testSaveLoadCustomScorer(self): + """Saving and loading a Phraser object with a custom scorer """ + + try: + bigram = Phraser( + Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)) + bigram.save("test_Phraser_testSaveLoadCustomScorer_temp_save.pkl") + bigram_loaded = Phraser.load("test_Phraser_testSaveLoadCustomScorer_temp_save.pkl") + # we do not much with scoring, just verify its the one expected + self.assertEqual(bigram_loaded.scoring, dumb_scorer) + finally: + if os.path.exists("test_phrases_testSaveLoadNoScoring_temp_save.pkl"): + os.remove("test_phrases_testSaveLoadNoScoring_temp_save.pkl") + + def testSaveLoad(self): + """ Saving and loading a Phraser object.""" + try: + bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1)) + bigram.save("test_Phraser_testSaveLoad_temp_save.pkl") + bigram_loaded = Phraser.load("test_Phraser_testSaveLoad_temp_save.pkl") + self.assertEqual( + bigram_loaded[['graph', 'minors', 'survey', 'human', 'interface', 'system']], + ['graph_minors', 'survey', 'human_interface', 'system']) + finally: + if os.path.exists("test_Phraser_testSaveLoad_temp_save.pkl"): + os.remove("test_Phraser_testSaveLoad_temp_save.pkl") + + def testSaveLoadStringScoring(self): + """ Saving and loading a Phraser object with a string scoring parameter. + This should ensure backwards compatibility with the previous version of Phraser""" + + try: + bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1)) + bigram.scoring = "default" + bigram.save("test_Phraser_testSaveLoadStringScoring_temp_save.pkl") + bigram_loaded = Phraser.load("test_Phraser_testSaveLoadStringScoring_temp_save.pkl") + # we do not much with scoring, just verify its the one expected + self.assertEqual(bigram_loaded.scoring, original_scorer) + finally: + if os.path.exists("test_Phraser_testSaveLoadStringScoring_temp_save.pkl"): + os.remove("test_Phraser_testSaveLoadStringScoring_temp_save.pkl") + + def testSaveLoadNoScoring(self): + """ Saving and loading a Phraser object with no scoring parameter. + This should ensure backwards compatibility with old versions of Phraser""" + + try: + bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1)) + del(bigram.scoring) + bigram.save("test_Phraser_testSaveLoadNoScoring_temp_save.pkl") + bigram_loaded = Phraser.load("test_Phraser_testSaveLoadNoScoring_temp_save.pkl") + # we do not much with scoring, just verify its the one expected + self.assertEqual(bigram_loaded.scoring, original_scorer) + finally: + if os.path.exists("test_Phraser_testSaveLoadNoScoring_temp_save.pkl"): + os.remove("test_Phraser_testSaveLoadNoScoring_temp_save.pkl") + + def testSaveLoadNoCommonTerms(self): + """ Saving and loading a Phraser objects without common_terms + This should ensure backwards compatibility with old versions of Phraser""" + + try: + bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1)) + del(bigram.common_terms) + bigram.save("test_Phraser_testSaveLoadNoScoring_temp_save.pkl") + bigram_loaded = Phraser.load("test_Phraser_testSaveLoadNoScoring_temp_save.pkl") + self.assertEqual(bigram_loaded.common_terms, frozenset()) + finally: + if os.path.exists("test_Phraser_testSaveLoadNoScoring_temp_save.pkl"): + os.remove("test_Phraser_testSaveLoadNoScoring_temp_save.pkl") + + class TestPhraserModel(PhrasesData, PhrasesCommon, unittest.TestCase): """ Test Phraser models.""" From 6297bf4572dc0cbbd9594358dbc930490fbff8d3 Mon Sep 17 00:00:00 2001 From: Alex Garel Date: Mon, 4 Dec 2017 16:55:10 +0100 Subject: [PATCH 3/8] minor: simplify persitence tests in test_phrases by using a context manager for temporary file management --- gensim/test/test_phrases.py | 109 +++++++++++++++--------------------- 1 file changed, 44 insertions(+), 65 deletions(-) diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index b9d3312e40..1d16ee720c 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -8,10 +8,13 @@ """ +import contextlib import logging -import unittest import os import sys +import shutil +import tempfile +import unittest from gensim import utils from gensim.models.phrases import SentenceAnalyzer, Phrases, Phraser @@ -22,6 +25,16 @@ unicode = str +@contextlib.contextmanager +def temporary_file(name): + # note : when dropping python2.7 support, we can use tempfile.TemporaryDirectory + tmp = tempfile.mkdtemp() + try: + yield os.path.join(tmp, name) + finally: + shutil.rmtree(tmp, ignore_errors=True) + + class TestUtils(unittest.TestCase): def test_pseudocorpus_no_common_terms(self): @@ -341,10 +354,10 @@ class TestPhrasesPersistence(PhrasesData, unittest.TestCase): def testSaveLoadCustomScorer(self): """ saving and loading a Phrases object with a custom scorer """ - try: + with temporary_file("test.pkl") as fpath: bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) - bigram.save("test_phrases_testSaveLoadCustomScorer_temp_save.pkl") - bigram_loaded = Phrases.load("test_phrases_testSaveLoadCustomScorer_temp_save.pkl") + bigram.save(fpath) + bigram_loaded = Phrases.load(fpath) seen_scores = [] test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): @@ -353,17 +366,13 @@ def testSaveLoadCustomScorer(self): assert all(seen_scores) # all scores 1 assert len(seen_scores) == 3 # 'graph minors' and 'survey human' and 'interface system' - finally: - if os.path.exists("test_phrases_testSaveLoadCustomScorer_temp_save.pkl"): - os.remove("test_phrases_testSaveLoadCustomScorer_temp_save.pkl") - def testSaveLoad(self): """ Saving and loading a Phrases object.""" - try: + with temporary_file("test.pkl") as fpath: bigram = Phrases(self.sentences, min_count=1, threshold=1) - bigram.save("test_phrases_testSaveLoad_temp_save.pkl") - bigram_loaded = Phrases.load("test_phrases_testSaveLoad_temp_save.pkl") + bigram.save(fpath) + bigram_loaded = Phrases.load(fpath) seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): @@ -374,19 +383,15 @@ def testSaveLoad(self): 3.444 # score for human interface ]) - finally: - if os.path.exists("test_phrases_testSaveLoad_temp_save.pkl"): - os.remove("test_phrases_testSaveLoad_temp_save.pkl") - def testSaveLoadStringScoring(self): """ Saving and loading a Phrases object with a string scoring parameter. This should ensure backwards compatibility with the previous version of Phrases""" - try: + with temporary_file("test.pkl") as fpath: bigram = Phrases(self.sentences, min_count=1, threshold=1) bigram.scoring = "default" - bigram.save("test_phrases_testSaveLoadStringScoring_temp_save.pkl") - bigram_loaded = Phrases.load("test_phrases_testSaveLoadStringScoring_temp_save.pkl") + bigram.save(fpath) + bigram_loaded = Phrases.load(fpath) seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): @@ -397,19 +402,15 @@ def testSaveLoadStringScoring(self): 3.444 # score for human interface ]) - finally: - if os.path.exists("test_phrases_testSaveLoadStringScoring_temp_save.pkl"): - os.remove("test_phrases_testSaveLoadStringScoring_temp_save.pkl") - def testSaveLoadNoScoring(self): """ Saving and loading a Phrases object with no scoring parameter. This should ensure backwards compatibility with old versions of Phrases""" - try: + with temporary_file("test.pkl") as fpath: bigram = Phrases(self.sentences, min_count=1, threshold=1) del(bigram.scoring) - bigram.save("test_phrases_testSaveLoadNoScoring_temp_save.pkl") - bigram_loaded = Phrases.load("test_phrases_testSaveLoadNoScoring_temp_save.pkl") + bigram.save(fpath) + bigram_loaded = Phrases.load(fpath) seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): @@ -420,26 +421,19 @@ def testSaveLoadNoScoring(self): 3.444 # score for human interface ]) - finally: - if os.path.exists("test_phrases_testSaveLoadNoScoring_temp_save.pkl"): - os.remove("test_phrases_testSaveLoadNoScoring_temp_save.pkl") - def testSaveLoadNoCommonTerms(self): """ Saving and loading a Phrases objects without common_terms This should ensure backwards compatibility with old versions of Phrases""" - try: + with temporary_file("test.pkl") as fpath: bigram = Phrases(self.sentences, min_count=1, threshold=1) del(bigram.common_terms) - bigram.save("test_phrases_testSaveLoadNoScoring_temp_save.pkl") - bigram_loaded = Phrases.load("test_phrases_testSaveLoadNoScoring_temp_save.pkl") + bigram.save(fpath) + bigram_loaded = Phrases.load(fpath) self.assertEqual(bigram_loaded.common_terms, frozenset()) # can make a phraser, cf #1751 phraser = Phraser(bigram_loaded) # does not raise phraser["some terms"] # does not raise - finally: - if os.path.exists("test_phrases_testSaveLoadNoScoring_temp_save.pkl"): - os.remove("test_phrases_testSaveLoadNoScoring_temp_save.pkl") class TestPhraserPersistence(PhrasesData, unittest.TestCase): @@ -447,73 +441,58 @@ class TestPhraserPersistence(PhrasesData, unittest.TestCase): def testSaveLoadCustomScorer(self): """Saving and loading a Phraser object with a custom scorer """ - try: + with temporary_file("test.pkl") as fpath: bigram = Phraser( Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)) - bigram.save("test_Phraser_testSaveLoadCustomScorer_temp_save.pkl") - bigram_loaded = Phraser.load("test_Phraser_testSaveLoadCustomScorer_temp_save.pkl") + bigram.save(fpath) + bigram_loaded = Phraser.load(fpath) # we do not much with scoring, just verify its the one expected self.assertEqual(bigram_loaded.scoring, dumb_scorer) - finally: - if os.path.exists("test_phrases_testSaveLoadNoScoring_temp_save.pkl"): - os.remove("test_phrases_testSaveLoadNoScoring_temp_save.pkl") def testSaveLoad(self): """ Saving and loading a Phraser object.""" - try: + with temporary_file("test.pkl") as fpath: bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1)) - bigram.save("test_Phraser_testSaveLoad_temp_save.pkl") - bigram_loaded = Phraser.load("test_Phraser_testSaveLoad_temp_save.pkl") + bigram.save(fpath) + bigram_loaded = Phraser.load(fpath) self.assertEqual( bigram_loaded[['graph', 'minors', 'survey', 'human', 'interface', 'system']], ['graph_minors', 'survey', 'human_interface', 'system']) - finally: - if os.path.exists("test_Phraser_testSaveLoad_temp_save.pkl"): - os.remove("test_Phraser_testSaveLoad_temp_save.pkl") def testSaveLoadStringScoring(self): """ Saving and loading a Phraser object with a string scoring parameter. This should ensure backwards compatibility with the previous version of Phraser""" - try: + with temporary_file("test.pkl") as fpath: bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1)) bigram.scoring = "default" - bigram.save("test_Phraser_testSaveLoadStringScoring_temp_save.pkl") - bigram_loaded = Phraser.load("test_Phraser_testSaveLoadStringScoring_temp_save.pkl") + bigram.save(fpath) + bigram_loaded = Phraser.load(fpath) # we do not much with scoring, just verify its the one expected self.assertEqual(bigram_loaded.scoring, original_scorer) - finally: - if os.path.exists("test_Phraser_testSaveLoadStringScoring_temp_save.pkl"): - os.remove("test_Phraser_testSaveLoadStringScoring_temp_save.pkl") def testSaveLoadNoScoring(self): """ Saving and loading a Phraser object with no scoring parameter. This should ensure backwards compatibility with old versions of Phraser""" - try: + with temporary_file("test.pkl") as fpath: bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1)) del(bigram.scoring) - bigram.save("test_Phraser_testSaveLoadNoScoring_temp_save.pkl") - bigram_loaded = Phraser.load("test_Phraser_testSaveLoadNoScoring_temp_save.pkl") + bigram.save(fpath) + bigram_loaded = Phraser.load(fpath) # we do not much with scoring, just verify its the one expected self.assertEqual(bigram_loaded.scoring, original_scorer) - finally: - if os.path.exists("test_Phraser_testSaveLoadNoScoring_temp_save.pkl"): - os.remove("test_Phraser_testSaveLoadNoScoring_temp_save.pkl") def testSaveLoadNoCommonTerms(self): """ Saving and loading a Phraser objects without common_terms This should ensure backwards compatibility with old versions of Phraser""" - try: + with temporary_file("test.pkl") as fpath: bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1)) del(bigram.common_terms) - bigram.save("test_Phraser_testSaveLoadNoScoring_temp_save.pkl") - bigram_loaded = Phraser.load("test_Phraser_testSaveLoadNoScoring_temp_save.pkl") + bigram.save(fpath) + bigram_loaded = Phraser.load(fpath) self.assertEqual(bigram_loaded.common_terms, frozenset()) - finally: - if os.path.exists("test_Phraser_testSaveLoadNoScoring_temp_save.pkl"): - os.remove("test_Phraser_testSaveLoadNoScoring_temp_save.pkl") class TestPhraserModel(PhrasesData, PhrasesCommon, unittest.TestCase): From f4fce12b7fe93f9601434c421f5da888c66a6058 Mon Sep 17 00:00:00 2001 From: Alex Garel Date: Mon, 4 Dec 2017 19:31:56 +0100 Subject: [PATCH 4/8] using six for python compatibility in phrases --- gensim/models/phrases.py | 10 ++-------- gensim/test/test_phrases.py | 12 +++++------- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index a4c8d34549..188f9a645a 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -188,12 +188,6 @@ def load(cls, *args, **kwargs): Otherwise, relies on utils.load """ - # for python 2 and 3 compatibility. basestring is used to check if model.scoring is a string - try: - basestring - except NameError: - basestring = str - model = super(PhrasesTransformation, cls).load(*args, **kwargs) # update older models # if no scoring parameter, use default scoring @@ -203,7 +197,7 @@ def load(cls, *args, **kwargs): model.scoring = original_scorer # if there is a scoring parameter, and it's a text value, load the proper scoring function if hasattr(model, 'scoring'): - if isinstance(model.scoring, basestring): + if isinstance(model.scoring, six.string_types): if model.scoring == 'default': logger.info( 'older version of %s loaded with "default" scoring parameter', @@ -224,7 +218,7 @@ def load(cls, *args, **kwargs): raise ValueError( 'failed to load %s model with unknown scoring setting %s' % (cls.__name__, model.scoring)) - # if there is non common_terms attribute, inizialize + # if there is non common_terms attribute, initialize if not hasattr(model, "common_terms"): logger.info('older version of %s loaded without common_terms attribute', cls.__name__) logger.info('setting common_terms to empty set') diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index 1d16ee720c..e62b096e03 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -11,19 +11,17 @@ import contextlib import logging import os -import sys import shutil import tempfile import unittest +import six + from gensim import utils from gensim.models.phrases import SentenceAnalyzer, Phrases, Phraser from gensim.models.phrases import pseudocorpus, original_scorer from gensim.test.utils import common_texts -if sys.version_info[0] >= 3: - unicode = str - @contextlib.contextmanager def temporary_file(name): @@ -244,7 +242,7 @@ def testEncoding(self): self.assertEqual(self.bigram_unicode[self.sentences[1]], expected) transformed = ' '.join(self.bigram_utf8[self.sentences[1]]) - self.assertTrue(isinstance(transformed, unicode)) + self.assertTrue(isinstance(transformed, six.text_type)) # scorer for testCustomScorer @@ -557,7 +555,7 @@ def testEncoding(self): self.assertEqual(self.bigram_unicode[self.sentences[1]], expected) transformed = ' '.join(self.bigram_utf8[self.sentences[1]]) - self.assertTrue(isinstance(transformed, unicode)) + self.assertTrue(isinstance(transformed, six.text_type)) def testMultipleBigramsSingleEntry(self): """ a single entry should produce multiple bigrams. """ @@ -664,7 +662,7 @@ def testEncoding(self): self.assertEqual(self.bigram_unicode[self.sentences[1]], expected) transformed = ' '.join(self.bigram_utf8[self.sentences[1]]) - self.assertTrue(isinstance(transformed, unicode)) + self.assertTrue(isinstance(transformed, six.text_type)) if __name__ == '__main__': From 198fdf7e7d6bd999c223e70ad264e9acbedf7b0d Mon Sep 17 00:00:00 2001 From: Alex Garel Date: Tue, 5 Dec 2017 16:28:44 +0100 Subject: [PATCH 5/8] better tests for phrases load backward compatibility (this also fix a bug in loading phrases model before scoring). Also moving temporary_file context manager in gensim.test.utils --- gensim/models/phrases.py | 37 +++--- .../test_data/phraser-no-common-terms.pkl | Bin 0 -> 561 bytes gensim/test/test_data/phraser-no-scoring.pkl | Bin 0 -> 506 bytes gensim/test/test_data/phraser-scoring-str.pkl | Bin 0 -> 534 bytes .../test_data/phrases-no-common-terms.pkl | Bin 0 -> 1348 bytes gensim/test/test_data/phrases-no-scoring.pkl | Bin 0 -> 1267 bytes gensim/test/test_data/phrases-scoring-str.pkl | Bin 0 -> 1321 bytes gensim/test/test_phrases.py | 111 ++++++------------ gensim/test/utils.py | 18 +++ 9 files changed, 73 insertions(+), 93 deletions(-) create mode 100644 gensim/test/test_data/phraser-no-common-terms.pkl create mode 100644 gensim/test/test_data/phraser-no-scoring.pkl create mode 100644 gensim/test/test_data/phraser-scoring-str.pkl create mode 100644 gensim/test/test_data/phrases-no-common-terms.pkl create mode 100644 gensim/test/test_data/phrases-no-scoring.pkl create mode 100644 gensim/test/test_data/phrases-scoring-str.pkl diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 188f9a645a..747d78e749 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -183,8 +183,8 @@ class PhrasesTransformation(interfaces.TransformationABC): @classmethod def load(cls, *args, **kwargs): """ - Load a previously saved Phrases class. Handles backwards compatibility from - older Phrases versions which did not support pluggable scoring functions. + Load a previously saved Phrases/Phraser class. Handles backwards compatibility from + older Phrases/Phraser versions which did not support pluggable scoring functions. Otherwise, relies on utils.load """ @@ -199,25 +199,15 @@ def load(cls, *args, **kwargs): if hasattr(model, 'scoring'): if isinstance(model.scoring, six.string_types): if model.scoring == 'default': - logger.info( - 'older version of %s loaded with "default" scoring parameter', - cls.__name__) - logger.info( - 'setting scoring method to original_scorer pluggable scoring method ' + - 'for compatibility') + logger.info('older version of %s loaded with "default" scoring parameter', cls.__name__) + logger.info('setting scoring method to original_scorer pluggable scoring method for compatibility') model.scoring = original_scorer elif model.scoring == 'npmi': - logger.info( - 'older version of %s loaded with "npmi" scoring parameter', - cls.__name__) - logger.info( - 'setting scoring method to npmi_scorer pluggable scoring method ' + - 'for compatibility') + logger.info('older version of %s loaded with "npmi" scoring parameter', cls.__name__) + logger.info('setting scoring method to npmi_scorer pluggable scoring method for compatibility') model.scoring = npmi_scorer else: - raise ValueError( - 'failed to load %s model with unknown scoring setting %s' % - (cls.__name__, model.scoring)) + raise ValueError('failed to load %s model with unknown scoring setting %s' % (cls.__name__, model.scoring)) # if there is non common_terms attribute, initialize if not hasattr(model, "common_terms"): logger.info('older version of %s loaded without common_terms attribute', cls.__name__) @@ -351,6 +341,19 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, if sentences is not None: self.add_vocab(sentences) + @classmethod + def load(cls, *args, **kwargs): + """ + Load a previously saved Phrases class. Handles backwards compatibility from + older Phrases versions which did not support pluggable scoring functions. + """ + model = super(Phrases, cls).load(*args, **kwargs) + if not hasattr(model, 'corpus_word_count'): + logger.info('older version of %s loaded without corpus_word_count', cls.__name__) + logger.info('Setting it to 0, do not use it in your scoring function.') + model.corpus_word_count = 0 + return model + def __str__(self): """Get short string representation of this phrase detector.""" return "%s<%i vocab, min_count=%s, threshold=%s, max_vocab_size=%s>" % ( diff --git a/gensim/test/test_data/phraser-no-common-terms.pkl b/gensim/test/test_data/phraser-no-common-terms.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0254b29f18a5bce7078ad4d0656107b2d23d37c8 GIT binary patch literal 561 zcmZ`%$xg#C6l}}dlzrdJPRmku#IZLHy&#`*siKgY#8TpP?4(i;mEe*iAHr|(BRq#* zdcd+|Yu?P8@$ymfd@i+!8d2Qgp>A{oyO@haHTy zia8Pq>3DG}6O0owO&rIOY1Hj&Otdj+ujYv2UaGX{aYws79>#7*W2%kmp+I}W2+D0( z_7n-U7b_usRQ^dvRbNOqbPSPG${D*tvcK@YawCn|0p@BRMeA@+TU?rrh56i&&d`me z=2);NNfx?^khfUOu*9$&%-iDxCL-ck2?$(eSj({9D!nwTRG^VzgIE)^Bd>0NO+(va z*e298q&ta~#*Tq^8TK+%TVpTHmG69H*dMBcG;$>l4DFENh){FKLMB|jyB^1}shlvJ zW;iqUm9OU?ni*=NMs>qGXSf)`I#oU1$EAU<7#bO_N8~rDZIO}V2%my-dW+jO?s9Yb X1R@d?0Qc0aXjphKeI6N}(znJhRSl~c literal 0 HcmV?d00001 diff --git a/gensim/test/test_data/phraser-no-scoring.pkl b/gensim/test/test_data/phraser-no-scoring.pkl new file mode 100644 index 0000000000000000000000000000000000000000..af483d45e3181df9024a6009ae1f2ef0aa3c68a9 GIT binary patch literal 506 zcmY+BJ5R$f6os3%KuLMO-{noqqdW&jHij(7QQD)mjQkM(7Jr26 zv|f*(kNXV2x02y6dG<$@3HsGNr|U zd)goHDDk@*l@6+It*~Vl_lFu&9Zb8kq@^WwgwE7} z53w4OyvDGeW5X%GzSO^;f9cDyN!%253bWXk*fQ8Q!w#t?zmO?cAAZ2GYn(lX{Tv7X z8Z;&iC~QB(p}~$Ajwjbo0-hN8l;JGLxl?(4S^Ftgjtk<<=x9v+L~4$vAzKWWWK_vW zMa*$!#A}9|9Bp%XgA7_ZZnGjLy`6{!Ma3PJS!8**H#H9oj|D+FXo8tc@#I+9d;1Sb CY?oC4 literal 0 HcmV?d00001 diff --git a/gensim/test/test_data/phraser-scoring-str.pkl b/gensim/test/test_data/phraser-scoring-str.pkl new file mode 100644 index 0000000000000000000000000000000000000000..58e999828e40e1b9bf6771a9b2de09354c815aa1 GIT binary patch literal 534 zcmY+B$xg#C5Qf{*Qb^hNz3h~&?1&>b4!t0sa;c(_x{am8>2D{MI8=g5jywpj#fvaD zyaaWhUjJkrg6sC><}^J;hHpm6$$;t-{VTPqX_f8n0;V~w#P z%8kGaC^66um*(K0;#PD#UxGsGFAFpGVOS%b|n%#%7vs*ovHT|eMhFp))ur5wwx(#v!0>*GF**ZWELBj>o#s&#}J0$t`vV>&J|#S~@tD0ee*s^ao~i%< literal 0 HcmV?d00001 diff --git a/gensim/test/test_data/phrases-no-common-terms.pkl b/gensim/test/test_data/phrases-no-common-terms.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7be7bc3c84edfd829447e9ccdec77cec2d3be745 GIT binary patch literal 1348 zcmZ{kS98-)5QUvWAfkoddrK&R(0lKuE=?43k)<13pmua6r_3+|5AeI$y^@{0@MO>V zcC}}V^lRAf=){Eabesk{GSgW%_l!2qtNIs>soxm-iQ!4tX*3#fn5bL_rLQs49CF85 zD!REg-82d?S}PLdr)5%L1uLsKMyaHXvju~(7FN1jtdeWL%uU$S%6L5;rCwmLs)ccP z17knUv(l)!G!N=WIn9IaN+yO;m~^oEKgOAM!o-V|1nS(u8h0J1#ooN?rM}lzCS255 z`?T??J?ySwlI3ZKJ1{EKIo3U&I95A0ekhLDw=m(Z;5eI&4Zfd7k@kx)O^g%hS+9(W zAoL3d8-1nPWf&FQzH*{8={VT*2b&37CVlQA(7thWB7YogbyrI>W!}>Zsx(}Bn>)rz z9gF4NtFJ(iAgQW3@)dqv(y*xwxHq!E^NW^h2% zgM>p?l}Si36()twXFXoyVF@@wIBK6J3teKzgiR8rnj`$kiRDb01ykr4$0g(h;iOGp z&mrA1_L5}seLLm=)Y1bS;)`4l+YQ-w8i#SvjqG z#(j5)BqAmxHd!M2$^G+8B_bnWP`!GS>VfBi4WVd`l=l0kRM_EFX#N(JE%fXMj=zgx W9LlNA@gWXz6@__8Tp%o#?dg9bOQXvG literal 0 HcmV?d00001 diff --git a/gensim/test/test_data/phrases-no-scoring.pkl b/gensim/test/test_data/phrases-no-scoring.pkl new file mode 100644 index 0000000000000000000000000000000000000000..70163414c9fc5763506a2ccd1a1c0578f1d094fd GIT binary patch literal 1267 zcmYk6=X29Q5XEh$kSGZ;fzW$PLNSEidyNT%rHOz!WcgwX)SgZ1q2!}gfLVHU*e3L`Cy+T%Ks_7bUjk>A0lX4SS#j@J#NMCx;6EsWcfIu<8U zSLZ1a%LLYPea;@y(SGDQZA`dsQ-CqQ62LgU0@M2f3cOYty%5WjVQIQ zQW=O|7C0ehdm3*i>?mF2r}e3U_(9#HoN99sd=2;Fe63&?ue3ykqdJvpvc!6+nqfb8f zB?gxXSDM3m0v1qsc9p?3!gW)Q*T7gknCk}Pn}l1Y;O0VvD1F6kPPjw3YZBO+{QO47 zJ>mT?kQZJBr$ovuEvEjUOX(FY<6OKP6EE#3I#v3)8%$^aJ4dbqb34_vj z&KWNVD@NCZ&d66Kul1qZw(#NXW83#9JLAWOP%>ru5$bX;bZAw zZmq~$wHSXQ2ve-w{!-{?<_@84_{{%fZhRsxCv^zYB(X13`%q+f{q6;ZCouSnbiY`*!x`7X?BAe8q*B3Y`e owrl2n(T|oFc>FCZyfiSXf8@xIy7@>C-aOp&Qk9ewb|e3`-%P;cc@mYbraufV!-#4Fw}k? zB(ZT?dc`ZkycPJlgF#=ZW)XyW5G&<`N!)fY^amRV!!y3}Z4%!&I_5tPM%)n_Q>R81 zMrYXQZm^%BXSMTi{$iA{sovKa5pxv8NoKIQW5E`}Sbe~PN$VfPxz1KRU)$(WTG$Va zv(p4&Qq%)w4s2!XHp2FLzm;yX)=m+5u`;VB*AaFw-bvUcB5qbNcC&pCVXtJ|Z0(sg zscl@VJcu;*G2c%(AYqx8r7BZoUA>Bf%nuO`3+E1d38xqwAsnp__Ub7Ws?2+g@o~a* zRZlzf(hg?WK1(+2ab~YE^HZX>D+g?biu6WPYqH$6oCXH`(JB;kJ~u(R!@2 zJIwA9?p4#5ZmX8=Gk-vMC|S2yt?=~#kJ#Zc;fXkm{~vs9^H14jflwD0?p!9V%KD!% zT_ijg%EzXpy#x7z@e<*s6e#O0_f8C65nc-zv^qI1Z`k~n@J^cLaJ}PFW^)<8CoD@g z-d;7}4~#w%8kMp z`j%uZd#8#xN4q{HS5+zF2kELor-8ubP30Y1UndD$_)_l^i+x6dDBx@O)gJ3U>!T2I ziii;RqQphSQIZBymvkDU-!zbKk9YyJ$1*7nupVS{bja P!^E>;RvY+HH0S;S7RQvC literal 0 HcmV?d00001 diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index e62b096e03..862fa2475a 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -8,11 +8,8 @@ """ -import contextlib import logging import os -import shutil -import tempfile import unittest import six @@ -20,17 +17,10 @@ from gensim import utils from gensim.models.phrases import SentenceAnalyzer, Phrases, Phraser from gensim.models.phrases import pseudocorpus, original_scorer -from gensim.test.utils import common_texts +from gensim.test.utils import common_texts, temporary_file -@contextlib.contextmanager -def temporary_file(name): - # note : when dropping python2.7 support, we can use tempfile.TemporaryDirectory - tmp = tempfile.mkdtemp() - try: - yield os.path.join(tmp, name) - finally: - shutil.rmtree(tmp, ignore_errors=True) +TEST_DATA = os.path.join(os.path.dirname(__file__), 'test_data') class TestUtils(unittest.TestCase): @@ -384,54 +374,39 @@ def testSaveLoad(self): def testSaveLoadStringScoring(self): """ Saving and loading a Phrases object with a string scoring parameter. This should ensure backwards compatibility with the previous version of Phrases""" + bigram_loaded = Phrases.load(os.path.join(TEST_DATA, "phrases-scoring-str.pkl")) + seen_scores = set() + test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] + for phrase, score in bigram_loaded.export_phrases(test_sentences): + seen_scores.add(round(score, 3)) - with temporary_file("test.pkl") as fpath: - bigram = Phrases(self.sentences, min_count=1, threshold=1) - bigram.scoring = "default" - bigram.save(fpath) - bigram_loaded = Phrases.load(fpath) - seen_scores = set() - test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] - for phrase, score in bigram_loaded.export_phrases(test_sentences): - seen_scores.add(round(score, 3)) - - assert seen_scores == set([ - 5.167, # score for graph minors - 3.444 # score for human interface - ]) + assert seen_scores == set([ + 5.167, # score for graph minors + 3.444 # score for human interface + ]) def testSaveLoadNoScoring(self): """ Saving and loading a Phrases object with no scoring parameter. This should ensure backwards compatibility with old versions of Phrases""" - with temporary_file("test.pkl") as fpath: - bigram = Phrases(self.sentences, min_count=1, threshold=1) - del(bigram.scoring) - bigram.save(fpath) - bigram_loaded = Phrases.load(fpath) - seen_scores = set() - test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] - for phrase, score in bigram_loaded.export_phrases(test_sentences): - seen_scores.add(round(score, 3)) + bigram_loaded = Phrases.load(os.path.join(TEST_DATA, "phrases-no-scoring.pkl")) + seen_scores = set() + test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] + for phrase, score in bigram_loaded.export_phrases(test_sentences): + seen_scores.add(round(score, 3)) - assert seen_scores == set([ - 5.167, # score for graph minors - 3.444 # score for human interface - ]) + assert seen_scores == set([ + 5.167, # score for graph minors + 3.444 # score for human interface + ]) def testSaveLoadNoCommonTerms(self): - """ Saving and loading a Phrases objects without common_terms - This should ensure backwards compatibility with old versions of Phrases""" - - with temporary_file("test.pkl") as fpath: - bigram = Phrases(self.sentences, min_count=1, threshold=1) - del(bigram.common_terms) - bigram.save(fpath) - bigram_loaded = Phrases.load(fpath) - self.assertEqual(bigram_loaded.common_terms, frozenset()) - # can make a phraser, cf #1751 - phraser = Phraser(bigram_loaded) # does not raise - phraser["some terms"] # does not raise + """ Ensure backwards compatibility with old versions of Phrases, before common_terms""" + bigram_loaded = Phrases.load(os.path.join(TEST_DATA, "phrases-no-common-terms.pkl")) + self.assertEqual(bigram_loaded.common_terms, frozenset()) + # can make a phraser, cf #1751 + phraser = Phraser(bigram_loaded) # does not raise + phraser[["human", "interface", "survey"]] # does not raise class TestPhraserPersistence(PhrasesData, unittest.TestCase): @@ -460,37 +435,21 @@ def testSaveLoad(self): def testSaveLoadStringScoring(self): """ Saving and loading a Phraser object with a string scoring parameter. This should ensure backwards compatibility with the previous version of Phraser""" - - with temporary_file("test.pkl") as fpath: - bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1)) - bigram.scoring = "default" - bigram.save(fpath) - bigram_loaded = Phraser.load(fpath) - # we do not much with scoring, just verify its the one expected - self.assertEqual(bigram_loaded.scoring, original_scorer) + bigram_loaded = Phraser.load(os.path.join(TEST_DATA, "phraser-scoring-str.pkl")) + # we do not much with scoring, just verify its the one expected + self.assertEqual(bigram_loaded.scoring, original_scorer) def testSaveLoadNoScoring(self): """ Saving and loading a Phraser object with no scoring parameter. This should ensure backwards compatibility with old versions of Phraser""" - - with temporary_file("test.pkl") as fpath: - bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1)) - del(bigram.scoring) - bigram.save(fpath) - bigram_loaded = Phraser.load(fpath) - # we do not much with scoring, just verify its the one expected - self.assertEqual(bigram_loaded.scoring, original_scorer) + bigram_loaded = Phraser.load(os.path.join(TEST_DATA, "phraser-no-scoring.pkl")) + # we do not much with scoring, just verify its the one expected + self.assertEqual(bigram_loaded.scoring, original_scorer) def testSaveLoadNoCommonTerms(self): - """ Saving and loading a Phraser objects without common_terms - This should ensure backwards compatibility with old versions of Phraser""" - - with temporary_file("test.pkl") as fpath: - bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1)) - del(bigram.common_terms) - bigram.save(fpath) - bigram_loaded = Phraser.load(fpath) - self.assertEqual(bigram_loaded.common_terms, frozenset()) + """ Ensure backwards compatibility with old versions of Phraser, before common_terms""" + bigram_loaded = Phraser.load(os.path.join(TEST_DATA, "phraser-no-common-terms.pkl")) + self.assertEqual(bigram_loaded.common_terms, frozenset()) class TestPhraserModel(PhrasesData, PhrasesCommon, unittest.TestCase): diff --git a/gensim/test/utils.py b/gensim/test/utils.py index 89fae9226e..358133c321 100644 --- a/gensim/test/utils.py +++ b/gensim/test/utils.py @@ -5,8 +5,10 @@ """ Common utils for tests """ +import contextlib import tempfile import os +import shutil from gensim.corpora import Dictionary @@ -27,6 +29,22 @@ def get_tmpfile(suffix): return os.path.join(tempfile.gettempdir(), suffix) +@contextlib.contextmanager +def temporary_file(name=""): + """create a temporary directory and return a path to "name" in that directory + + At the end of the context, the directory is removed. + + The function doesn't create the file. + """ + # note : when dropping python2.7 support, we can use tempfile.TemporaryDirectory + tmp = tempfile.mkdtemp() + try: + yield os.path.join(tmp, name) + finally: + shutil.rmtree(tmp, ignore_errors=True) + + # set up vars used in testing ("Deerwester" from the web tutorial) common_texts = [ ['human', 'interface', 'computer'], From e023ac74355391f1037b8bce75afcfff4d3d3355 Mon Sep 17 00:00:00 2001 From: Alex Garel Date: Tue, 5 Dec 2017 17:39:43 +0100 Subject: [PATCH 6/8] pep8 fix --- gensim/models/phrases.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 747d78e749..973eee9be5 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -207,7 +207,8 @@ def load(cls, *args, **kwargs): logger.info('setting scoring method to npmi_scorer pluggable scoring method for compatibility') model.scoring = npmi_scorer else: - raise ValueError('failed to load %s model with unknown scoring setting %s' % (cls.__name__, model.scoring)) + raise ValueError( + 'failed to load %s model with unknown scoring setting %s' % (cls.__name__, model.scoring)) # if there is non common_terms attribute, initialize if not hasattr(model, "common_terms"): logger.info('older version of %s loaded without common_terms attribute', cls.__name__) From cc623ca1c030c14f935bce40a5e429af17fc505f Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 6 Dec 2017 17:50:14 +0500 Subject: [PATCH 7/8] fix imports, reuse datapath --- gensim/test/test_phrases.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index 862fa2475a..f26fd0daa8 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -14,13 +14,10 @@ import six -from gensim import utils +from gensim.utils import to_unicode from gensim.models.phrases import SentenceAnalyzer, Phrases, Phraser from gensim.models.phrases import pseudocorpus, original_scorer -from gensim.test.utils import common_texts, temporary_file - - -TEST_DATA = os.path.join(os.path.dirname(__file__), 'test_data') +from gensim.test.utils import common_texts, temporary_file, datapath class TestUtils(unittest.TestCase): @@ -140,7 +137,7 @@ class PhrasesData: sentences = common_texts + [ ['graph', 'minors', 'survey', 'human', 'interface'] ] - unicode_sentences = [[utils.to_unicode(w) for w in sentence] for sentence in sentences] + unicode_sentences = [[to_unicode(w) for w in sentence] for sentence in sentences] common_terms = frozenset() bigram1 = u'response_time' @@ -374,7 +371,7 @@ def testSaveLoad(self): def testSaveLoadStringScoring(self): """ Saving and loading a Phrases object with a string scoring parameter. This should ensure backwards compatibility with the previous version of Phrases""" - bigram_loaded = Phrases.load(os.path.join(TEST_DATA, "phrases-scoring-str.pkl")) + bigram_loaded = Phrases.load(datapath("phrases-scoring-str.pkl")) seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): @@ -389,7 +386,7 @@ def testSaveLoadNoScoring(self): """ Saving and loading a Phrases object with no scoring parameter. This should ensure backwards compatibility with old versions of Phrases""" - bigram_loaded = Phrases.load(os.path.join(TEST_DATA, "phrases-no-scoring.pkl")) + bigram_loaded = Phrases.load(datapath("phrases-no-scoring.pkl")) seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): @@ -402,7 +399,7 @@ def testSaveLoadNoScoring(self): def testSaveLoadNoCommonTerms(self): """ Ensure backwards compatibility with old versions of Phrases, before common_terms""" - bigram_loaded = Phrases.load(os.path.join(TEST_DATA, "phrases-no-common-terms.pkl")) + bigram_loaded = Phrases.load(datapath("phrases-no-common-terms.pkl")) self.assertEqual(bigram_loaded.common_terms, frozenset()) # can make a phraser, cf #1751 phraser = Phraser(bigram_loaded) # does not raise @@ -435,20 +432,20 @@ def testSaveLoad(self): def testSaveLoadStringScoring(self): """ Saving and loading a Phraser object with a string scoring parameter. This should ensure backwards compatibility with the previous version of Phraser""" - bigram_loaded = Phraser.load(os.path.join(TEST_DATA, "phraser-scoring-str.pkl")) + bigram_loaded = Phraser.load(datapath("phraser-scoring-str.pkl")) # we do not much with scoring, just verify its the one expected self.assertEqual(bigram_loaded.scoring, original_scorer) def testSaveLoadNoScoring(self): """ Saving and loading a Phraser object with no scoring parameter. This should ensure backwards compatibility with old versions of Phraser""" - bigram_loaded = Phraser.load(os.path.join(TEST_DATA, "phraser-no-scoring.pkl")) + bigram_loaded = Phraser.load(datapath("phraser-no-scoring.pkl")) # we do not much with scoring, just verify its the one expected self.assertEqual(bigram_loaded.scoring, original_scorer) def testSaveLoadNoCommonTerms(self): """ Ensure backwards compatibility with old versions of Phraser, before common_terms""" - bigram_loaded = Phraser.load(os.path.join(TEST_DATA, "phraser-no-common-terms.pkl")) + bigram_loaded = Phraser.load(datapath("phraser-no-common-terms.pkl")) self.assertEqual(bigram_loaded.common_terms, frozenset()) @@ -489,7 +486,7 @@ class CommonTermsPhrasesData: ['data', 'and', 'graph', 'survey'], ['data', 'and', 'graph', 'survey', 'for', 'human', 'interface'] # test bigrams within same sentence ] - unicode_sentences = [[utils.to_unicode(w) for w in sentence] for sentence in sentences] + unicode_sentences = [[to_unicode(w) for w in sentence] for sentence in sentences] common_terms = ['of', 'and', 'for'] bigram1 = u'lack_of_interest' From 63fe8c3c4b6aaa61632a96623b95043b02798f94 Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 6 Dec 2017 18:00:49 +0500 Subject: [PATCH 8/8] remove unused import --- gensim/test/test_phrases.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index f26fd0daa8..f0e9cea864 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -9,7 +9,6 @@ import logging -import os import unittest import six