piskvorky · menshikh-iv · Dec 6, 2017 · Dec 4, 2017 · Dec 4, 2017 · Dec 4, 2017
diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
@@ -178,7 +178,55 @@ def analyze_sentence(self, sentence, threshold, common_terms, scorer):
                     yield (word, None)
 
 
-class Phrases(SentenceAnalyzer, interfaces.TransformationABC):
+class PhrasesTransformation(interfaces.TransformationABC):
+
+    @classmethod
+    def load(cls, *args, **kwargs):
+        """
+        Load a previously saved Phrases class. Handles backwards compatibility from
+            older Phrases versions which did not support  pluggable scoring functions.
+            Otherwise, relies on utils.load
+        """
+
+        model = super(PhrasesTransformation, cls).load(*args, **kwargs)
+        # update older models
+        # if no scoring parameter, use default scoring
+        if not hasattr(model, 'scoring'):
+            logger.info('older version of %s loaded without scoring function', cls.__name__)
+            logger.info('setting pluggable scoring method to original_scorer for compatibility')
+            model.scoring = original_scorer
+        # if there is a scoring parameter, and it's a text value, load the proper scoring function
+        if hasattr(model, 'scoring'):
+            if isinstance(model.scoring, six.string_types):
+                if model.scoring == 'default':
+                    logger.info(
+                        'older version of %s loaded with "default" scoring parameter',
+                        cls.__name__)
+                    logger.info(
+                        'setting scoring method to original_scorer pluggable scoring method ' +
+                        'for compatibility')
+                    model.scoring = original_scorer
+                elif model.scoring == 'npmi':
+                    logger.info(
+                        'older version of %s loaded with "npmi" scoring parameter',
+                        cls.__name__)
+                    logger.info(
+                        'setting scoring method to npmi_scorer pluggable scoring method ' +
+                        'for compatibility')
+                    model.scoring = npmi_scorer
+                else:
+                    raise ValueError(
+                        'failed to load %s model with unknown scoring setting %s' %
+                        (cls.__name__, model.scoring))
+        # if there is non common_terms attribute, initialize
+        if not hasattr(model, "common_terms"):
+            logger.info('older version of %s loaded without common_terms attribute', cls.__name__)
+            logger.info('setting common_terms to empty set')
+            model.common_terms = frozenset()
+        return model
+
+
+class Phrases(SentenceAnalyzer, PhrasesTransformation):
     """
     Detect phrases, based on collected collocation counts. Adjacent words that appear
     together more frequently than expected are joined together with the `_` character.
@@ -461,41 +509,6 @@ def __getitem__(self, sentence):
 
         return [utils.to_unicode(w) for w in new_s]
 
-    @classmethod
-    def load(cls, *args, **kwargs):
-        """
-        Load a previously saved Phrases class. Handles backwards compatibility from
-            older Phrases versions which did not support  pluggable scoring functions. Otherwise, relies on utils.load
-        """
-
-        # for python 2 and 3 compatibility. basestring is used to check if model.scoring is a string
-        try:
-            basestring
-        except NameError:
-            basestring = str
-
-        model = super(Phrases, cls).load(*args, **kwargs)
-        # update older models
-        # if no scoring parameter, use default scoring
-        if not hasattr(model, 'scoring'):
-            logger.info('older version of Phrases loaded without scoring function')
-            logger.info('setting pluggable scoring method to original_scorer for compatibility')
-            model.scoring = original_scorer
-        # if there is a scoring parameter, and it's a text value, load the proper scoring function
-        if hasattr(model, 'scoring'):
-            if isinstance(model.scoring, basestring):
-                if model.scoring == 'default':
-                    logger.info('older version of Phrases loaded with "default" scoring parameter')
-                    logger.info('setting scoring method to original_scorer pluggable scoring method for compatibility')
-                    model.scoring = original_scorer
-                elif model.scoring == 'npmi':
-                    logger.info('older version of Phrases loaded with "npmi" scoring parameter')
-                    logger.info('setting scoring method to npmi_scorer pluggable scoring method for compatibility')
-                    model.scoring = npmi_scorer
-                else:
-                    raise ValueError('failed to load Phrases model with unknown scoring setting %s' % (model.scoring))
-        return model
-
 
 # these two built-in scoring methods don't cast everything to float because the casting is done in the call
 # to the scoring method in __getitem__ and export_phrases.
@@ -530,7 +543,7 @@ def pseudocorpus(source_vocab, sep, common_terms=frozenset()):
                 yield components
 
 
-class Phraser(SentenceAnalyzer, interfaces.TransformationABC):
+class Phraser(SentenceAnalyzer, PhrasesTransformation):
     """
     Minimal state & functionality to apply results of a Phrases model to tokens.
 

diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py
@@ -8,17 +8,29 @@
 """
 
 
+import contextlib
 import logging
-import unittest
 import os
-import sys
+import shutil
+import tempfile
+import unittest
+
+import six
 
 from gensim import utils
-from gensim.models.phrases import SentenceAnalyzer, Phrases, Phraser, pseudocorpus
+from gensim.models.phrases import SentenceAnalyzer, Phrases, Phraser
+from gensim.models.phrases import pseudocorpus, original_scorer
 from gensim.test.utils import common_texts
 
-if sys.version_info[0] >= 3:
-    unicode = str
+
+@contextlib.contextmanager
+def temporary_file(name):
+    # note : when dropping python2.7 support, we can use tempfile.TemporaryDirectory
+    tmp = tempfile.mkdtemp()
+    try:
+        yield os.path.join(tmp, name)
+    finally:
+        shutil.rmtree(tmp, ignore_errors=True)
 
 
 class TestUtils(unittest.TestCase):
@@ -230,7 +242,7 @@ def testEncoding(self):
         self.assertEqual(self.bigram_unicode[self.sentences[1]], expected)
 
         transformed = ' '.join(self.bigram_utf8[self.sentences[1]])
-        self.assertTrue(isinstance(transformed, unicode))
+        self.assertTrue(isinstance(transformed, six.text_type))
 
 
 # scorer for testCustomScorer
@@ -335,15 +347,15 @@ def testPruning(self):
 # endclass TestPhrasesModel
 
 
-class TestPhrasesScoringPersistence(PhrasesData, unittest.TestCase):
+class TestPhrasesPersistence(PhrasesData, unittest.TestCase):
 
     def testSaveLoadCustomScorer(self):
         """ saving and loading a Phrases object with a custom scorer """
 
-        try:
+        with temporary_file("test.pkl") as fpath:
             bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)
-            bigram.save("test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
-            bigram_loaded = Phrases.load("test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
+            bigram.save(fpath)
+            bigram_loaded = Phrases.load(fpath)
             seen_scores = []
             test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
             for phrase, score in bigram_loaded.export_phrases(test_sentences):
@@ -352,17 +364,13 @@ def testSaveLoadCustomScorer(self):
             assert all(seen_scores)  # all scores 1
             assert len(seen_scores) == 3  # 'graph minors' and 'survey human' and 'interface system'
 
-        finally:
-            if os.path.exists("test_phrases_testSaveLoadCustomScorer_temp_save.pkl"):
-                os.remove("test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
-
     def testSaveLoad(self):
         """ Saving and loading a Phrases object."""
 
-        try:
+        with temporary_file("test.pkl") as fpath:
             bigram = Phrases(self.sentences, min_count=1, threshold=1)
-            bigram.save("test_phrases_testSaveLoad_temp_save.pkl")
-            bigram_loaded = Phrases.load("test_phrases_testSaveLoad_temp_save.pkl")
+            bigram.save(fpath)
+            bigram_loaded = Phrases.load(fpath)
             seen_scores = set()
             test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
             for phrase, score in bigram_loaded.export_phrases(test_sentences):
@@ -373,19 +381,15 @@ def testSaveLoad(self):
                 3.444  # score for human interface
             ])
 
-        finally:
-            if os.path.exists("test_phrases_testSaveLoad_temp_save.pkl"):
-                os.remove("test_phrases_testSaveLoad_temp_save.pkl")
-
     def testSaveLoadStringScoring(self):
         """ Saving and loading a Phrases object with a string scoring parameter.
         This should ensure backwards compatibility with the previous version of Phrases"""
 
-        try:
+        with temporary_file("test.pkl") as fpath:
             bigram = Phrases(self.sentences, min_count=1, threshold=1)
             bigram.scoring = "default"
-            bigram.save("test_phrases_testSaveLoadStringScoring_temp_save.pkl")
-            bigram_loaded = Phrases.load("test_phrases_testSaveLoadStringScoring_temp_save.pkl")
+            bigram.save(fpath)
+            bigram_loaded = Phrases.load(fpath)
             seen_scores = set()
             test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
             for phrase, score in bigram_loaded.export_phrases(test_sentences):
@@ -396,19 +400,15 @@ def testSaveLoadStringScoring(self):
                 3.444  # score for human interface
             ])
 
-        finally:
-            if os.path.exists("test_phrases_testSaveLoadStringScoring_temp_save.pkl"):
-                os.remove("test_phrases_testSaveLoadStringScoring_temp_save.pkl")
-
     def testSaveLoadNoScoring(self):
         """ Saving and loading a Phrases object with no scoring parameter.
         This should ensure backwards compatibility with old versions of Phrases"""
 
-        try:
+        with temporary_file("test.pkl") as fpath:
             bigram = Phrases(self.sentences, min_count=1, threshold=1)
             del(bigram.scoring)
-            bigram.save("test_phrases_testSaveLoadNoScoring_temp_save.pkl")
-            bigram_loaded = Phrases.load("test_phrases_testSaveLoadNoScoring_temp_save.pkl")
+            bigram.save(fpath)
+            bigram_loaded = Phrases.load(fpath)
             seen_scores = set()
             test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
             for phrase, score in bigram_loaded.export_phrases(test_sentences):
@@ -419,9 +419,78 @@ def testSaveLoadNoScoring(self):
                 3.444  # score for human interface
             ])
 
-        finally:
-            if os.path.exists("test_phrases_testSaveLoadNoScoring_temp_save.pkl"):
-                os.remove("test_phrases_testSaveLoadNoScoring_temp_save.pkl")
+    def testSaveLoadNoCommonTerms(self):
+        """ Saving and loading a Phrases objects without common_terms
+        This should ensure backwards compatibility with old versions of Phrases"""
+
+        with temporary_file("test.pkl") as fpath:
+            bigram = Phrases(self.sentences, min_count=1, threshold=1)
+            del(bigram.common_terms)
+            bigram.save(fpath)
+            bigram_loaded = Phrases.load(fpath)
+            self.assertEqual(bigram_loaded.common_terms, frozenset())
+            # can make a phraser, cf #1751
+            phraser = Phraser(bigram_loaded)  # does not raise
+            phraser["some terms"]  # does not raise
+
+
+class TestPhraserPersistence(PhrasesData, unittest.TestCase):
+
+    def testSaveLoadCustomScorer(self):
+        """Saving and loading a Phraser object with a custom scorer """
+
+        with temporary_file("test.pkl") as fpath:
+            bigram = Phraser(
+                Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer))
+            bigram.save(fpath)
+            bigram_loaded = Phraser.load(fpath)
+            # we do not much with scoring, just verify its the one expected
+            self.assertEqual(bigram_loaded.scoring, dumb_scorer)
+
+    def testSaveLoad(self):
+        """ Saving and loading a Phraser object."""
+        with temporary_file("test.pkl") as fpath:
+            bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1))
+            bigram.save(fpath)
+            bigram_loaded = Phraser.load(fpath)
+            self.assertEqual(
+                bigram_loaded[['graph', 'minors', 'survey', 'human', 'interface', 'system']],
+                ['graph_minors', 'survey', 'human_interface', 'system'])
+
+    def testSaveLoadStringScoring(self):
+        """ Saving and loading a Phraser object with a string scoring parameter.
+        This should ensure backwards compatibility with the previous version of Phraser"""
+
+        with temporary_file("test.pkl") as fpath:
+            bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1))
+            bigram.scoring = "default"
+            bigram.save(fpath)
+            bigram_loaded = Phraser.load(fpath)
+            # we do not much with scoring, just verify its the one expected
+            self.assertEqual(bigram_loaded.scoring, original_scorer)
+
+    def testSaveLoadNoScoring(self):
+        """ Saving and loading a Phraser object with no scoring parameter.
+        This should ensure backwards compatibility with old versions of Phraser"""
+
+        with temporary_file("test.pkl") as fpath:
+            bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1))
+            del(bigram.scoring)
+            bigram.save(fpath)
+            bigram_loaded = Phraser.load(fpath)
+            # we do not much with scoring, just verify its the one expected
+            self.assertEqual(bigram_loaded.scoring, original_scorer)
+
+    def testSaveLoadNoCommonTerms(self):
+        """ Saving and loading a Phraser objects without common_terms
+        This should ensure backwards compatibility with old versions of Phraser"""
+
+        with temporary_file("test.pkl") as fpath:
+            bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1))
+            del(bigram.common_terms)
+            bigram.save(fpath)
+            bigram_loaded = Phraser.load(fpath)
+            self.assertEqual(bigram_loaded.common_terms, frozenset())
 
 
 class TestPhraserModel(PhrasesData, PhrasesCommon, unittest.TestCase):
@@ -486,7 +555,7 @@ def testEncoding(self):
         self.assertEqual(self.bigram_unicode[self.sentences[1]], expected)
 
         transformed = ' '.join(self.bigram_utf8[self.sentences[1]])
-        self.assertTrue(isinstance(transformed, unicode))
+        self.assertTrue(isinstance(transformed, six.text_type))
 
     def testMultipleBigramsSingleEntry(self):
         """ a single entry should produce multiple bigrams. """
@@ -593,7 +662,7 @@ def testEncoding(self):
         self.assertEqual(self.bigram_unicode[self.sentences[1]], expected)
 
         transformed = ' '.join(self.bigram_utf8[self.sentences[1]])
-        self.assertTrue(isinstance(transformed, unicode))
+        self.assertTrue(isinstance(transformed, six.text_type))
 
 
 if __name__ == '__main__':