Merge pull request #555 from svenkreiss/lazy-init-sims

load_word2vec_format(): remove init_sims() call
piskvorky · Dec 1, 2015 · 839513f · 839513f
2 parents 602c0c7 + 01b09c7
commit 839513f
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 17 deletions.
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -3,6 +3,9 @@ Changes
 
 0.12.4, XX/XX/XXXX
 
+* load_word2vec_format() performance (@svenkreiss, #555)
+  - Remove `init_sims()` call for performance improvements when normalized vectors are not needed.
+  - Remove `norm_only` parameter (API change). Call `init_sims(replace=True)` after the `load_word2vec_format()` call for the old `norm_only=True` behavior.
 * Better internal handling of job batching in word2vec (#535)
   - up to 300% speed up when training on very short documents (~tweets)
 * Word2vec allows non-strict unicode error handling (ignore or replace) (Gordon Mohr, #466)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
@@ -1015,7 +1015,7 @@ def save_word2vec_format(self, fname, fvocab=None, binary=False):
                     fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))
 
     @classmethod
-    def load_word2vec_format(cls, fname, fvocab=None, binary=False, norm_only=True, encoding='utf8', unicode_errors='strict', init_sims=True):
+    def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict'):
         """
         Load the input-hidden weight matrix from the original C word2vec-tool format.
 
@@ -1096,8 +1096,6 @@ def add_word(word, weights):
         assert (len(result.vocab), result.vector_size) == result.syn0.shape
 
         logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname))
-        if init_sims:
-            result.init_sims(norm_only)
         return result
 
     def intersect_word2vec_format(self, fname, binary=False, encoding='utf8', unicode_errors='strict'):

diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py
@@ -90,24 +90,16 @@ def testLambdaRule(self):
         model = word2vec.Word2Vec(sentences, min_count=1, trim_rule=rule)
         self.assertTrue("human" not in model.vocab)
 
-    def testPersistenceWord2VecFormatInitSims(self):
-        """Test storing/loading the entire model in word2vec format skipping
-        the init_sims() call."""
-        model = word2vec.Word2Vec(sentences, min_count=1)
-        model.init_sims()
-        model.save_word2vec_format(testfile(), binary=True)
-        binary_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True, norm_only=False, init_sims=False)
-        self.assertTrue(numpy.allclose(model['human'], binary_model['human']))
-        self.assertFalse(hasattr(binary_model, 'syn0norm'))
-
     def testPersistenceWord2VecFormat(self):
         """Test storing/loading the entire model in word2vec format."""
         model = word2vec.Word2Vec(sentences, min_count=1)
         model.init_sims()
         model.save_word2vec_format(testfile(), binary=True)
-        binary_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True, norm_only=False)
+        binary_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True)
+        binary_model.init_sims(replace=False)
         self.assertTrue(numpy.allclose(model['human'], binary_model['human']))
-        norm_only_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True, norm_only=True)
+        norm_only_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True)
+        norm_only_model.init_sims(replace=True)
         self.assertFalse(numpy.allclose(model['human'], norm_only_model['human']))
         self.assertTrue(numpy.allclose(model.syn0norm[model.vocab['human'].index], norm_only_model['human']))
 
@@ -116,9 +108,11 @@ def testPersistenceWord2VecFormatNonBinary(self):
         model = word2vec.Word2Vec(sentences, min_count=1)
         model.init_sims()
         model.save_word2vec_format(testfile(), binary=False)
-        text_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=False, norm_only=False)
+        text_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=False)
+        text_model.init_sims(False)
         self.assertTrue(numpy.allclose(model['human'], text_model['human'], atol=1e-6))
-        norm_only_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=False, norm_only=True)
+        norm_only_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=False)
+        norm_only_model.init_sims(True)
         self.assertFalse(numpy.allclose(model['human'], norm_only_model['human'], atol=1e-6))
 
         self.assertTrue(numpy.allclose(model.syn0norm[model.vocab['human'].index], norm_only_model['human'], atol=1e-4))