Skip to content

Commit

Permalink
Merge pull request #555 from svenkreiss/lazy-init-sims
Browse files Browse the repository at this point in the history
load_word2vec_format(): remove init_sims() call
  • Loading branch information
piskvorky committed Dec 1, 2015
2 parents 602c0c7 + 01b09c7 commit 839513f
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 17 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ Changes

0.12.4, XX/XX/XXXX

* load_word2vec_format() performance (@svenkreiss, #555)
- Remove `init_sims()` call for performance improvements when normalized vectors are not needed.
- Remove `norm_only` parameter (API change). Call `init_sims(replace=True)` after the `load_word2vec_format()` call for the old `norm_only=True` behavior.
* Better internal handling of job batching in word2vec (#535)
- up to 300% speed up when training on very short documents (~tweets)
* Word2vec allows non-strict unicode error handling (ignore or replace) (Gordon Mohr, #466)
Expand Down
4 changes: 1 addition & 3 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -1015,7 +1015,7 @@ def save_word2vec_format(self, fname, fvocab=None, binary=False):
fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))

@classmethod
def load_word2vec_format(cls, fname, fvocab=None, binary=False, norm_only=True, encoding='utf8', unicode_errors='strict', init_sims=True):
def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict'):
"""
Load the input-hidden weight matrix from the original C word2vec-tool format.
Expand Down Expand Up @@ -1096,8 +1096,6 @@ def add_word(word, weights):
assert (len(result.vocab), result.vector_size) == result.syn0.shape

logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname))
if init_sims:
result.init_sims(norm_only)
return result

def intersect_word2vec_format(self, fname, binary=False, encoding='utf8', unicode_errors='strict'):
Expand Down
22 changes: 8 additions & 14 deletions gensim/test/test_word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,24 +90,16 @@ def testLambdaRule(self):
model = word2vec.Word2Vec(sentences, min_count=1, trim_rule=rule)
self.assertTrue("human" not in model.vocab)

def testPersistenceWord2VecFormatInitSims(self):
"""Test storing/loading the entire model in word2vec format skipping
the init_sims() call."""
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
model.save_word2vec_format(testfile(), binary=True)
binary_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True, norm_only=False, init_sims=False)
self.assertTrue(numpy.allclose(model['human'], binary_model['human']))
self.assertFalse(hasattr(binary_model, 'syn0norm'))

def testPersistenceWord2VecFormat(self):
"""Test storing/loading the entire model in word2vec format."""
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
model.save_word2vec_format(testfile(), binary=True)
binary_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True, norm_only=False)
binary_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True)
binary_model.init_sims(replace=False)
self.assertTrue(numpy.allclose(model['human'], binary_model['human']))
norm_only_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True, norm_only=True)
norm_only_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True)
norm_only_model.init_sims(replace=True)
self.assertFalse(numpy.allclose(model['human'], norm_only_model['human']))
self.assertTrue(numpy.allclose(model.syn0norm[model.vocab['human'].index], norm_only_model['human']))

Expand All @@ -116,9 +108,11 @@ def testPersistenceWord2VecFormatNonBinary(self):
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
model.save_word2vec_format(testfile(), binary=False)
text_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=False, norm_only=False)
text_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=False)
text_model.init_sims(False)
self.assertTrue(numpy.allclose(model['human'], text_model['human'], atol=1e-6))
norm_only_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=False, norm_only=True)
norm_only_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=False)
norm_only_model.init_sims(True)
self.assertFalse(numpy.allclose(model['human'], norm_only_model['human'], atol=1e-6))

self.assertTrue(numpy.allclose(model.syn0norm[model.vocab['human'].index], norm_only_model['human'], atol=1e-4))
Expand Down

0 comments on commit 839513f

Please sign in to comment.