Skip to content

Commit

Permalink
Fix doctag unicode problem. Fix 1543 (#1544)
Browse files Browse the repository at this point in the history
* Fix doctag unicode

* Add test for unicode doctags.

* Fix doc2vec unicode title test.

* Make the unicode tag cast less hidden.
  • Loading branch information
englhardt authored and menshikh-iv committed Sep 19, 2017
1 parent 2e58a1c commit 5a49a79
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 3 deletions.
2 changes: 1 addition & 1 deletion gensim/models/doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -868,7 +868,7 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*
fout.write(utils.to_utf8("%s %s\n" % (total_vec, self.vector_size)))
# store as in input order
for i in range(len(self.docvecs)):
doctag = prefix + str(self.docvecs.index_to_doctag(i))
doctag = u"%s%s" % (prefix, self.docvecs.index_to_doctag(i))
row = self.docvecs.doctag_syn0[i]
if binary:
fout.write(utils.to_utf8(doctag) + b" " + row.tostring())
Expand Down
17 changes: 15 additions & 2 deletions gensim/test/test_doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,16 @@


class DocsLeeCorpus(object):
def __init__(self, string_tags=False):
def __init__(self, string_tags=False, unicode_tags=False):
self.string_tags = string_tags
self.unicode_tags = unicode_tags

def _tag(self, i):
return i if not self.string_tags else '_*%d' % i
if self.unicode_tags:
return u'_\xa1_%d' % i
elif self.string_tags:
return '_*%d' % i
return i

def __iter__(self):
with open(datapath('lee_background.cor')) as f:
Expand Down Expand Up @@ -98,6 +103,14 @@ def testPersistenceWord2VecFormat(self):
binary_model_dv = keyedvectors.KeyedVectors.load_word2vec_format(test_word, binary=True)
self.assertEqual(len(model.wv.vocab), len(binary_model_dv.vocab))

def test_unicode_in_doctag(self):
"""Test storing document vectors of a model with unicode titles."""
model = doc2vec.Doc2Vec(DocsLeeCorpus(unicode_tags=True), min_count=1)
try:
model.save_word2vec_format(testfile(), doctag_vec=True, word_vec=True, binary=True)
except UnicodeEncodeError:
self.fail('Failed storing unicode title.')

def test_load_mmap(self):
"""Test storing/loading the entire model."""
model = doc2vec.Doc2Vec(sentences, min_count=1)
Expand Down

0 comments on commit 5a49a79

Please sign in to comment.