From b923043fa34c93e01d9d6df4a4cb9fccc5bd9af2 Mon Sep 17 00:00:00 2001 From: Pushpankar Date: Wed, 27 Dec 2017 20:22:21 +0530 Subject: [PATCH 01/13] load vector with high precision --- gensim/models/keyedvectors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 5c3d3f55bc..fb8dde0ae7 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -243,7 +243,7 @@ def add_word(word, weights): parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") if len(parts) != vector_size + 1: raise ValueError("invalid vector on line %s (is this really the text format?)" % line_no) - word, weights = parts[0], [REAL(x) for x in parts[1:]] + word, weights = parts[0], [datatype(x) for x in parts[1:]] add_word(word, weights) if result.syn0.shape[0] != len(result.vocab): logger.info( From aaa7c2ab586f1c698fadd6cd04339cdbceb62bbb Mon Sep 17 00:00:00 2001 From: Pushpankar Date: Mon, 15 Jan 2018 21:02:41 +0530 Subject: [PATCH 02/13] Test changes --- gensim/test/test.kv.txt | 4 ++++ gensim/test/test_datatype.py | 15 +++++++++++++++ 2 files changed, 19 insertions(+) create mode 100644 gensim/test/test.kv.txt create mode 100644 gensim/test/test_datatype.py diff --git a/gensim/test/test.kv.txt b/gensim/test/test.kv.txt new file mode 100644 index 0000000000..d653bc02f8 --- /dev/null +++ b/gensim/test/test.kv.txt @@ -0,0 +1,4 @@ +2 3 +kangaroo.n.01 -0.0007369244245224787 -8.269973595356034e-05 +horse.n.01 -0.0008546282343595379 0.0007694142576316829 +horsee.n.01 -0.0008546282343595379 0.0007694142576316829 diff --git a/gensim/test/test_datatype.py b/gensim/test/test_datatype.py new file mode 100644 index 0000000000..a4038dce53 --- /dev/null +++ b/gensim/test/test_datatype.py @@ -0,0 +1,15 @@ +import logging +import unittest + +import numpy as np +from gensim.models.keyedvectors import KeyedVectors + +class TestDataType(unittest.TestCase): + def test_datatype(self): + kv = KeyedVectors.load_word2vec_format('test.kv.txt', datatype=np.float64) + self.assertEqual(kv['horse.n.01'][0], -0.0008546282343595379) + + +if __name__ == '__main__': + logging.root.setLevel(logging.WARNING) + unittest.main() From a8f44c53a5c22486ff28514163ca4a30c673a9d5 Mon Sep 17 00:00:00 2001 From: Pushpankar Date: Mon, 15 Jan 2018 21:20:42 +0530 Subject: [PATCH 03/13] Fix flake8 error --- gensim/test/test_datatype.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gensim/test/test_datatype.py b/gensim/test/test_datatype.py index a4038dce53..e90a647d97 100644 --- a/gensim/test/test_datatype.py +++ b/gensim/test/test_datatype.py @@ -4,6 +4,7 @@ import numpy as np from gensim.models.keyedvectors import KeyedVectors + class TestDataType(unittest.TestCase): def test_datatype(self): kv = KeyedVectors.load_word2vec_format('test.kv.txt', datatype=np.float64) From 37b39f43a3a8d14bea821c9281c66219279e0246 Mon Sep 17 00:00:00 2001 From: Pushpankar Date: Tue, 16 Jan 2018 12:17:52 +0530 Subject: [PATCH 04/13] Fix path error --- gensim/test/test.kv.txt | 3 +-- gensim/test/test_datatype.py | 4 +++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/gensim/test/test.kv.txt b/gensim/test/test.kv.txt index d653bc02f8..fb63e7af67 100644 --- a/gensim/test/test.kv.txt +++ b/gensim/test/test.kv.txt @@ -1,4 +1,3 @@ -2 3 +2 2 kangaroo.n.01 -0.0007369244245224787 -8.269973595356034e-05 horse.n.01 -0.0008546282343595379 0.0007694142576316829 -horsee.n.01 -0.0008546282343595379 0.0007694142576316829 diff --git a/gensim/test/test_datatype.py b/gensim/test/test_datatype.py index e90a647d97..51db874d57 100644 --- a/gensim/test/test_datatype.py +++ b/gensim/test/test_datatype.py @@ -3,11 +3,13 @@ import numpy as np from gensim.models.keyedvectors import KeyedVectors +import os class TestDataType(unittest.TestCase): def test_datatype(self): - kv = KeyedVectors.load_word2vec_format('test.kv.txt', datatype=np.float64) + path = os.path.join(os.path.dirname(__file__), 'test.kv.txt') + kv = KeyedVectors.load_word2vec_format(path, datatype=np.float64) self.assertEqual(kv['horse.n.01'][0], -0.0008546282343595379) From 310690d1f99fa44ca00cecf6716c465857055c7e Mon Sep 17 00:00:00 2001 From: Pushpankar Date: Tue, 16 Jan 2018 17:39:52 +0530 Subject: [PATCH 05/13] Reformat code --- gensim/test/{ => test_data}/test.kv.txt | 0 gensim/test/test_datatype.py | 22 +++++++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) rename gensim/test/{ => test_data}/test.kv.txt (100%) diff --git a/gensim/test/test.kv.txt b/gensim/test/test_data/test.kv.txt similarity index 100% rename from gensim/test/test.kv.txt rename to gensim/test/test_data/test.kv.txt diff --git a/gensim/test/test_datatype.py b/gensim/test/test_datatype.py index 51db874d57..f0c31af7f3 100644 --- a/gensim/test/test_datatype.py +++ b/gensim/test/test_datatype.py @@ -1,16 +1,28 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Automated tests for checking various matutils functions. +""" + import logging import unittest import numpy as np + +from gensim.test.utils import datapath from gensim.models.keyedvectors import KeyedVectors -import os class TestDataType(unittest.TestCase): - def test_datatype(self): - path = os.path.join(os.path.dirname(__file__), 'test.kv.txt') - kv = KeyedVectors.load_word2vec_format(path, datatype=np.float64) - self.assertEqual(kv['horse.n.01'][0], -0.0008546282343595379) + def test_text(self): + path = datapath('test.kv.txt') + kv = KeyedVectors.load_word2vec_format(path, binary=False, + datatype=np.float64) + self.assertAlmostEqual(kv['horse.n.01'][0], -0.0008546282343595379) + self.assertEqual(kv['horse.n.01'][0].dtype, np.float64) if __name__ == '__main__': From de98f2e4edf77f4edbf3f9c3be7d281c41878253 Mon Sep 17 00:00:00 2001 From: Pushpankar Date: Wed, 17 Jan 2018 19:50:01 +0530 Subject: [PATCH 06/13] Fix precision loss issue for binary word2vec --- gensim/models/keyedvectors.py | 6 +++--- gensim/test/test_data/test.kv.bin | 2 ++ gensim/test/test_datatype.py | 7 +++++++ 3 files changed, 12 insertions(+), 3 deletions(-) create mode 100644 gensim/test/test_data/test.kv.bin diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index fb8dde0ae7..123b9f1006 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -220,7 +220,7 @@ def add_word(word, weights): result.index2word.append(word) if binary: - binary_len = dtype(REAL).itemsize * vector_size + binary_len = dtype(datatype).itemsize * vector_size for _ in xrange(vocab_size): # mixed text and binary: read text first, then binary word = [] @@ -233,7 +233,7 @@ def add_word(word, weights): if ch != b'\n': # ignore newlines in front of words (some binary files have) word.append(ch) word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors) - weights = fromstring(fin.read(binary_len), dtype=REAL) + weights = fromstring(fin.read(binary_len), dtype=datatype) add_word(word, weights) else: for line_no in xrange(vocab_size): @@ -243,7 +243,7 @@ def add_word(word, weights): parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") if len(parts) != vector_size + 1: raise ValueError("invalid vector on line %s (is this really the text format?)" % line_no) - word, weights = parts[0], [datatype(x) for x in parts[1:]] + word, weights = parts[0], np.array(parts[1:], dtype=datatype) add_word(word, weights) if result.syn0.shape[0] != len(result.vocab): logger.info( diff --git a/gensim/test/test_data/test.kv.bin b/gensim/test/test_data/test.kv.bin new file mode 100644 index 0000000000..889d91b341 --- /dev/null +++ b/gensim/test/test_data/test.kv.bin @@ -0,0 +1,2 @@ +2 2 +kangaroo.n.01 8ıÆ&Å%H¿ğ.¥â­¿horse.n.01 \O($L¿ÀÜk‰P6I? \ No newline at end of file diff --git a/gensim/test/test_datatype.py b/gensim/test/test_datatype.py index f0c31af7f3..e6b68ac08e 100644 --- a/gensim/test/test_datatype.py +++ b/gensim/test/test_datatype.py @@ -17,6 +17,13 @@ class TestDataType(unittest.TestCase): + def test_binary(self): + path = datapath('test.kv.bin') + kv = KeyedVectors.load_word2vec_format(path, binary=True, + datatype=np.float64) + self.assertAlmostEqual(kv['horse.n.01'][0], -0.0008546282343595379) + self.assertEqual(kv['horse.n.01'][0].dtype, np.float64) + def test_text(self): path = datapath('test.kv.txt') kv = KeyedVectors.load_word2vec_format(path, binary=False, From 805daf6300ffe594ba8de33b00db9ac94869c2d1 Mon Sep 17 00:00:00 2001 From: Pushpankar Date: Thu, 18 Jan 2018 16:26:22 +0530 Subject: [PATCH 07/13] Fix precision loss during saving model in text format --- gensim/models/keyedvectors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 123b9f1006..af1b686b37 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -149,7 +149,7 @@ def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None) if binary: fout.write(utils.to_utf8(word) + b" " + row.tostring()) else: - fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row)))) + fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join(repr(val) for val in row)))) @classmethod def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', From 466f37f773d713c774d051e28a9cba82827cfab6 Mon Sep 17 00:00:00 2001 From: Pushpankar Date: Sat, 20 Jan 2018 14:56:48 +0530 Subject: [PATCH 08/13] Fix binary file loading issue --- gensim/models/keyedvectors.py | 7 ++++--- gensim/test/test_data/test.kv.bin | 2 -- gensim/test/test_datatype.py | 7 ------- 3 files changed, 4 insertions(+), 12 deletions(-) delete mode 100644 gensim/test/test_data/test.kv.bin diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index af1b686b37..2fdfb5f5d3 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -147,6 +147,7 @@ def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None) for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): row = self.syn0[vocab.index] if binary: + row = row.astype(REAL) fout.write(utils.to_utf8(word) + b" " + row.tostring()) else: fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join(repr(val) for val in row)))) @@ -220,7 +221,7 @@ def add_word(word, weights): result.index2word.append(word) if binary: - binary_len = dtype(datatype).itemsize * vector_size + binary_len = dtype(REAL).itemsize * vector_size for _ in xrange(vocab_size): # mixed text and binary: read text first, then binary word = [] @@ -233,7 +234,7 @@ def add_word(word, weights): if ch != b'\n': # ignore newlines in front of words (some binary files have) word.append(ch) word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors) - weights = fromstring(fin.read(binary_len), dtype=datatype) + weights = fromstring(fin.read(binary_len), dtype=REAL).astype(datatype) add_word(word, weights) else: for line_no in xrange(vocab_size): @@ -243,7 +244,7 @@ def add_word(word, weights): parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") if len(parts) != vector_size + 1: raise ValueError("invalid vector on line %s (is this really the text format?)" % line_no) - word, weights = parts[0], np.array(parts[1:], dtype=datatype) + word, weights = parts[0], [datatype(x) for x in parts[1:]] add_word(word, weights) if result.syn0.shape[0] != len(result.vocab): logger.info( diff --git a/gensim/test/test_data/test.kv.bin b/gensim/test/test_data/test.kv.bin deleted file mode 100644 index 889d91b341..0000000000 --- a/gensim/test/test_data/test.kv.bin +++ /dev/null @@ -1,2 +0,0 @@ -2 2 -kangaroo.n.01 8ıÆ&Å%H¿ğ.¥â­¿horse.n.01 \O($L¿ÀÜk‰P6I? \ No newline at end of file diff --git a/gensim/test/test_datatype.py b/gensim/test/test_datatype.py index e6b68ac08e..f0c31af7f3 100644 --- a/gensim/test/test_datatype.py +++ b/gensim/test/test_datatype.py @@ -17,13 +17,6 @@ class TestDataType(unittest.TestCase): - def test_binary(self): - path = datapath('test.kv.bin') - kv = KeyedVectors.load_word2vec_format(path, binary=True, - datatype=np.float64) - self.assertAlmostEqual(kv['horse.n.01'][0], -0.0008546282343595379) - self.assertEqual(kv['horse.n.01'][0].dtype, np.float64) - def test_text(self): path = datapath('test.kv.txt') kv = KeyedVectors.load_word2vec_format(path, binary=False, From c157d79f62ea3377b181253b489823ffe6c0ef51 Mon Sep 17 00:00:00 2001 From: Pushpankar Date: Tue, 23 Jan 2018 00:32:06 +0530 Subject: [PATCH 09/13] Test other datatypes as well. --- gensim/test/test_datatype.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/gensim/test/test_datatype.py b/gensim/test/test_datatype.py index f0c31af7f3..f6e78ea036 100644 --- a/gensim/test/test_datatype.py +++ b/gensim/test/test_datatype.py @@ -17,13 +17,27 @@ class TestDataType(unittest.TestCase): - def test_text(self): + def load_model(self, datatype): path = datapath('test.kv.txt') kv = KeyedVectors.load_word2vec_format(path, binary=False, - datatype=np.float64) + datatype=datatype) + return kv + + def test_high_precision(self): + kv = self.load_model(np.float64) self.assertAlmostEqual(kv['horse.n.01'][0], -0.0008546282343595379) self.assertEqual(kv['horse.n.01'][0].dtype, np.float64) + def test_medium_precision(self): + kv = self.load_model(np.float32) + self.assertAlmostEqual(kv['horse.n.01'][0], -0.00085462822) + self.assertEqual(kv['horse.n.01'][0].dtype, np.float32) + + def test_low_precision(self): + kv = self.load_model(np.float16) + self.assertAlmostEqual(kv['horse.n.01'][0], -0.00085449) + self.assertEqual(kv['horse.n.01'][0].dtype, np.float16) + if __name__ == '__main__': logging.root.setLevel(logging.WARNING) From 991bcb6a23f8bcb1b4399454d16a51c2bb4b4374 Mon Sep 17 00:00:00 2001 From: Pushpankar Date: Mon, 5 Feb 2018 20:36:16 +0530 Subject: [PATCH 10/13] Test type conversion --- gensim/test/test_datatype.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/gensim/test/test_datatype.py b/gensim/test/test_datatype.py index f6e78ea036..42b05c4fb5 100644 --- a/gensim/test/test_datatype.py +++ b/gensim/test/test_datatype.py @@ -25,6 +25,8 @@ def load_model(self, datatype): def test_high_precision(self): kv = self.load_model(np.float64) + import pdb + pdb.set_trace() self.assertAlmostEqual(kv['horse.n.01'][0], -0.0008546282343595379) self.assertEqual(kv['horse.n.01'][0].dtype, np.float64) @@ -38,6 +40,16 @@ def test_low_precision(self): self.assertAlmostEqual(kv['horse.n.01'][0], -0.00085449) self.assertEqual(kv['horse.n.01'][0].dtype, np.float16) + def test_type_conversion(self): + path = datapath('test.kv.txt') + binary_path = datapath('test.kv.bin') + model1 = KeyedVectors.load_word2vec_format(path, datatype=np.float16) + model1.save_word2vec_format(binary_path, binary=True) + model2 = KeyedVectors.load_word2vec_format(binary_path, datatype=np.float64, binary=True) + import pdb + pdb.set_trace() + self.assertAlmostEqual(model1["horse.n.01"][0], np.float16(model2["horse.n.01"][0])) + if __name__ == '__main__': logging.root.setLevel(logging.WARNING) From 0904460087c539066e3547d24b47c7990e988dfb Mon Sep 17 00:00:00 2001 From: Pushpankar Kumar Pushp Date: Tue, 6 Feb 2018 14:36:30 +0530 Subject: [PATCH 11/13] Fix build error --- gensim/test/test_datatype.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/gensim/test/test_datatype.py b/gensim/test/test_datatype.py index 42b05c4fb5..a169dbb609 100644 --- a/gensim/test/test_datatype.py +++ b/gensim/test/test_datatype.py @@ -25,8 +25,6 @@ def load_model(self, datatype): def test_high_precision(self): kv = self.load_model(np.float64) - import pdb - pdb.set_trace() self.assertAlmostEqual(kv['horse.n.01'][0], -0.0008546282343595379) self.assertEqual(kv['horse.n.01'][0].dtype, np.float64) @@ -46,8 +44,6 @@ def test_type_conversion(self): model1 = KeyedVectors.load_word2vec_format(path, datatype=np.float16) model1.save_word2vec_format(binary_path, binary=True) model2 = KeyedVectors.load_word2vec_format(binary_path, datatype=np.float64, binary=True) - import pdb - pdb.set_trace() self.assertAlmostEqual(model1["horse.n.01"][0], np.float16(model2["horse.n.01"][0])) From 96d8aa51653ae7e117e7341cac2670a84b8f373d Mon Sep 17 00:00:00 2001 From: Pushpankar Date: Fri, 16 Feb 2018 14:32:16 +0530 Subject: [PATCH 12/13] Use better names --- gensim/test/test_data/high_precision.kv.bin | Bin 0 -> 45 bytes .../{test.kv.txt => high_precision.kv.txt} | 0 gensim/test/test_datatype.py | 6 +++--- 3 files changed, 3 insertions(+), 3 deletions(-) create mode 100644 gensim/test/test_data/high_precision.kv.bin rename gensim/test/test_data/{test.kv.txt => high_precision.kv.txt} (100%) diff --git a/gensim/test/test_data/high_precision.kv.bin b/gensim/test/test_data/high_precision.kv.bin new file mode 100644 index 0000000000000000000000000000000000000000..3a49c0897c016f6e6839b4a156bbe3365b51528c GIT binary patch literal 45 vcmXp~FyhKi%u7!!%Fox!(=#wsU{G+}#gMReM@D{8aVk`ZAz>H80Z%IcIU^0# literal 0 HcmV?d00001 diff --git a/gensim/test/test_data/test.kv.txt b/gensim/test/test_data/high_precision.kv.txt similarity index 100% rename from gensim/test/test_data/test.kv.txt rename to gensim/test/test_data/high_precision.kv.txt diff --git a/gensim/test/test_datatype.py b/gensim/test/test_datatype.py index a169dbb609..83892fe884 100644 --- a/gensim/test/test_datatype.py +++ b/gensim/test/test_datatype.py @@ -18,7 +18,7 @@ class TestDataType(unittest.TestCase): def load_model(self, datatype): - path = datapath('test.kv.txt') + path = datapath('high_precision.kv.txt') kv = KeyedVectors.load_word2vec_format(path, binary=False, datatype=datatype) return kv @@ -39,8 +39,8 @@ def test_low_precision(self): self.assertEqual(kv['horse.n.01'][0].dtype, np.float16) def test_type_conversion(self): - path = datapath('test.kv.txt') - binary_path = datapath('test.kv.bin') + path = datapath('high_precision.kv.txt') + binary_path = datapath('high_precision.kv.bin') model1 = KeyedVectors.load_word2vec_format(path, datatype=np.float16) model1.save_word2vec_format(binary_path, binary=True) model2 = KeyedVectors.load_word2vec_format(binary_path, datatype=np.float64, binary=True) From 6f53175a36fd030bd038d5949e0b6f11a5b17843 Mon Sep 17 00:00:00 2001 From: Pushpankar Date: Fri, 16 Feb 2018 14:46:54 +0530 Subject: [PATCH 13/13] Test type after conversion --- gensim/test/test_datatype.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gensim/test/test_datatype.py b/gensim/test/test_datatype.py index 83892fe884..22b278f7b5 100644 --- a/gensim/test/test_datatype.py +++ b/gensim/test/test_datatype.py @@ -45,6 +45,8 @@ def test_type_conversion(self): model1.save_word2vec_format(binary_path, binary=True) model2 = KeyedVectors.load_word2vec_format(binary_path, datatype=np.float64, binary=True) self.assertAlmostEqual(model1["horse.n.01"][0], np.float16(model2["horse.n.01"][0])) + self.assertEqual(model1["horse.n.01"][0].dtype, np.float16) + self.assertEqual(model2["horse.n.01"][0].dtype, np.float64) if __name__ == '__main__':