diff --git a/.gitignore b/.gitignore index 93135d8..378b372 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ build dist ImageHash.egg-info/ .eggs -.DS_Store \ No newline at end of file +.DS_Store +.python-version diff --git a/imagehash/__init__.py b/imagehash/__init__.py index 0dc69b8..5da7efd 100644 --- a/imagehash/__init__.py +++ b/imagehash/__init__.py @@ -41,24 +41,14 @@ __version__ = open(os.path.join(os.path.abspath( os.path.dirname(__file__)), 'VERSION')).read().strip() + def _binary_array_to_hex(arr): """ internal function to make a hex string out of a binary array. - - binary array might be created from comparison - for example, in - average hash, each pixel in the image is compared with the average pixel value. - If the pixel's value is less than the average it gets a 0 and if it's more it gets a 1. - Then we treat this like a string of bits and convert it to hexadecimal. """ - h = 0 - s = [] - for i, v in enumerate(arr.flatten()): - if v: - h += 2**(i % 8) - if (i % 8) == 7: - s.append(hex(h)[2:].rjust(2, '0')) - h = 0 - return "".join(s) + bit_string = ''.join(str(b) for b in 1 * arr.flatten()) + width = int(numpy.ceil(len(bit_string)/4)) + return '{:0>{width}x}'.format(int(bit_string, 2), width=width) class ImageHash(object): @@ -96,10 +86,29 @@ def __hash__(self): return sum([2**(i % 8) for i, v in enumerate(self.hash.flatten()) if v]) -def hex_to_hash(hexstr, hash_size=8): +def hex_to_hash(hexstr): """ Convert a stored hash (hex, as retrieved from str(Imagehash)) back to a Imagehash object. + + Notes: + 1. This algorithm assumes all hashes are bidimensional arrays + with dimensions hash_size * hash_size. + 2. This algorithm does not work for hash_size < 2. + """ + hash_size = int(numpy.sqrt(len(hexstr)*4)) + binary_array = '{:0>{width}b}'.format(int(hexstr, 16), width = hash_size * hash_size) + bit_rows = [binary_array[i:i+hash_size] for i in range(0, len(binary_array), hash_size)] + hash_array = numpy.array([[bool(int(d)) for d in row] for row in bit_rows]) + return ImageHash(hash_array) + +def old_hex_to_hash(hexstr, hash_size=8): + """ + Convert a stored hash (hex, as retrieved from str(Imagehash)) + back to a Imagehash object. This method should be used for + hashes generated by ImageHash up to version 3.7. For hashes + generated by newer versions of ImageHash, hex_to_hash should + be used instead. """ l = [] count = hash_size * (hash_size // 4) @@ -123,8 +132,8 @@ def average_hash(image, hash_size=8): @image must be a PIL instance. """ - if hash_size < 0: - raise ValueError("Hash size must be positive") + if hash_size < 2: + raise ValueError("Hash size must be greater than or equal to 2") # reduce size and complexity, then covert to grayscale image = image.convert("L").resize((hash_size, hash_size), Image.ANTIALIAS) @@ -147,8 +156,8 @@ def phash(image, hash_size=8, highfreq_factor=4): @image must be a PIL instance. """ - if hash_size < 0: - raise ValueError("Hash size must be positive") + if hash_size < 2: + raise ValueError("Hash size must be greater than or equal to 2") import scipy.fftpack img_size = hash_size * highfreq_factor @@ -191,8 +200,8 @@ def dhash(image, hash_size=8): @image must be a PIL instance. """ # resize(w, h), but numpy.array((h, w)) - if hash_size < 0: - raise ValueError("Hash size must be positive") + if hash_size < 2: + raise ValueError("Hash size must be greater than or equal to 2") image = image.convert("L").resize((hash_size + 1, hash_size), Image.ANTIALIAS) pixels = numpy.asarray(image) diff --git a/imagehash/tests/__init__.py b/imagehash/tests/__init__.py index 13ab8f0..6f2f0a8 100644 --- a/imagehash/tests/__init__.py +++ b/imagehash/tests/__init__.py @@ -41,24 +41,27 @@ def check_hash_algorithm(self, func, image): distance)) self.assertTrue(distance > 10, emsg) - def check_hash_length(self, func, image, sizes): + def check_hash_length(self, func, image, sizes=range(2,21)): for hash_size in sizes: image_hash = func(image, hash_size=hash_size) emsg = 'hash_size={} is not respected'.format(hash_size) self.assertEqual(image_hash.hash.size, hash_size**2, emsg) - def check_hash_stored(self, func, image): - image_hash = func(image) - other_hash = imagehash.hex_to_hash(str(image_hash)) - emsg = 'stringified hash {} != original hash {}'.format(other_hash, - image_hash) - self.assertEqual(image_hash, other_hash, emsg) - distance = image_hash - other_hash - emsg = ('unexpected hamming distance {}: original hash {} ' - '- stringified hash {}'.format(distance, image_hash, - other_hash)) - self.assertEqual(distance, 0, emsg) + def check_hash_stored(self, func, image, sizes=range(2,21)): + for hash_size in sizes: + image_hash = func(image, hash_size) + other_hash = imagehash.hex_to_hash(str(image_hash)) + emsg = 'stringified hash {} != original hash {}'.format(other_hash, + image_hash) + self.assertEqual(image_hash, other_hash, emsg) + distance = image_hash - other_hash + emsg = ('unexpected hamming distance {}: original hash {} ' + '- stringified hash {}'.format(distance, image_hash, + other_hash)) + self.assertEqual(distance, 0, emsg) + + def check_hash_size(self, func, image, sizes=range(-1,2)): + for hash_size in sizes: + with self.assertRaises(ValueError): + func(image, hash_size) - def check_hash_size(self, func, image, size): - with self.assertRaises(ValueError): - func(image, -1) diff --git a/imagehash/tests/test_average_hash.py b/imagehash/tests/test_average_hash.py index bf26323..5664e28 100644 --- a/imagehash/tests/test_average_hash.py +++ b/imagehash/tests/test_average_hash.py @@ -16,13 +16,13 @@ def test_average_hash(self): self.check_hash_algorithm(self.func, self.image) def test_average_hash_length(self): - self.check_hash_length(self.func, self.image, [8, 20]) + self.check_hash_length(self.func, self.image) def test_average_hash_stored(self): self.check_hash_stored(self.func, self.image) def test_average_hash_size(self): - self.check_hash_size(self.func, self.image, -1) + self.check_hash_size(self.func, self.image) diff --git a/imagehash/tests/test_dhash.py b/imagehash/tests/test_dhash.py index 60179db..60ea960 100644 --- a/imagehash/tests/test_dhash.py +++ b/imagehash/tests/test_dhash.py @@ -16,13 +16,13 @@ def test_dhash(self): self.check_hash_algorithm(self.func, self.image) def test_dhash_length(self): - self.check_hash_length(self.func, self.image, [8, 20]) + self.check_hash_length(self.func, self.image) def test_dhash_stored(self): self.check_hash_stored(self.func, self.image) def test_dhash_size(self): - self.check_hash_size(self.func, self.image, -1) + self.check_hash_size(self.func, self.image) if __name__ == '__main__': unittest.main() diff --git a/imagehash/tests/test_hex_conversions.py b/imagehash/tests/test_hex_conversions.py new file mode 100644 index 0000000..1708f6f --- /dev/null +++ b/imagehash/tests/test_hex_conversions.py @@ -0,0 +1,99 @@ +import unittest +import numpy as np +import imagehash + + +# Each row is a test case where the first value is a bit sequence and +# the second value is the expected hexadecimal representation for it. +binary_to_hexadecimal_values = [ + ['1', '1'], + ['11', '3'], + ['111', '7'], + ['1111', 'f'], + ['10000', '10'], + ['110000', '30'], + ['1110000', '70'], + ['11110000', 'f0'], + ['00001', '01'], + ['000011', '03'], + ['0000111', '07'], + ['00001111', '0f'], + ['10000001', '81'], + ['00000000000000001', '00001'], + ['000000000000000011', '00003'], + ['0000000000000000111', '00007'], + ['00000000000000001111', '0000f'], + ['11110000111100001111', 'f0f0f'], + ['00001111000011110000', '0f0f0'], + ['11110000000100100011010001010110011110001001101010111100110111101111', 'f0123456789abcdef'], + ['1001111000111100110000011111000011110000110000111110011111000000', '9e3cc1f0f0c3e7c0'], + ['1000111100001111000011110000111100001111000010110000101101111010', '8f0f0f0f0f0b0b7a'], +] + +# Each row is a test case where the first value is a hexadecimal sequence +# and the second value is the expected binary representation for it. +hexadecimal_to_binary_values = [ + ['1', '0001'], + ['2', '0010'], + ['3', '0011'], + ['a', '1010'], + ['f', '1111'], + ['101', '100000001'], + ['1b1', '110110001'], + ['0b1', '010110001'], + ['f0f0', '1111000011110000'], + ['0f0f', '0000111100001111'], + ['000c', '0000000000001100'], + ['100000d', '1000000000000000000001101'], + ['000000d', '0000000000000000000001101'], + ['000000001', '000000000000000000000000000000000001'], + ['800000001', '100000000000000000000000000000000001'], + ['0000000000001', '0000000000000000000000000000000000000000000000001'], + ['1000000000001', '1000000000000000000000000000000000000000000000001'], + ['0000000000000001', '0000000000000000000000000000000000000000000000000000000000000001'], + ['8000000000000001', '1000000000000000000000000000000000000000000000000000000000000001'], +] + +# Each row is a test case where the first value is a hexadecimal +# sequence and the second value is the expected bool array for it. +hexadecimal_to_bool_array = [ + ['9e3cc1f0f0c3e7c0', np.array([ [True, False, False, True, True, True, True, False], + [False, False, True, True, True, True, False, False], + [True, True, False, False, False, False, False, True], + [True, True, True, True, False, False, False, False], + [True, True, True, True, False, False, False, False], + [True, True, False, False, False, False, True, True], + [True, True, True, False, False, True, True, True], + [True, True, False, False, False, False, False, False]]) ], +] + +class TestHexConversions(unittest.TestCase): + + def setUp(self): + self.to_hex = imagehash._binary_array_to_hex + self.from_hex = imagehash.hex_to_hash + + def test_binary_array_to_hex_input(self): + for case in hexadecimal_to_bool_array: + self.assertEqual(case[0], self.to_hex(case[1])) + + def test_hex_to_hash_output(self): + for case in hexadecimal_to_bool_array: + self.assertTrue(np.array_equal(case[1], self.from_hex(case[0]).hash)) + + def test_conversion_to_hex(self): + for case in binary_to_hexadecimal_values: + expected = case[1] + bit_array = np.array([int(d) for d in case[0]]) + result = self.to_hex(bit_array) + self.assertEqual(expected, result) + + def test_conversion_from_hex(self): + for case in hexadecimal_to_binary_values: + expected = case[1] + result = ''.join(str(b) for b in 1 * self.from_hex(case[0]).hash.flatten()) + self.assertEqual(expected, result) + + +if __name__ == '__main__': + unittest.main() diff --git a/imagehash/tests/test_old_hex_conversions.py b/imagehash/tests/test_old_hex_conversions.py new file mode 100644 index 0000000..42d82e8 --- /dev/null +++ b/imagehash/tests/test_old_hex_conversions.py @@ -0,0 +1,30 @@ +import unittest +import numpy as np +import imagehash + + +# Each row is a test case where the first value is a hexadecimal +# sequence and the second value is the expected bool array for it. +old_hexadecimal_to_bool_array = [ + ['ffeb89818193ffff', np.array([ [True, True, True, True, True, True, True, True], + [True, True, False, True, False, True, True, True], + [True, False, False, True, False, False, False, True], + [True, False, False, False, False, False, False, True], + [True, False, False, False, False, False, False, True], + [True, True, False, False, True, False, False, True], + [True, True, True, True, True, True, True, True], + [True, True, True, True, True, True, True, True]]) ], +] + +class TestOldHexConversions(unittest.TestCase): + + def setUp(self): + self.from_hex = imagehash.old_hex_to_hash + + def test_hex_to_hash_output(self): + for case in old_hexadecimal_to_bool_array: + self.assertTrue(np.array_equal(case[1], self.from_hex(case[0]).hash)) + + +if __name__ == '__main__': + unittest.main() diff --git a/imagehash/tests/test_phash.py b/imagehash/tests/test_phash.py index 9061753..3fc5f13 100644 --- a/imagehash/tests/test_phash.py +++ b/imagehash/tests/test_phash.py @@ -16,13 +16,13 @@ def test_phash(self): self.check_hash_algorithm(self.func, self.image) def test_phash_length(self): - self.check_hash_length(self.func, self.image, [8, 20]) + self.check_hash_length(self.func, self.image) def test_phash_stored(self): self.check_hash_stored(self.func, self.image) def test_phash_size(self): - self.check_hash_size(self.func, self.image, -1) + self.check_hash_size(self.func, self.image) if __name__ == '__main__': unittest.main() diff --git a/imagehash/tests/test_whash.py b/imagehash/tests/test_whash.py index 3134ef9..0c518f1 100644 --- a/imagehash/tests/test_whash.py +++ b/imagehash/tests/test_whash.py @@ -5,6 +5,23 @@ import unittest import imagehash +import imagehash.tests as tests + + +class TestBasic(tests.TestImageHash): + + def setUp(self): + self.image = self.get_data_image() + self.func = imagehash.whash + + def test_whash(self): + self.check_hash_algorithm(self.func, self.image) + + def test_whash_length(self): + self.check_hash_length(self.func, self.image, sizes=[2,4,8,16,32,64]) + + def test_whash_stored(self): + self.check_hash_stored(self.func, self.image, sizes=[2,4,8,16,32,64]) class Test(unittest.TestCase): @@ -57,5 +74,6 @@ def test_image_scale_not_2power(self): with six.assertRaisesRegex(self, AssertionError, emsg): imagehash.whash(self.image, image_scale=image_scale+1) + if __name__ == '__main__': unittest.main()