Skip to content

Commit

Permalink
* bug fixed: serialize simhash failed. (#94)
Browse files Browse the repository at this point in the history
  • Loading branch information
HYLcool authored Nov 23, 2023
1 parent a26e069 commit 67026a8
Showing 1 changed file with 39 additions and 41 deletions.
80 changes: 39 additions & 41 deletions data_juicer/ops/deduplicator/document_simhash_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,45 +21,45 @@
with AvailabilityChecking(['simhash-py'], OP_NAME):
import simhash

def local_num_differing_bits(hash_a, hash_b):
"""
Local implementation of calculating the number of different bits
between two integers.
def local_num_differing_bits(hash_a, hash_b):
"""
Local implementation of calculating the number of different bits between
two integers.
:param hash_a: integer hash value a
:param hash_b: integer hash value b
:return: number of different bits between input hashes.
"""
cnt = 0
n = hash_a ^ hash_b
while n != 0:
cnt += 1
n = n & (n - 1)
return cnt


def num_differing_bits_selector():
"""
Select a num_differing_bits method according to the Python version
installed.
When Python >= 3.9, the original simhash library cannot be compiled
correctly due to some changes in cython. After fixing this
incompatibility, RecursionError occurs sometimes when calling
simhash.num_differing_bits. So we use our implementation when Python
>= 3.9. Otherwise, we use implementation of simhash.
:return: an available num_differing_bits function.
"""
import platform
a, b, _ = platform.python_version().split('.')
if a == '3' and int(b) >= 9:
# for >= 3.9, use local implementation
return local_num_differing_bits
else:
# for < 3.9, use simhash version
return simhash.num_differing_bits
:param hash_a: integer hash value a
:param hash_b: integer hash value b
:return: number of different bits between input hashes.
"""
cnt = 0
n = hash_a ^ hash_b
while n != 0:
cnt += 1
n = n & (n - 1)
return cnt

def num_differing_bits_selector():
"""
Select a num_differing_bits method according to the Python version
installed.
When Python >= 3.9, the original simhash library cannot be compiled
correctly due to some changes in cython. After fixing this
incompatibility, RecursionError occurs sometimes when calling
simhash.num_differing_bits. So we use our implementation when Python
>= 3.9. Otherwise, we use implementation of simhash.
:return: an available num_differing_bits function.
"""
import platform
a, b, _ = platform.python_version().split('.')
if a == '3' and int(b) >= 9:
# for >= 3.9, use local implementation
return local_num_differing_bits
else:
# for < 3.9, use simhash version
return simhash.num_differing_bits

num_differing_bits = num_differing_bits_selector()


@OPERATORS.register_module(OP_NAME)
Expand Down Expand Up @@ -114,8 +114,6 @@ def __init__(self,
self.num_blocks = num_blocks
self.hamming_distance = hamming_distance

self.num_differing_bits = num_differing_bits_selector()

def compute_hash(self, sample):
"""
Compute simhash values for the sample.
Expand Down Expand Up @@ -189,7 +187,7 @@ def process(self, dataset, show_num=0):
dist = Counter()
for x, y in matches:
graph[x][y] = graph[y][x] = True
num_diff = self.num_differing_bits(x, y)
num_diff = num_differing_bits(x, y)
dist[num_diff] += 1
logger.info(f'Hash diff distribution: {dist}')

Expand Down

0 comments on commit 67026a8

Please sign in to comment.