-
Notifications
You must be signed in to change notification settings - Fork 0
/
_pyloom.py
108 lines (79 loc) · 2.78 KB
/
_pyloom.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
__doc__ = """
This modules implements a bloom filter which can store arbitrary strings.
Requires:
mmh3 (https://pypi.python.org/pypi/mmh3/)
bitarray (https://pypi.python.org/pypi/bitarray/)
"""
from bitarray import bitarray
import math
import mmh3
LOG2 = math.log(2)
def murmur(key, seed=0):
"""Return murmur3 hash of the key as 32 bit signed int."""
return mmh3.hash(key, seed)
class BloomFilter(object):
def __init__(self, capacity, error=0.001):
self.capacity = capacity
self.error = error
self._count = 0
self._setup()
def _setup(self):
num_bits = int(-self.capacity * math.log(self.error) / (LOG2 * LOG2))
self._num_hashes = int(LOG2 * (num_bits / self.capacity) + 0.5)
self._bits_per_hash = int(num_bits / self._num_hashes + 0.5)
self._num_bits = self._bits_per_hash * self._num_hashes
self._bitarray = bitarray(self._num_bits)
self._bitarray.setall(False)
def add(self, key):
if key not in self:
for b in self._get_hashes(key):
self._bitarray[b] = True
self._count += 1
def __contains__(self, key):
for b in self._get_hashes(key):
if self._bitarray[b] is False:
return False
return True
def __len__(self):
return self._count
def _get_hashes(self, key):
h1 = murmur(key, seed=0)
h2 = murmur(key, h1)
bph = self._bits_per_hash
# doing a modulo now instead of earlier so that h2 could be seeded on a
# larger space
h1 = h1 % bph;
h2 = h2 % bph
sum_h = h1
base = 0
for i in range(self._num_hashes):
yield base + sum_h
base += bph
sum_h += h2
if sum_h >= bph:
sum_h -= bph
class ScalableBloomFilter(object):
def __init__(self, capacity, error=0.001, expansion_rate=2):
self.capacity = capacity
self.error = error
self.expansion_rate = expansion_rate
self._error_r = 0.9
self._bfs = [BloomFilter(capacity, error * (1 - self._error_r))]
def __len__(self):
return sum(len(bf) for bf in self._bfs)
def _memory(self):
return sum(len(bf._bitarray) for bf in self._bfs)
def add(self, key):
if key not in self:
last = self._bfs[-1]
if len(last) >= last.capacity:
new_capacity = self.expansion_rate * last.capacity
new_error = self._error_r * last.error
last = BloomFilter(new_capacity, new_error)
self._bfs.append(last)
last.add(key)
def __contains__(self, key):
for bf in reversed(self._bfs):
if key in bf:
return True
return False