Skip to content

Commit

Permalink
Add throughput benchmarking for large inputs
Browse files Browse the repository at this point in the history
  • Loading branch information
hajimes committed Aug 22, 2024
1 parent b14abef commit ea50371
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 10 deletions.
118 changes: 108 additions & 10 deletions benchmark/benchmark.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import gc
import hashlib
import time
from collections.abc import Callable, List

import matplotlib.pyplot as plt
import mmh3
Expand Down Expand Up @@ -95,27 +96,46 @@ def run_timed_benchmarks(self, params):
return result


def init_buffer(ba, size):
def init_buffer(ba: bytearray) -> None:
"""Initializes a buffer with a pattern.
Args:
ba: The buffer to initialize.
"""
K1 = 0b1001111000110111011110011011000110000101111010111100101010000111
K2 = 0b1100001010110010101011100011110100100111110101001110101101001111
MASK = 0xFFFFFFFFFFFFFFFF
acc = K2

for i in range(size):
for i in range(len(ba)):
acc = (acc * K1) & MASK
ba[i] = acc >> 56


def benchmark_hash(f, size, total_microseconds, run_microseconds):
def benchmark_hash(
f: Callable, size: int, total_microseconds: int, run_microseconds: int
) -> int:
"""Benchmarks a hash function by running it on a buffer of a given size.
Args:
f: The hash function to benchmark.
size: The size of the buffer to hash.
total_microseconds: The total time to spend benchmarking.
run_microseconds: The time to spend running the hash function.
Returns:
The time taken to hash the buffer in nanoseconds.
"""

SIZE_TO_HASH_PER_ROUND = 200000
HASH_ROUNDS_MAX = 1000

MARGIN_FOR_LATENCY = 1024

source_buffer = bytearray(size + MARGIN_FOR_LATENCY)
init_buffer(source_buffer, size + MARGIN_FOR_LATENCY)
init_buffer(source_buffer)

number_of_blocks = (SIZE_TO_HASH_PER_ROUND / size) + 1
number_of_blocks = (SIZE_TO_HASH_PER_ROUND // size) + 1
number_of_blocks = min(number_of_blocks, HASH_ROUNDS_MAX)

source_buffers = []
Expand All @@ -138,7 +158,18 @@ def benchmark_hash(f, size, total_microseconds, run_microseconds):
return result["nanoseconds_per_run"]


def benchmark_throughput_small_inputs(hashes, small_test_size_min, small_test_size_max):
def benchmark_throughput_small_inputs(
hashes: List[Callable], small_test_size_min: int, small_test_size_max: int
):
"""Benchmarks the throughput of a hash function on small inputs.
Args:
hashes: The hash functions to benchmark.
small_test_size_min: The minimum size of the input.
small_test_size_max: The maximum size of the input.
Returns: A dictionary containing the results of the benchmark.
"""
BENCH_SMALL_TOTAL_MS = 490
BENCH_SMALL_ITERATION_MS = 170

Expand All @@ -147,6 +178,7 @@ def benchmark_throughput_small_inputs(hashes, small_test_size_min, small_test_si
for h in hashes:
result = []
for i in range(small_test_size_min, small_test_size_max + 1):
print(h["name"], i)
gc.disable()
result.append(
benchmark_hash(
Expand All @@ -158,10 +190,49 @@ def benchmark_throughput_small_inputs(hashes, small_test_size_min, small_test_si

return data_result

def benchmark_large_inputs(
hashes: List[Callable], large_test_log_min: int, large_test_log_max: int
):
"""Benchmarks the throughput of a hash function on large inputs.
Args:
hashes: The hash functions to benchmark.
large_test_log_min: The minimum log2 size of the input.
large_test_log_max: The maximum log2 size of the input.
Returns: A dictionary containing the results of the benchmark.
"""
BENCH_LARGE_TOTAL_MS = 1010
BENCH_LARGE_ITERATION_MS = 490

data_result = {}

for h in hashes:
result = []
for i in range(large_test_log_min, large_test_log_max + 1):
print(h["name"], i)
gc.disable()
result.append(
benchmark_hash(
h["function"],
1 << i,
BENCH_LARGE_TOTAL_MS,
BENCH_LARGE_ITERATION_MS,
)
)
gc.enable()
data_result[h["name"]] = result

return data_result


if __name__ == "__main__":
SMALL_SIZE_MIN_DEFAULT = 1
SMALL_SIZE_MAX_DEFAULT = 127
LARGE_SIZELOG_MIN_DEFAULT = 9
LARGE_SIZELOG_MAX_DEFAULT = 27
# LARGE_SIZELOG_MIN_DEFAULT = 15
# LARGE_SIZELOG_MAX_DEFAULT = 24

HASHES = [
{"name": "mmh3", "function": lambda x: mmh3.hash_bytes(bytes(x))},
Expand All @@ -171,13 +242,40 @@ def benchmark_throughput_small_inputs(hashes, small_test_size_min, small_test_si
{"name": "sha1", "function": lambda x: hashlib.sha1(bytes(x)).digest()},
]

benchmark_results = benchmark_throughput_small_inputs(
HASHES, SMALL_SIZE_MIN_DEFAULT, SMALL_SIZE_MAX_DEFAULT
# HASHES = [
# {"name": "mmh3", "function": lambda x: mmh3.hash_bytes(bytes(x))},
# {"name": "xxhash", "function": lambda x: xxhash.xxh128(bytes(x)).digest()}
# ]

# print('Benchmarking small inputs')
# benchmark_results = benchmark_throughput_small_inputs(
# HASHES, SMALL_SIZE_MIN_DEFAULT, SMALL_SIZE_MAX_DEFAULT
# )

# print('Generating plot')
# df = pd.DataFrame(
# benchmark_results,
# index=list(range(SMALL_SIZE_MIN_DEFAULT, SMALL_SIZE_MAX_DEFAULT + 1)),
# )

# plt.figure()

# df.plot(
# logy=True,
# )

# plt.savefig("docs/images/throughput_small_inputs.png")
# plt.close("all")

print("Benchmarking large inputs")
benchmark_results = benchmark_large_inputs(
HASHES, LARGE_SIZELOG_MIN_DEFAULT, LARGE_SIZELOG_MAX_DEFAULT
)

print("Generating plot")
df = pd.DataFrame(
benchmark_results,
index=list(range(SMALL_SIZE_MIN_DEFAULT, SMALL_SIZE_MAX_DEFAULT + 1)),
index=list(range(LARGE_SIZELOG_MIN_DEFAULT, LARGE_SIZELOG_MAX_DEFAULT + 1)),
)

plt.figure()
Expand All @@ -186,5 +284,5 @@ def benchmark_throughput_small_inputs(hashes, small_test_size_min, small_test_si
logy=True,
)

plt.savefig("docs/images/throughput_small_inputs.png")
plt.savefig("docs/images/throughput_large_inputs.png")
plt.close("all")
Binary file added docs/images/throughput_large_inputs.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/images/throughput_small_inputs.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit ea50371

Please sign in to comment.