Add throughput benchmarking for large inputs

hajimes · Aug 22, 2024 · ea50371 · ea50371
1 parent b14abef
commit ea50371
Show file tree

Hide file tree

Showing 3 changed files with 108 additions and 10 deletions.
diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
@@ -1,6 +1,7 @@
 import gc
 import hashlib
 import time
+from collections.abc import Callable, List
 
 import matplotlib.pyplot as plt
 import mmh3
@@ -95,27 +96,46 @@ def run_timed_benchmarks(self, params):
         return result
 
 
-def init_buffer(ba, size):
+def init_buffer(ba: bytearray) -> None:
+    """Initializes a buffer with a pattern.
+
+    Args:
+        ba: The buffer to initialize.
+    """
     K1 = 0b1001111000110111011110011011000110000101111010111100101010000111
     K2 = 0b1100001010110010101011100011110100100111110101001110101101001111
     MASK = 0xFFFFFFFFFFFFFFFF
     acc = K2
 
-    for i in range(size):
+    for i in range(len(ba)):
         acc = (acc * K1) & MASK
         ba[i] = acc >> 56
 
 
-def benchmark_hash(f, size, total_microseconds, run_microseconds):
+def benchmark_hash(
+    f: Callable, size: int, total_microseconds: int, run_microseconds: int
+) -> int:
+    """Benchmarks a hash function by running it on a buffer of a given size.
+
+    Args:
+        f: The hash function to benchmark.
+        size: The size of the buffer to hash.
+        total_microseconds: The total time to spend benchmarking.
+        run_microseconds: The time to spend running the hash function.
+
+    Returns:
+        The time taken to hash the buffer in nanoseconds.
+    """
+
     SIZE_TO_HASH_PER_ROUND = 200000
     HASH_ROUNDS_MAX = 1000
 
     MARGIN_FOR_LATENCY = 1024
 
     source_buffer = bytearray(size + MARGIN_FOR_LATENCY)
-    init_buffer(source_buffer, size + MARGIN_FOR_LATENCY)
+    init_buffer(source_buffer)
 
-    number_of_blocks = (SIZE_TO_HASH_PER_ROUND / size) + 1
+    number_of_blocks = (SIZE_TO_HASH_PER_ROUND // size) + 1
     number_of_blocks = min(number_of_blocks, HASH_ROUNDS_MAX)
 
     source_buffers = []
@@ -138,7 +158,18 @@ def benchmark_hash(f, size, total_microseconds, run_microseconds):
     return result["nanoseconds_per_run"]
 
 
-def benchmark_throughput_small_inputs(hashes, small_test_size_min, small_test_size_max):
+def benchmark_throughput_small_inputs(
+    hashes: List[Callable], small_test_size_min: int, small_test_size_max: int
+):
+    """Benchmarks the throughput of a hash function on small inputs.
+
+    Args:
+        hashes: The hash functions to benchmark.
+        small_test_size_min: The minimum size of the input.
+        small_test_size_max: The maximum size of the input.
+
+    Returns: A dictionary containing the results of the benchmark.
+    """
     BENCH_SMALL_TOTAL_MS = 490
     BENCH_SMALL_ITERATION_MS = 170
 
@@ -147,6 +178,7 @@ def benchmark_throughput_small_inputs(hashes, small_test_size_min, small_test_si
     for h in hashes:
         result = []
         for i in range(small_test_size_min, small_test_size_max + 1):
+            print(h["name"], i)
             gc.disable()
             result.append(
                 benchmark_hash(
@@ -158,10 +190,49 @@ def benchmark_throughput_small_inputs(hashes, small_test_size_min, small_test_si
 
     return data_result
 
+def benchmark_large_inputs(
+    hashes: List[Callable], large_test_log_min: int, large_test_log_max: int
+):
+    """Benchmarks the throughput of a hash function on large inputs.
+
+    Args:
+        hashes: The hash functions to benchmark.
+        large_test_log_min: The minimum log2 size of the input.
+        large_test_log_max: The maximum log2 size of the input.
+
+    Returns: A dictionary containing the results of the benchmark.
+    """
+    BENCH_LARGE_TOTAL_MS = 1010
+    BENCH_LARGE_ITERATION_MS = 490
+
+    data_result = {}
+
+    for h in hashes:
+        result = []
+        for i in range(large_test_log_min, large_test_log_max + 1):
+            print(h["name"], i)
+            gc.disable()
+            result.append(
+                benchmark_hash(
+                    h["function"],
+                    1 << i,
+                    BENCH_LARGE_TOTAL_MS,
+                    BENCH_LARGE_ITERATION_MS,
+                )
+            )
+            gc.enable()
+        data_result[h["name"]] = result
+
+    return data_result
+
 
 if __name__ == "__main__":
     SMALL_SIZE_MIN_DEFAULT = 1
     SMALL_SIZE_MAX_DEFAULT = 127
+    LARGE_SIZELOG_MIN_DEFAULT = 9
+    LARGE_SIZELOG_MAX_DEFAULT = 27
+    # LARGE_SIZELOG_MIN_DEFAULT = 15
+    # LARGE_SIZELOG_MAX_DEFAULT = 24
 
     HASHES = [
         {"name": "mmh3", "function": lambda x: mmh3.hash_bytes(bytes(x))},
@@ -171,13 +242,40 @@ def benchmark_throughput_small_inputs(hashes, small_test_size_min, small_test_si
         {"name": "sha1", "function": lambda x: hashlib.sha1(bytes(x)).digest()},
     ]
 
-    benchmark_results = benchmark_throughput_small_inputs(
-        HASHES, SMALL_SIZE_MIN_DEFAULT, SMALL_SIZE_MAX_DEFAULT
+    # HASHES = [
+    #     {"name": "mmh3", "function": lambda x: mmh3.hash_bytes(bytes(x))},
+    #     {"name": "xxhash", "function": lambda x: xxhash.xxh128(bytes(x)).digest()}
+    # ]
+
+    # print('Benchmarking small inputs')
+    # benchmark_results = benchmark_throughput_small_inputs(
+    #     HASHES, SMALL_SIZE_MIN_DEFAULT, SMALL_SIZE_MAX_DEFAULT
+    # )
+
+    # print('Generating plot')
+    # df = pd.DataFrame(
+    #     benchmark_results,
+    #     index=list(range(SMALL_SIZE_MIN_DEFAULT, SMALL_SIZE_MAX_DEFAULT + 1)),
+    # )
+
+    # plt.figure()
+
+    # df.plot(
+    #     logy=True,
+    # )
+
+    # plt.savefig("docs/images/throughput_small_inputs.png")
+    # plt.close("all")
+
+    print("Benchmarking large inputs")
+    benchmark_results = benchmark_large_inputs(
+        HASHES, LARGE_SIZELOG_MIN_DEFAULT, LARGE_SIZELOG_MAX_DEFAULT
     )
 
+    print("Generating plot")
     df = pd.DataFrame(
         benchmark_results,
-        index=list(range(SMALL_SIZE_MIN_DEFAULT, SMALL_SIZE_MAX_DEFAULT + 1)),
+        index=list(range(LARGE_SIZELOG_MIN_DEFAULT, LARGE_SIZELOG_MAX_DEFAULT + 1)),
     )
 
     plt.figure()
@@ -186,5 +284,5 @@ def benchmark_throughput_small_inputs(hashes, small_test_size_min, small_test_si
         logy=True,
     )
 
-    plt.savefig("docs/images/throughput_small_inputs.png")
+    plt.savefig("docs/images/throughput_large_inputs.png")
     plt.close("all")
diff --git a/docs/images/throughput_large_inputs.png b/docs/images/throughput_large_inputs.png
diff --git a/docs/images/throughput_small_inputs.png b/docs/images/throughput_small_inputs.png