Quantco · pavelzw · Feb 22, 2023 · Feb 21, 2023 · Feb 22, 2023 · Feb 22, 2023
@@ -0,0 +1,29 @@
+on: pull_request
+name: Benchmark
+permissions:
+   pull-requests: write
+   contents: read
+
+jobs:
+  benchmark:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up conda env
+        uses: mamba-org/provision-with-micromamba@v15
+        with:
+          cache-env: true
+          extra-specs: |
+            python=3.11
+      - name: Install repository
+        run: python -m pip install --no-build-isolation --no-deps --disable-pip-version-check -e .
+      - name: Run benchmark
+        shell: bash -el {0}
+        run: |
+          echo "_(benchmark **${{ github.run_id }}** / attempt **${{ github.run_attempt }}**)_" >> benchmark.md
+          python benchmark.py >> benchmark.md
+      - name: Comment PR
+        uses: thollander/actions-comment-pull-request@v2
+        with:
+          filePath: benchmark.md
+          comment_tag: benchmark
@@ -24,9 +24,8 @@ jobs:
       matrix:
         PYTHON_VERSION: ['3.8', '3.9', '3.10', '3.11']
     steps:
-      - name: Checkout branch
-        uses: actions/checkout@v3
-      - name: Set up Conda env
+      - uses: actions/checkout@v3
+      - name: Set up conda env
         uses: mamba-org/provision-with-micromamba@v15
         with:
           cache-env: true

@@ -0,0 +1,139 @@
+import io
+import pickle
+import textwrap
+import time
+from typing import Callable, List
+
+import lightgbm as lgb
+from sklearn.ensemble import RandomForestRegressor
+
+from examples.utils import generate_dataset
+from pickle_compression.lgbm_booster import dump_lgbm
+from pickle_compression.sklearn_tree import dump_sklearn
+
+
+def train_model_sklearn() -> RandomForestRegressor:
+    regressor = RandomForestRegressor(n_estimators=100, random_state=42)
+    regressor.fit(*generate_dataset(n_samples=10000))
+    return regressor
+
+
+def train_gbdt_lgbm() -> lgb.LGBMRegressor:
+    regressor = lgb.LGBMRegressor(n_estimators=100, random_state=42)
+    regressor.fit(*generate_dataset(n_samples=10000))
+    return regressor
+
+
+def train_rf_lgbm() -> lgb.LGBMRegressor:
+    regressor = lgb.LGBMRegressor(
+        boosting_type="rf",
+        n_estimators=100,
+        num_leaves=1000,
+        random_state=42,
+        bagging_freq=5,
+        bagging_fraction=0.5,
+    )
+    regressor.fit(*generate_dataset(n_samples=10000))
+    return regressor
+
+
+def benchmark(func: Callable, *args, **kwargs) -> float:
+    times = []
+    for _ in range(10):
+        start = time.perf_counter()
+        func(*args, **kwargs)
+        times.append(time.perf_counter() - start)
+    return min(times)
+
+
+def benchmark_model(name, train_func, dump_func) -> dict:
+    model = train_func()
+
+    naive_dump_time = benchmark(pickle.dumps, model)
+    naive_pickled = pickle.dumps(model)
+    naive_pickled_size = len(naive_pickled)
+    naive_load_time = benchmark(pickle.loads, naive_pickled)
+
+    our_dump_time = benchmark(dump_func, model, io.BytesIO())
+    our_pickled_buf = io.BytesIO()
+    dump_func(model, our_pickled_buf)
+    our_pickled = our_pickled_buf.getvalue()
+    our_pickled_size = len(our_pickled)
+    our_load_time = benchmark(pickle.loads, our_pickled)
+    return {
+        "name": name,
+        "baseline": {
+            "size": naive_pickled_size,
+            "dump_time": naive_dump_time,
+            "load_time": naive_load_time,
+        },
+        "ours": {
+            "size": our_pickled_size,
+            "dump_time": our_dump_time,
+            "load_time": our_load_time,
+        },
+        "change": {
+            "size": naive_pickled_size / our_pickled_size,
+            "dump_time": our_dump_time / naive_dump_time,
+            "load_time": our_load_time / naive_load_time,
+        },
+    }
+
+
+def format_size(n_bytes: int) -> str:
+    MiB = 1024**2
+    return f"{n_bytes / MiB:.1f} MiB"
+
+
+def format_time(seconds: float) -> str:
+    return f"{seconds:.2f} s"
+
+
+def format_change(multiple: float) -> str:
+    return f"{multiple:.2f} x"
+
+
+def format_benchmarks_results_table(benchmark_results: List[dict]) -> str:
+    header = """
+        | Model | Size | Dump Time | Load Time |
+        |--|--:|--:|--:|
+    """
+
+    def format_row(results):
+        def format_cell(base, ours, change):
+            return f"{base} / {ours} / {change}"
+
+        column_data = [
+            results["name"],
+            format_cell(
+                format_size(results["baseline"]["size"]),
+                format_size(results["ours"]["size"]),
+                format_change(results["change"]["size"]),
+            ),
+            format_cell(
+                format_time(results["baseline"]["dump_time"]),
+                format_time(results["ours"]["dump_time"]),
+                format_change(results["change"]["dump_time"]),
+            ),
+            format_cell(
+                format_time(results["baseline"]["load_time"]),
+                format_time(results["ours"]["load_time"]),
+                format_change(results["change"]["load_time"]),
+            ),
+        ]
+        return " | ".join(column_data)
+
+    formatted_rows = map(format_row, benchmark_results)
+
+    return (textwrap.dedent(header) + "\n".join(formatted_rows)).strip()
+
+
+if __name__ == "__main__":
+    models_to_benchmark = [
+        ("`RandomForestRegressor`", train_model_sklearn, dump_sklearn),
+        ("`LGBMRegressor gbdt`", train_gbdt_lgbm, dump_lgbm),
+        ("`LGBMRegressor rf`", train_rf_lgbm, dump_lgbm),
+    ]
+    benchmark_results = [benchmark_model(*args) for args in models_to_benchmark]
+    print("Base results / Our results / Change")
+    print(format_benchmarks_results_table(benchmark_results))
@@ -17,8 +17,8 @@
 
 def train_model() -> lgb.LGBMRegressor:
     regressor = lgb.LGBMRegressor(n_estimators=100, random_state=42)
-    X_train, _, y_train, _ = generate_dataset(n_samples=10000)  # noqa: N806
-    regressor.fit(X_train, y_train)
+    X, y = generate_dataset(n_samples=10000)
+    regressor.fit(X, y)
     return regressor
 
 
@@ -44,6 +44,6 @@ def dump_model_string(booster: Booster, path: Union[str, pathlib.Path]):
     dump_model_string(model_compressed.booster_, "examples/out/model_compressed.model")
 
     evaluate_prediction_difference(
-        model, model_compressed, generate_dataset(n_samples=10000)[1]
+        model, model_compressed, generate_dataset(n_samples=10000)[0]
     )
     evaluate_compression_performance(model, dump_lgbm)
@@ -15,8 +15,8 @@
 
 def train_model() -> RandomForestRegressor:
     regressor = RandomForestRegressor(n_estimators=100, random_state=42)
-    X_train, _, y_train, _ = generate_dataset(n_samples=10000)  # noqa: N806
-    regressor.fit(X_train, y_train)
+    X, y = generate_dataset(n_samples=10000)
+    regressor.fit(X, y)
     return regressor
 
 
@@ -29,6 +29,6 @@ def train_model() -> RandomForestRegressor:
         model_compressed = load_compressed(path, "no")
 
     evaluate_prediction_difference(
-        model, model_compressed, generate_dataset(n_samples=10000)[1]
+        model, model_compressed, generate_dataset(n_samples=10000)[0]
     )
     evaluate_compression_performance(model, dump_sklearn)
@@ -14,16 +14,14 @@
 
 def generate_dataset(
     n_samples: int = 50000, n_features: int = 100
-) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+) -> Tuple[np.ndarray, np.ndarray]:
     """Generate a dataset with 50000 samples and 100 features.
 
     Returns:
-        X_train (np.array): (0.8 * n_samples) x n_features
-        X_test (np.array): (0.2 * n_samples) x n_features
-        y_train (np.array): 0.8 * n_samples
-        y_test (np.array): 0.2 * n_samples
+        X (np.array): n_samples x n_features
+        y (np.array): n_samples
     """
-    X, y = make_regression(  # noqa: N806
+    X, y = make_regression(
         n_samples=n_samples,
         n_features=n_features,
         n_informative=50,
@@ -36,10 +34,22 @@ def generate_dataset(
     for i in range(0, 100, 10):
         X[:, i] = X[:, i].astype("int")
 
-    X_train, X_test, y_train, y_test = train_test_split(  # noqa: N806
-        X, y, test_size=0.2, random_state=42
-    )
-    return X_train, X_test, y_train, y_test
+    return X, y
+
+
+def generate_dataset_train_test(
+    n_samples: int = 50000, n_features: int = 100
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Generate a dataset with 50000 samples and 100 features.
+
+    Returns:
+        X_train (np.array): (0.8 * n_samples) x n_features
+        X_test (np.array): (0.2 * n_samples) x n_features
+        y_train (np.array): 0.8 * n_samples
+        y_test (np.array): 0.2 * n_samples
+    """
+    X, y = generate_dataset(n_samples, n_features)
+    return train_test_split(X, y, test_size=0.2, random_state=42)
 
 
 def evaluate_compression_performance(

@@ -61,20 +61,26 @@ def _decompress_booster_state(compressed_state: dict):
 def _compress_booster_handle(model_string: str) -> Tuple[str, List[dict], str]:
     if not model_string.startswith("tree\nversion=v3"):
         raise ValueError("Only v3 is supported for the booster string format.")
-    FRONT_STRING_REGEX = r"(?:tree\n)(?:\w+=.*\n)*\n(?=Tree)"  # noqa: N806
-    BACK_STRING_REGEX = r"end of trees(?:\n)+(?:.|\n)*"  # noqa: N806
-    TREE_GROUP_REGEX = r"(Tree=\d+\n+)((?:.+\n)*)\n\n"  # noqa: N806
+    FRONT_STRING_REGEX = r"(?:\w+(?:=.*)?\n)*\n(?=Tree)"
+    BACK_STRING_REGEX = r"end of trees(?:\n)+(?:.|\n)*"
+    TREE_GROUP_REGEX = r"(Tree=\d+\n+)((?:.+\n)*)\n\n"
 
     def _extract_feature(feature_line):
         feat_name, values_str = feature_line.split("=")
         return feat_name, values_str.split(" ")
 
-    front_str = re.findall(FRONT_STRING_REGEX, model_string)[0]
+    front_str_match = re.search(FRONT_STRING_REGEX, model_string)
+    if front_str_match is None:
+        raise ValueError("Could not find front string.")
+    front_str = front_str_match.group()
     # delete tree_sizes line since this messes up the tree parsing by LightGBM if not set correctly
     # todo calculate correct tree_sizes
     front_str = re.sub(r"tree_sizes=(?:\d+ )*\d+\n", "", front_str)
 
-    back_str = re.findall(BACK_STRING_REGEX, model_string)[0]
+    back_str_match = re.search(BACK_STRING_REGEX, model_string)
+    if back_str_match is None:
+        raise ValueError("Could not find back string.")
+    back_str = back_str_match.group()
     tree_matches = re.findall(TREE_GROUP_REGEX, model_string)
     trees: List[dict] = []
     for i, tree_match in enumerate(tree_matches):

@@ -46,6 +46,10 @@ select = [
     # pyupgrade
     "UP"
 ]
+ignore = [
+    # variable in function should be lowercase
+    "N806",
+]
 
 [tool.mypy]
 python_version = "3.8"

@@ -15,7 +15,7 @@ def lgbm_regressor(rng):
 
 
 def test_compresed_predictions(diabetes_toy_df, lgbm_regressor, tmp_path):
-    X, y = diabetes_toy_df  # noqa: N806
+    X, y = diabetes_toy_df
     lgbm_regressor.fit(X, y)
 
     model_path = tmp_path / "model_compressed.pickle.lzma"
@@ -27,7 +27,7 @@ def test_compresed_predictions(diabetes_toy_df, lgbm_regressor, tmp_path):
 
 
 def test_compressed_size(diabetes_toy_df, lgbm_regressor, tmp_path):
-    X, y = diabetes_toy_df  # noqa: N806
+    X, y = diabetes_toy_df
     lgbm_regressor.fit(X, y)
 
     model_path_compressed = tmp_path / "model_compressed.pickle.lzma"
@@ -41,7 +41,7 @@ def test_compressed_size(diabetes_toy_df, lgbm_regressor, tmp_path):
 
 @pytest.mark.parametrize("compression_method", ["no", "lzma", "gzip", "bz2"])
 def test_dump_times(diabetes_toy_df, lgbm_regressor, tmp_path, compression_method):
-    X, y = diabetes_toy_df  # noqa: N806
+    X, y = diabetes_toy_df
     lgbm_regressor.fit(X, y)
     factor = 7 if compression_method == "no" else 4
 
@@ -53,7 +53,7 @@ def test_dump_times(diabetes_toy_df, lgbm_regressor, tmp_path, compression_metho
 
 @pytest.mark.parametrize("compression_method", ["no", "lzma", "gzip", "bz2"])
 def test_load_times(diabetes_toy_df, lgbm_regressor, tmp_path, compression_method):
-    X, y = diabetes_toy_df  # noqa: N806
+    X, y = diabetes_toy_df
     lgbm_regressor.fit(X, y)
 
     load_time_compressed, load_time_uncompressed = get_load_times(

@@ -26,7 +26,7 @@ def decision_tree_regressor(rng):
 
 
 def test_compressed_predictions(diabetes_toy_df, random_forest_regressor, tmp_path):
-    X, y = diabetes_toy_df  # noqa: N806
+    X, y = diabetes_toy_df
     random_forest_regressor.fit(X, y)
 
     model_path = tmp_path / "model_compressed.pickle.lzma"
@@ -40,7 +40,7 @@ def test_compressed_predictions(diabetes_toy_df, random_forest_regressor, tmp_pa
 def test_compressed_internal_structure(
     diabetes_toy_df, decision_tree_regressor, tmp_path
 ):
-    X, y = diabetes_toy_df  # noqa: N806
+    X, y = diabetes_toy_df
     decision_tree_regressor.fit(X, y)
 
     model_path = tmp_path / "model_dtype_reduction.pickle.lzma"
@@ -70,7 +70,7 @@ def test_compressed_internal_structure(
 
 
 def test_compression_size(diabetes_toy_df, random_forest_regressor, tmp_path):
-    X, y = diabetes_toy_df  # noqa: N806
+    X, y = diabetes_toy_df
     random_forest_regressor.fit(X, y)
 
     model_path_dtype_reduction = tmp_path / "model_dtype_reduction.pickle.lzma"
@@ -86,7 +86,7 @@ def test_compression_size(diabetes_toy_df, random_forest_regressor, tmp_path):
 def test_dump_times(
     diabetes_toy_df, random_forest_regressor, tmp_path, compression_method
 ):
-    X, y = diabetes_toy_df  # noqa: N806
+    X, y = diabetes_toy_df
     random_forest_regressor.fit(X, y)
     factor = 4 if compression_method == "no" else 1.5
 
@@ -100,7 +100,7 @@ def test_dump_times(
 def test_load_times(
     diabetes_toy_df, random_forest_regressor, tmp_path, compression_method
 ):
-    X, y = diabetes_toy_df  # noqa: N806
+    X, y = diabetes_toy_df
     random_forest_regressor.fit(X, y)
 
     load_time_compressed, load_time_uncompressed = get_load_times(