From 2bfcacf7ded9c40d0f0fce3e7361a20513c11c15 Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Tue, 21 Feb 2023 23:15:51 +0200 Subject: [PATCH 01/16] Add benchmark script --- benchmark.py | 102 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 benchmark.py diff --git a/benchmark.py b/benchmark.py new file mode 100644 index 0000000..666e381 --- /dev/null +++ b/benchmark.py @@ -0,0 +1,102 @@ +import io +import pickle +import textwrap +import time +from typing import Callable, List + +import lightgbm as lgb +from sklearn.ensemble import RandomForestRegressor + +from examples.utils import load_data +from pickle_compression.lgbm_booster import dump_lgbm +from pickle_compression.sklearn_tree import dump_sklearn + + +def train_model_sklearn() -> RandomForestRegressor: + regressor = RandomForestRegressor(n_estimators=100, random_state=42) + regressor.fit(*load_data()) + return regressor + + +def train_model_lgbm() -> lgb.LGBMRegressor: + regressor = lgb.LGBMRegressor(n_estimators=100, random_state=42) + regressor.fit(*load_data()) + return regressor + + +def benchmark(func: Callable, *args, **kwargs) -> float: + times = [] + for _ in range(10): + start = time.perf_counter() + func(*args, **kwargs) + times.append(time.perf_counter() - start) + return min(times) + + +def benchmark_model(name, train_func, dump_func) -> dict: + model = train_func() + naive_pickled = pickle.dumps(model) + naive_pickled_size = len(naive_pickled) + naive_load_time = benchmark(pickle.loads, naive_pickled) + our_pickled_buf = io.BytesIO() + dump_func(model, our_pickled_buf) + our_pickled = our_pickled_buf.getvalue() + our_pickled_size = len(our_pickled) + our_load_time = benchmark(pickle.loads, our_pickled) + return { + "name": name, + "baseline": { + "size": naive_pickled_size, + "load_time": naive_load_time, + }, + "ours": {"size": our_pickled_size, "load_time": our_load_time}, + "change": { + "size": naive_pickled_size / our_pickled_size, + "load_time": our_load_time / naive_load_time, + }, + } + + +def format_size(n_bytes: int) -> str: + MiB = 1024**2 + return f"{n_bytes/MiB:.1f} MiB" + + +def format_time(seconds: float) -> str: + return f"{seconds:.1f} s" + + +def format_change(multiple: float) -> str: + return f"{multiple:.1f} x" + + +def format_benchmarks_results_table(benchmark_results: List[dict]) -> str: + header = f""" + | Model | Baseline Size | Our Size | Size Reduction | Baseline Loading Time | Our Loading Time | Slowdown | + |--|--:|--:|--:|--:|--:|--:| + """ + + def format_row(results): + column_data = [ + results["name"], + format_size(results["baseline"]["size"]), + format_size(results["ours"]["size"]), + format_change(results["change"]["size"]), + format_time(results["baseline"]["load_time"]), + format_time(results["ours"]["load_time"]), + format_change(results["change"]["load_time"]), + ] + return " | ".join(map(str, column_data)) + + formatted_rows = map(format_row, benchmark_results) + + return (textwrap.dedent(header) + "\n".join(formatted_rows)).strip() + + +if __name__ == "__main__": + models_to_benchmark = [ + ("sklearn `RandomForestRegressor`", train_model_sklearn, dump_sklearn), + ("lightgbm `LGBMRegressor`", train_model_lgbm, dump_lgbm), + ] + benchmark_results = [benchmark_model(*args) for args in models_to_benchmark] + print(format_benchmarks_results_table(benchmark_results)) From 84bf258e798f85fbb345fbac06294a2a8eab459e Mon Sep 17 00:00:00 2001 From: Pavel Zwerschke Date: Wed, 22 Feb 2023 13:33:01 +0100 Subject: [PATCH 02/16] WIP --- benchmark.py | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/benchmark.py b/benchmark.py index 666e381..9f01771 100644 --- a/benchmark.py +++ b/benchmark.py @@ -5,22 +5,42 @@ from typing import Callable, List import lightgbm as lgb +import numpy as np from sklearn.ensemble import RandomForestRegressor -from examples.utils import load_data +from examples.generate_data import generate_dataset from pickle_compression.lgbm_booster import dump_lgbm from pickle_compression.sklearn_tree import dump_sklearn +def rng(): + return np.random.RandomState(42) + + def train_model_sklearn() -> RandomForestRegressor: - regressor = RandomForestRegressor(n_estimators=100, random_state=42) - regressor.fit(*load_data()) + regressor = RandomForestRegressor( + n_estimators=100, random_state=rng(), max_leaf_nodes=200 + ) + X_train, X_test, y_train, y_test = generate_dataset() + regressor.fit(X_train, y_train) + print("sklearn score", regressor.score(X_test, y_test)) return regressor -def train_model_lgbm() -> lgb.LGBMRegressor: - regressor = lgb.LGBMRegressor(n_estimators=100, random_state=42) - regressor.fit(*load_data()) +# def train_model_lgbm() -> lgb.LGBMRegressor: +def train_model_lgbm(): + # regressor = lgb.LGBMRegressor(n_estimators=100, random_state=rng()) + regressor = lgb.LGBMRegressor( + boosting_type="rf", + n_estimators=100, + num_leaves=6300, + random_state=rng(), + bagging_freq=5, + bagging_fraction=0.5, + ) + X_train, X_test, y_train, y_test = generate_dataset() + regressor.fit(X_train, y_train) + print("lgbm score", regressor.score(X_test, y_test)) return regressor @@ -71,7 +91,7 @@ def format_change(multiple: float) -> str: def format_benchmarks_results_table(benchmark_results: List[dict]) -> str: - header = f""" + header = """ | Model | Baseline Size | Our Size | Size Reduction | Baseline Loading Time | Our Loading Time | Slowdown | |--|--:|--:|--:|--:|--:|--:| """ From 6f1bfa63fe6c6b3e0db1782a9ce7a107519fb939 Mon Sep 17 00:00:00 2001 From: Pavel Zwerschke Date: Wed, 22 Feb 2023 15:12:08 +0100 Subject: [PATCH 03/16] Fix front string regex --- benchmark.py | 4 ++-- pickle_compression/lgbm_booster.py | 2 +- pyproject.toml | 4 ++++ 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/benchmark.py b/benchmark.py index 9f01771..48222e5 100644 --- a/benchmark.py +++ b/benchmark.py @@ -8,7 +8,7 @@ import numpy as np from sklearn.ensemble import RandomForestRegressor -from examples.generate_data import generate_dataset +from examples.utils import generate_dataset from pickle_compression.lgbm_booster import dump_lgbm from pickle_compression.sklearn_tree import dump_sklearn @@ -33,7 +33,7 @@ def train_model_lgbm(): regressor = lgb.LGBMRegressor( boosting_type="rf", n_estimators=100, - num_leaves=6300, + num_leaves=1000, random_state=rng(), bagging_freq=5, bagging_fraction=0.5, diff --git a/pickle_compression/lgbm_booster.py b/pickle_compression/lgbm_booster.py index 9fd9797..41f5ffb 100644 --- a/pickle_compression/lgbm_booster.py +++ b/pickle_compression/lgbm_booster.py @@ -61,7 +61,7 @@ def _decompress_booster_state(compressed_state: dict): def _compress_booster_handle(model_string: str) -> Tuple[str, List[dict], str]: if not model_string.startswith("tree\nversion=v3"): raise ValueError("Only v3 is supported for the booster string format.") - FRONT_STRING_REGEX = r"(?:tree\n)(?:\w+=.*\n)*\n(?=Tree)" # noqa: N806 + FRONT_STRING_REGEX = r"(?:\w+(?:=.*)?\n)*\n(?=Tree)" # noqa: N806 BACK_STRING_REGEX = r"end of trees(?:\n)+(?:.|\n)*" # noqa: N806 TREE_GROUP_REGEX = r"(Tree=\d+\n+)((?:.+\n)*)\n\n" # noqa: N806 diff --git a/pyproject.toml b/pyproject.toml index 2ff064f..78158f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,10 @@ select = [ # pyupgrade "UP" ] +ignore = [ + # variable in function should be lowercase + "N806", +] [tool.mypy] python_version = "3.8" From a2944f392e602690bce1b20878882d514a2b4797 Mon Sep 17 00:00:00 2001 From: Pavel Zwerschke Date: Wed, 22 Feb 2023 15:16:25 +0100 Subject: [PATCH 04/16] Remove N806 --- examples/pickle_sklearn.py | 2 +- pickle_compression/lgbm_booster.py | 6 +++--- tests/test_lgbm_compression.py | 8 ++++---- tests/test_sklearn_compression.py | 10 +++++----- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/pickle_sklearn.py b/examples/pickle_sklearn.py index f7862e4..8975b5a 100644 --- a/examples/pickle_sklearn.py +++ b/examples/pickle_sklearn.py @@ -15,7 +15,7 @@ def train_model() -> RandomForestRegressor: regressor = RandomForestRegressor(n_estimators=100, random_state=42) - X_train, _, y_train, _ = generate_dataset(n_samples=10000) # noqa: N806 + X_train, _, y_train, _ = generate_dataset(n_samples=10000) regressor.fit(X_train, y_train) return regressor diff --git a/pickle_compression/lgbm_booster.py b/pickle_compression/lgbm_booster.py index 41f5ffb..5e3f089 100644 --- a/pickle_compression/lgbm_booster.py +++ b/pickle_compression/lgbm_booster.py @@ -61,9 +61,9 @@ def _decompress_booster_state(compressed_state: dict): def _compress_booster_handle(model_string: str) -> Tuple[str, List[dict], str]: if not model_string.startswith("tree\nversion=v3"): raise ValueError("Only v3 is supported for the booster string format.") - FRONT_STRING_REGEX = r"(?:\w+(?:=.*)?\n)*\n(?=Tree)" # noqa: N806 - BACK_STRING_REGEX = r"end of trees(?:\n)+(?:.|\n)*" # noqa: N806 - TREE_GROUP_REGEX = r"(Tree=\d+\n+)((?:.+\n)*)\n\n" # noqa: N806 + FRONT_STRING_REGEX = r"(?:\w+(?:=.*)?\n)*\n(?=Tree)" + BACK_STRING_REGEX = r"end of trees(?:\n)+(?:.|\n)*" + TREE_GROUP_REGEX = r"(Tree=\d+\n+)((?:.+\n)*)\n\n" def _extract_feature(feature_line): feat_name, values_str = feature_line.split("=") diff --git a/tests/test_lgbm_compression.py b/tests/test_lgbm_compression.py index f69f2af..4db89e1 100644 --- a/tests/test_lgbm_compression.py +++ b/tests/test_lgbm_compression.py @@ -15,7 +15,7 @@ def lgbm_regressor(rng): def test_compresed_predictions(diabetes_toy_df, lgbm_regressor, tmp_path): - X, y = diabetes_toy_df # noqa: N806 + X, y = diabetes_toy_df lgbm_regressor.fit(X, y) model_path = tmp_path / "model_compressed.pickle.lzma" @@ -27,7 +27,7 @@ def test_compresed_predictions(diabetes_toy_df, lgbm_regressor, tmp_path): def test_compressed_size(diabetes_toy_df, lgbm_regressor, tmp_path): - X, y = diabetes_toy_df # noqa: N806 + X, y = diabetes_toy_df lgbm_regressor.fit(X, y) model_path_compressed = tmp_path / "model_compressed.pickle.lzma" @@ -41,7 +41,7 @@ def test_compressed_size(diabetes_toy_df, lgbm_regressor, tmp_path): @pytest.mark.parametrize("compression_method", ["no", "lzma", "gzip", "bz2"]) def test_dump_times(diabetes_toy_df, lgbm_regressor, tmp_path, compression_method): - X, y = diabetes_toy_df # noqa: N806 + X, y = diabetes_toy_df lgbm_regressor.fit(X, y) factor = 7 if compression_method == "no" else 4 @@ -53,7 +53,7 @@ def test_dump_times(diabetes_toy_df, lgbm_regressor, tmp_path, compression_metho @pytest.mark.parametrize("compression_method", ["no", "lzma", "gzip", "bz2"]) def test_load_times(diabetes_toy_df, lgbm_regressor, tmp_path, compression_method): - X, y = diabetes_toy_df # noqa: N806 + X, y = diabetes_toy_df lgbm_regressor.fit(X, y) load_time_compressed, load_time_uncompressed = get_load_times( diff --git a/tests/test_sklearn_compression.py b/tests/test_sklearn_compression.py index 6163943..2157779 100644 --- a/tests/test_sklearn_compression.py +++ b/tests/test_sklearn_compression.py @@ -26,7 +26,7 @@ def decision_tree_regressor(rng): def test_compressed_predictions(diabetes_toy_df, random_forest_regressor, tmp_path): - X, y = diabetes_toy_df # noqa: N806 + X, y = diabetes_toy_df random_forest_regressor.fit(X, y) model_path = tmp_path / "model_compressed.pickle.lzma" @@ -40,7 +40,7 @@ def test_compressed_predictions(diabetes_toy_df, random_forest_regressor, tmp_pa def test_compressed_internal_structure( diabetes_toy_df, decision_tree_regressor, tmp_path ): - X, y = diabetes_toy_df # noqa: N806 + X, y = diabetes_toy_df decision_tree_regressor.fit(X, y) model_path = tmp_path / "model_dtype_reduction.pickle.lzma" @@ -70,7 +70,7 @@ def test_compressed_internal_structure( def test_compression_size(diabetes_toy_df, random_forest_regressor, tmp_path): - X, y = diabetes_toy_df # noqa: N806 + X, y = diabetes_toy_df random_forest_regressor.fit(X, y) model_path_dtype_reduction = tmp_path / "model_dtype_reduction.pickle.lzma" @@ -86,7 +86,7 @@ def test_compression_size(diabetes_toy_df, random_forest_regressor, tmp_path): def test_dump_times( diabetes_toy_df, random_forest_regressor, tmp_path, compression_method ): - X, y = diabetes_toy_df # noqa: N806 + X, y = diabetes_toy_df random_forest_regressor.fit(X, y) factor = 4 if compression_method == "no" else 1.5 @@ -100,7 +100,7 @@ def test_dump_times( def test_load_times( diabetes_toy_df, random_forest_regressor, tmp_path, compression_method ): - X, y = diabetes_toy_df # noqa: N806 + X, y = diabetes_toy_df random_forest_regressor.fit(X, y) load_time_compressed, load_time_uncompressed = get_load_times( From 5186bb5ebfb098abbeb64f36521260f52fe563c8 Mon Sep 17 00:00:00 2001 From: Pavel Zwerschke Date: Wed, 22 Feb 2023 15:18:55 +0100 Subject: [PATCH 05/16] Fix code style --- benchmark.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmark.py b/benchmark.py index 48222e5..ce31719 100644 --- a/benchmark.py +++ b/benchmark.py @@ -91,10 +91,10 @@ def format_change(multiple: float) -> str: def format_benchmarks_results_table(benchmark_results: List[dict]) -> str: - header = """ - | Model | Baseline Size | Our Size | Size Reduction | Baseline Loading Time | Our Loading Time | Slowdown | - |--|--:|--:|--:|--:|--:|--:| - """ + header = ( + "| Model | Baseline Size | Our Size | Size Reduction | Baseline Loading Time | " + "Our Loading Time | Slowdown |\n|--|--:|--:|--:|--:|--:|--:|" + ) def format_row(results): column_data = [ From 2ff73d823b7fdf3cebd39194496288299805f4cf Mon Sep 17 00:00:00 2001 From: Pavel Zwerschke Date: Wed, 22 Feb 2023 16:18:19 +0100 Subject: [PATCH 06/16] Small changes --- benchmark.py | 29 ++++++++++++++--------------- examples/pickle_lgbm.py | 6 +++--- examples/pickle_sklearn.py | 6 +++--- examples/utils.py | 27 ++++++++++++++++++++------- 4 files changed, 40 insertions(+), 28 deletions(-) diff --git a/benchmark.py b/benchmark.py index ce31719..a3b35bd 100644 --- a/benchmark.py +++ b/benchmark.py @@ -18,18 +18,18 @@ def rng(): def train_model_sklearn() -> RandomForestRegressor: - regressor = RandomForestRegressor( - n_estimators=100, random_state=rng(), max_leaf_nodes=200 - ) - X_train, X_test, y_train, y_test = generate_dataset() - regressor.fit(X_train, y_train) - print("sklearn score", regressor.score(X_test, y_test)) + regressor = RandomForestRegressor(n_estimators=100, random_state=rng()) + regressor.fit(*generate_dataset(n_samples=10000)) + return regressor + + +def train_gbdt_lgbm() -> lgb.LGBMRegressor: + regressor = lgb.LGBMRegressor(n_estimators=100, random_state=rng()) + regressor.fit(*generate_dataset(n_samples=10000)) return regressor -# def train_model_lgbm() -> lgb.LGBMRegressor: -def train_model_lgbm(): - # regressor = lgb.LGBMRegressor(n_estimators=100, random_state=rng()) +def train_rf_lgbm() -> lgb.LGBMRegressor: regressor = lgb.LGBMRegressor( boosting_type="rf", n_estimators=100, @@ -38,9 +38,7 @@ def train_model_lgbm(): bagging_freq=5, bagging_fraction=0.5, ) - X_train, X_test, y_train, y_test = generate_dataset() - regressor.fit(X_train, y_train) - print("lgbm score", regressor.score(X_test, y_test)) + regressor.fit(*generate_dataset(n_samples=10000)) return regressor @@ -79,7 +77,7 @@ def benchmark_model(name, train_func, dump_func) -> dict: def format_size(n_bytes: int) -> str: MiB = 1024**2 - return f"{n_bytes/MiB:.1f} MiB" + return f"{n_bytes / MiB:.1f} MiB" def format_time(seconds: float) -> str: @@ -93,7 +91,7 @@ def format_change(multiple: float) -> str: def format_benchmarks_results_table(benchmark_results: List[dict]) -> str: header = ( "| Model | Baseline Size | Our Size | Size Reduction | Baseline Loading Time | " - "Our Loading Time | Slowdown |\n|--|--:|--:|--:|--:|--:|--:|" + "Our Loading Time | Slowdown |\n|--|--:|--:|--:|--:|--:|--:|\n" ) def format_row(results): @@ -116,7 +114,8 @@ def format_row(results): if __name__ == "__main__": models_to_benchmark = [ ("sklearn `RandomForestRegressor`", train_model_sklearn, dump_sklearn), - ("lightgbm `LGBMRegressor`", train_model_lgbm, dump_lgbm), + ("lightgbm `LGBMRegressor gbdt`", train_gbdt_lgbm, dump_lgbm), + ("lightgbm `LGBMRegressor rf`", train_rf_lgbm, dump_lgbm), ] benchmark_results = [benchmark_model(*args) for args in models_to_benchmark] print(format_benchmarks_results_table(benchmark_results)) diff --git a/examples/pickle_lgbm.py b/examples/pickle_lgbm.py index 68a7ffc..ac22f0a 100644 --- a/examples/pickle_lgbm.py +++ b/examples/pickle_lgbm.py @@ -17,8 +17,8 @@ def train_model() -> lgb.LGBMRegressor: regressor = lgb.LGBMRegressor(n_estimators=100, random_state=42) - X_train, _, y_train, _ = generate_dataset(n_samples=10000) # noqa: N806 - regressor.fit(X_train, y_train) + X, y = generate_dataset(n_samples=10000) + regressor.fit(X, y) return regressor @@ -44,6 +44,6 @@ def dump_model_string(booster: Booster, path: Union[str, pathlib.Path]): dump_model_string(model_compressed.booster_, "examples/out/model_compressed.model") evaluate_prediction_difference( - model, model_compressed, generate_dataset(n_samples=10000)[1] + model, model_compressed, generate_dataset(n_samples=10000)[0] ) evaluate_compression_performance(model, dump_lgbm) diff --git a/examples/pickle_sklearn.py b/examples/pickle_sklearn.py index 8975b5a..a3418cb 100644 --- a/examples/pickle_sklearn.py +++ b/examples/pickle_sklearn.py @@ -15,8 +15,8 @@ def train_model() -> RandomForestRegressor: regressor = RandomForestRegressor(n_estimators=100, random_state=42) - X_train, _, y_train, _ = generate_dataset(n_samples=10000) - regressor.fit(X_train, y_train) + X, y = generate_dataset(n_samples=10000) + regressor.fit(X, y) return regressor @@ -29,6 +29,6 @@ def train_model() -> RandomForestRegressor: model_compressed = load_compressed(path, "no") evaluate_prediction_difference( - model, model_compressed, generate_dataset(n_samples=10000)[1] + model, model_compressed, generate_dataset(n_samples=10000)[0] ) evaluate_compression_performance(model, dump_sklearn) diff --git a/examples/utils.py b/examples/utils.py index b7ae58a..1aa4b60 100644 --- a/examples/utils.py +++ b/examples/utils.py @@ -14,16 +14,14 @@ def generate_dataset( n_samples: int = 50000, n_features: int = 100 -) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: +) -> Tuple[np.ndarray, np.ndarray]: """Generate a dataset with 50000 samples and 100 features. Returns: - X_train (np.array): (0.8 * n_samples) x n_features - X_test (np.array): (0.2 * n_samples) x n_features - y_train (np.array): 0.8 * n_samples - y_test (np.array): 0.2 * n_samples + X (np.array): n_samples x n_features + y (np.array): n_samples """ - X, y = make_regression( # noqa: N806 + X, y = make_regression( n_samples=n_samples, n_features=n_features, n_informative=50, @@ -36,7 +34,22 @@ def generate_dataset( for i in range(0, 100, 10): X[:, i] = X[:, i].astype("int") - X_train, X_test, y_train, y_test = train_test_split( # noqa: N806 + return X, y + + +def generate_dataset_train_test( + n_samples: int = 50000, n_features: int = 100 +) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """Generate a dataset with 50000 samples and 100 features. + + Returns: + X_train (np.array): (0.8 * n_samples) x n_features + X_test (np.array): (0.2 * n_samples) x n_features + y_train (np.array): 0.8 * n_samples + y_test (np.array): 0.2 * n_samples + """ + X, y = generate_dataset(n_samples, n_features) + X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) return X_train, X_test, y_train, y_test From 687396f1dbf3e7749e409876eb0edde2a7107ae2 Mon Sep 17 00:00:00 2001 From: Pavel Zwerschke Date: Wed, 22 Feb 2023 16:18:37 +0100 Subject: [PATCH 07/16] Add benchmark pr comment --- .github/workflows/benchmark.yml | 26 ++++++++++++++++++++++++++ .github/workflows/ci.yml | 5 ++--- 2 files changed, 28 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/benchmark.yml diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 0000000..ea858e2 --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,26 @@ +on: pull_request + +jobs: + example_comment_pr: + runs-on: ubuntu-latest + name: Benchmark + steps: + - uses: actions/checkout@v3 + - name: Set up conda env + uses: mamba-org/provision-with-micromamba@v15 + with: + cache-env: true + extra-specs: | + python=3.11 + - name: Install repository + run: python -m pip install --no-build-isolation --no-deps --disable-pip-version-check -e . + - name: Run benchmark + shell: bash -el {0} + run: | + echo "_(benchmark **${{ github.run_id }}** / attempt **${{ github.run_attempt }}**)_" >> benchmark.md + python benchmark.py >> benchmark.md + - name: Comment PR + uses: thollander/actions-comment-pull-request@v2 + with: + filePath: benchmark.md + comment_tag: benchmark diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9756059..4058b6c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,9 +24,8 @@ jobs: matrix: PYTHON_VERSION: ['3.8', '3.9', '3.10', '3.11'] steps: - - name: Checkout branch - uses: actions/checkout@v3 - - name: Set up Conda env + - uses: actions/checkout@v3 + - name: Set up conda env uses: mamba-org/provision-with-micromamba@v15 with: cache-env: true From 298a5211933045b36c83ee00d9ced6441b6cf8e9 Mon Sep 17 00:00:00 2001 From: Pavel Zwerschke Date: Wed, 22 Feb 2023 16:25:04 +0100 Subject: [PATCH 08/16] add permissions --- .github/workflows/benchmark.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index ea858e2..fad8483 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -1,9 +1,11 @@ on: pull_request +name: Benchmark +permissions: + pull-requests: write jobs: - example_comment_pr: + benchmark: runs-on: ubuntu-latest - name: Benchmark steps: - uses: actions/checkout@v3 - name: Set up conda env From 23b765ed202997b7d48fe9368e015639cda412b3 Mon Sep 17 00:00:00 2001 From: Pavel Zwerschke Date: Wed, 22 Feb 2023 16:27:30 +0100 Subject: [PATCH 09/16] add permissions --- .github/workflows/benchmark.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index fad8483..1fa2236 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -2,6 +2,7 @@ on: pull_request name: Benchmark permissions: pull-requests: write + contents: read jobs: benchmark: From 4cc841f5580ccd59faee6556fa6dee528b56bd81 Mon Sep 17 00:00:00 2001 From: Pavel Zwerschke Date: Wed, 22 Feb 2023 17:31:36 +0100 Subject: [PATCH 10/16] Fix dump performance, improve benchmark output --- benchmark.py | 30 ++++++++++++++++++++++-------- pickle_compression/lgbm_booster.py | 10 ++++++++-- 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/benchmark.py b/benchmark.py index a3b35bd..cc5fb4a 100644 --- a/benchmark.py +++ b/benchmark.py @@ -53,9 +53,13 @@ def benchmark(func: Callable, *args, **kwargs) -> float: def benchmark_model(name, train_func, dump_func) -> dict: model = train_func() + + naive_dump_time = benchmark(pickle.dumps, model) naive_pickled = pickle.dumps(model) naive_pickled_size = len(naive_pickled) naive_load_time = benchmark(pickle.loads, naive_pickled) + + our_dump_time = benchmark(dump_func, model, io.BytesIO()) our_pickled_buf = io.BytesIO() dump_func(model, our_pickled_buf) our_pickled = our_pickled_buf.getvalue() @@ -65,11 +69,17 @@ def benchmark_model(name, train_func, dump_func) -> dict: "name": name, "baseline": { "size": naive_pickled_size, + "dump_time": naive_dump_time, "load_time": naive_load_time, }, - "ours": {"size": our_pickled_size, "load_time": our_load_time}, + "ours": { + "size": our_pickled_size, + "dump_time": our_dump_time, + "load_time": our_load_time, + }, "change": { "size": naive_pickled_size / our_pickled_size, + "dump_time": our_dump_time / naive_dump_time, "load_time": our_load_time / naive_load_time, }, } @@ -81,17 +91,18 @@ def format_size(n_bytes: int) -> str: def format_time(seconds: float) -> str: - return f"{seconds:.1f} s" + return f"{seconds:.2f} s" def format_change(multiple: float) -> str: - return f"{multiple:.1f} x" + return f"{multiple:.2f} x" def format_benchmarks_results_table(benchmark_results: List[dict]) -> str: header = ( - "| Model | Baseline Size | Our Size | Size Reduction | Baseline Loading Time | " - "Our Loading Time | Slowdown |\n|--|--:|--:|--:|--:|--:|--:|\n" + "| Model | Baseline Size | Our Size | Size Reduction | Baseline Dump Time | Our Dump Time " + "| Dump Slowdown | Baseline Load Time | Our Load Time | Loading Slowdown |\n" + "|--|--:|--:|--:|--:|--:|--:|--:|--:|--:|\n" ) def format_row(results): @@ -100,6 +111,9 @@ def format_row(results): format_size(results["baseline"]["size"]), format_size(results["ours"]["size"]), format_change(results["change"]["size"]), + format_time(results["baseline"]["dump_time"]), + format_time(results["ours"]["dump_time"]), + format_change(results["change"]["dump_time"]), format_time(results["baseline"]["load_time"]), format_time(results["ours"]["load_time"]), format_change(results["change"]["load_time"]), @@ -113,9 +127,9 @@ def format_row(results): if __name__ == "__main__": models_to_benchmark = [ - ("sklearn `RandomForestRegressor`", train_model_sklearn, dump_sklearn), - ("lightgbm `LGBMRegressor gbdt`", train_gbdt_lgbm, dump_lgbm), - ("lightgbm `LGBMRegressor rf`", train_rf_lgbm, dump_lgbm), + ("`RandomForestRegressor`", train_model_sklearn, dump_sklearn), + ("`LGBMRegressor gbdt`", train_gbdt_lgbm, dump_lgbm), + ("`LGBMRegressor rf`", train_rf_lgbm, dump_lgbm), ] benchmark_results = [benchmark_model(*args) for args in models_to_benchmark] print(format_benchmarks_results_table(benchmark_results)) diff --git a/pickle_compression/lgbm_booster.py b/pickle_compression/lgbm_booster.py index 5e3f089..a9451ce 100644 --- a/pickle_compression/lgbm_booster.py +++ b/pickle_compression/lgbm_booster.py @@ -69,12 +69,18 @@ def _extract_feature(feature_line): feat_name, values_str = feature_line.split("=") return feat_name, values_str.split(" ") - front_str = re.findall(FRONT_STRING_REGEX, model_string)[0] + front_str_match = re.search(FRONT_STRING_REGEX, model_string) + if front_str_match is None: + raise ValueError("Could not find front string.") + front_str = front_str_match.group() # delete tree_sizes line since this messes up the tree parsing by LightGBM if not set correctly # todo calculate correct tree_sizes front_str = re.sub(r"tree_sizes=(?:\d+ )*\d+\n", "", front_str) - back_str = re.findall(BACK_STRING_REGEX, model_string)[0] + back_str_match = re.search(BACK_STRING_REGEX, model_string) + if back_str_match is None: + raise ValueError("Could not find back string.") + back_str = back_str_match.group() tree_matches = re.findall(TREE_GROUP_REGEX, model_string) trees: List[dict] = [] for i, tree_match in enumerate(tree_matches): From 528e4d75de86d383ea296af5b82b8d33fb23f5bf Mon Sep 17 00:00:00 2001 From: Pavel Zwerschke Date: Wed, 22 Feb 2023 17:42:06 +0100 Subject: [PATCH 11/16] change formatting --- benchmark.py | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/benchmark.py b/benchmark.py index cc5fb4a..c2dd4cb 100644 --- a/benchmark.py +++ b/benchmark.py @@ -99,26 +99,33 @@ def format_change(multiple: float) -> str: def format_benchmarks_results_table(benchmark_results: List[dict]) -> str: - header = ( - "| Model | Baseline Size | Our Size | Size Reduction | Baseline Dump Time | Our Dump Time " - "| Dump Slowdown | Baseline Load Time | Our Load Time | Loading Slowdown |\n" - "|--|--:|--:|--:|--:|--:|--:|--:|--:|--:|\n" - ) + header = """| Model | Size | Dump Time | Load Time | + |--|--:|--:|--:| + """ def format_row(results): + def format_cell(base, ours, change): + return f"{base} / {ours} / {change}" + column_data = [ results["name"], - format_size(results["baseline"]["size"]), - format_size(results["ours"]["size"]), - format_change(results["change"]["size"]), - format_time(results["baseline"]["dump_time"]), - format_time(results["ours"]["dump_time"]), - format_change(results["change"]["dump_time"]), - format_time(results["baseline"]["load_time"]), - format_time(results["ours"]["load_time"]), - format_change(results["change"]["load_time"]), + format_cell( + format_size(results["baseline"]["size"]), + format_size(results["ours"]["size"]), + format_change(results["change"]["size"]), + ), + format_cell( + format_size(results["baseline"]["dump_time"]), + format_size(results["ours"]["dump_time"]), + format_change(results["change"]["dump_time"]), + ), + format_cell( + format_size(results["baseline"]["load_time"]), + format_size(results["ours"]["load_time"]), + format_change(results["change"]["load_time"]), + ), ] - return " | ".join(map(str, column_data)) + return " | ".join(column_data) formatted_rows = map(format_row, benchmark_results) From cd246a20ae0e9efcad02f9bb0014190b0a4373a8 Mon Sep 17 00:00:00 2001 From: Pavel Zwerschke Date: Wed, 22 Feb 2023 17:47:29 +0100 Subject: [PATCH 12/16] Fix formatting --- benchmark.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/benchmark.py b/benchmark.py index c2dd4cb..aa09562 100644 --- a/benchmark.py +++ b/benchmark.py @@ -5,7 +5,6 @@ from typing import Callable, List import lightgbm as lgb -import numpy as np from sklearn.ensemble import RandomForestRegressor from examples.utils import generate_dataset @@ -13,18 +12,14 @@ from pickle_compression.sklearn_tree import dump_sklearn -def rng(): - return np.random.RandomState(42) - - def train_model_sklearn() -> RandomForestRegressor: - regressor = RandomForestRegressor(n_estimators=100, random_state=rng()) + regressor = RandomForestRegressor(n_estimators=100, random_state=42) regressor.fit(*generate_dataset(n_samples=10000)) return regressor def train_gbdt_lgbm() -> lgb.LGBMRegressor: - regressor = lgb.LGBMRegressor(n_estimators=100, random_state=rng()) + regressor = lgb.LGBMRegressor(n_estimators=100, random_state=42) regressor.fit(*generate_dataset(n_samples=10000)) return regressor @@ -34,7 +29,7 @@ def train_rf_lgbm() -> lgb.LGBMRegressor: boosting_type="rf", n_estimators=100, num_leaves=1000, - random_state=rng(), + random_state=42, bagging_freq=5, bagging_fraction=0.5, ) @@ -99,8 +94,9 @@ def format_change(multiple: float) -> str: def format_benchmarks_results_table(benchmark_results: List[dict]) -> str: - header = """| Model | Size | Dump Time | Load Time | - |--|--:|--:|--:| + header = """ + | Model | Size | Dump Time | Load Time | + |--|--:|--:|--:| """ def format_row(results): From 2954a7981675f1b4ff92d3d35a7702d69b146344 Mon Sep 17 00:00:00 2001 From: Pavel Zwerschke Date: Wed, 22 Feb 2023 17:52:02 +0100 Subject: [PATCH 13/16] =?UTF-8?q?=F0=9F=A4=A6=F0=9F=8F=BB=E2=80=8D?= =?UTF-8?q?=E2=99=82=EF=B8=8F=F0=9F=A4=A6=F0=9F=8F=BB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- benchmark.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmark.py b/benchmark.py index aa09562..dff4070 100644 --- a/benchmark.py +++ b/benchmark.py @@ -111,13 +111,13 @@ def format_cell(base, ours, change): format_change(results["change"]["size"]), ), format_cell( - format_size(results["baseline"]["dump_time"]), - format_size(results["ours"]["dump_time"]), + format_time(results["baseline"]["dump_time"]), + format_time(results["ours"]["dump_time"]), format_change(results["change"]["dump_time"]), ), format_cell( - format_size(results["baseline"]["load_time"]), - format_size(results["ours"]["load_time"]), + format_time(results["baseline"]["load_time"]), + format_time(results["ours"]["load_time"]), format_change(results["change"]["load_time"]), ), ] From 607ceff5b8c15f666f95ba59503dd9fcf20443b0 Mon Sep 17 00:00:00 2001 From: Pavel Zwerschke Date: Wed, 22 Feb 2023 17:55:33 +0100 Subject: [PATCH 14/16] Add explanation string --- benchmark.py | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmark.py b/benchmark.py index dff4070..a943dd1 100644 --- a/benchmark.py +++ b/benchmark.py @@ -135,4 +135,5 @@ def format_cell(base, ours, change): ("`LGBMRegressor rf`", train_rf_lgbm, dump_lgbm), ] benchmark_results = [benchmark_model(*args) for args in models_to_benchmark] + print("Base results (pickle) / Our results / Change (ours / base)") print(format_benchmarks_results_table(benchmark_results)) From 27365008a29e3ac5ea38df3d013e7b0e5c2db2f1 Mon Sep 17 00:00:00 2001 From: Pavel Zwerschke Date: Wed, 22 Feb 2023 18:00:56 +0100 Subject: [PATCH 15/16] ... --- benchmark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark.py b/benchmark.py index a943dd1..dfc2e42 100644 --- a/benchmark.py +++ b/benchmark.py @@ -135,5 +135,5 @@ def format_cell(base, ours, change): ("`LGBMRegressor rf`", train_rf_lgbm, dump_lgbm), ] benchmark_results = [benchmark_model(*args) for args in models_to_benchmark] - print("Base results (pickle) / Our results / Change (ours / base)") + print("Base results / Our results / Change") print(format_benchmarks_results_table(benchmark_results)) From f96d76b5320ff993a214e56c03a66abfd49050e5 Mon Sep 17 00:00:00 2001 From: Pavel Zwerschke Date: Wed, 22 Feb 2023 18:03:22 +0100 Subject: [PATCH 16/16] . --- examples/utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/examples/utils.py b/examples/utils.py index 1aa4b60..60f56f6 100644 --- a/examples/utils.py +++ b/examples/utils.py @@ -49,10 +49,7 @@ def generate_dataset_train_test( y_test (np.array): 0.2 * n_samples """ X, y = generate_dataset(n_samples, n_features) - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42 - ) - return X_train, X_test, y_train, y_test + return train_test_split(X, y, test_size=0.2, random_state=42) def evaluate_compression_performance(