diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 0000000..1fa2236 --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,29 @@ +on: pull_request +name: Benchmark +permissions: + pull-requests: write + contents: read + +jobs: + benchmark: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up conda env + uses: mamba-org/provision-with-micromamba@v15 + with: + cache-env: true + extra-specs: | + python=3.11 + - name: Install repository + run: python -m pip install --no-build-isolation --no-deps --disable-pip-version-check -e . + - name: Run benchmark + shell: bash -el {0} + run: | + echo "_(benchmark **${{ github.run_id }}** / attempt **${{ github.run_attempt }}**)_" >> benchmark.md + python benchmark.py >> benchmark.md + - name: Comment PR + uses: thollander/actions-comment-pull-request@v2 + with: + filePath: benchmark.md + comment_tag: benchmark diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9756059..4058b6c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,9 +24,8 @@ jobs: matrix: PYTHON_VERSION: ['3.8', '3.9', '3.10', '3.11'] steps: - - name: Checkout branch - uses: actions/checkout@v3 - - name: Set up Conda env + - uses: actions/checkout@v3 + - name: Set up conda env uses: mamba-org/provision-with-micromamba@v15 with: cache-env: true diff --git a/benchmark.py b/benchmark.py new file mode 100644 index 0000000..dfc2e42 --- /dev/null +++ b/benchmark.py @@ -0,0 +1,139 @@ +import io +import pickle +import textwrap +import time +from typing import Callable, List + +import lightgbm as lgb +from sklearn.ensemble import RandomForestRegressor + +from examples.utils import generate_dataset +from pickle_compression.lgbm_booster import dump_lgbm +from pickle_compression.sklearn_tree import dump_sklearn + + +def train_model_sklearn() -> RandomForestRegressor: + regressor = RandomForestRegressor(n_estimators=100, random_state=42) + regressor.fit(*generate_dataset(n_samples=10000)) + return regressor + + +def train_gbdt_lgbm() -> lgb.LGBMRegressor: + regressor = lgb.LGBMRegressor(n_estimators=100, random_state=42) + regressor.fit(*generate_dataset(n_samples=10000)) + return regressor + + +def train_rf_lgbm() -> lgb.LGBMRegressor: + regressor = lgb.LGBMRegressor( + boosting_type="rf", + n_estimators=100, + num_leaves=1000, + random_state=42, + bagging_freq=5, + bagging_fraction=0.5, + ) + regressor.fit(*generate_dataset(n_samples=10000)) + return regressor + + +def benchmark(func: Callable, *args, **kwargs) -> float: + times = [] + for _ in range(10): + start = time.perf_counter() + func(*args, **kwargs) + times.append(time.perf_counter() - start) + return min(times) + + +def benchmark_model(name, train_func, dump_func) -> dict: + model = train_func() + + naive_dump_time = benchmark(pickle.dumps, model) + naive_pickled = pickle.dumps(model) + naive_pickled_size = len(naive_pickled) + naive_load_time = benchmark(pickle.loads, naive_pickled) + + our_dump_time = benchmark(dump_func, model, io.BytesIO()) + our_pickled_buf = io.BytesIO() + dump_func(model, our_pickled_buf) + our_pickled = our_pickled_buf.getvalue() + our_pickled_size = len(our_pickled) + our_load_time = benchmark(pickle.loads, our_pickled) + return { + "name": name, + "baseline": { + "size": naive_pickled_size, + "dump_time": naive_dump_time, + "load_time": naive_load_time, + }, + "ours": { + "size": our_pickled_size, + "dump_time": our_dump_time, + "load_time": our_load_time, + }, + "change": { + "size": naive_pickled_size / our_pickled_size, + "dump_time": our_dump_time / naive_dump_time, + "load_time": our_load_time / naive_load_time, + }, + } + + +def format_size(n_bytes: int) -> str: + MiB = 1024**2 + return f"{n_bytes / MiB:.1f} MiB" + + +def format_time(seconds: float) -> str: + return f"{seconds:.2f} s" + + +def format_change(multiple: float) -> str: + return f"{multiple:.2f} x" + + +def format_benchmarks_results_table(benchmark_results: List[dict]) -> str: + header = """ + | Model | Size | Dump Time | Load Time | + |--|--:|--:|--:| + """ + + def format_row(results): + def format_cell(base, ours, change): + return f"{base} / {ours} / {change}" + + column_data = [ + results["name"], + format_cell( + format_size(results["baseline"]["size"]), + format_size(results["ours"]["size"]), + format_change(results["change"]["size"]), + ), + format_cell( + format_time(results["baseline"]["dump_time"]), + format_time(results["ours"]["dump_time"]), + format_change(results["change"]["dump_time"]), + ), + format_cell( + format_time(results["baseline"]["load_time"]), + format_time(results["ours"]["load_time"]), + format_change(results["change"]["load_time"]), + ), + ] + return " | ".join(column_data) + + formatted_rows = map(format_row, benchmark_results) + + return (textwrap.dedent(header) + "\n".join(formatted_rows)).strip() + + +if __name__ == "__main__": + models_to_benchmark = [ + ("`RandomForestRegressor`", train_model_sklearn, dump_sklearn), + ("`LGBMRegressor gbdt`", train_gbdt_lgbm, dump_lgbm), + ("`LGBMRegressor rf`", train_rf_lgbm, dump_lgbm), + ] + benchmark_results = [benchmark_model(*args) for args in models_to_benchmark] + print("Base results / Our results / Change") + print(format_benchmarks_results_table(benchmark_results)) diff --git a/examples/pickle_lgbm.py b/examples/pickle_lgbm.py index 68a7ffc..ac22f0a 100644 --- a/examples/pickle_lgbm.py +++ b/examples/pickle_lgbm.py @@ -17,8 +17,8 @@ def train_model() -> lgb.LGBMRegressor: regressor = lgb.LGBMRegressor(n_estimators=100, random_state=42) - X_train, _, y_train, _ = generate_dataset(n_samples=10000) # noqa: N806 - regressor.fit(X_train, y_train) + X, y = generate_dataset(n_samples=10000) + regressor.fit(X, y) return regressor @@ -44,6 +44,6 @@ def dump_model_string(booster: Booster, path: Union[str, pathlib.Path]): dump_model_string(model_compressed.booster_, "examples/out/model_compressed.model") evaluate_prediction_difference( - model, model_compressed, generate_dataset(n_samples=10000)[1] + model, model_compressed, generate_dataset(n_samples=10000)[0] ) evaluate_compression_performance(model, dump_lgbm) diff --git a/examples/pickle_sklearn.py b/examples/pickle_sklearn.py index f7862e4..a3418cb 100644 --- a/examples/pickle_sklearn.py +++ b/examples/pickle_sklearn.py @@ -15,8 +15,8 @@ def train_model() -> RandomForestRegressor: regressor = RandomForestRegressor(n_estimators=100, random_state=42) - X_train, _, y_train, _ = generate_dataset(n_samples=10000) # noqa: N806 - regressor.fit(X_train, y_train) + X, y = generate_dataset(n_samples=10000) + regressor.fit(X, y) return regressor @@ -29,6 +29,6 @@ def train_model() -> RandomForestRegressor: model_compressed = load_compressed(path, "no") evaluate_prediction_difference( - model, model_compressed, generate_dataset(n_samples=10000)[1] + model, model_compressed, generate_dataset(n_samples=10000)[0] ) evaluate_compression_performance(model, dump_sklearn) diff --git a/examples/utils.py b/examples/utils.py index b7ae58a..60f56f6 100644 --- a/examples/utils.py +++ b/examples/utils.py @@ -14,16 +14,14 @@ def generate_dataset( n_samples: int = 50000, n_features: int = 100 -) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: +) -> Tuple[np.ndarray, np.ndarray]: """Generate a dataset with 50000 samples and 100 features. Returns: - X_train (np.array): (0.8 * n_samples) x n_features - X_test (np.array): (0.2 * n_samples) x n_features - y_train (np.array): 0.8 * n_samples - y_test (np.array): 0.2 * n_samples + X (np.array): n_samples x n_features + y (np.array): n_samples """ - X, y = make_regression( # noqa: N806 + X, y = make_regression( n_samples=n_samples, n_features=n_features, n_informative=50, @@ -36,10 +34,22 @@ def generate_dataset( for i in range(0, 100, 10): X[:, i] = X[:, i].astype("int") - X_train, X_test, y_train, y_test = train_test_split( # noqa: N806 - X, y, test_size=0.2, random_state=42 - ) - return X_train, X_test, y_train, y_test + return X, y + + +def generate_dataset_train_test( + n_samples: int = 50000, n_features: int = 100 +) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """Generate a dataset with 50000 samples and 100 features. + + Returns: + X_train (np.array): (0.8 * n_samples) x n_features + X_test (np.array): (0.2 * n_samples) x n_features + y_train (np.array): 0.8 * n_samples + y_test (np.array): 0.2 * n_samples + """ + X, y = generate_dataset(n_samples, n_features) + return train_test_split(X, y, test_size=0.2, random_state=42) def evaluate_compression_performance( diff --git a/pickle_compression/lgbm_booster.py b/pickle_compression/lgbm_booster.py index 9fd9797..a9451ce 100644 --- a/pickle_compression/lgbm_booster.py +++ b/pickle_compression/lgbm_booster.py @@ -61,20 +61,26 @@ def _decompress_booster_state(compressed_state: dict): def _compress_booster_handle(model_string: str) -> Tuple[str, List[dict], str]: if not model_string.startswith("tree\nversion=v3"): raise ValueError("Only v3 is supported for the booster string format.") - FRONT_STRING_REGEX = r"(?:tree\n)(?:\w+=.*\n)*\n(?=Tree)" # noqa: N806 - BACK_STRING_REGEX = r"end of trees(?:\n)+(?:.|\n)*" # noqa: N806 - TREE_GROUP_REGEX = r"(Tree=\d+\n+)((?:.+\n)*)\n\n" # noqa: N806 + FRONT_STRING_REGEX = r"(?:\w+(?:=.*)?\n)*\n(?=Tree)" + BACK_STRING_REGEX = r"end of trees(?:\n)+(?:.|\n)*" + TREE_GROUP_REGEX = r"(Tree=\d+\n+)((?:.+\n)*)\n\n" def _extract_feature(feature_line): feat_name, values_str = feature_line.split("=") return feat_name, values_str.split(" ") - front_str = re.findall(FRONT_STRING_REGEX, model_string)[0] + front_str_match = re.search(FRONT_STRING_REGEX, model_string) + if front_str_match is None: + raise ValueError("Could not find front string.") + front_str = front_str_match.group() # delete tree_sizes line since this messes up the tree parsing by LightGBM if not set correctly # todo calculate correct tree_sizes front_str = re.sub(r"tree_sizes=(?:\d+ )*\d+\n", "", front_str) - back_str = re.findall(BACK_STRING_REGEX, model_string)[0] + back_str_match = re.search(BACK_STRING_REGEX, model_string) + if back_str_match is None: + raise ValueError("Could not find back string.") + back_str = back_str_match.group() tree_matches = re.findall(TREE_GROUP_REGEX, model_string) trees: List[dict] = [] for i, tree_match in enumerate(tree_matches): diff --git a/pyproject.toml b/pyproject.toml index 2ff064f..78158f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,10 @@ select = [ # pyupgrade "UP" ] +ignore = [ + # variable in function should be lowercase + "N806", +] [tool.mypy] python_version = "3.8" diff --git a/tests/test_lgbm_compression.py b/tests/test_lgbm_compression.py index f69f2af..4db89e1 100644 --- a/tests/test_lgbm_compression.py +++ b/tests/test_lgbm_compression.py @@ -15,7 +15,7 @@ def lgbm_regressor(rng): def test_compresed_predictions(diabetes_toy_df, lgbm_regressor, tmp_path): - X, y = diabetes_toy_df # noqa: N806 + X, y = diabetes_toy_df lgbm_regressor.fit(X, y) model_path = tmp_path / "model_compressed.pickle.lzma" @@ -27,7 +27,7 @@ def test_compresed_predictions(diabetes_toy_df, lgbm_regressor, tmp_path): def test_compressed_size(diabetes_toy_df, lgbm_regressor, tmp_path): - X, y = diabetes_toy_df # noqa: N806 + X, y = diabetes_toy_df lgbm_regressor.fit(X, y) model_path_compressed = tmp_path / "model_compressed.pickle.lzma" @@ -41,7 +41,7 @@ def test_compressed_size(diabetes_toy_df, lgbm_regressor, tmp_path): @pytest.mark.parametrize("compression_method", ["no", "lzma", "gzip", "bz2"]) def test_dump_times(diabetes_toy_df, lgbm_regressor, tmp_path, compression_method): - X, y = diabetes_toy_df # noqa: N806 + X, y = diabetes_toy_df lgbm_regressor.fit(X, y) factor = 7 if compression_method == "no" else 4 @@ -53,7 +53,7 @@ def test_dump_times(diabetes_toy_df, lgbm_regressor, tmp_path, compression_metho @pytest.mark.parametrize("compression_method", ["no", "lzma", "gzip", "bz2"]) def test_load_times(diabetes_toy_df, lgbm_regressor, tmp_path, compression_method): - X, y = diabetes_toy_df # noqa: N806 + X, y = diabetes_toy_df lgbm_regressor.fit(X, y) load_time_compressed, load_time_uncompressed = get_load_times( diff --git a/tests/test_sklearn_compression.py b/tests/test_sklearn_compression.py index 6163943..2157779 100644 --- a/tests/test_sklearn_compression.py +++ b/tests/test_sklearn_compression.py @@ -26,7 +26,7 @@ def decision_tree_regressor(rng): def test_compressed_predictions(diabetes_toy_df, random_forest_regressor, tmp_path): - X, y = diabetes_toy_df # noqa: N806 + X, y = diabetes_toy_df random_forest_regressor.fit(X, y) model_path = tmp_path / "model_compressed.pickle.lzma" @@ -40,7 +40,7 @@ def test_compressed_predictions(diabetes_toy_df, random_forest_regressor, tmp_pa def test_compressed_internal_structure( diabetes_toy_df, decision_tree_regressor, tmp_path ): - X, y = diabetes_toy_df # noqa: N806 + X, y = diabetes_toy_df decision_tree_regressor.fit(X, y) model_path = tmp_path / "model_dtype_reduction.pickle.lzma" @@ -70,7 +70,7 @@ def test_compressed_internal_structure( def test_compression_size(diabetes_toy_df, random_forest_regressor, tmp_path): - X, y = diabetes_toy_df # noqa: N806 + X, y = diabetes_toy_df random_forest_regressor.fit(X, y) model_path_dtype_reduction = tmp_path / "model_dtype_reduction.pickle.lzma" @@ -86,7 +86,7 @@ def test_compression_size(diabetes_toy_df, random_forest_regressor, tmp_path): def test_dump_times( diabetes_toy_df, random_forest_regressor, tmp_path, compression_method ): - X, y = diabetes_toy_df # noqa: N806 + X, y = diabetes_toy_df random_forest_regressor.fit(X, y) factor = 4 if compression_method == "no" else 1.5 @@ -100,7 +100,7 @@ def test_dump_times( def test_load_times( diabetes_toy_df, random_forest_regressor, tmp_path, compression_method ): - X, y = diabetes_toy_df # noqa: N806 + X, y = diabetes_toy_df random_forest_regressor.fit(X, y) load_time_compressed, load_time_uncompressed = get_load_times(