Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add benchmark script, fix regex, remove N806 #9

Merged
merged 17 commits into from
Feb 22, 2023
29 changes: 29 additions & 0 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
on: pull_request
name: Benchmark
permissions:
pull-requests: write
contents: read

jobs:
benchmark:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up conda env
uses: mamba-org/provision-with-micromamba@v15
with:
cache-env: true
extra-specs: |
python=3.11
- name: Install repository
run: python -m pip install --no-build-isolation --no-deps --disable-pip-version-check -e .
- name: Run benchmark
shell: bash -el {0}
run: |
echo "_(benchmark **${{ github.run_id }}** / attempt **${{ github.run_attempt }}**)_" >> benchmark.md
python benchmark.py >> benchmark.md
- name: Comment PR
uses: thollander/actions-comment-pull-request@v2
with:
filePath: benchmark.md
comment_tag: benchmark
5 changes: 2 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,8 @@ jobs:
matrix:
PYTHON_VERSION: ['3.8', '3.9', '3.10', '3.11']
steps:
- name: Checkout branch
uses: actions/checkout@v3
- name: Set up Conda env
- uses: actions/checkout@v3
- name: Set up conda env
uses: mamba-org/provision-with-micromamba@v15
with:
cache-env: true
Expand Down
139 changes: 139 additions & 0 deletions benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import io
import pickle
import textwrap
import time
from typing import Callable, List

import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor

from examples.utils import generate_dataset
from pickle_compression.lgbm_booster import dump_lgbm
from pickle_compression.sklearn_tree import dump_sklearn


def train_model_sklearn() -> RandomForestRegressor:
regressor = RandomForestRegressor(n_estimators=100, random_state=42)
regressor.fit(*generate_dataset(n_samples=10000))
return regressor


def train_gbdt_lgbm() -> lgb.LGBMRegressor:
regressor = lgb.LGBMRegressor(n_estimators=100, random_state=42)
regressor.fit(*generate_dataset(n_samples=10000))
return regressor


def train_rf_lgbm() -> lgb.LGBMRegressor:
regressor = lgb.LGBMRegressor(
boosting_type="rf",
n_estimators=100,
num_leaves=1000,
random_state=42,
bagging_freq=5,
bagging_fraction=0.5,
)
regressor.fit(*generate_dataset(n_samples=10000))
return regressor


def benchmark(func: Callable, *args, **kwargs) -> float:
times = []
for _ in range(10):
start = time.perf_counter()
func(*args, **kwargs)
times.append(time.perf_counter() - start)
return min(times)


def benchmark_model(name, train_func, dump_func) -> dict:
model = train_func()

naive_dump_time = benchmark(pickle.dumps, model)
naive_pickled = pickle.dumps(model)
naive_pickled_size = len(naive_pickled)
naive_load_time = benchmark(pickle.loads, naive_pickled)

our_dump_time = benchmark(dump_func, model, io.BytesIO())
our_pickled_buf = io.BytesIO()
dump_func(model, our_pickled_buf)
our_pickled = our_pickled_buf.getvalue()
our_pickled_size = len(our_pickled)
our_load_time = benchmark(pickle.loads, our_pickled)
return {
"name": name,
"baseline": {
"size": naive_pickled_size,
"dump_time": naive_dump_time,
"load_time": naive_load_time,
},
"ours": {
"size": our_pickled_size,
"dump_time": our_dump_time,
"load_time": our_load_time,
},
"change": {
"size": naive_pickled_size / our_pickled_size,
"dump_time": our_dump_time / naive_dump_time,
"load_time": our_load_time / naive_load_time,
},
}


def format_size(n_bytes: int) -> str:
MiB = 1024**2
return f"{n_bytes / MiB:.1f} MiB"


def format_time(seconds: float) -> str:
return f"{seconds:.2f} s"


def format_change(multiple: float) -> str:
return f"{multiple:.2f} x"


def format_benchmarks_results_table(benchmark_results: List[dict]) -> str:
header = """
| Model | Size | Dump Time | Load Time |
|--|--:|--:|--:|
"""

def format_row(results):
def format_cell(base, ours, change):
return f"{base} / {ours} / {change}"

column_data = [
results["name"],
format_cell(
format_size(results["baseline"]["size"]),
format_size(results["ours"]["size"]),
format_change(results["change"]["size"]),
),
format_cell(
format_time(results["baseline"]["dump_time"]),
format_time(results["ours"]["dump_time"]),
format_change(results["change"]["dump_time"]),
),
format_cell(
format_time(results["baseline"]["load_time"]),
format_time(results["ours"]["load_time"]),
format_change(results["change"]["load_time"]),
),
]
return " | ".join(column_data)

formatted_rows = map(format_row, benchmark_results)

return (textwrap.dedent(header) + "\n".join(formatted_rows)).strip()


if __name__ == "__main__":
models_to_benchmark = [
("`RandomForestRegressor`", train_model_sklearn, dump_sklearn),
("`LGBMRegressor gbdt`", train_gbdt_lgbm, dump_lgbm),
("`LGBMRegressor rf`", train_rf_lgbm, dump_lgbm),
]
benchmark_results = [benchmark_model(*args) for args in models_to_benchmark]
print("Base results / Our results / Change")
print(format_benchmarks_results_table(benchmark_results))
6 changes: 3 additions & 3 deletions examples/pickle_lgbm.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@

def train_model() -> lgb.LGBMRegressor:
regressor = lgb.LGBMRegressor(n_estimators=100, random_state=42)
X_train, _, y_train, _ = generate_dataset(n_samples=10000) # noqa: N806
regressor.fit(X_train, y_train)
X, y = generate_dataset(n_samples=10000)
regressor.fit(X, y)
return regressor


Expand All @@ -44,6 +44,6 @@ def dump_model_string(booster: Booster, path: Union[str, pathlib.Path]):
dump_model_string(model_compressed.booster_, "examples/out/model_compressed.model")

evaluate_prediction_difference(
model, model_compressed, generate_dataset(n_samples=10000)[1]
model, model_compressed, generate_dataset(n_samples=10000)[0]
)
evaluate_compression_performance(model, dump_lgbm)
6 changes: 3 additions & 3 deletions examples/pickle_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@

def train_model() -> RandomForestRegressor:
regressor = RandomForestRegressor(n_estimators=100, random_state=42)
X_train, _, y_train, _ = generate_dataset(n_samples=10000) # noqa: N806
regressor.fit(X_train, y_train)
X, y = generate_dataset(n_samples=10000)
regressor.fit(X, y)
return regressor


Expand All @@ -29,6 +29,6 @@ def train_model() -> RandomForestRegressor:
model_compressed = load_compressed(path, "no")

evaluate_prediction_difference(
model, model_compressed, generate_dataset(n_samples=10000)[1]
model, model_compressed, generate_dataset(n_samples=10000)[0]
)
evaluate_compression_performance(model, dump_sklearn)
30 changes: 20 additions & 10 deletions examples/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,14 @@

def generate_dataset(
n_samples: int = 50000, n_features: int = 100
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
) -> Tuple[np.ndarray, np.ndarray]:
"""Generate a dataset with 50000 samples and 100 features.

Returns:
X_train (np.array): (0.8 * n_samples) x n_features
X_test (np.array): (0.2 * n_samples) x n_features
y_train (np.array): 0.8 * n_samples
y_test (np.array): 0.2 * n_samples
X (np.array): n_samples x n_features
y (np.array): n_samples
"""
X, y = make_regression( # noqa: N806
X, y = make_regression(
n_samples=n_samples,
n_features=n_features,
n_informative=50,
Expand All @@ -36,10 +34,22 @@ def generate_dataset(
for i in range(0, 100, 10):
X[:, i] = X[:, i].astype("int")

X_train, X_test, y_train, y_test = train_test_split( # noqa: N806
X, y, test_size=0.2, random_state=42
)
return X_train, X_test, y_train, y_test
return X, y


def generate_dataset_train_test(
n_samples: int = 50000, n_features: int = 100
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
"""Generate a dataset with 50000 samples and 100 features.

Returns:
X_train (np.array): (0.8 * n_samples) x n_features
X_test (np.array): (0.2 * n_samples) x n_features
y_train (np.array): 0.8 * n_samples
y_test (np.array): 0.2 * n_samples
"""
X, y = generate_dataset(n_samples, n_features)
return train_test_split(X, y, test_size=0.2, random_state=42)


def evaluate_compression_performance(
Expand Down
16 changes: 11 additions & 5 deletions pickle_compression/lgbm_booster.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,20 +61,26 @@ def _decompress_booster_state(compressed_state: dict):
def _compress_booster_handle(model_string: str) -> Tuple[str, List[dict], str]:
if not model_string.startswith("tree\nversion=v3"):
raise ValueError("Only v3 is supported for the booster string format.")
FRONT_STRING_REGEX = r"(?:tree\n)(?:\w+=.*\n)*\n(?=Tree)" # noqa: N806
BACK_STRING_REGEX = r"end of trees(?:\n)+(?:.|\n)*" # noqa: N806
TREE_GROUP_REGEX = r"(Tree=\d+\n+)((?:.+\n)*)\n\n" # noqa: N806
FRONT_STRING_REGEX = r"(?:\w+(?:=.*)?\n)*\n(?=Tree)"
BACK_STRING_REGEX = r"end of trees(?:\n)+(?:.|\n)*"
TREE_GROUP_REGEX = r"(Tree=\d+\n+)((?:.+\n)*)\n\n"

def _extract_feature(feature_line):
feat_name, values_str = feature_line.split("=")
return feat_name, values_str.split(" ")

front_str = re.findall(FRONT_STRING_REGEX, model_string)[0]
front_str_match = re.search(FRONT_STRING_REGEX, model_string)
if front_str_match is None:
raise ValueError("Could not find front string.")
front_str = front_str_match.group()
# delete tree_sizes line since this messes up the tree parsing by LightGBM if not set correctly
# todo calculate correct tree_sizes
front_str = re.sub(r"tree_sizes=(?:\d+ )*\d+\n", "", front_str)

back_str = re.findall(BACK_STRING_REGEX, model_string)[0]
back_str_match = re.search(BACK_STRING_REGEX, model_string)
if back_str_match is None:
raise ValueError("Could not find back string.")
back_str = back_str_match.group()
tree_matches = re.findall(TREE_GROUP_REGEX, model_string)
trees: List[dict] = []
for i, tree_match in enumerate(tree_matches):
Expand Down
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ select = [
# pyupgrade
"UP"
]
ignore = [
# variable in function should be lowercase
"N806",
]

[tool.mypy]
python_version = "3.8"
Expand Down
8 changes: 4 additions & 4 deletions tests/test_lgbm_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def lgbm_regressor(rng):


def test_compresed_predictions(diabetes_toy_df, lgbm_regressor, tmp_path):
X, y = diabetes_toy_df # noqa: N806
X, y = diabetes_toy_df
lgbm_regressor.fit(X, y)

model_path = tmp_path / "model_compressed.pickle.lzma"
Expand All @@ -27,7 +27,7 @@ def test_compresed_predictions(diabetes_toy_df, lgbm_regressor, tmp_path):


def test_compressed_size(diabetes_toy_df, lgbm_regressor, tmp_path):
X, y = diabetes_toy_df # noqa: N806
X, y = diabetes_toy_df
lgbm_regressor.fit(X, y)

model_path_compressed = tmp_path / "model_compressed.pickle.lzma"
Expand All @@ -41,7 +41,7 @@ def test_compressed_size(diabetes_toy_df, lgbm_regressor, tmp_path):

@pytest.mark.parametrize("compression_method", ["no", "lzma", "gzip", "bz2"])
def test_dump_times(diabetes_toy_df, lgbm_regressor, tmp_path, compression_method):
X, y = diabetes_toy_df # noqa: N806
X, y = diabetes_toy_df
lgbm_regressor.fit(X, y)
factor = 7 if compression_method == "no" else 4

Expand All @@ -53,7 +53,7 @@ def test_dump_times(diabetes_toy_df, lgbm_regressor, tmp_path, compression_metho

@pytest.mark.parametrize("compression_method", ["no", "lzma", "gzip", "bz2"])
def test_load_times(diabetes_toy_df, lgbm_regressor, tmp_path, compression_method):
X, y = diabetes_toy_df # noqa: N806
X, y = diabetes_toy_df
lgbm_regressor.fit(X, y)

load_time_compressed, load_time_uncompressed = get_load_times(
Expand Down
10 changes: 5 additions & 5 deletions tests/test_sklearn_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def decision_tree_regressor(rng):


def test_compressed_predictions(diabetes_toy_df, random_forest_regressor, tmp_path):
X, y = diabetes_toy_df # noqa: N806
X, y = diabetes_toy_df
random_forest_regressor.fit(X, y)

model_path = tmp_path / "model_compressed.pickle.lzma"
Expand All @@ -40,7 +40,7 @@ def test_compressed_predictions(diabetes_toy_df, random_forest_regressor, tmp_pa
def test_compressed_internal_structure(
diabetes_toy_df, decision_tree_regressor, tmp_path
):
X, y = diabetes_toy_df # noqa: N806
X, y = diabetes_toy_df
decision_tree_regressor.fit(X, y)

model_path = tmp_path / "model_dtype_reduction.pickle.lzma"
Expand Down Expand Up @@ -70,7 +70,7 @@ def test_compressed_internal_structure(


def test_compression_size(diabetes_toy_df, random_forest_regressor, tmp_path):
X, y = diabetes_toy_df # noqa: N806
X, y = diabetes_toy_df
random_forest_regressor.fit(X, y)

model_path_dtype_reduction = tmp_path / "model_dtype_reduction.pickle.lzma"
Expand All @@ -86,7 +86,7 @@ def test_compression_size(diabetes_toy_df, random_forest_regressor, tmp_path):
def test_dump_times(
diabetes_toy_df, random_forest_regressor, tmp_path, compression_method
):
X, y = diabetes_toy_df # noqa: N806
X, y = diabetes_toy_df
random_forest_regressor.fit(X, y)
factor = 4 if compression_method == "no" else 1.5

Expand All @@ -100,7 +100,7 @@ def test_dump_times(
def test_load_times(
diabetes_toy_df, random_forest_regressor, tmp_path, compression_method
):
X, y = diabetes_toy_df # noqa: N806
X, y = diabetes_toy_df
random_forest_regressor.fit(X, y)

load_time_compressed, load_time_uncompressed = get_load_times(
Expand Down