Skip to content

Commit

Permalink
Add lgbm compression, refactoring (#5)
Browse files Browse the repository at this point in the history
Co-authored-by: Yasin Tatar <[email protected]>
  • Loading branch information
pavelzw and YYYasin19 committed Mar 4, 2023
1 parent 13228b8 commit 98ec163
Show file tree
Hide file tree
Showing 12 changed files with 424 additions and 163 deletions.
2 changes: 1 addition & 1 deletion examples/.gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
great_lakes_1*
lgb1.model
*.model
32 changes: 20 additions & 12 deletions examples/pickle_lgbm.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
import pathlib
import tempfile
from typing import Union

import lightgbm as lgb
from lgbm_booster import dump_lgbm
import numpy as np
from lightgbm import Booster
from utils import evaluate_compression_performance, load_data

from examples.utils import load_data, print_model_size
from pickle_compression import dump_lgbm_compressed
from pickle_compression.lgbm_booster import dump_lgbm
from pickle_compression.pickling import load_compressed


def train_model() -> lgb.LGBMRegressor:
regressor = lgb.LGBMRegressor(n_estimators=1, random_state=42)
regressor = lgb.LGBMRegressor(n_estimators=100, random_state=42)
regressor.fit(*load_data())
return regressor

Expand All @@ -23,16 +27,20 @@ def dump_model_string(booster: Booster, path: Union[str, pathlib.Path]):
f.write(booster.model_to_string())


# model = load_model("examples/lgb1-base.model")
model = train_model()
# dump_model_string(model.booster_, "great_lakes_1.model")

# x, y = load_data()
# model = load_model("great_lakes_1.model")
# model_new = load_model("great_lakes_1_omit_values.model")
with tempfile.TemporaryDirectory() as tmpdir:
path = pathlib.Path(tmpdir) / "model.pkl"
dump_lgbm_compressed(model, path, "no")
model_compressed = load_compressed(path, "no")

# y_pred = model.predict(x)
# y_pred_new = model_new.predict(x)
# diff = y_pred - y_pred_new
# print(diff.max(), diff.min(), diff.mean(), diff.std())
pathlib.Path("examples/out").mkdir(exist_ok=True)
dump_model_string(model_compressed.booster_, "examples/out/model_compressed.model")

print_model_size(model, dump_lgbm)
x, y = load_data()
y_pred = model.predict(x)
y_pred_new = model_compressed.predict(x)
print(f"Maximum prediction difference: {np.abs(y_pred - y_pred_new).max()}")

evaluate_compression_performance(model, dump_lgbm)
4 changes: 2 additions & 2 deletions examples/pickle_sklearn.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from sklearn.ensemble import RandomForestRegressor
from sklearn_tree import dump_sklearn

from examples.utils import load_data, print_model_size
from examples.utils import evaluate_compression_performance, load_data


def train_model() -> RandomForestRegressor:
Expand All @@ -12,4 +12,4 @@ def train_model() -> RandomForestRegressor:

model = train_model()

print_model_size(model, dump_sklearn)
evaluate_compression_performance(model, dump_sklearn)
48 changes: 32 additions & 16 deletions examples/utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
import os
import tempfile
import time
from itertools import product
from typing import Any, Callable, List, Optional, Tuple
from typing import Any, Callable, Tuple

import pandas as pd
from pickling import get_pickled_size
from sklearn.preprocessing import LabelEncoder

from pickle_compression.pickling import dump_compressed, load_compressed


def load_data() -> Tuple[pd.DataFrame, pd.Series]:
df = pd.read_csv("great_lakes_1.csv")
df = pd.read_csv("examples/great_lakes_1.csv")
df.drop(["lat", "long"], axis=1, inplace=True)
cols = ["region", "type", "laundry_options", "parking_options"]
label_encoder = LabelEncoder()
Expand All @@ -24,18 +27,31 @@ def load_data() -> Tuple[pd.DataFrame, pd.Series]:
return X, y


def print_model_size(
model: Any, dump: Callable, compressions: Optional[List[str]] = None
def evaluate_compression_performance(
model: Any, dump: Callable, print_performance: bool = True
):
if not compressions:
compressions = ["no", "lzma", "bz2", "gzip"]
compressions = ["no", "lzma", "bz2", "gzip"]
performance = []
for compression, dump_function in product(compressions, [None, dump]):
start = time.time()
size = get_pickled_size(
model, compression=compression, dump_function=dump_function
)
print(
f"Compression {compression}, "
f"dump_function {None if not dump_function else dump_function.__name__}: "
f"{size / 2 ** 20:.2f} MB / {time.time() - start:.2f} s"
)
with tempfile.TemporaryDirectory() as tmpdir:
path = f"{tmpdir}/model"
start = time.time()
dump_compressed(model, path, compression, dump_function)
dump_time = time.time() - start
start = time.time()
load_compressed(path, compression)
load_time = time.time() - start
size = os.path.getsize(path)
performance += [
{
"compression": compression,
"dump_function": dump_function.__name__ if dump_function else None,
"size": f"{size / 2 ** 20:.2f} MB",
"dump_time": f"{dump_time:.3f} s",
"load_time": f"{load_time:.3f} s",
}
]
df = pd.DataFrame(performance)
if print_performance:
print(df.to_string(index=False))
return df
51 changes: 51 additions & 0 deletions pickle_compression/compression_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import numpy as np


def _is_in_neighborhood_of_int(arr, iinfo, eps=1e-12):
"""
Checks if the numbers are around an integer.
np.abs(arr % 1 - 1) < eps checks if the number is in an epsilon neighborhood on the right side
of the next int and arr % 1 < eps checks if the number is in an epsilon neighborhood on the left
side of the next int.
"""
return (
(np.minimum(np.abs(arr % 1 - 1), arr % 1) < eps)
& (arr >= iinfo.min)
& (arr <= iinfo.max)
)


def compress_half_int_float_array(a, compression_dtype="int8"):
"""Compress small integer and half-integer floats in a lossless fashion
Idea:
If most values in array <a> are small integers or half-integers, we can
store them as float16, while keeping the rest as float64.
Technical details:
- The boolean array (2 * a) % 1 == 0 indicates the integers and half-integers in <a>.
- int8 can represent integers between np.iinfo('int8').min and np.iinfo('int8').max
"""
info = np.iinfo(compression_dtype)
a2 = 2.0 * a
is_compressible = _is_in_neighborhood_of_int(a2, info)
not_compressible = np.logical_not(is_compressible)

a2_compressible = a2[is_compressible].astype(compression_dtype)
a_incompressible = a[not_compressible]

state = {
"is_compressible": is_compressible,
"a2_compressible": a2_compressible,
"a_incompressible": a_incompressible,
}

return state


def decompress_half_int_float_array(state):
is_compressible = state["is_compressible"]
a = np.zeros(len(is_compressible), dtype="float64")
a[is_compressible] = state["a2_compressible"] / 2.0
a[np.logical_not(is_compressible)] = state["a_incompressible"]
return a
Loading

0 comments on commit 98ec163

Please sign in to comment.