Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add examples, rename methods #3

Merged
merged 1 commit into from
Feb 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions examples/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
great_lakes_1.csv
lgb1.model
46 changes: 46 additions & 0 deletions examples/pickle_lgbm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import time
from itertools import product

import lightgbm as lgb
import pandas as pd
from lgbm_booster import dump_lgbm
from lightgbm import Booster
from pickling import get_pickled_size
from sklearn.preprocessing import LabelEncoder


def train_model():
df = pd.read_csv("great_lakes_1.csv")
df.drop(["lat", "long"], axis=1, inplace=True)
cols = ["region", "type", "laundry_options", "parking_options"]
label_encoder = LabelEncoder()
mapping_dict = {}
for col in cols:
df[col] = label_encoder.fit_transform(df[col])
le_name_mapping = dict(
zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))
)
mapping_dict[col] = le_name_mapping
regressor = lgb.LGBMRegressor(n_estimators=100, random_state=42)
X = df.drop("price", axis=1) # noqa: N806
y = df["price"]
regressor.fit(X, y)
return regressor


def load_model():
return Booster(model_file="lgb1.model")


model = load_model()

for compression, dump_function in product(
["no", "lzma", "bz2", "gzip"], [None, dump_lgbm]
):
start = time.time()
size = get_pickled_size(model, compression=compression, dump_function=dump_function)
print(
f"Compression {compression}, "
f"dump_function {None if not dump_function else dump_function.__name__}: "
f"{size / 2**20:.2f} MB / {time.time() - start:.2f} s"
)
48 changes: 48 additions & 0 deletions examples/pickle_sklearn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import time
from itertools import product

import pandas as pd
from pickling import get_pickled_size
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn_tree import dump_sklearn


def train_model():
df = pd.read_csv("great_lakes_1.csv")
df.drop(["lat", "long"], axis=1, inplace=True)
cols = ["region", "type", "laundry_options", "parking_options"]
label_encoder = LabelEncoder()
mapping_dict = {}
for col in cols:
df[col] = label_encoder.fit_transform(df[col])
le_name_mapping = dict(
zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))
)
mapping_dict[col] = le_name_mapping
X = df.drop("price", axis=1) # noqa: N806
y = df["price"]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) # noqa: N806
X_train, X_test, y_train, y_test = train_test_split( # noqa: N806
X_scaled, y, test_size=0.3, random_state=42
)

regressor = RandomForestRegressor(n_estimators=100, random_state=42)
regressor.fit(X_train, y_train)
return regressor


model = train_model()

for compression, dump_function in product(
["no", "lzma", "bz2", "gzip"], [None, dump_sklearn]
):
start = time.time()
size = get_pickled_size(model, compression=compression, dump_function=dump_function)
print(
f"Compression {compression}, "
f"dump_function {None if not dump_function else dump_function.__name__}: "
f"{size / 2**20:.2f} MB / {time.time() - start:.2f} s"
)
12 changes: 6 additions & 6 deletions pickle_compression/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
__version__ = "unknown"


def pickle_sklearn_compressed(
def dump_sklearn_compressed(
model: Any, path: Union[str, Path], compression: Union[str, dict] = "lzma"
):
"""
Expand All @@ -37,14 +37,14 @@ def pickle_sklearn_compressed(
of the compression library.
Inspired by the pandas.to_csv interface.
"""
from pickle_compression.sklearn_tree import pickle_sklearn_compressed
from pickle_compression.sklearn_tree import dump_sklearn

dump_compressed(model, path, compression, pickle_sklearn_compressed)
dump_compressed(model, path, compression, dump_sklearn)


def pickle_booster_compressed(
def dump_lgbm_compressed(
model: Any, path: Union[str, Path], compression: Union[str, dict] = "lzma"
):
from pickle_compression.lgbm_booster import pickle_lgbm_compressed
from pickle_compression.lgbm_booster import dump_lgbm

dump_compressed(model, path, compression, pickle_lgbm_compressed)
dump_compressed(model, path, compression, dump_lgbm)
27 changes: 14 additions & 13 deletions pickle_compression/lgbm_booster.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os
import pickle
import sys
from typing import BinaryIO
from typing import Any, BinaryIO

try:
from lightgbm.basic import Booster
Expand All @@ -11,18 +11,18 @@
sys.exit(os.EX_CONFIG)


def pickle_lgbm_compressed(model: Booster, file: BinaryIO):
def dump_lgbm(model: Any, file: BinaryIO):
p = pickle.Pickler(file)
p.dispatch_table = copyreg.dispatch_table.copy()
p.dispatch_table[Booster] = _compressed_lgbm_pickle
p.dispatch_table[Booster] = _compressed_booster_pickle
p.dump(model)


def _compressed_lgbm_pickle(lgbm_booster: Booster):
assert isinstance(lgbm_booster, Booster)
def _compressed_booster_pickle(booster: Booster):
assert isinstance(booster, Booster)

# retrieve
cls, init_args, _ = lgbm_booster.__reduce__()
cls, init_args, state = booster.__reduce__()

# extract state information
"""
Expand All @@ -43,33 +43,34 @@ def _compressed_lgbm_pickle(lgbm_booster: Booster):
# keys: (['name', 'version', 'num_class', 'num_tree_per_iteration', 'label_index',
# 'max_feature_idx', 'objective', 'average_output', 'feature_names', 'monotone_constraints',
# 'feature_infos', 'tree_info', 'feature_importances', 'pandas_categorical']
dump_dict = lgbm_booster.dump_model()
booster.dump_model()

# transform and compress state
compressed_state = _compress_lgbm_state(dump_dict)
compressed_state = _compress_lgbm_state(state)

# return function to unpickle again
return _compressed_lgbm_unpickle, (cls, init_args, compressed_state)
return _compressed_booster_unpickle, (cls, init_args, compressed_state)


def _compressed_lgbm_unpickle(cls, init_args, compressed_state):
def _compressed_booster_unpickle(cls, init_args, compressed_state):
_class, base, state = init_args # unpack
state = {"model_str": compressed_state}
cls(_class, base, state)
model_string = _decompress_lgbm_state(compressed_state)
state = _decompress_lgbm_state(compressed_state)
model_string = state["handle"]
# https://github.com/microsoft/LightGBM/issues/5370
# currently it's not possible to de-serialize out of the JSON/dict again
# tree.__setstate__(decompressed_state)
# TODO: find a way to create a Booster back again from it's state representation
return Booster(model_str=model_string)


def _compress_lgbm_state(booster: Booster):
def _compress_lgbm_state(state: dict):
"""
For a given state dictionary, store data in a structured format that can then
be saved to disk in a way that can be compressed.
"""
return booster # booster.model_to_string() # TODO: actually _do_ something
return state # TODO: actually _do_ something


def _decompress_lgbm_state(compressed_state):
Expand Down
26 changes: 13 additions & 13 deletions pickle_compression/sklearn_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,28 +14,28 @@
import numpy as np


def pickle_sklearn_compressed(model: Any, file: BinaryIO):
def dump_sklearn(model: Any, file: BinaryIO):
p = pickle.Pickler(file)
p.dispatch_table = copyreg.dispatch_table.copy()
p.dispatch_table[Tree] = compressed_tree_pickle
p.dispatch_table[Tree] = _compressed_tree_pickle
p.dump(model)


def compressed_tree_pickle(tree):
def _compressed_tree_pickle(tree):
assert isinstance(tree, Tree)
cls, init_args, state = tree.__reduce__()
compressed_state = compress_tree_state(state)
return compressed_tree_unpickle, (cls, init_args, compressed_state)
compressed_state = _compress_tree_state(state)
return _compressed_tree_unpickle, (cls, init_args, compressed_state)


def compressed_tree_unpickle(cls, init_args, state):
def _compressed_tree_unpickle(cls, init_args, state):
tree = cls(*init_args)
decompressed_state = decompress_tree_state(state)
decompressed_state = _decompress_tree_state(state)
tree.__setstate__(decompressed_state)
return tree


def compress_tree_state(state: dict):
def _compress_tree_state(state: dict):
"""
Compresses a Tree state.
:param state: dictionary with 'max_depth', 'node_count', 'nodes', 'values' as keys.
Expand Down Expand Up @@ -68,7 +68,7 @@ def compress_tree_state(state: dict):
values = state["values"][is_leaf].astype(dtype_value)
# do lossless compression for thresholds by downcasting half ints (e.g. 5.5, 10.5, ...) to int8
thresholds = nodes["threshold"][is_not_leaf].astype(dtype_threshold)
thresholds = compress_half_int_float_array(thresholds)
thresholds = _compress_half_int_float_array(thresholds)

return {
"max_depth": state["max_depth"],
Expand All @@ -81,7 +81,7 @@ def compress_tree_state(state: dict):
}


def decompress_tree_state(state: dict):
def _decompress_tree_state(state: dict):
"""
Decompresses a Tree state.
:param state: 'children_left', 'children_right', 'features', 'thresholds', 'values' as keys.
Expand Down Expand Up @@ -113,7 +113,7 @@ def decompress_tree_state(state: dict):
children_right[is_leaf] = -1
features[is_not_leaf] = state["features"]
features[is_leaf] = -2 # feature of leaves is -2
thresholds[is_not_leaf] = decompress_half_int_float_array(state["thresholds"])
thresholds[is_not_leaf] = _decompress_half_int_float_array(state["thresholds"])
thresholds[is_leaf] = -2 # threshold of leaves is -2
values[is_leaf] = state["values"]

Expand Down Expand Up @@ -156,7 +156,7 @@ def _is_in_neighborhood_of_int(arr, iinfo, eps=1e-12):
)


def compress_half_int_float_array(a, compression_dtype="int8"):
def _compress_half_int_float_array(a, compression_dtype="int8"):
"""Compress small integer and half-integer floats in a lossless fashion

Idea:
Expand Down Expand Up @@ -184,7 +184,7 @@ def compress_half_int_float_array(a, compression_dtype="int8"):
return state


def decompress_half_int_float_array(state):
def _decompress_half_int_float_array(state):
is_compressible = state["is_compressible"]
a = np.zeros(len(is_compressible), dtype="float64")
a[is_compressible] = state["a2_compressible"] / 2.0
Expand Down
16 changes: 8 additions & 8 deletions tests/test_sklearn_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@
from sklearn.tree import DecisionTreeRegressor
from sklearn.utils import check_random_state

from pickle_compression import pickle_sklearn_compressed
from pickle_compression import dump_sklearn_compressed
from pickle_compression.pickling import dump_compressed, load_compressed
from pickle_compression.sklearn_tree import (
_compress_half_int_float_array,
_decompress_half_int_float_array,
_is_in_neighborhood_of_int,
compress_half_int_float_array,
decompress_half_int_float_array,
)


Expand Down Expand Up @@ -41,7 +41,7 @@ def test_compressed_predictions(diabetes_toy_df, random_forest_regressor, tmp_pa
random_forest_regressor.fit(X, y)

model_path = tmp_path / "model_dtype_reduction.pickle.lzma"
pickle_sklearn_compressed(random_forest_regressor, model_path)
dump_sklearn_compressed(random_forest_regressor, model_path)
model_dtype_reduction = load_compressed(model_path, "lzma")
prediction_no_reduction = random_forest_regressor.predict(X)
prediction_reduction = model_dtype_reduction.predict(X)
Expand All @@ -55,7 +55,7 @@ def test_compressed_internal_structure(
decision_tree_regressor.fit(X, y)

model_path = tmp_path / "model_dtype_reduction.pickle.lzma"
pickle_sklearn_compressed(decision_tree_regressor, model_path)
dump_sklearn_compressed(decision_tree_regressor, model_path)
model_dtype_reduction = load_compressed(model_path, "lzma")

tree_no_reduction = decision_tree_regressor.tree_
Expand Down Expand Up @@ -86,7 +86,7 @@ def test_compression_size(diabetes_toy_df, random_forest_regressor, tmp_path):

model_path_dtype_reduction = tmp_path / "model_dtype_reduction.pickle.lzma"
model_path_no_reduction = tmp_path / "model_no_reduction.pickle.lzma"
pickle_sklearn_compressed(random_forest_regressor, model_path_dtype_reduction)
dump_sklearn_compressed(random_forest_regressor, model_path_dtype_reduction)
dump_compressed(random_forest_regressor, model_path_no_reduction)
size_no_reduction = os.path.getsize(model_path_no_reduction)
size_dtype_reduction = os.path.getsize(model_path_dtype_reduction)
Expand All @@ -95,8 +95,8 @@ def test_compression_size(diabetes_toy_df, random_forest_regressor, tmp_path):

def test_compress_half_int_float_array():
a1 = np.array([0, 1, 2.5, np.pi, -np.pi, 1e5, 35.5, 2.50000000001])
state = compress_half_int_float_array(a1)
np.testing.assert_array_equal(a1, decompress_half_int_float_array(state))
state = _compress_half_int_float_array(a1)
np.testing.assert_array_equal(a1, _decompress_half_int_float_array(state))


def test_compress_is_compressible_edge_cases():
Expand Down