Add lgbm compression, refactoring (#5)

Co-authored-by: Yasin Tatar <[email protected]>
Quantco · Mar 4, 2023 · 98ec163 · 98ec163
1 parent 13228b8
commit 98ec163
Show file tree

Hide file tree

Showing 12 changed files with 424 additions and 163 deletions.
diff --git a/examples/.gitignore b/examples/.gitignore
@@ -1,2 +1,2 @@
 great_lakes_1*
-lgb1.model
+*.model
diff --git a/examples/pickle_lgbm.py b/examples/pickle_lgbm.py
@@ -1,15 +1,19 @@
 import pathlib
+import tempfile
 from typing import Union
 
 import lightgbm as lgb
-from lgbm_booster import dump_lgbm
+import numpy as np
 from lightgbm import Booster
+from utils import evaluate_compression_performance, load_data
 
-from examples.utils import load_data, print_model_size
+from pickle_compression import dump_lgbm_compressed
+from pickle_compression.lgbm_booster import dump_lgbm
+from pickle_compression.pickling import load_compressed
 
 
 def train_model() -> lgb.LGBMRegressor:
-    regressor = lgb.LGBMRegressor(n_estimators=1, random_state=42)
+    regressor = lgb.LGBMRegressor(n_estimators=100, random_state=42)
     regressor.fit(*load_data())
     return regressor
 
@@ -23,16 +27,20 @@ def dump_model_string(booster: Booster, path: Union[str, pathlib.Path]):
         f.write(booster.model_to_string())
 
 
+# model = load_model("examples/lgb1-base.model")
 model = train_model()
-# dump_model_string(model.booster_, "great_lakes_1.model")
 
-# x, y = load_data()
-# model = load_model("great_lakes_1.model")
-# model_new = load_model("great_lakes_1_omit_values.model")
+with tempfile.TemporaryDirectory() as tmpdir:
+    path = pathlib.Path(tmpdir) / "model.pkl"
+    dump_lgbm_compressed(model, path, "no")
+    model_compressed = load_compressed(path, "no")
 
-# y_pred = model.predict(x)
-# y_pred_new = model_new.predict(x)
-# diff = y_pred - y_pred_new
-# print(diff.max(), diff.min(), diff.mean(), diff.std())
+pathlib.Path("examples/out").mkdir(exist_ok=True)
+dump_model_string(model_compressed.booster_, "examples/out/model_compressed.model")
 
-print_model_size(model, dump_lgbm)
+x, y = load_data()
+y_pred = model.predict(x)
+y_pred_new = model_compressed.predict(x)
+print(f"Maximum prediction difference: {np.abs(y_pred - y_pred_new).max()}")
+
+evaluate_compression_performance(model, dump_lgbm)
diff --git a/examples/pickle_sklearn.py b/examples/pickle_sklearn.py
@@ -1,7 +1,7 @@
 from sklearn.ensemble import RandomForestRegressor
 from sklearn_tree import dump_sklearn
 
-from examples.utils import load_data, print_model_size
+from examples.utils import evaluate_compression_performance, load_data
 
 
 def train_model() -> RandomForestRegressor:
@@ -12,4 +12,4 @@ def train_model() -> RandomForestRegressor:
 
 model = train_model()
 
-print_model_size(model, dump_sklearn)
+evaluate_compression_performance(model, dump_sklearn)
diff --git a/examples/utils.py b/examples/utils.py
@@ -1,14 +1,17 @@
+import os
+import tempfile
 import time
 from itertools import product
-from typing import Any, Callable, List, Optional, Tuple
+from typing import Any, Callable, Tuple
 
 import pandas as pd
-from pickling import get_pickled_size
 from sklearn.preprocessing import LabelEncoder
 
+from pickle_compression.pickling import dump_compressed, load_compressed
+
 
 def load_data() -> Tuple[pd.DataFrame, pd.Series]:
-    df = pd.read_csv("great_lakes_1.csv")
+    df = pd.read_csv("examples/great_lakes_1.csv")
     df.drop(["lat", "long"], axis=1, inplace=True)
     cols = ["region", "type", "laundry_options", "parking_options"]
     label_encoder = LabelEncoder()
@@ -24,18 +27,31 @@ def load_data() -> Tuple[pd.DataFrame, pd.Series]:
     return X, y
 
 
-def print_model_size(
-    model: Any, dump: Callable, compressions: Optional[List[str]] = None
+def evaluate_compression_performance(
+    model: Any, dump: Callable, print_performance: bool = True
 ):
-    if not compressions:
-        compressions = ["no", "lzma", "bz2", "gzip"]
+    compressions = ["no", "lzma", "bz2", "gzip"]
+    performance = []
     for compression, dump_function in product(compressions, [None, dump]):
-        start = time.time()
-        size = get_pickled_size(
-            model, compression=compression, dump_function=dump_function
-        )
-        print(
-            f"Compression {compression}, "
-            f"dump_function {None if not dump_function else dump_function.__name__}: "
-            f"{size / 2 ** 20:.2f} MB / {time.time() - start:.2f} s"
-        )
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = f"{tmpdir}/model"
+            start = time.time()
+            dump_compressed(model, path, compression, dump_function)
+            dump_time = time.time() - start
+            start = time.time()
+            load_compressed(path, compression)
+            load_time = time.time() - start
+            size = os.path.getsize(path)
+        performance += [
+            {
+                "compression": compression,
+                "dump_function": dump_function.__name__ if dump_function else None,
+                "size": f"{size / 2 ** 20:.2f} MB",
+                "dump_time": f"{dump_time:.3f} s",
+                "load_time": f"{load_time:.3f} s",
+            }
+        ]
+    df = pd.DataFrame(performance)
+    if print_performance:
+        print(df.to_string(index=False))
+    return df
diff --git a/pickle_compression/compression_utils.py b/pickle_compression/compression_utils.py
@@ -0,0 +1,51 @@
+import numpy as np
+
+
+def _is_in_neighborhood_of_int(arr, iinfo, eps=1e-12):
+    """
+    Checks if the numbers are around an integer.
+    np.abs(arr % 1 - 1) < eps checks if the number is in an epsilon neighborhood on the right side
+    of the next int and arr % 1 < eps checks if the number is in an epsilon neighborhood on the left
+    side of the next int.
+    """
+    return (
+        (np.minimum(np.abs(arr % 1 - 1), arr % 1) < eps)
+        & (arr >= iinfo.min)
+        & (arr <= iinfo.max)
+    )
+
+
+def compress_half_int_float_array(a, compression_dtype="int8"):
+    """Compress small integer and half-integer floats in a lossless fashion
+
+    Idea:
+        If most values in array <a> are small integers or half-integers, we can
+        store them as float16, while keeping the rest as float64.
+
+    Technical details:
+        - The boolean array (2 * a) % 1 == 0 indicates the integers and half-integers in <a>.
+        - int8 can represent integers between np.iinfo('int8').min and np.iinfo('int8').max
+    """
+    info = np.iinfo(compression_dtype)
+    a2 = 2.0 * a
+    is_compressible = _is_in_neighborhood_of_int(a2, info)
+    not_compressible = np.logical_not(is_compressible)
+
+    a2_compressible = a2[is_compressible].astype(compression_dtype)
+    a_incompressible = a[not_compressible]
+
+    state = {
+        "is_compressible": is_compressible,
+        "a2_compressible": a2_compressible,
+        "a_incompressible": a_incompressible,
+    }
+
+    return state
+
+
+def decompress_half_int_float_array(state):
+    is_compressible = state["is_compressible"]
+    a = np.zeros(len(is_compressible), dtype="float64")
+    a[is_compressible] = state["a2_compressible"] / 2.0
+    a[np.logical_not(is_compressible)] = state["a_incompressible"]
+    return a