Add linear models

rapidsai · Sep 7, 2023 · f8f9248 · f8f9248
1 parent 5f6f6d3
commit f8f9248
Show file tree

Hide file tree

Showing 12 changed files with 242 additions and 57 deletions.
diff --git a/README.md b/README.md
@@ -49,6 +49,16 @@ for i in range(total_estimators // estimators_per_batch):
 
 The above example can be found here: [examples/batch_training](examples/batch_training/README.md).
 
+### Different model types
+Legateboost supports tree models, linear models, kernel ridge regression models, custom user models and any combinations of these models.
+
+The following example shows a model combining linear and decision tree base learners.
+
+```python
+model = lb.LBRegressor(base_models=(lb.models.Linear(),)*5 + (lb.models.Tree(max_depth=1),)*15, **params).fit(X, y)
+```
+
+<img src="examples/linear_model/linear_model.png" alt="drawing" width="800"/>
 
 ## Installation
 

diff --git a/examples/linear_model/README.md b/examples/linear_model/README.md
@@ -0,0 +1,4 @@
+# Linear model
+This example shows how to train a mixed model with linear and tree components. The dataset is a linear function with some added noise, then a step in the middle of the function. This is challenging for a linear model due to the step, and challenging for a tree model due to the sloped function (see the characteristic axis aligned step function of the tree model). We create a combined model by first boosting 5 iterations of a linear model and then 15 iterations of the tree model. The result is a model that is better able to fit the linear function and the step function.
+
+<img src="linear_model.png" alt="drawing" width="800"/>
diff --git a/examples/linear_model/linear_model.png b/examples/linear_model/linear_model.png
diff --git a/examples/linear_model/linear_model.py b/examples/linear_model/linear_model.py
@@ -0,0 +1,60 @@
+from pathlib import Path
+
+import seaborn as sns
+from matplotlib import pyplot as plt
+from matplotlib.ticker import FuncFormatter
+
+import cunumeric as cn
+import legateboost as lb
+
+sns.set()
+plt.rcParams["font.family"] = "serif"
+
+rs = cn.random.RandomState(42)
+X = cn.linspace(0, 10, 200)[:, cn.newaxis]
+y_true = X[:, 0].copy()
+y_true[X.shape[0] // 2 :] += 3.0
+y = y_true + rs.normal(0, 0.25, X.shape[0])
+params = {"n_estimators": 20, "learning_rate": 0.5, "verbose": True, "random_state": 20}
+eval_result = {}
+linear_model = lb.LBRegressor(base_models=(lb.models.Linear(),), **params).fit(
+    X, y, eval_set=[(X, y_true)], eval_result=eval_result
+)
+linear_test_error = cn.sqrt(eval_result["eval-0"]["mse"])
+tree_model = lb.LBRegressor(base_models=(lb.models.Tree(max_depth=1),), **params).fit(
+    X, y, eval_set=[(X, y_true)], eval_result=eval_result
+)
+tree_test_error = cn.sqrt(eval_result["eval-0"]["mse"])
+model = lb.LBRegressor(
+    base_models=(lb.models.Linear(),) * 5 + (lb.models.Tree(max_depth=1),) * 15,
+    **params
+).fit(X, y, eval_set=[(X, y_true)], eval_result=eval_result)
+mixed_test_error = cn.sqrt(eval_result["eval-0"]["mse"])
+
+# plot
+fig, ax = plt.subplots(1, 2, figsize=(12, 6))
+plt.gca().xaxis.set_major_formatter(FuncFormatter(lambda x, _: int(x)))
+sns.scatterplot(x=X[:, 0], y=y, color=".2", alpha=0.5, label="f(x)+noise", ax=ax[0])
+sns.lineplot(x=X[:, 0], y=linear_model.predict(X), label="linear model", ax=ax[0])
+sns.lineplot(x=X[:, 0], y=tree_model.predict(X), label="tree model", ax=ax[0])
+sns.lineplot(x=X[:, 0], y=model.predict(X), label="linear + tree model", ax=ax[0])
+ax[0].set_xlabel("X")
+
+sns.lineplot(
+    x=range(params["n_estimators"]), y=linear_test_error, label="linear model", ax=ax[1]
+)
+sns.lineplot(
+    x=range(params["n_estimators"]), y=tree_test_error, label="tree model", ax=ax[1]
+)
+sns.lineplot(
+    x=range(params["n_estimators"]),
+    y=mixed_test_error,
+    label="linear + tree model",
+    ax=ax[1],
+)
+ax[1].set_xlabel("n_estimators")
+ax[1].set_ylabel("test error")
+plt.suptitle("Linear Models + Tree Models")
+plt.tight_layout()
+image_dir = Path(__file__).parent
+plt.savefig(image_dir / "linear_model.png")
diff --git a/legateboost/legateboost.py b/legateboost/legateboost.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import warnings
+from copy import deepcopy
 from typing import Any, List, Optional, Tuple, Union
 
 import numpy as np
@@ -12,7 +13,7 @@
 
 from .input_validation import check_sample_weight, check_X_y
 from .metrics import BaseMetric, metrics
-from .models import Tree
+from .models import BaseModel, Tree
 from .objectives import BaseObjective, objectives
 from .utils import PickleCunumericMixin, preround
 
@@ -25,9 +26,9 @@ def __init__(
         metric: Union[str, BaseMetric, list[Union[str, BaseMetric]]] = "default",
         learning_rate: float = 0.1,
         init: Union[str, None] = "average",
+        base_models: Tuple[BaseModel, ...] = (Tree(max_depth=3),),
         verbose: int = 0,
         random_state: Optional[np.random.RandomState] = None,
-        max_depth: int = 3,
         version: str = "native",
     ) -> None:
         self.n_estimators = n_estimators
@@ -37,9 +38,9 @@ def __init__(
         self.init = init
         self.verbose = verbose
         self.random_state = random_state
-        self.max_depth = max_depth
         self.version = version
         self.model_init_: cn.ndarray
+        self.base_models = base_models
 
     def _more_tags(self) -> Any:
         return {
@@ -215,21 +216,19 @@ def _partial_fit(
         # current model prediction
         train_pred = self._predict(X)
         eval_preds = [self._predict(X_eval) for X_eval, _, _ in _eval_set]
-        for _ in range(self.n_estimators):
+        for i in range(self.n_estimators):
             # obtain gradients
             g, h = self._get_weighted_gradient(
                 y, train_pred, sample_weight, self.learning_rate
             )
-            # build new tree
+
+            # build new model
             self.models_.append(
-                Tree(
-                    X,
-                    g,
-                    h,
-                    self.max_depth,
-                    self.random_state_,
+                deepcopy(self.base_models[i % len(self.base_models)]).set_random_state(
+                    self.random_state_
                 )
             )
+            self.models_[-1].fit(X, g, h)
 
             # update current predictions
             train_pred += self.models_[-1].predict(X)
@@ -368,7 +367,7 @@ def fit(
         """
         sample_weight = check_sample_weight(sample_weight, len(y))
         self.n_features_in_ = X.shape[1]
-        self.models_: List[Tree] = []
+        self.models_: List[BaseModel] = []
         # initialise random state if an integer was passed
         self.random_state_ = check_random_state(self.random_state)
 
@@ -387,8 +386,6 @@ def fit(
         self.model_init_ = self._objective_instance.initialise_prediction(
             y, sample_weight, self.init == "average"
         )
-        self.sum_model_weights_ = sample_weight.sum()
-
         self.is_fitted_ = True
 
         return self._partial_fit(X, y, sample_weight, eval_set, eval_result)
@@ -407,7 +404,7 @@ def _predict(self, X: cn.ndarray) -> cn.ndarray:
             pred += m.predict(X)
         return pred
 
-    def dump_trees(self) -> str:
+    def dump_models(self) -> str:
         check_is_fitted(self, "is_fitted_")
         text = "init={}\n".format(self.model_init_)
         for m in self.models_:
@@ -417,8 +414,7 @@ def dump_trees(self) -> str:
 
 class LBRegressor(LBBase, RegressorMixin):
     """Implementation of a gradient boosting algorithm for regression problems.
-    Uses decision trees as weak learners and iteratively improves the model by
-    minimizing a loss function.
+    Learns component models to iteratively improve a loss function.
 
     Parameters
     ----------
@@ -431,19 +427,20 @@ class LBRegressor(LBBase, RegressorMixin):
         the accompanying metric. Possible values: ['mse'] or instance of BaseMetric. Can
         be a list multiple metrics.
     learning_rate :
-        The learning rate shrinks the contribution of each tree.
+        The learning rate shrinks the contribution of each model.
     init :
         The initial prediction of the model. If `None`, the initial prediction
         is zero. If 'average', the initial prediction minimises a second order
         approximation of the loss-function (simply the mean label in the case of
         regression).
+    base_models :
+        The base models to use for each iteration. The model used in each iteration
+        i is base_models[i % len(base_models)].
     verbose :
         Controls the verbosity when fitting and predicting.
     random_state :
         Controls the randomness of the estimator. Pass an int for reproducible
         results across multiple function calls.
-    max_depth :
-        The maximum depth of the decision trees.
 
     Attributes
     ----------
@@ -477,19 +474,19 @@ def __init__(
         metric: Union[str, BaseMetric, list[Union[str, BaseMetric]]] = "default",
         learning_rate: float = 0.1,
         init: Union[str, None] = "average",
+        base_models: Tuple[BaseModel, ...] = (Tree(max_depth=3),),
         verbose: int = 0,
         random_state: Optional[np.random.RandomState] = None,
-        max_depth: int = 3,
     ) -> None:
         super().__init__(
             n_estimators=n_estimators,
             objective=objective,
             metric=metric,
             learning_rate=learning_rate,
             init=init,
+            base_models=base_models,
             verbose=verbose,
             random_state=random_state,
-            max_depth=max_depth,
         )
 
     def _more_tags(self) -> Any:
@@ -581,18 +578,19 @@ class LBClassifier(LBBase, ClassifierMixin):
         choose the accompanying metric. Possible values: ['log_loss', 'exp'] or
         instance of BaseMetric. Can be a list multiple metrics.
     learning_rate :
-        The learning rate shrinks the contribution of each tree by `learning_rate`.
+        The learning rate shrinks the contribution of each model.
     init :
         The initial prediction of the model. If `None`, the initial prediction
         is zero. If 'average', the initial prediction minimises a second order
         approximation of the loss-function.
+    base_models:
+        The base models to use for each iteration. The model used in each iteration
+        i is base_models[i % len(base_models)].
     verbose :
         Controls the verbosity of the boosting process.
     random_state :
         Controls the randomness of the estimator. Pass an int for reproducible output
         across multiple function calls.
-    max_depth :
-        The maximum depth of the individual trees.
 
     Attributes
     ----------
@@ -627,19 +625,19 @@ def __init__(
         metric: Union[str, BaseMetric, list[Union[str, BaseMetric]]] = "default",
         learning_rate: float = 0.1,
         init: Union[str, None] = "average",
+        base_models: Tuple[BaseModel, ...] = (Tree(max_depth=3),),
         verbose: int = 0,
         random_state: Optional[np.random.RandomState] = None,
-        max_depth: int = 3,
     ) -> None:
         super().__init__(
             n_estimators=n_estimators,
             objective=objective,
             metric=metric,
             learning_rate=learning_rate,
             init=init,
+            base_models=base_models,
             verbose=verbose,
             random_state=random_state,
-            max_depth=max_depth,
         )
 
     def partial_fit(

diff --git a/legateboost/models/__init__.py b/legateboost/models/__init__.py
@@ -1 +1,3 @@
 from .tree import Tree
+from .linear import Linear
+from .base_model import BaseModel
diff --git a/legateboost/models/base_model.py b/legateboost/models/base_model.py
@@ -0,0 +1,43 @@
+from abc import ABC, abstractmethod
+
+import numpy as np
+
+import cunumeric as cn
+
+from ..utils import PickleCunumericMixin
+
+
+class BaseModel(PickleCunumericMixin, ABC):
+    def set_random_state(self, random_state: np.random.RandomState) -> "BaseModel":
+        self.random_state = random_state
+        return self
+
+    @abstractmethod
+    def fit(
+        self,
+        X: cn.ndarray,
+        g: cn.ndarray,
+        h: cn.ndarray,
+    ) -> "BaseModel":
+        pass
+
+    @abstractmethod
+    def update(
+        self,
+        X: cn.ndarray,
+        g: cn.ndarray,
+        h: cn.ndarray,
+    ) -> "BaseModel":
+        pass
+
+    @abstractmethod
+    def predict(self, X: cn.ndarray) -> cn.ndarray:
+        pass
+
+    @abstractmethod
+    def __str__(self) -> str:
+        pass
+
+    @abstractmethod
+    def __eq__(self, other: object) -> bool:
+        pass
diff --git a/legateboost/models/linear.py b/legateboost/models/linear.py
@@ -0,0 +1,44 @@
+import cunumeric as cn
+
+from .base_model import BaseModel
+
+
+class Linear(BaseModel):
+    def fit(
+        self,
+        X: cn.ndarray,
+        g: cn.ndarray,
+        h: cn.ndarray,
+    ) -> "Linear":
+
+        num_outputs = g.shape[1]
+        self.bias = -g.sum(axis=0) / h.sum(axis=0)
+        g = g + self.bias[cn.newaxis, :] * h
+        self.betas = cn.zeros((X.shape[1], num_outputs))
+        for k in range(num_outputs):
+            W = cn.sqrt(h[:, k])
+            Xw = X * W[:, cn.newaxis]
+            yw = W * (-g[:, k] / h[:, k])
+            self.betas[:, k] = cn.linalg.lstsq(Xw, yw)[0]
+        return self
+
+    def clear(self) -> None:
+        self.bias.fill(0)
+        self.betas.fill(0)
+
+    def update(
+        self,
+        X: cn.ndarray,
+        g: cn.ndarray,
+        h: cn.ndarray,
+    ) -> "Linear":
+        return self.fit(X, g, h)
+
+    def predict(self, X: cn.ndarray) -> cn.ndarray:
+        return self.bias + X.dot(self.betas)
+
+    def __str__(self) -> str:
+        return "Bias: " + str(self.bias) + "\nCoefficients: " + str(self.betas) + "\n"
+
+    def __eq__(self, other: object) -> bool:
+        return (other.betas == self.betas).all()