Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[c++] Fix dump_model() information for root node #6569

Open
wants to merge 42 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
12102cc
Fix value calculation in root node
Jul 24, 2024
c933399
Fix dask tests
Jul 26, 2024
c240016
Merge branch 'master' into fix-root-values
neNasko1 Jul 26, 2024
2f1de57
Create proper tests
Jul 29, 2024
273a1df
Merge branch 'master' into fix-root-values
neNasko1 Jul 29, 2024
208df85
Test only on cpu
Jul 29, 2024
130879b
Merge branch 'fix-root-values' of github.com:neNasko1/LightGBM into f…
Jul 29, 2024
48e6b96
Disable new tests for CUDA
Jul 30, 2024
26b9859
Merge with #5964
Aug 3, 2024
88e3dec
Finish merging with dump_model unification
Aug 3, 2024
e1274dc
Improve tests
Aug 3, 2024
38ee92c
Add linear test for stump
Aug 4, 2024
3b423de
Fix CUDA compilation
Aug 5, 2024
c89e257
Merge branch 'master' into fix-root-values
neNasko1 Aug 5, 2024
3de14d9
Merge branch 'master' into fix-root-values
neNasko1 Aug 6, 2024
fc42c1c
Merge branch 'master' into fix-root-values
neNasko1 Aug 14, 2024
3ffcac6
Comments after code review
Aug 14, 2024
d5a82c4
Fix test
Aug 15, 2024
be7675d
Reenable cuda testing
Aug 15, 2024
f616e03
Tests
Aug 15, 2024
6c6bc33
Merge branch 'microsoft:master' into fix-root-values
neNasko1 Aug 15, 2024
c28a2cf
test cuda
Aug 15, 2024
6113f90
.
Aug 15, 2024
94cf7f0
Fix warning
Aug 15, 2024
01aa952
reenable tests
Aug 15, 2024
fadaa83
.
Aug 15, 2024
b9c681b
Merge branch 'fix-cuda' into fix-root-values
Aug 15, 2024
a323acb
fix cuda
Aug 15, 2024
0fd0c59
Fix compilation error
Aug 15, 2024
4cc5dd4
Fix weight
Aug 15, 2024
a743a87
Fix numerical
Aug 15, 2024
031c945
Make tests more robust
Aug 16, 2024
91993a9
Merge branch 'master' into fix-root-values
neNasko1 Sep 2, 2024
f744f64
Merge branch 'master' into fix-root-values
neNasko1 Sep 5, 2024
634b0fc
Fix test failing because of accuracy reasons
Sep 17, 2024
3fe4577
Fix test_dask::test_init_scores
Sep 21, 2024
9e3e8ed
Decrease size of trees in test
Sep 21, 2024
a01e737
Merge branch 'master' of github.com:microsoft/LightGBM into fix-root-…
jameslamb Oct 8, 2024
e76d5bc
add a test on predictions from a model of all stumps
jameslamb Oct 8, 2024
0af4631
Comments after code review
neNasko1 Oct 8, 2024
04886c0
Small text QOL
neNasko1 Oct 9, 2024
15fc3bf
Add test_predict_stump on dask
neNasko1 Oct 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion include/LightGBM/cuda/cuda_tree.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ class CUDATree : public Tree {
const data_size_t* used_data_indices,
data_size_t num_data, double* score) const override;

inline void AsConstantTree(double val) override;
inline void AsConstantTree(double val, int count) override;

const int* cuda_leaf_parent() const { return cuda_leaf_parent_; }

Expand Down
5 changes: 3 additions & 2 deletions include/LightGBM/tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -228,13 +228,14 @@ class Tree {
shrinkage_ = 1.0f;
}

virtual inline void AsConstantTree(double val) {
virtual inline void AsConstantTree(double val, int count = 0) {
num_leaves_ = 1;
shrinkage_ = 1.0f;
leaf_value_[0] = val;
if (is_linear_) {
leaf_const_[0] = val;
}
leaf_count_[0] = count;
}

/*! \brief Serialize this object to string*/
Expand Down Expand Up @@ -563,7 +564,7 @@ inline void Tree::Split(int leaf, int feature, int real_feature,
leaf_parent_[leaf] = new_node_idx;
leaf_parent_[num_leaves_] = new_node_idx;
// save current leaf value to internal node before change
internal_weight_[new_node_idx] = leaf_weight_[leaf];
internal_weight_[new_node_idx] = left_weight + right_weight;
internal_value_[new_node_idx] = leaf_value_[leaf];
internal_count_[new_node_idx] = left_cnt + right_cnt;
leaf_value_[leaf] = std::isnan(left_value) ? 0.0f : left_value;
Expand Down
2 changes: 1 addition & 1 deletion python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3913,7 +3913,7 @@ def _get_split_feature(
return feature_name

def _is_single_node_tree(tree: Dict[str, Any]) -> bool:
return set(tree.keys()) == {"leaf_value"}
return set(tree.keys()) == {"leaf_value", "leaf_count"}

# Create the node record, and populate universal data members
node: Dict[str, Union[int, str, None]] = OrderedDict()
Expand Down
5 changes: 4 additions & 1 deletion src/boosting/gbdt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,10 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
score_updater->AddScore(init_scores[cur_tree_id], cur_tree_id);
}
}
new_tree->AsConstantTree(init_scores[cur_tree_id]);
new_tree->AsConstantTree(init_scores[cur_tree_id], num_data_);
} else {
// extend init_scores with zeros
new_tree->AsConstantTree(0, num_data_);
}
}
// add model
Expand Down
2 changes: 1 addition & 1 deletion src/boosting/rf.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ class RF : public GBDT {
output = init_scores_[cur_tree_id];
}
}
new_tree->AsConstantTree(output);
new_tree->AsConstantTree(output, num_data_);
MultiplyScore(cur_tree_id, (iter_ + num_init_iteration_));
UpdateScore(new_tree.get(), cur_tree_id);
MultiplyScore(cur_tree_id, 1.0 / (iter_ + num_init_iteration_ + 1));
Expand Down
5 changes: 3 additions & 2 deletions src/io/cuda/cuda_tree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,9 +330,10 @@ void CUDATree::SyncLeafOutputFromCUDAToHost() {
CopyFromCUDADeviceToHost<double>(leaf_value_.data(), cuda_leaf_value_, leaf_value_.size(), __FILE__, __LINE__);
}

void CUDATree::AsConstantTree(double val) {
Tree::AsConstantTree(val);
void CUDATree::AsConstantTree(double val, int count) {
Tree::AsConstantTree(val, count);
CopyFromHostToCUDADevice<double>(cuda_leaf_value_, &val, 1, __FILE__, __LINE__);
CopyFromHostToCUDADevice<int>(cuda_leaf_count_, &count, 1, __FILE__, __LINE__);
}

} // namespace LightGBM
Expand Down
22 changes: 13 additions & 9 deletions src/io/tree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -416,12 +416,16 @@ std::string Tree::ToJSON() const {
str_buf << "\"num_cat\":" << num_cat_ << "," << '\n';
str_buf << "\"shrinkage\":" << shrinkage_ << "," << '\n';
if (num_leaves_ == 1) {
str_buf << "\"tree_structure\":{";
if (is_linear_) {
str_buf << "\"tree_structure\":{" << "\"leaf_value\":" << leaf_value_[0] << ", " << "\n";
str_buf << LinearModelToJSON(0) << "}" << "\n";
str_buf << "\"leaf_value\":" << leaf_value_[0] << ", " << '\n';
neNasko1 marked this conversation as resolved.
Show resolved Hide resolved
str_buf << "\"leaf_count\":" << leaf_count_[0] << ", " << '\n';
str_buf << LinearModelToJSON(0);
} else {
str_buf << "\"tree_structure\":{" << "\"leaf_value\":" << leaf_value_[0] << "}" << '\n';
str_buf << "\"leaf_value\":" << leaf_value_[0] << ", " << '\n';
str_buf << "\"leaf_count\":" << leaf_count_[0];
}
str_buf << "}" << '\n';
} else {
str_buf << "\"tree_structure\":" << NodeToJSON(0) << '\n';
}
Expand Down Expand Up @@ -731,6 +735,12 @@ Tree::Tree(const char* str, size_t* used_len) {
is_linear_ = false;
}

if (key_vals.count("leaf_count")) {
leaf_count_ = CommonC::StringToArrayFast<int>(key_vals["leaf_count"], num_leaves_);
} else {
leaf_count_.resize(num_leaves_);
}

#ifdef USE_CUDA
is_cuda_tree_ = false;
#endif // USE_CUDA
Expand Down Expand Up @@ -793,12 +803,6 @@ Tree::Tree(const char* str, size_t* used_len) {
leaf_weight_.resize(num_leaves_);
}

if (key_vals.count("leaf_count")) {
leaf_count_ = CommonC::StringToArrayFast<int>(key_vals["leaf_count"], num_leaves_);
} else {
leaf_count_.resize(num_leaves_);
}

if (key_vals.count("decision_type")) {
decision_type_ = CommonC::StringToArrayFast<int8_t>(key_vals["decision_type"], num_leaves_ - 1);
} else {
Expand Down
6 changes: 6 additions & 0 deletions src/treelearner/serial_tree_learner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,12 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
auto tree_ptr = tree.get();
constraints_->ShareTreePointer(tree_ptr);

// set the root value by hand, as it is not handled by splits
tree->SetLeafOutput(0, FeatureHistogram::CalculateSplittedLeafOutput<true, true, true, false>(
smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians(),
config_->lambda_l1, config_->lambda_l2, config_->max_delta_step,
BasicConstraint(), config_->path_smooth, static_cast<data_size_t>(num_data_), 0));

// root leaf
int left_leaf = 0;
int cur_depth = 1;
Expand Down
3 changes: 1 addition & 2 deletions tests/python_package_test/test_dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -1464,8 +1464,7 @@ def test_init_score(task, output, cluster):
init_scores = dy.map_blocks(lambda x: np.full((x.size, size_factor), init_score))
model = model_factory(client=client, **params)
model.fit(dX, dy, sample_weight=dw, init_score=init_scores, group=dg)
# value of the root node is 0 when init_score is set
assert model.booster_.trees_to_dataframe()["value"][0] == 0
assert model.fitted_
Copy link
Collaborator

@jameslamb jameslamb Aug 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you help me understand why this test had to be changed, when in gbdt.cpp I see the following?

// extend init_scores with zeros
        new_tree->AsConstantTree(0, num_data_);

And if this is no longer the case... can you please help us come up with a better way to change this test? This particular test is called test_init_score .... its specific goal is to check that when you provide init_score to the .fit() method for the Dask estimators, that init_score actually makes it all the way down to the Booster and affects the resulting model.

Changing this condition to just "model fitting did not raise an error" is not sufficiently strict... that test would pass if DaskLGBMRegressor.fit() simply ignored init_score, for example.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reverted!

Nice catch, seems like it got fixed in the meantime.

Copy link
Contributor Author

@neNasko1 neNasko1 Aug 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I may have misstested. I made the test more comprehensive, however it now does not execute on LGBMRanker. However this is out-of-scope for this PR.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it now does not execute on LGBMRanker. However this is out-of-scope for this PR

If the changes in your PR require tests that were previously passing to now be skipped, the need to skip those tests is not "out-of-scope".

Can you please explain here what issues you saw with LGBMRanker and this PR's changes?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The way the init_score was tested seems like a flawed way to test it, because as we see from this PR the root value was always 0.

Rewriting the test to be "check if the 2 models are different" seems to have more merit, since if you have an init score you would expect that to happen with a small number of trees.

It seems however that is not the case in the LGBMRanker, i.e. when boosting from an init_score or without one the models are exactly the same. Could you tell me whether I am missing something and this is the expected functionality?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on all of this... I feel pretty good about these changes! I think this can be merged once the other suggestions I've left have been addressed.

One other note relevant to this thread... I just pushed a new unit test to this PR: e76d5bc

That tests that the predictions from a model of all stumps takes on the correct value based on whether or not init_score is provided.

Right now if you train for 5 iterations but LightGBM cannot find any splits, you won't get 5 trees... you'll get 1 stump.

import lightgbm as lgb
from sklearn.datasets import make_regression

X, y = make_regression(n_features=2)
bst = lgb.train(
    train_set=lgb.Dataset(X, label=y),
    params={
        "objective": "regression",
        "min_data_in_leaf": X.shape[0]
    },
    num_boost_round=5
)
bst.dump_model()["tree_info"]
[{'tree_index': 0, 'num_leaves': 1, 'num_cat': 0, 'shrinkage': 1, 'tree_structure': {'leaf_value': 3.7640261858701707, 'leaf_count': 100}}]

That unit test will protect us against predictions being wrong if that ever changes and if, for example, that situation produces 5 stumps. I'm thinking about this in-progress PR, for example: #5699.

cc @jmoralez @StrikerRUS (just for awareness)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@neNasko1 I've copied your comment from #6569 (comment) here ... let's please keep this in a thread, so there's only one place to link to and so it takes up less visual space scrolling through the already-very-long activity history for this PR.


I said:

When there is a init_score then boost_from_average does not occur.

And you asked:

Is this related to boost_from_average?
. * When there is a init_score then boost_from_average does not occur.
. * After training the first trees(iteration=1) we then add the average to them.
...however after this fix there is a need for a change in this test.

Yes, boost_from_average is a part of this. That parameter is set to true by default

https://github.com/microsoft/LightGBM/blob/06432300c0c01268c8a80c3537eef81dd5ede30d/include/LightGBM/config.h#L946-947

And yes it's correct that when you provide an init_score, then boosting from the average doesn't occur, and the first tree's root node has an internal value of 0.0.

Which test do you mean "this test" and what specific changes do you think are needed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

...however after this fix there is a need for a change in this test.

I am addressing the changes to the test_dask::test_init_scores. As we saw previously the test was properly checking if a init_score was supplied and propagated. However after this fix, this is no longer the case so a different way to test the test_dask::test_init_scores-s is needed.

Based on all of this... I feel pretty good about these changes! I think this can be merged once the other suggestions I've left have been addressed.

I have addressed the changes in one commit, since they were a lot and I wanted to make sure that they work properly together.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ohhhh I see. Yes I think you're right, that's a great point. Here's my proposal for making that test stricter:

  1. add the following to params in test_dask.py::test_init_scores to eliminate other sources of non-determinism, so we can be more sure the difference is only due to init_score:
    • `{"seed": 708, "num_thread": 1, "deterministic": True, "force_row_wise": True
  2. add a Dask equivalent of the new test_predict_stump() test I'd pushed: https://github.com/neNasko1/LightGBM/blob/04886c0d6d2da2e3d145916556ef34d012b43ec9/tests/python_package_test/test_engine.py#L3815-L3840

What do you think about that? And please let me know if you're not very familiar with Dask and want me to push a test like that.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not that familiar with dask, but is this what you have in mind?

P.S. I am running the dask tests on my M3 Mac machine by removing these lines. Maybe it will be possible to enable test_dask for apple silicone in a future PR?



def sklearn_checks_to_run():
Expand Down
62 changes: 56 additions & 6 deletions tests/python_package_test/test_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

from .utils import (
SERIALIZERS,
assert_all_trees_valid,
dummy_obj,
load_breast_cancer,
load_digits,
Expand Down Expand Up @@ -3853,21 +3854,70 @@ def test_reset_params_works_with_metric_num_class_and_boosting():
assert new_bst.params == expected_params


def test_dump_model():
@pytest.mark.parametrize("linear_tree", [False, True])
def test_dump_model_stump(linear_tree):
X, y = load_breast_cancer(return_X_y=True)
train_data = lgb.Dataset(X, label=y)
params = {"objective": "binary", "verbose": -1}
# intentionally create a stump (tree with only a root-node)
# using restricted # samples
subidx = random.sample(range(len(y)), 30)
neNasko1 marked this conversation as resolved.
Show resolved Hide resolved

train_data = lgb.Dataset(X[subidx], label=y[subidx])
params = {
"objective": "binary",
"verbose": -1,
"n_jobs": 1,
neNasko1 marked this conversation as resolved.
Show resolved Hide resolved
"linear_tree": linear_tree,
}
bst = lgb.train(params, train_data, num_boost_round=5)
dumped_model_str = str(bst.dump_model(5, 0))
dumped_model = bst.dump_model(5, 0)
neNasko1 marked this conversation as resolved.
Show resolved Hide resolved
tree_structure = dumped_model["tree_info"][0]["tree_structure"]
assert len(dumped_model["tree_info"]) == 1
assert "leaf_value" in tree_structure
assert tree_structure["leaf_count"] == 30


def test_dump_model():
offset = 100
neNasko1 marked this conversation as resolved.
Show resolved Hide resolved
X, y = make_synthetic_regression()
train_data = lgb.Dataset(X, label=y + offset)

params = {
"objective": "regression",
"verbose": -1,
"boost_from_average": True,
}
bst = lgb.train(params, train_data, num_boost_round=5)
dumped_model = bst.dump_model(5, 0)
neNasko1 marked this conversation as resolved.
Show resolved Hide resolved
dumped_model_str = str(dumped_model)
assert "leaf_features" not in dumped_model_str
assert "leaf_coeff" not in dumped_model_str
assert "leaf_const" not in dumped_model_str
assert "leaf_value" in dumped_model_str
assert "leaf_count" in dumped_model_str
params["linear_tree"] = True

# CUDA does not return correct values for the root
if getenv("TASK", "") == "cuda":
return
neNasko1 marked this conversation as resolved.
Show resolved Hide resolved

for tree in dumped_model["tree_info"]:
assert not np.all(tree["tree_structure"]["internal_value"] == 0)
neNasko1 marked this conversation as resolved.
Show resolved Hide resolved

np.testing.assert_allclose(dumped_model["tree_info"][0]["tree_structure"]["internal_value"], offset, atol=1)
neNasko1 marked this conversation as resolved.
Show resolved Hide resolved
assert_all_trees_valid(dumped_model)


def test_dump_model_linear():
X, y = load_breast_cancer(return_X_y=True)
params = {
"objective": "binary",
"verbose": -1,
"linear_tree": True,
}
train_data = lgb.Dataset(X, label=y)
bst = lgb.train(params, train_data, num_boost_round=5)
dumped_model_str = str(bst.dump_model(5, 0))
dumped_model = bst.dump_model(5, 0)
neNasko1 marked this conversation as resolved.
Show resolved Hide resolved
assert_all_trees_valid(dumped_model)
dumped_model_str = str(dumped_model)
assert "leaf_features" in dumped_model_str
assert "leaf_coeff" in dumped_model_str
assert "leaf_const" in dumped_model_str
Expand Down
35 changes: 35 additions & 0 deletions tests/python_package_test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,3 +206,38 @@ def np_assert_array_equal(*args, **kwargs):
if not _numpy_testing_supports_strict_kwarg:
kwargs.pop("strict")
np.testing.assert_array_equal(*args, **kwargs)


def assert_subtree_valid(root):
"""Recursively checks the validity of a subtree rooted at `root`.

Currently it only checks whether weights and counts are consistent between
all parent nodes and their children.

Parameters
----------
root : dict
A dictionary representing the root of the subtree.
It should be produced by dump_model()

Returns
-------
tuple
A tuple containing the weight and count of the subtree rooted at `root`.
"""
if "leaf_count" in root:
return (root["leaf_weight"], root["leaf_count"])

left_child = root["left_child"]
right_child = root["right_child"]
(l_w, l_c) = assert_subtree_valid(left_child)
(r_w, r_c) = assert_subtree_valid(right_child)
assert np.allclose(root["internal_weight"], l_w + r_w)
assert np.allclose(root["internal_count"], l_c + r_c)
neNasko1 marked this conversation as resolved.
Show resolved Hide resolved
return (root["internal_weight"], root["internal_count"])


def assert_all_trees_valid(model_dump):
for idx, tree in enumerate(model_dump["tree_info"]):
assert tree["tree_index"] == idx
neNasko1 marked this conversation as resolved.
Show resolved Hide resolved
assert_subtree_valid(tree["tree_structure"])
Loading