From 12102cc22096f44fff79f472b2051cc4a74f8f9b Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Wed, 24 Jul 2024 17:46:07 +0300 Subject: [PATCH 01/31] Fix value calculation in root node --- include/LightGBM/tree.h | 2 +- src/boosting/gbdt.cpp | 3 +++ src/treelearner/serial_tree_learner.cpp | 6 ++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h index 0c4a41f46a87..8718a0577e19 100644 --- a/include/LightGBM/tree.h +++ b/include/LightGBM/tree.h @@ -563,7 +563,7 @@ inline void Tree::Split(int leaf, int feature, int real_feature, leaf_parent_[leaf] = new_node_idx; leaf_parent_[num_leaves_] = new_node_idx; // save current leaf value to internal node before change - internal_weight_[new_node_idx] = leaf_weight_[leaf]; + internal_weight_[new_node_idx] = left_weight + right_weight; internal_value_[new_node_idx] = leaf_value_[leaf]; internal_count_[new_node_idx] = left_cnt + right_cnt; leaf_value_[leaf] = std::isnan(left_value) ? 0.0f : left_value; diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 937b44fcc8aa..3a83323b372d 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -420,6 +420,9 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { } } new_tree->AsConstantTree(init_scores[cur_tree_id]); + } else { + // extend init_scores with zeros + new_tree->AsConstantTree(0); } } // add model diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index f3a88bd18679..14ede072dc9e 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -201,6 +201,12 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians auto tree_ptr = tree.get(); constraints_->ShareTreePointer(tree_ptr); + // set the root value by hand, as it is not handled by splits + tree->SetLeafOutput(0, FeatureHistogram::CalculateSplittedLeafOutput( + smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians(), + config_->lambda_l1, config_->lambda_l2, config_->max_delta_step, + BasicConstraint(), config_->path_smooth, static_cast(num_data_), 0)); + // root leaf int left_leaf = 0; int cur_depth = 1; From c933399c07537c2d3b1721dc155a332192f2a2c3 Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Fri, 26 Jul 2024 17:53:32 +0300 Subject: [PATCH 02/31] Fix dask tests --- tests/python_package_test/test_dask.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 37d6db2541f5..7dcc21ac4d6a 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -1464,8 +1464,7 @@ def test_init_score(task, output, cluster): init_scores = dy.map_blocks(lambda x: np.full((x.size, size_factor), init_score)) model = model_factory(client=client, **params) model.fit(dX, dy, sample_weight=dw, init_score=init_scores, group=dg) - # value of the root node is 0 when init_score is set - assert model.booster_.trees_to_dataframe()["value"][0] == 0 + assert model.fitted_ def sklearn_checks_to_run(): From 2f1de57e341d601ee690871993a810e35fdcc259 Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Mon, 29 Jul 2024 14:47:35 +0300 Subject: [PATCH 03/31] Create proper tests --- tests/python_package_test/test_engine.py | 17 ++++++++++-- tests/python_package_test/utils.py | 35 ++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 9ff56206ca70..73793e48e6ac 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -24,6 +24,7 @@ from .utils import ( SERIALIZERS, + assert_all_trees_valid, dummy_obj, load_breast_cancer, load_digits, @@ -3857,16 +3858,26 @@ def test_dump_model(): train_data = lgb.Dataset(X, label=y) params = {"objective": "binary", "verbose": -1} bst = lgb.train(params, train_data, num_boost_round=5) - dumped_model_str = str(bst.dump_model(5, 0)) + dumped_model = bst.dump_model(5, 0) + assert_all_trees_valid(dumped_model) + dumped_model_str = str(dumped_model) assert "leaf_features" not in dumped_model_str assert "leaf_coeff" not in dumped_model_str assert "leaf_const" not in dumped_model_str assert "leaf_value" in dumped_model_str assert "leaf_count" in dumped_model_str - params["linear_tree"] = True + for tree in dumped_model["tree_info"]: + assert not np.allclose(tree["tree_structure"]["internal_value"], 0) + + +def test_dump_model_linear(): + X, y = load_breast_cancer(return_X_y=True) + params = {"objective": "binary", "verbose": -1, "linear_tree": True} train_data = lgb.Dataset(X, label=y) bst = lgb.train(params, train_data, num_boost_round=5) - dumped_model_str = str(bst.dump_model(5, 0)) + dumped_model = bst.dump_model(5, 0) + assert_all_trees_valid(dumped_model) + dumped_model_str = str(dumped_model) assert "leaf_features" in dumped_model_str assert "leaf_coeff" in dumped_model_str assert "leaf_const" in dumped_model_str diff --git a/tests/python_package_test/utils.py b/tests/python_package_test/utils.py index 66298b8190fa..60274dea4322 100644 --- a/tests/python_package_test/utils.py +++ b/tests/python_package_test/utils.py @@ -206,3 +206,38 @@ def np_assert_array_equal(*args, **kwargs): if not _numpy_testing_supports_strict_kwarg: kwargs.pop("strict") np.testing.assert_array_equal(*args, **kwargs) + + +def assert_subtree_valid(root): + """Recursively checks the validity of a subtree rooted at `root`. + + Currently it only checks whether weights and counts are consistent between + all parent nodes and their children. + + Parameters + ---------- + root : dict + A dictionary representing the root of the subtree. + It should be produced by dump_model() + + Returns + ------- + tuple + A tuple containing the weight and count of the subtree rooted at `root`. + """ + if "leaf_count" in root: + return (root["leaf_weight"], root["leaf_count"]) + + left_child = root["left_child"] + right_child = root["right_child"] + (l_w, l_c) = assert_subtree_valid(left_child) + (r_w, r_c) = assert_subtree_valid(right_child) + assert np.allclose(root["internal_weight"], l_w + r_w) + assert np.allclose(root["internal_count"], l_c + r_c) + return (root["internal_weight"], root["internal_count"]) + + +def assert_all_trees_valid(model_dump): + for idx, tree in enumerate(model_dump["tree_info"]): + assert tree["tree_index"] == idx + assert_subtree_valid(tree["tree_structure"]) From 208df85064e3f4625ebf3a0e660728301ec2d10a Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Tue, 30 Jul 2024 02:10:11 +0300 Subject: [PATCH 04/31] Test only on cpu --- tests/python_package_test/test_engine.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 73793e48e6ac..056bd7cdc78c 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3859,7 +3859,6 @@ def test_dump_model(): params = {"objective": "binary", "verbose": -1} bst = lgb.train(params, train_data, num_boost_round=5) dumped_model = bst.dump_model(5, 0) - assert_all_trees_valid(dumped_model) dumped_model_str = str(dumped_model) assert "leaf_features" not in dumped_model_str assert "leaf_coeff" not in dumped_model_str @@ -3869,6 +3868,10 @@ def test_dump_model(): for tree in dumped_model["tree_info"]: assert not np.allclose(tree["tree_structure"]["internal_value"], 0) + # Cuda seems to report innacurately + if getenv("TASK", "") != "cuda": + assert_all_trees_valid(dumped_model) + def test_dump_model_linear(): X, y = load_breast_cancer(return_X_y=True) From 48e6b968dfdc16c5c899e2feef533a642fd55f33 Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Tue, 30 Jul 2024 12:21:35 +0300 Subject: [PATCH 05/31] Disable new tests for CUDA --- tests/python_package_test/test_engine.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 056bd7cdc78c..6c117a4ccfaa 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3865,11 +3865,11 @@ def test_dump_model(): assert "leaf_const" not in dumped_model_str assert "leaf_value" in dumped_model_str assert "leaf_count" in dumped_model_str - for tree in dumped_model["tree_info"]: - assert not np.allclose(tree["tree_structure"]["internal_value"], 0) - # Cuda seems to report innacurately + # CUDA does not return correct values for the root if getenv("TASK", "") != "cuda": + for tree in dumped_model["tree_info"]: + assert not np.allclose(tree["tree_structure"]["internal_value"], 0) assert_all_trees_valid(dumped_model) From 26b9859de75cabf6105d6da7f10d13a9360cf1cd Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Sat, 3 Aug 2024 19:10:29 +0300 Subject: [PATCH 06/31] Merge with #5964 --- src/io/tree.cpp | 22 ++++++++++-------- tests/python_package_test/test_engine.py | 29 +++++++++++++++++++++++- 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/src/io/tree.cpp b/src/io/tree.cpp index 4312b4f65002..c051c8060a33 100644 --- a/src/io/tree.cpp +++ b/src/io/tree.cpp @@ -416,12 +416,16 @@ std::string Tree::ToJSON() const { str_buf << "\"num_cat\":" << num_cat_ << "," << '\n'; str_buf << "\"shrinkage\":" << shrinkage_ << "," << '\n'; if (num_leaves_ == 1) { + str_buf << "\"tree_structure\":{"; if (is_linear_) { - str_buf << "\"tree_structure\":{" << "\"leaf_value\":" << leaf_value_[0] << ", " << "\n"; - str_buf << LinearModelToJSON(0) << "}" << "\n"; + str_buf << "\"leaf_value\":" << leaf_value_[0] << ", " << '\n'; + str_buf << "\"leaf_count\":" << leaf_count_[0] << ", " << '\n'; + str_buf << LinearModelToJSON(0); } else { - str_buf << "\"tree_structure\":{" << "\"leaf_value\":" << leaf_value_[0] << "}" << '\n'; + str_buf << "\"leaf_value\":" << leaf_value_[0] << ", " << '\n'; + str_buf << "\"leaf_count\":" << leaf_count_[0]; } + str_buf << "}" << '\n'; } else { str_buf << "\"tree_structure\":" << NodeToJSON(0) << '\n'; } @@ -731,6 +735,12 @@ Tree::Tree(const char* str, size_t* used_len) { is_linear_ = false; } + if (key_vals.count("leaf_count")) { + leaf_count_ = CommonC::StringToArrayFast(key_vals["leaf_count"], num_leaves_); + } else { + leaf_count_.resize(num_leaves_); + } + #ifdef USE_CUDA is_cuda_tree_ = false; #endif // USE_CUDA @@ -793,12 +803,6 @@ Tree::Tree(const char* str, size_t* used_len) { leaf_weight_.resize(num_leaves_); } - if (key_vals.count("leaf_count")) { - leaf_count_ = CommonC::StringToArrayFast(key_vals["leaf_count"], num_leaves_); - } else { - leaf_count_.resize(num_leaves_); - } - if (key_vals.count("decision_type")) { decision_type_ = CommonC::StringToArrayFast(key_vals["decision_type"], num_leaves_ - 1); } else { diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 6c117a4ccfaa..e2338a739e73 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3853,10 +3853,37 @@ def test_reset_params_works_with_metric_num_class_and_boosting(): assert new_bst.params == expected_params +def test_dump_model_stump(): + X, y = load_breast_cancer(return_X_y=True) + # intentionally create a stump (tree with only a root-node) + # using restricted # samples + subidx = random.sample(range(len(y)), 30) + X = X[subidx] + y = y[subidx] + + train_data = lgb.Dataset(X, label=y) + params = { + "objective": "binary", + "verbose": -1, + "n_jobs": 1, + } + bst = lgb.train(params, train_data, num_boost_round=5) + dumped_model = bst.dump_model(5, 0) + print(dumped_model) + assert len(dumped_model["tree_info"]) == 1 + tree_structure = dumped_model["tree_info"][0]["tree_structure"] + assert "leaf_value" in tree_structure + assert "leaf_count" in tree_structure + assert tree_structure["leaf_count"] == 30 + + def test_dump_model(): X, y = load_breast_cancer(return_X_y=True) train_data = lgb.Dataset(X, label=y) - params = {"objective": "binary", "verbose": -1} + params = { + "objective": "binary", + "verbose": -1, + } bst = lgb.train(params, train_data, num_boost_round=5) dumped_model = bst.dump_model(5, 0) dumped_model_str = str(dumped_model) From 88e3dec15a63c698458ce1a0e9c84a34f6dd784d Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Sat, 3 Aug 2024 19:43:21 +0300 Subject: [PATCH 07/31] Finish merging with dump_model unification --- include/LightGBM/cuda/cuda_tree.hpp | 2 +- include/LightGBM/tree.h | 3 ++- python-package/lightgbm/basic.py | 2 +- src/boosting/gbdt.cpp | 4 ++-- src/boosting/rf.hpp | 2 +- src/io/cuda/cuda_tree.cpp | 5 +++-- tests/python_package_test/test_engine.py | 14 +++++++------- 7 files changed, 17 insertions(+), 15 deletions(-) diff --git a/include/LightGBM/cuda/cuda_tree.hpp b/include/LightGBM/cuda/cuda_tree.hpp index e2836baa2be5..7ab06190481b 100644 --- a/include/LightGBM/cuda/cuda_tree.hpp +++ b/include/LightGBM/cuda/cuda_tree.hpp @@ -77,7 +77,7 @@ class CUDATree : public Tree { const data_size_t* used_data_indices, data_size_t num_data, double* score) const override; - inline void AsConstantTree(double val) override; + inline void AsConstantTree(double val, int count) override; const int* cuda_leaf_parent() const { return cuda_leaf_parent_; } diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h index 8718a0577e19..c28ddd140c48 100644 --- a/include/LightGBM/tree.h +++ b/include/LightGBM/tree.h @@ -228,13 +228,14 @@ class Tree { shrinkage_ = 1.0f; } - virtual inline void AsConstantTree(double val) { + virtual inline void AsConstantTree(double val, int count = 0) { num_leaves_ = 1; shrinkage_ = 1.0f; leaf_value_[0] = val; if (is_linear_) { leaf_const_[0] = val; } + leaf_count_[0] = count; } /*! \brief Serialize this object to string*/ diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index af4d757f480b..90b01f25fa80 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -3913,7 +3913,7 @@ def _get_split_feature( return feature_name def _is_single_node_tree(tree: Dict[str, Any]) -> bool: - return set(tree.keys()) == {"leaf_value"} + return set(tree.keys()) == {"leaf_value", "leaf_count"} # Create the node record, and populate universal data members node: Dict[str, Union[int, str, None]] = OrderedDict() diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 3a83323b372d..f966a275ae8b 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -419,10 +419,10 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { score_updater->AddScore(init_scores[cur_tree_id], cur_tree_id); } } - new_tree->AsConstantTree(init_scores[cur_tree_id]); + new_tree->AsConstantTree(init_scores[cur_tree_id], num_data_); } else { // extend init_scores with zeros - new_tree->AsConstantTree(0); + new_tree->AsConstantTree(0, num_data_); } } // add model diff --git a/src/boosting/rf.hpp b/src/boosting/rf.hpp index e6101dc30a39..3eb065b30f2d 100644 --- a/src/boosting/rf.hpp +++ b/src/boosting/rf.hpp @@ -168,7 +168,7 @@ class RF : public GBDT { output = init_scores_[cur_tree_id]; } } - new_tree->AsConstantTree(output); + new_tree->AsConstantTree(output, num_data_); MultiplyScore(cur_tree_id, (iter_ + num_init_iteration_)); UpdateScore(new_tree.get(), cur_tree_id); MultiplyScore(cur_tree_id, 1.0 / (iter_ + num_init_iteration_ + 1)); diff --git a/src/io/cuda/cuda_tree.cpp b/src/io/cuda/cuda_tree.cpp index 923e51961e0b..d7da93daeed1 100644 --- a/src/io/cuda/cuda_tree.cpp +++ b/src/io/cuda/cuda_tree.cpp @@ -330,9 +330,10 @@ void CUDATree::SyncLeafOutputFromCUDAToHost() { CopyFromCUDADeviceToHost(leaf_value_.data(), cuda_leaf_value_, leaf_value_.size(), __FILE__, __LINE__); } -void CUDATree::AsConstantTree(double val) { - Tree::AsConstantTree(val); +void CUDATree::AsConstantTree(double val, int count) { + Tree::AsConstantTree(val, int count); CopyFromHostToCUDADevice(cuda_leaf_value_, &val, 1, __FILE__, __LINE__); + CopyFromHostToCUDADevice(cuda_leaf_count_, &count, 1, __FILE__, __LINE__); } } // namespace LightGBM diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index e2338a739e73..8b781705bab3 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3858,10 +3858,8 @@ def test_dump_model_stump(): # intentionally create a stump (tree with only a root-node) # using restricted # samples subidx = random.sample(range(len(y)), 30) - X = X[subidx] - y = y[subidx] - train_data = lgb.Dataset(X, label=y) + train_data = lgb.Dataset(X[subidx], label=y[subidx]) params = { "objective": "binary", "verbose": -1, @@ -3869,11 +3867,9 @@ def test_dump_model_stump(): } bst = lgb.train(params, train_data, num_boost_round=5) dumped_model = bst.dump_model(5, 0) - print(dumped_model) - assert len(dumped_model["tree_info"]) == 1 tree_structure = dumped_model["tree_info"][0]["tree_structure"] + assert len(dumped_model["tree_info"]) == 1 assert "leaf_value" in tree_structure - assert "leaf_count" in tree_structure assert tree_structure["leaf_count"] == 30 @@ -3902,7 +3898,11 @@ def test_dump_model(): def test_dump_model_linear(): X, y = load_breast_cancer(return_X_y=True) - params = {"objective": "binary", "verbose": -1, "linear_tree": True} + params = { + "objective": "binary", + "verbose": -1, + "linear_tree": True, + } train_data = lgb.Dataset(X, label=y) bst = lgb.train(params, train_data, num_boost_round=5) dumped_model = bst.dump_model(5, 0) From e1274dc167c2064c6ae888dd711b8e6a9707bc2f Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Sat, 3 Aug 2024 20:28:45 +0300 Subject: [PATCH 08/31] Improve tests --- tests/python_package_test/test_engine.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 8b781705bab3..5fbf2e6386cb 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3874,11 +3874,14 @@ def test_dump_model_stump(): def test_dump_model(): - X, y = load_breast_cancer(return_X_y=True) - train_data = lgb.Dataset(X, label=y) + offset = 100 + X, y = make_synthetic_regression() + train_data = lgb.Dataset(X, label=y+offset) + params = { - "objective": "binary", + "objective": "regression", "verbose": -1, + "boost_from_average": True, } bst = lgb.train(params, train_data, num_boost_round=5) dumped_model = bst.dump_model(5, 0) @@ -3892,7 +3895,12 @@ def test_dump_model(): # CUDA does not return correct values for the root if getenv("TASK", "") != "cuda": for tree in dumped_model["tree_info"]: - assert not np.allclose(tree["tree_structure"]["internal_value"], 0) + assert not np.all(tree["tree_structure"]["internal_value"] == 0) + np.testing.assert_allclose( + dumped_model["tree_info"][0]["tree_structure"]["internal_value"], + offset, + atol=1 + ) assert_all_trees_valid(dumped_model) From 38ee92c2d9c3aafa5a8afb4943585ab0a26e136b Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Sun, 4 Aug 2024 20:44:04 +0300 Subject: [PATCH 09/31] Add linear test for stump --- tests/python_package_test/test_engine.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 5fbf2e6386cb..8e41d29c1456 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3853,7 +3853,8 @@ def test_reset_params_works_with_metric_num_class_and_boosting(): assert new_bst.params == expected_params -def test_dump_model_stump(): +@pytest.mark.parametrize("linear_tree", [False, True]) +def test_dump_model_stump(linear_tree): X, y = load_breast_cancer(return_X_y=True) # intentionally create a stump (tree with only a root-node) # using restricted # samples @@ -3864,6 +3865,7 @@ def test_dump_model_stump(): "objective": "binary", "verbose": -1, "n_jobs": 1, + "linear_tree": linear_tree, } bst = lgb.train(params, train_data, num_boost_round=5) dumped_model = bst.dump_model(5, 0) @@ -3876,7 +3878,7 @@ def test_dump_model_stump(): def test_dump_model(): offset = 100 X, y = make_synthetic_regression() - train_data = lgb.Dataset(X, label=y+offset) + train_data = lgb.Dataset(X, label=y + offset) params = { "objective": "regression", @@ -3893,15 +3895,14 @@ def test_dump_model(): assert "leaf_count" in dumped_model_str # CUDA does not return correct values for the root - if getenv("TASK", "") != "cuda": - for tree in dumped_model["tree_info"]: - assert not np.all(tree["tree_structure"]["internal_value"] == 0) - np.testing.assert_allclose( - dumped_model["tree_info"][0]["tree_structure"]["internal_value"], - offset, - atol=1 - ) - assert_all_trees_valid(dumped_model) + if getenv("TASK", "") == "cuda": + return + + for tree in dumped_model["tree_info"]: + assert not np.all(tree["tree_structure"]["internal_value"] == 0) + + np.testing.assert_allclose(dumped_model["tree_info"][0]["tree_structure"]["internal_value"], offset, atol=1) + assert_all_trees_valid(dumped_model) def test_dump_model_linear(): From 3b423de1f6a559590fbe7363b8d49326de8c07ad Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Mon, 5 Aug 2024 18:32:51 +0300 Subject: [PATCH 10/31] Fix CUDA compilation --- src/io/cuda/cuda_tree.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/io/cuda/cuda_tree.cpp b/src/io/cuda/cuda_tree.cpp index d7da93daeed1..c5dee89ca3af 100644 --- a/src/io/cuda/cuda_tree.cpp +++ b/src/io/cuda/cuda_tree.cpp @@ -331,9 +331,9 @@ void CUDATree::SyncLeafOutputFromCUDAToHost() { } void CUDATree::AsConstantTree(double val, int count) { - Tree::AsConstantTree(val, int count); + Tree::AsConstantTree(val, count); CopyFromHostToCUDADevice(cuda_leaf_value_, &val, 1, __FILE__, __LINE__); - CopyFromHostToCUDADevice(cuda_leaf_count_, &count, 1, __FILE__, __LINE__); + CopyFromHostToCUDADevice(cuda_leaf_count_, &count, 1, __FILE__, __LINE__); } } // namespace LightGBM From 3ffcac65bf148343306fe7a50312dff1c77e8bbc Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Thu, 15 Aug 2024 01:27:06 +0300 Subject: [PATCH 11/31] Comments after code review --- src/io/tree.cpp | 3 +-- tests/python_package_test/test_dask.py | 3 ++- tests/python_package_test/test_engine.py | 1 - 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/io/tree.cpp b/src/io/tree.cpp index c051c8060a33..975f09d209df 100644 --- a/src/io/tree.cpp +++ b/src/io/tree.cpp @@ -417,12 +417,11 @@ std::string Tree::ToJSON() const { str_buf << "\"shrinkage\":" << shrinkage_ << "," << '\n'; if (num_leaves_ == 1) { str_buf << "\"tree_structure\":{"; + str_buf << "\"leaf_value\":" << leaf_value_[0] << ", " << '\n'; if (is_linear_) { - str_buf << "\"leaf_value\":" << leaf_value_[0] << ", " << '\n'; str_buf << "\"leaf_count\":" << leaf_count_[0] << ", " << '\n'; str_buf << LinearModelToJSON(0); } else { - str_buf << "\"leaf_value\":" << leaf_value_[0] << ", " << '\n'; str_buf << "\"leaf_count\":" << leaf_count_[0]; } str_buf << "}" << '\n'; diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 7dcc21ac4d6a..37d6db2541f5 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -1464,7 +1464,8 @@ def test_init_score(task, output, cluster): init_scores = dy.map_blocks(lambda x: np.full((x.size, size_factor), init_score)) model = model_factory(client=client, **params) model.fit(dX, dy, sample_weight=dw, init_score=init_scores, group=dg) - assert model.fitted_ + # value of the root node is 0 when init_score is set + assert model.booster_.trees_to_dataframe()["value"][0] == 0 def sklearn_checks_to_run(): diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index c490583b6032..afc2ddc535e1 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3865,7 +3865,6 @@ def test_dump_model_stump(linear_tree): params = { "objective": "binary", "verbose": -1, - "n_jobs": 1, "linear_tree": linear_tree, } bst = lgb.train(params, train_data, num_boost_round=5) From d5a82c49182a490283376d9abaa3e654e5eaacd8 Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Thu, 15 Aug 2024 03:27:14 +0300 Subject: [PATCH 12/31] Fix test --- tests/python_package_test/test_dask.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 37d6db2541f5..a0c3ba850b2b 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -1444,8 +1444,8 @@ def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task @pytest.mark.parametrize("task", tasks) @pytest.mark.parametrize("output", data_output) def test_init_score(task, output, cluster): - if task == "ranking" and output == "scipy_csr_matrix": - pytest.skip("LGBMRanker is not currently tested on sparse matrices") + if task == "ranking": + pytest.skip("LGBMRanker is not currently tested") with Client(cluster) as client: _, _, _, _, dX, dy, dw, dg = _create_data(objective=task, output=output, group=None) @@ -1462,10 +1462,18 @@ def test_init_score(task, output, cluster): init_scores = dy.map_partitions(lambda x: pd.DataFrame([[init_score] * size_factor] * x.size)) else: init_scores = dy.map_blocks(lambda x: np.full((x.size, size_factor), init_score)) + model = model_factory(client=client, **params) - model.fit(dX, dy, sample_weight=dw, init_score=init_scores, group=dg) - # value of the root node is 0 when init_score is set - assert model.booster_.trees_to_dataframe()["value"][0] == 0 + model.fit(dX, dy, sample_weight=dw, group=dg) + + model_init_score = model_factory(client=client, **params) + model_init_score.fit(dX, dy, sample_weight=dw, init_score=init_scores, group=dg) + + # check if init score changes root value + assert ( + model.booster_.trees_to_dataframe()["value"][0] + != model_init_score.booster_.trees_to_dataframe()["value"][0] + ) def sklearn_checks_to_run(): From be7675d7b63b65438e1dd2fcd9d362b560e73ede Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Thu, 15 Aug 2024 03:40:25 +0300 Subject: [PATCH 13/31] Reenable cuda testing --- tests/python_package_test/test_engine.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index afc2ddc535e1..b29f9abfa961 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3894,10 +3894,6 @@ def test_dump_model(): assert "leaf_value" in dumped_model_str assert "leaf_count" in dumped_model_str - # CUDA does not return correct values for the root - if getenv("TASK", "") == "cuda": - return - for tree in dumped_model["tree_info"]: assert not np.all(tree["tree_structure"]["internal_value"] == 0) From f616e0384f52c099a273b9831f2668da0bf2b732 Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Thu, 15 Aug 2024 03:41:05 +0300 Subject: [PATCH 14/31] Tests --- tests/python_package_test/test_dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index a0c3ba850b2b..3af65fc945f1 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -1445,7 +1445,7 @@ def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task @pytest.mark.parametrize("output", data_output) def test_init_score(task, output, cluster): if task == "ranking": - pytest.skip("LGBMRanker is not currently tested") + pytest.skip("LGBMRanker is not currently tested for init_score") with Client(cluster) as client: _, _, _, _, dX, dy, dw, dg = _create_data(objective=task, output=output, group=None) From c28a2cfde81c7654ea9f99dd709beac68c726708 Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Thu, 15 Aug 2024 13:12:05 +0300 Subject: [PATCH 15/31] test cuda --- src/treelearner/cuda/cuda_leaf_splits.cpp | 7 +++++-- src/treelearner/cuda/cuda_leaf_splits.hpp | 4 ++-- src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp | 8 ++++++++ src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp | 1 + 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/treelearner/cuda/cuda_leaf_splits.cpp b/src/treelearner/cuda/cuda_leaf_splits.cpp index 57b5b777c142..803d4674ee48 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cpp +++ b/src/treelearner/cuda/cuda_leaf_splits.cpp @@ -38,12 +38,14 @@ void CUDALeafSplits::InitValues( const double lambda_l1, const double lambda_l2, const score_t* cuda_gradients, const score_t* cuda_hessians, const data_size_t* cuda_bagging_data_indices, const data_size_t* cuda_data_indices_in_leaf, - const data_size_t num_used_indices, hist_t* cuda_hist_in_leaf, double* root_sum_hessians) { + const data_size_t num_used_indices, hist_t* cuda_hist_in_leaf, + double* root_sum_gradients, double* root_sum_hessians) { cuda_gradients_ = cuda_gradients; cuda_hessians_ = cuda_hessians; cuda_sum_of_gradients_buffer_.SetValue(0); cuda_sum_of_hessians_buffer_.SetValue(0); LaunchInitValuesKernal(lambda_l1, lambda_l2, cuda_bagging_data_indices, cuda_data_indices_in_leaf, num_used_indices, cuda_hist_in_leaf); + CopyFromCUDADeviceToHost(root_sum_gradients, cuda_sum_of_gradients_buffer_.RawData(), 1, __FILE__, __LINE__); CopyFromCUDADeviceToHost(root_sum_hessians, cuda_sum_of_hessians_buffer_.RawData(), 1, __FILE__, __LINE__); SynchronizeCUDADevice(__FILE__, __LINE__); } @@ -53,11 +55,12 @@ void CUDALeafSplits::InitValues( const int16_t* cuda_gradients_and_hessians, const data_size_t* cuda_bagging_data_indices, const data_size_t* cuda_data_indices_in_leaf, const data_size_t num_used_indices, - hist_t* cuda_hist_in_leaf, double* root_sum_hessians, + hist_t* cuda_hist_in_leaf, double* root_sum_gradients, double* root_sum_hessians, const score_t* grad_scale, const score_t* hess_scale) { cuda_gradients_ = reinterpret_cast(cuda_gradients_and_hessians); cuda_hessians_ = nullptr; LaunchInitValuesKernal(lambda_l1, lambda_l2, cuda_bagging_data_indices, cuda_data_indices_in_leaf, num_used_indices, cuda_hist_in_leaf, grad_scale, hess_scale); + CopyFromCUDADeviceToHost(root_sum_gradients, cuda_sum_of_gradients_buffer_.RawData(), 1, __FILE__, __LINE__); CopyFromCUDADeviceToHost(root_sum_hessians, cuda_sum_of_hessians_buffer_.RawData(), 1, __FILE__, __LINE__); SynchronizeCUDADevice(__FILE__, __LINE__); } diff --git a/src/treelearner/cuda/cuda_leaf_splits.hpp b/src/treelearner/cuda/cuda_leaf_splits.hpp index 33a9ea578a1f..c2635346098b 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.hpp +++ b/src/treelearner/cuda/cuda_leaf_splits.hpp @@ -44,14 +44,14 @@ class CUDALeafSplits { const score_t* cuda_gradients, const score_t* cuda_hessians, const data_size_t* cuda_bagging_data_indices, const data_size_t* cuda_data_indices_in_leaf, const data_size_t num_used_indices, - hist_t* cuda_hist_in_leaf, double* root_sum_hessians); + hist_t* cuda_hist_in_leaf, double* root_sum_gradients, double* root_sum_hessians); void InitValues( const double lambda_l1, const double lambda_l2, const int16_t* cuda_gradients_and_hessians, const data_size_t* cuda_bagging_data_indices, const data_size_t* cuda_data_indices_in_leaf, const data_size_t num_used_indices, - hist_t* cuda_hist_in_leaf, double* root_sum_hessians, + hist_t* cuda_hist_in_leaf, double* root_sum_gradients, double* root_sum_hessians, const score_t* grad_scale, const score_t* hess_scale); void InitValues(); diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp index 8f8ff15f0715..755fb1486245 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp @@ -66,6 +66,7 @@ void CUDASingleGPUTreeLearner::Init(const Dataset* train_data, bool is_constant_ leaf_best_split_default_left_.resize(config_->num_leaves, 0); leaf_num_data_.resize(config_->num_leaves, 0); leaf_data_start_.resize(config_->num_leaves, 0); + leaf_sum_gradients_.resize(config_->num_leaves, 0.0f); leaf_sum_hessians_.resize(config_->num_leaves, 0.0f); if (!boosting_on_cuda_) { @@ -122,6 +123,7 @@ void CUDASingleGPUTreeLearner::BeforeTrain() { cuda_data_partition_->cuda_data_indices(), root_num_data, cuda_histogram_constructor_->cuda_hist_pointer(), + &leaf_sum_gradients_[0], &leaf_sum_hessians_[0], cuda_gradient_discretizer_->grad_scale_ptr(), cuda_gradient_discretizer_->hess_scale_ptr()); @@ -137,6 +139,7 @@ void CUDASingleGPUTreeLearner::BeforeTrain() { cuda_data_partition_->cuda_data_indices(), root_num_data, cuda_histogram_constructor_->cuda_hist_pointer(), + &leaf_sum_gradients_[0], &leaf_sum_hessians_[0]); } leaf_num_data_[0] = root_num_data; @@ -162,6 +165,11 @@ Tree* CUDASingleGPUTreeLearner::Train(const score_t* gradients, const bool track_branch_features = !(config_->interaction_constraints_vector.empty()); std::unique_ptr tree(new CUDATree(config_->num_leaves, track_branch_features, config_->linear_tree, config_->gpu_device_id, has_categorical_feature_)); + // set the root value by hand, as it is not handled by splits + tree->SetLeafOutput(0, CUDALeafSplits::CalculateSplittedLeafOutput( + leaf_sum_gradients_[smaller_leaf_index_], leaf_sum_hessians_[smaller_leaf_index_], + config_->lambda_l1, config_->lambda_l2, config_->path_smooth, + static_cast(num_data_), 0)); for (int i = 0; i < config_->num_leaves - 1; ++i) { global_timer.Start("CUDASingleGPUTreeLearner::ConstructHistogramForLeaf"); const data_size_t num_data_in_smaller_leaf = leaf_num_data_[smaller_leaf_index_]; diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp index a1ea79efa1a1..6121ae5d22a5 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp @@ -103,6 +103,7 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner { std::vector leaf_best_split_default_left_; std::vector leaf_num_data_; std::vector leaf_data_start_; + std::vector leaf_sum_gradients_; std::vector leaf_sum_hessians_; int smaller_leaf_index_; int larger_leaf_index_; From 6113f9095120f7697d9be55a91156063927c52b6 Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Thu, 15 Aug 2024 13:21:52 +0300 Subject: [PATCH 16/31] . --- .../cuda/cuda_single_gpu_tree_learner.cpp | 15 +++++++-------- .../cuda/cuda_single_gpu_tree_learner.hpp | 3 +-- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp index 755fb1486245..3a62d7ec26e5 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp @@ -321,10 +321,10 @@ Tree* CUDASingleGPUTreeLearner::Train(const score_t* gradients, &leaf_data_start_[right_leaf_index], &leaf_sum_hessians_[best_leaf_index_], &leaf_sum_hessians_[right_leaf_index], - &sum_left_gradients, - &sum_right_gradients); + &leaf_sum_gradients_[best_leaf_index_], + &leaf_sum_gradients_[right_leaf_index]); #ifdef DEBUG - CheckSplitValid(best_leaf_index_, right_leaf_index, sum_left_gradients, sum_right_gradients); + CheckSplitValid(best_leaf_index_, right_leaf_index); #endif // DEBUG smaller_leaf_index_ = (leaf_num_data_[best_leaf_index_] < leaf_num_data_[right_leaf_index] ? best_leaf_index_ : right_leaf_index); larger_leaf_index_ = (smaller_leaf_index_ == best_leaf_index_ ? right_leaf_index : best_leaf_index_); @@ -382,6 +382,7 @@ void CUDASingleGPUTreeLearner::ResetConfig(const Config* config) { leaf_best_split_default_left_.resize(config_->num_leaves, 0); leaf_num_data_.resize(config_->num_leaves, 0); leaf_data_start_.resize(config_->num_leaves, 0); + leaf_sum_gradients_.resize(config_->num_leaves, 0.0f); leaf_sum_hessians_.resize(config_->num_leaves, 0.0f); } cuda_histogram_constructor_->ResetConfig(config); @@ -570,9 +571,7 @@ void CUDASingleGPUTreeLearner::SelectFeatureByNode(const Tree* tree) { #ifdef DEBUG void CUDASingleGPUTreeLearner::CheckSplitValid( const int left_leaf, - const int right_leaf, - const double split_sum_left_gradients, - const double split_sum_right_gradients) { + const int right_leaf) { std::vector left_data_indices(leaf_num_data_[left_leaf]); std::vector right_data_indices(leaf_num_data_[right_leaf]); CopyFromCUDADeviceToHost(left_data_indices.data(), @@ -593,9 +592,9 @@ void CUDASingleGPUTreeLearner::CheckSplitValid( sum_right_gradients += host_gradients_[index]; sum_right_hessians += host_hessians_[index]; } - CHECK_LE(std::fabs(sum_left_gradients - split_sum_left_gradients), 1e-6f); + CHECK_LE(std::fabs(sum_left_gradients - leaf_sum_gradients_[left_leaf]), 1e-6f); CHECK_LE(std::fabs(sum_left_hessians - leaf_sum_hessians_[left_leaf]), 1e-6f); - CHECK_LE(std::fabs(sum_right_gradients - split_sum_right_gradients), 1e-6f); + CHECK_LE(std::fabs(sum_right_gradients - leaf_sum_gradients_[right_leaf]), 1e-6f); CHECK_LE(std::fabs(sum_right_hessians - leaf_sum_hessians_[right_leaf]), 1e-6f); } #endif // DEBUG diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp index 6121ae5d22a5..b50d3a8884ca 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp @@ -71,8 +71,7 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner { #ifdef DEBUG void CheckSplitValid( - const int left_leaf, const int right_leaf, - const double sum_left_gradients, const double sum_right_gradients); + const int left_leaf, const int right_leaf); #endif // DEBUG void RenewDiscretizedTreeLeaves(CUDATree* cuda_tree); From 94cf7f040cabbdd51284b4b176d41407cd296834 Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Thu, 15 Aug 2024 13:22:26 +0300 Subject: [PATCH 17/31] Fix warning --- src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp index 3a62d7ec26e5..55040caf454d 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp @@ -301,8 +301,6 @@ Tree* CUDASingleGPUTreeLearner::Train(const score_t* gradients, best_split_info); } - double sum_left_gradients = 0.0f; - double sum_right_gradients = 0.0f; cuda_data_partition_->Split(best_split_info, best_leaf_index_, right_leaf_index, From 01aa952391c1ed4a8bbd9a907506646027abc682 Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Thu, 15 Aug 2024 13:32:17 +0300 Subject: [PATCH 18/31] reenable tests --- tests/python_package_test/test_engine.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index c490583b6032..92541d3a3f47 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3895,10 +3895,6 @@ def test_dump_model(): assert "leaf_value" in dumped_model_str assert "leaf_count" in dumped_model_str - # CUDA does not return correct values for the root - if getenv("TASK", "") == "cuda": - return - for tree in dumped_model["tree_info"]: assert not np.all(tree["tree_structure"]["internal_value"] == 0) From fadaa83bf20f20e59e33b254952845810f9e47e7 Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Thu, 15 Aug 2024 13:35:02 +0300 Subject: [PATCH 19/31] . --- tests/python_package_test/test_engine.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 92541d3a3f47..b29f9abfa961 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3865,7 +3865,6 @@ def test_dump_model_stump(linear_tree): params = { "objective": "binary", "verbose": -1, - "n_jobs": 1, "linear_tree": linear_tree, } bst = lgb.train(params, train_data, num_boost_round=5) From a323acb76d1fe2c3ea4853f67340d614ef0c3bae Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Thu, 15 Aug 2024 14:00:46 +0300 Subject: [PATCH 20/31] fix cuda --- src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp index 55040caf454d..59131befef32 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp @@ -170,6 +170,7 @@ Tree* CUDASingleGPUTreeLearner::Train(const score_t* gradients, leaf_sum_gradients_[smaller_leaf_index_], leaf_sum_hessians_[smaller_leaf_index_], config_->lambda_l1, config_->lambda_l2, config_->path_smooth, static_cast(num_data_), 0)); + cuda_tree->SyncLeafOutputFromHostToCUDA(); for (int i = 0; i < config_->num_leaves - 1; ++i) { global_timer.Start("CUDASingleGPUTreeLearner::ConstructHistogramForLeaf"); const data_size_t num_data_in_smaller_leaf = leaf_num_data_[smaller_leaf_index_]; From 0fd0c59308a2f83736a308553d80df4c002a0c7b Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Thu, 15 Aug 2024 14:06:39 +0300 Subject: [PATCH 21/31] Fix compilation error --- src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp index 59131befef32..952ef52f8023 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp @@ -170,7 +170,7 @@ Tree* CUDASingleGPUTreeLearner::Train(const score_t* gradients, leaf_sum_gradients_[smaller_leaf_index_], leaf_sum_hessians_[smaller_leaf_index_], config_->lambda_l1, config_->lambda_l2, config_->path_smooth, static_cast(num_data_), 0)); - cuda_tree->SyncLeafOutputFromHostToCUDA(); + tree->SyncLeafOutputFromHostToCUDA(); for (int i = 0; i < config_->num_leaves - 1; ++i) { global_timer.Start("CUDASingleGPUTreeLearner::ConstructHistogramForLeaf"); const data_size_t num_data_in_smaller_leaf = leaf_num_data_[smaller_leaf_index_]; From 4cc5dd45a31d74ad832a285bb8a60611a340b41a Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Thu, 15 Aug 2024 14:27:30 +0300 Subject: [PATCH 22/31] Fix weight --- src/io/cuda/cuda_tree.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/io/cuda/cuda_tree.cu b/src/io/cuda/cuda_tree.cu index 62020c3a09ae..09a3164dbf52 100644 --- a/src/io/cuda/cuda_tree.cu +++ b/src/io/cuda/cuda_tree.cu @@ -210,7 +210,7 @@ __global__ void SplitCategoricalKernel( // split information split_gain[new_node_index] = static_cast(cuda_split_info->gain); } else if (thread_index == 4) { // save current leaf value to internal node before change - internal_weight[new_node_index] = leaf_weight[leaf_index]; + internal_weight[new_node_index] = cuda_split_info->left_sum_hessians + cuda_split_info->right_sum_hessians; leaf_weight[leaf_index] = cuda_split_info->left_sum_hessians; } else if (thread_index == 5) { internal_value[new_node_index] = leaf_value[leaf_index]; From a743a87739b2cf04040f36344483d71e32e02b50 Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Thu, 15 Aug 2024 15:18:06 +0300 Subject: [PATCH 23/31] Fix numerical --- src/io/cuda/cuda_tree.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/io/cuda/cuda_tree.cu b/src/io/cuda/cuda_tree.cu index 09a3164dbf52..87abfc1353b4 100644 --- a/src/io/cuda/cuda_tree.cu +++ b/src/io/cuda/cuda_tree.cu @@ -94,7 +94,7 @@ __global__ void SplitKernel( // split information split_gain[new_node_index] = static_cast(cuda_split_info->gain); } else if (thread_index == 4) { // save current leaf value to internal node before change - internal_weight[new_node_index] = leaf_weight[leaf_index]; + internal_weight[new_node_index] = cuda_split_info->left_sum_hessians + cuda_split_info->right_sum_hessians; leaf_weight[leaf_index] = cuda_split_info->left_sum_hessians; } else if (thread_index == 5) { internal_value[new_node_index] = leaf_value[leaf_index]; From 031c945ba3e065357488effb4d7454d99f7bad46 Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Fri, 16 Aug 2024 13:20:28 +0300 Subject: [PATCH 24/31] Make tests more robust --- tests/python_package_test/test_dask.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 3af65fc945f1..6e1389b0ecd5 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -1465,15 +1465,14 @@ def test_init_score(task, output, cluster): model = model_factory(client=client, **params) model.fit(dX, dy, sample_weight=dw, group=dg) + pred = model.predict(dX, raw_score=True) model_init_score = model_factory(client=client, **params) model_init_score.fit(dX, dy, sample_weight=dw, init_score=init_scores, group=dg) + pred_init_score = model_init_score.predict(dX, raw_score=True) - # check if init score changes root value - assert ( - model.booster_.trees_to_dataframe()["value"][0] - != model_init_score.booster_.trees_to_dataframe()["value"][0] - ) + # check if init score changes predictions + assert not np.allclose(pred, pred_init_score) def sklearn_checks_to_run(): From 634b0fcffba2d828391c0d54c5f15c788b7243a5 Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Tue, 17 Sep 2024 16:42:57 +0300 Subject: [PATCH 25/31] Fix test failing because of accuracy reasons --- tests/python_package_test/test_dask.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index dd7b1817448b..b47d31f16507 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -2,7 +2,6 @@ """Tests for lightgbm.dask module""" import inspect -import random import socket from itertools import groupby from os import getenv @@ -1444,16 +1443,16 @@ def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task @pytest.mark.parametrize("task", tasks) @pytest.mark.parametrize("output", data_output) def test_init_score(task, output, cluster): - if task == "ranking": - pytest.skip("LGBMRanker is not currently tested for init_score") + if task == "ranking" and output == "scipy_csr_matrix": + pytest.skip("LGBMRanker is not currently tested on sparse matrices") with Client(cluster) as client: _, _, _, _, dX, dy, dw, dg = _create_data(objective=task, output=output, group=None) model_factory = task_to_dask_factory[task] - params = {"n_estimators": 1, "num_leaves": 2, "time_out": 5} - init_score = random.random() + params = {"n_estimators": 30, "num_leaves": 15, "time_out": 5} + init_score = 1 size_factor = 1 if task == "multiclass-classification": size_factor = 3 # number of classes @@ -1472,7 +1471,8 @@ def test_init_score(task, output, cluster): pred_init_score = model_init_score.predict(dX, raw_score=True) # check if init score changes predictions - assert not np.allclose(pred, pred_init_score) + with pytest.raises(AssertionError): + assert_eq(pred, pred_init_score, atol=0.0001) def sklearn_checks_to_run(): From 3fe4577da970fdd6a055721a724ee9aa22dafee1 Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Sat, 21 Sep 2024 19:01:07 +0300 Subject: [PATCH 26/31] Fix test_dask::test_init_scores --- tests/python_package_test/test_dask.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index b47d31f16507..15f9bed859b7 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -1452,15 +1452,14 @@ def test_init_score(task, output, cluster): model_factory = task_to_dask_factory[task] params = {"n_estimators": 30, "num_leaves": 15, "time_out": 5} - init_score = 1 - size_factor = 1 - if task == "multiclass-classification": - size_factor = 3 # number of classes + num_classes = 3 if task == "multiclass-classification" else 1 + + rnd = np.random.RandomState(42) if output.startswith("dataframe"): - init_scores = dy.map_partitions(lambda x: pd.DataFrame([[init_score] * size_factor] * x.size)) + init_scores = dy.map_partitions(lambda x: pd.DataFrame(rnd.uniform(size=(x.size, num_classes)))) else: - init_scores = dy.map_blocks(lambda x: np.full((x.size, size_factor), init_score)) + init_scores = dy.map_blocks(lambda x: rnd.uniform(size=(x.size, num_classes))) model = model_factory(client=client, **params) model.fit(dX, dy, sample_weight=dw, group=dg) @@ -1472,7 +1471,7 @@ def test_init_score(task, output, cluster): # check if init score changes predictions with pytest.raises(AssertionError): - assert_eq(pred, pred_init_score, atol=0.0001) + assert_eq(pred, pred_init_score) def sklearn_checks_to_run(): From 9e3e8eda11cc900f73c8ad29563ebb49d806b737 Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Sat, 21 Sep 2024 19:22:40 +0300 Subject: [PATCH 27/31] Decrease size of trees in test --- tests/python_package_test/test_dask.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 15f9bed859b7..1929d4ba53c5 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -1450,12 +1450,11 @@ def test_init_score(task, output, cluster): _, _, _, _, dX, dy, dw, dg = _create_data(objective=task, output=output, group=None) model_factory = task_to_dask_factory[task] + rnd = np.random.RandomState(42) - params = {"n_estimators": 30, "num_leaves": 15, "time_out": 5} + params = {"n_estimators": 1, "num_leaves": 2, "time_out": 5} num_classes = 3 if task == "multiclass-classification" else 1 - rnd = np.random.RandomState(42) - if output.startswith("dataframe"): init_scores = dy.map_partitions(lambda x: pd.DataFrame(rnd.uniform(size=(x.size, num_classes)))) else: From e76d5bc62f1d9010f4322193b3f8be567b75fa2a Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 7 Oct 2024 23:16:00 -0500 Subject: [PATCH 28/31] add a test on predictions from a model of all stumps --- tests/python_package_test/test_engine.py | 28 ++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index a2573899d499..4e36072131fc 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3812,6 +3812,34 @@ def inner_test(X, y, params, early_stopping_rounds): inner_test(X, y, params, early_stopping_rounds=None) +@pytest.mark.parametrize("use_init_score", [False, True]) +def test_predict_stump(rng, use_init_score): + X, y = load_breast_cancer(return_X_y=True) + dataset_kwargs = {"data": X, "label": y} + if use_init_score: + dataset_kwargs.update({"init_score": rng.uniform(size=y.shape)}) + bst = lgb.train( + train_set=lgb.Dataset(**dataset_kwargs), + params={"objective": "binary", "min_data_in_leaf": X.shape[0]}, + num_boost_round=5, + ) + # checking prediction from 1 iteration and the whole model, to prevent bugs + # of the form "a model of n stumps predicts n * initial_score" + preds_1 = bst.predict(X, raw_score=True, num_iteration=1) + preds_all = bst.predict(X, raw_score=True) + if use_init_score: + # if init_score was provided, a model of stumps should predict all 0s + all_zeroes = np.full_like(preds_1, fill_value=0.0) + np.testing.assert_allclose(preds_1, all_zeroes) + np.testing.assert_allclose(preds_all, all_zeroes) + else: + # if init_score was not provided, prediction for a model of stumps should be + # the "average" of the labels + y_avg = np.log(y.mean() / (1.0 - y.mean())) + np.testing.assert_allclose(preds_1, np.full_like(preds_1, fill_value=y_avg)) + np.testing.assert_allclose(preds_all, np.full_like(preds_all, fill_value=y_avg)) + + def test_average_precision_metric(): # test against sklearn average precision metric X, y = load_breast_cancer(return_X_y=True) From 0af46318eb3a9c45089582c8c80cd96fa1678c74 Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Tue, 8 Oct 2024 20:18:25 +0300 Subject: [PATCH 29/31] Comments after code review --- tests/python_package_test/test_dask.py | 11 +++++---- tests/python_package_test/test_engine.py | 29 ++++++++++-------------- tests/python_package_test/utils.py | 10 +++++--- 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 1929d4ba53c5..dccc4065ba55 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -1442,7 +1442,7 @@ def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task @pytest.mark.parametrize("task", tasks) @pytest.mark.parametrize("output", data_output) -def test_init_score(task, output, cluster): +def test_init_score(task, output, cluster, rng): if task == "ranking" and output == "scipy_csr_matrix": pytest.skip("LGBMRanker is not currently tested on sparse matrices") @@ -1450,15 +1450,16 @@ def test_init_score(task, output, cluster): _, _, _, _, dX, dy, dw, dg = _create_data(objective=task, output=output, group=None) model_factory = task_to_dask_factory[task] - rnd = np.random.RandomState(42) params = {"n_estimators": 1, "num_leaves": 2, "time_out": 5} - num_classes = 3 if task == "multiclass-classification" else 1 + num_classes = 1 + if task == "multiclass-classification": + num_classes = 3 if output.startswith("dataframe"): - init_scores = dy.map_partitions(lambda x: pd.DataFrame(rnd.uniform(size=(x.size, num_classes)))) + init_scores = dy.map_partitions(lambda x: pd.DataFrame(rng.uniform(size=(x.size, num_classes)))) else: - init_scores = dy.map_blocks(lambda x: rnd.uniform(size=(x.size, num_classes))) + init_scores = dy.map_blocks(lambda x: rng.uniform(size=(x.size, num_classes))) model = model_factory(client=client, **params) model.fit(dX, dy, sample_weight=dw, group=dg) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 4e36072131fc..286f066a3526 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3887,28 +3887,21 @@ def test_reset_params_works_with_metric_num_class_and_boosting(): @pytest.mark.parametrize("linear_tree", [False, True]) def test_dump_model_stump(linear_tree): X, y = load_breast_cancer(return_X_y=True) - # intentionally create a stump (tree with only a root-node) - # using restricted # samples - subidx = random.sample(range(len(y)), 30) - train_data = lgb.Dataset(X[subidx], label=y[subidx]) - params = { - "objective": "binary", - "verbose": -1, - "linear_tree": linear_tree, - } + train_data = lgb.Dataset(X, label=y) + params = {"objective": "binary", "verbose": -1, "linear_tree": linear_tree, "min_data_in_leaf": len(y)} bst = lgb.train(params, train_data, num_boost_round=5) - dumped_model = bst.dump_model(5, 0) + dumped_model = bst.dump_model(num_iteration=5, start_iteration=0) tree_structure = dumped_model["tree_info"][0]["tree_structure"] assert len(dumped_model["tree_info"]) == 1 assert "leaf_value" in tree_structure - assert tree_structure["leaf_count"] == 30 + assert tree_structure["leaf_count"] == len(y) def test_dump_model(): - offset = 100 + initial_score_offset = 57.5 X, y = make_synthetic_regression() - train_data = lgb.Dataset(X, label=y + offset) + train_data = lgb.Dataset(X, label=y + initial_score_offset) params = { "objective": "regression", @@ -3916,7 +3909,7 @@ def test_dump_model(): "boost_from_average": True, } bst = lgb.train(params, train_data, num_boost_round=5) - dumped_model = bst.dump_model(5, 0) + dumped_model = bst.dump_model(num_iteration=5, start_iteration=0) dumped_model_str = str(dumped_model) assert "leaf_features" not in dumped_model_str assert "leaf_coeff" not in dumped_model_str @@ -3925,9 +3918,11 @@ def test_dump_model(): assert "leaf_count" in dumped_model_str for tree in dumped_model["tree_info"]: - assert not np.all(tree["tree_structure"]["internal_value"] == 0) + assert tree["tree_structure"]["internal_value"] != 0 - np.testing.assert_allclose(dumped_model["tree_info"][0]["tree_structure"]["internal_value"], offset, atol=1) + assert dumped_model["tree_info"][0]["tree_structure"]["internal_value"] == pytest.approx( + initial_score_offset, abs=1 + ) assert_all_trees_valid(dumped_model) @@ -3940,7 +3935,7 @@ def test_dump_model_linear(): } train_data = lgb.Dataset(X, label=y) bst = lgb.train(params, train_data, num_boost_round=5) - dumped_model = bst.dump_model(5, 0) + dumped_model = bst.dump_model(num_iteration=5, start_iteration=0) assert_all_trees_valid(dumped_model) dumped_model_str = str(dumped_model) assert "leaf_features" in dumped_model_str diff --git a/tests/python_package_test/utils.py b/tests/python_package_test/utils.py index c8ef7a9fa139..d5e233c579de 100644 --- a/tests/python_package_test/utils.py +++ b/tests/python_package_test/utils.py @@ -251,12 +251,16 @@ def assert_subtree_valid(root): right_child = root["right_child"] (l_w, l_c) = assert_subtree_valid(left_child) (r_w, r_c) = assert_subtree_valid(right_child) - assert np.allclose(root["internal_weight"], l_w + r_w) - assert np.allclose(root["internal_count"], l_c + r_c) + assert ( + abs(root["internal_weight"] - (l_w + r_w)) <= 1e-3 + ), "root node's internal weight should be exactly the sum of its child nodes' internal counts" + assert ( + root["internal_count"] == l_c + r_c + ), "root node's internal count should be exactly the sum of its child nodes' internal counts" return (root["internal_weight"], root["internal_count"]) def assert_all_trees_valid(model_dump): for idx, tree in enumerate(model_dump["tree_info"]): - assert tree["tree_index"] == idx + assert tree["tree_index"] == idx, f"tree {idx} should have tree_index={idx}. Full tree: {tree}" assert_subtree_valid(tree["tree_structure"]) From 04886c0d6d2da2e3d145916556ef34d012b43ec9 Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Wed, 9 Oct 2024 23:52:13 +0300 Subject: [PATCH 30/31] Small text QOL --- tests/python_package_test/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/utils.py b/tests/python_package_test/utils.py index d5e233c579de..85bf796b2cc4 100644 --- a/tests/python_package_test/utils.py +++ b/tests/python_package_test/utils.py @@ -253,7 +253,7 @@ def assert_subtree_valid(root): (r_w, r_c) = assert_subtree_valid(right_child) assert ( abs(root["internal_weight"] - (l_w + r_w)) <= 1e-3 - ), "root node's internal weight should be exactly the sum of its child nodes' internal counts" + ), "root node's internal weight should be approximately the sum of its child nodes' internal weights" assert ( root["internal_count"] == l_c + r_c ), "root node's internal count should be exactly the sum of its child nodes' internal counts" From 15fc3bfee90846a626560f4fc2acea991ca38f27 Mon Sep 17 00:00:00 2001 From: Atanas Dimitrov Date: Thu, 10 Oct 2024 01:32:55 +0300 Subject: [PATCH 31/31] Add test_predict_stump on dask --- tests/python_package_test/test_dask.py | 47 +++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index dccc4065ba55..ff1fb2e155ed 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -1451,7 +1451,15 @@ def test_init_score(task, output, cluster, rng): model_factory = task_to_dask_factory[task] - params = {"n_estimators": 1, "num_leaves": 2, "time_out": 5} + params = { + "n_estimators": 1, + "num_leaves": 2, + "time_out": 5, + "seed": 708, + "deterministic": True, + "force_row_wise": True, + "num_thread": 1, + } num_classes = 1 if task == "multiclass-classification": num_classes = 3 @@ -1533,6 +1541,43 @@ def test_predict_with_raw_score(task, output, cluster): assert_eq(raw_predictions, pred_proba_raw) +@pytest.mark.parametrize("output", data_output) +@pytest.mark.parametrize("use_init_score", [False, True]) +def test_predict_stump(output, use_init_score, cluster, rng): + with Client(cluster) as client: + task = "binary-classification" + n_samples = 1_000 + _, _, _, _, dX, dy, _, dg = _create_data(objective=task, n_samples=n_samples, output=output) + + model_factory = task_to_dask_factory[task] + + params = {"objective": "binary", "n_estimators": 5, "min_data_in_leaf": n_samples} + + if not use_init_score: + init_scores = None + elif output.startswith("dataframe"): + init_scores = dy.map_partitions(lambda x: pd.DataFrame(rng.uniform(size=x.size))) + else: + init_scores = dy.map_blocks(lambda x: rng.uniform(size=x.size)) + + model = model_factory(client=client, **params) + model.fit(dX, dy, group=dg, init_score=init_scores) + preds_1 = model.predict(dX, raw_score=True, num_iteration=1).compute() + preds_all = model.predict(dX, raw_score=True).compute() + + if use_init_score: + # if init_score was provided, a model of stumps should predict all 0s + all_zeroes = np.full_like(preds_1, fill_value=0.0) + assert_eq(preds_1, all_zeroes) + assert_eq(preds_all, all_zeroes) + else: + # if init_score was not provided, prediction for a model of stumps should be + # the "average" of the labels + y_avg = np.log(dy.mean() / (1.0 - dy.mean())) + assert_eq(preds_1, np.full_like(preds_1, fill_value=y_avg)) + assert_eq(preds_all, np.full_like(preds_all, fill_value=y_avg)) + + def test_distributed_quantized_training(tmp_path, cluster): with Client(cluster) as client: X, y, w, _, dX, dy, dw, _ = _create_data(objective="regression", output="array")