microsoft · neNasko1 · Jul 24, 2024 · Jul 26, 2024 · Jul 26, 2024 · Jul 29, 2024
@@ -77,7 +77,7 @@ class CUDATree : public Tree {
                             const data_size_t* used_data_indices,
                             data_size_t num_data, double* score) const override;
 
-  inline void AsConstantTree(double val) override;
+  inline void AsConstantTree(double val, int count) override;
 
   const int* cuda_leaf_parent() const { return cuda_leaf_parent_; }
 

@@ -228,13 +228,14 @@ class Tree {
     shrinkage_ = 1.0f;
   }
 
-  virtual inline void AsConstantTree(double val) {
+  virtual inline void AsConstantTree(double val, int count = 0) {
     num_leaves_ = 1;
     shrinkage_ = 1.0f;
     leaf_value_[0] = val;
     if (is_linear_) {
       leaf_const_[0] = val;
     }
+    leaf_count_[0] = count;
   }
 
   /*! \brief Serialize this object to string*/
@@ -563,7 +564,7 @@ inline void Tree::Split(int leaf, int feature, int real_feature,
   leaf_parent_[leaf] = new_node_idx;
   leaf_parent_[num_leaves_] = new_node_idx;
   // save current leaf value to internal node before change
-  internal_weight_[new_node_idx] = leaf_weight_[leaf];
+  internal_weight_[new_node_idx] = left_weight + right_weight;
   internal_value_[new_node_idx] = leaf_value_[leaf];
   internal_count_[new_node_idx] = left_cnt + right_cnt;
   leaf_value_[leaf] = std::isnan(left_value) ? 0.0f : left_value;

@@ -3913,7 +3913,7 @@ def _get_split_feature(
                 return feature_name
 
             def _is_single_node_tree(tree: Dict[str, Any]) -> bool:
-                return set(tree.keys()) == {"leaf_value"}
+                return set(tree.keys()) == {"leaf_value", "leaf_count"}
 
             # Create the node record, and populate universal data members
             node: Dict[str, Union[int, str, None]] = OrderedDict()

@@ -419,7 +419,10 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
             score_updater->AddScore(init_scores[cur_tree_id], cur_tree_id);
           }
         }
-        new_tree->AsConstantTree(init_scores[cur_tree_id]);
+        new_tree->AsConstantTree(init_scores[cur_tree_id], num_data_);
+      } else {
+        // extend init_scores with zeros
+        new_tree->AsConstantTree(0, num_data_);
       }
     }
     // add model

@@ -168,7 +168,7 @@ class RF : public GBDT {
               output = init_scores_[cur_tree_id];
             }
           }
-          new_tree->AsConstantTree(output);
+          new_tree->AsConstantTree(output, num_data_);
           MultiplyScore(cur_tree_id, (iter_ + num_init_iteration_));
           UpdateScore(new_tree.get(), cur_tree_id);
           MultiplyScore(cur_tree_id, 1.0 / (iter_ + num_init_iteration_ + 1));

@@ -330,9 +330,10 @@ void CUDATree::SyncLeafOutputFromCUDAToHost() {
   CopyFromCUDADeviceToHost<double>(leaf_value_.data(), cuda_leaf_value_, leaf_value_.size(), __FILE__, __LINE__);
 }
 
-void CUDATree::AsConstantTree(double val) {
-  Tree::AsConstantTree(val);
+void CUDATree::AsConstantTree(double val, int count) {
+  Tree::AsConstantTree(val, count);
   CopyFromHostToCUDADevice<double>(cuda_leaf_value_, &val, 1, __FILE__, __LINE__);
+  CopyFromHostToCUDADevice<int>(cuda_leaf_count_, &count, 1, __FILE__, __LINE__);
 }
 
 }  // namespace LightGBM

@@ -416,12 +416,16 @@ std::string Tree::ToJSON() const {
   str_buf << "\"num_cat\":" << num_cat_ << "," << '\n';
   str_buf << "\"shrinkage\":" << shrinkage_ << "," << '\n';
   if (num_leaves_ == 1) {
+    str_buf << "\"tree_structure\":{";
     if (is_linear_) {
-      str_buf << "\"tree_structure\":{" << "\"leaf_value\":" << leaf_value_[0] << ", " << "\n";
-      str_buf << LinearModelToJSON(0) << "}" << "\n";
+      str_buf << "\"leaf_value\":" << leaf_value_[0] << ", " << '\n';
+      str_buf << "\"leaf_count\":" << leaf_count_[0] << ", " << '\n';
+      str_buf << LinearModelToJSON(0);
     } else {
-      str_buf << "\"tree_structure\":{" << "\"leaf_value\":" << leaf_value_[0] << "}" << '\n';
+      str_buf << "\"leaf_value\":" << leaf_value_[0] << ", " << '\n';
+      str_buf << "\"leaf_count\":" << leaf_count_[0];
     }
+    str_buf << "}" << '\n';
   } else {
     str_buf << "\"tree_structure\":" << NodeToJSON(0) << '\n';
   }
@@ -731,6 +735,12 @@ Tree::Tree(const char* str, size_t* used_len) {
     is_linear_ = false;
   }
 
+  if (key_vals.count("leaf_count")) {
+    leaf_count_ = CommonC::StringToArrayFast<int>(key_vals["leaf_count"], num_leaves_);
+  } else {
+    leaf_count_.resize(num_leaves_);
+  }
+
   #ifdef USE_CUDA
   is_cuda_tree_ = false;
   #endif  // USE_CUDA
@@ -793,12 +803,6 @@ Tree::Tree(const char* str, size_t* used_len) {
     leaf_weight_.resize(num_leaves_);
   }
 
-  if (key_vals.count("leaf_count")) {
-    leaf_count_ = CommonC::StringToArrayFast<int>(key_vals["leaf_count"], num_leaves_);
-  } else {
-    leaf_count_.resize(num_leaves_);
-  }
-
   if (key_vals.count("decision_type")) {
     decision_type_ = CommonC::StringToArrayFast<int8_t>(key_vals["decision_type"], num_leaves_ - 1);
   } else {

@@ -201,6 +201,12 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
   auto tree_ptr = tree.get();
   constraints_->ShareTreePointer(tree_ptr);
 
+  // set the root value by hand, as it is not handled by splits
+  tree->SetLeafOutput(0, FeatureHistogram::CalculateSplittedLeafOutput<true, true, true, false>(
+    smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians(),
+    config_->lambda_l1, config_->lambda_l2, config_->max_delta_step,
+    BasicConstraint(), config_->path_smooth, static_cast<data_size_t>(num_data_), 0));
+
   // root leaf
   int left_leaf = 0;
   int cur_depth = 1;

@@ -1464,8 +1464,7 @@ def test_init_score(task, output, cluster):
             init_scores = dy.map_blocks(lambda x: np.full((x.size, size_factor), init_score))
         model = model_factory(client=client, **params)
         model.fit(dX, dy, sample_weight=dw, init_score=init_scores, group=dg)
-        # value of the root node is 0 when init_score is set
-        assert model.booster_.trees_to_dataframe()["value"][0] == 0
+        assert model.fitted_
 
 
 def sklearn_checks_to_run():

@@ -24,6 +24,7 @@
 
 from .utils import (
     SERIALIZERS,
+    assert_all_trees_valid,
     dummy_obj,
     load_breast_cancer,
     load_digits,
@@ -3853,21 +3854,70 @@ def test_reset_params_works_with_metric_num_class_and_boosting():
     assert new_bst.params == expected_params
 
 
-def test_dump_model():
+@pytest.mark.parametrize("linear_tree", [False, True])
+def test_dump_model_stump(linear_tree):
     X, y = load_breast_cancer(return_X_y=True)
-    train_data = lgb.Dataset(X, label=y)
-    params = {"objective": "binary", "verbose": -1}
+    # intentionally create a stump (tree with only a root-node)
+    # using restricted # samples
+    subidx = random.sample(range(len(y)), 30)
+
+    train_data = lgb.Dataset(X[subidx], label=y[subidx])
+    params = {
+        "objective": "binary",
+        "verbose": -1,
+        "n_jobs": 1,
+        "linear_tree": linear_tree,
+    }
     bst = lgb.train(params, train_data, num_boost_round=5)
-    dumped_model_str = str(bst.dump_model(5, 0))
+    dumped_model = bst.dump_model(5, 0)
+    tree_structure = dumped_model["tree_info"][0]["tree_structure"]
+    assert len(dumped_model["tree_info"]) == 1
+    assert "leaf_value" in tree_structure
+    assert tree_structure["leaf_count"] == 30
+
+
+def test_dump_model():
+    offset = 100
+    X, y = make_synthetic_regression()
+    train_data = lgb.Dataset(X, label=y + offset)
+
+    params = {
+        "objective": "regression",
+        "verbose": -1,
+        "boost_from_average": True,
+    }
+    bst = lgb.train(params, train_data, num_boost_round=5)
+    dumped_model = bst.dump_model(5, 0)
+    dumped_model_str = str(dumped_model)
     assert "leaf_features" not in dumped_model_str
     assert "leaf_coeff" not in dumped_model_str
     assert "leaf_const" not in dumped_model_str
     assert "leaf_value" in dumped_model_str
     assert "leaf_count" in dumped_model_str
-    params["linear_tree"] = True
+
+    # CUDA does not return correct values for the root
+    if getenv("TASK", "") == "cuda":
+        return
+
+    for tree in dumped_model["tree_info"]:
+        assert not np.all(tree["tree_structure"]["internal_value"] == 0)
+
+    np.testing.assert_allclose(dumped_model["tree_info"][0]["tree_structure"]["internal_value"], offset, atol=1)
+    assert_all_trees_valid(dumped_model)
+
+
+def test_dump_model_linear():
+    X, y = load_breast_cancer(return_X_y=True)
+    params = {
+        "objective": "binary",
+        "verbose": -1,
+        "linear_tree": True,
+    }
     train_data = lgb.Dataset(X, label=y)
     bst = lgb.train(params, train_data, num_boost_round=5)
-    dumped_model_str = str(bst.dump_model(5, 0))
+    dumped_model = bst.dump_model(5, 0)
+    assert_all_trees_valid(dumped_model)
+    dumped_model_str = str(dumped_model)
     assert "leaf_features" in dumped_model_str
     assert "leaf_coeff" in dumped_model_str
     assert "leaf_const" in dumped_model_str

@@ -206,3 +206,38 @@ def np_assert_array_equal(*args, **kwargs):
     if not _numpy_testing_supports_strict_kwarg:
         kwargs.pop("strict")
     np.testing.assert_array_equal(*args, **kwargs)
+
+
+def assert_subtree_valid(root):
+    """Recursively checks the validity of a subtree rooted at `root`.
+
+    Currently it only checks whether weights and counts are consistent between
+    all parent nodes and their children.
+
+    Parameters
+    ----------
+    root : dict
+        A dictionary representing the root of the subtree.
+        It should be produced by dump_model()
+
+    Returns
+    -------
+    tuple
+        A tuple containing the weight and count of the subtree rooted at `root`.
+    """
+    if "leaf_count" in root:
+        return (root["leaf_weight"], root["leaf_count"])
+
+    left_child = root["left_child"]
+    right_child = root["right_child"]
+    (l_w, l_c) = assert_subtree_valid(left_child)
+    (r_w, r_c) = assert_subtree_valid(right_child)
+    assert np.allclose(root["internal_weight"], l_w + r_w)
+    assert np.allclose(root["internal_count"], l_c + r_c)
+    return (root["internal_weight"], root["internal_count"])
+
+
+def assert_all_trees_valid(model_dump):
+    for idx, tree in enumerate(model_dump["tree_info"]):
+        assert tree["tree_index"] == idx
+        assert_subtree_valid(tree["tree_structure"])