Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[c++] Fix dump_model() information for root node #6569

Open
wants to merge 37 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
12102cc
Fix value calculation in root node
neNasko1 Jul 24, 2024
c933399
Fix dask tests
neNasko1 Jul 26, 2024
c240016
Merge branch 'master' into fix-root-values
neNasko1 Jul 26, 2024
2f1de57
Create proper tests
neNasko1 Jul 29, 2024
273a1df
Merge branch 'master' into fix-root-values
neNasko1 Jul 29, 2024
208df85
Test only on cpu
neNasko1 Jul 29, 2024
130879b
Merge branch 'fix-root-values' of github.com:neNasko1/LightGBM into f…
neNasko1 Jul 29, 2024
48e6b96
Disable new tests for CUDA
neNasko1 Jul 30, 2024
26b9859
Merge with #5964
neNasko1 Aug 3, 2024
88e3dec
Finish merging with dump_model unification
neNasko1 Aug 3, 2024
e1274dc
Improve tests
neNasko1 Aug 3, 2024
38ee92c
Add linear test for stump
neNasko1 Aug 4, 2024
3b423de
Fix CUDA compilation
neNasko1 Aug 5, 2024
c89e257
Merge branch 'master' into fix-root-values
neNasko1 Aug 5, 2024
3de14d9
Merge branch 'master' into fix-root-values
neNasko1 Aug 6, 2024
fc42c1c
Merge branch 'master' into fix-root-values
neNasko1 Aug 14, 2024
3ffcac6
Comments after code review
neNasko1 Aug 14, 2024
d5a82c4
Fix test
neNasko1 Aug 15, 2024
be7675d
Reenable cuda testing
neNasko1 Aug 15, 2024
f616e03
Tests
neNasko1 Aug 15, 2024
6c6bc33
Merge branch 'microsoft:master' into fix-root-values
neNasko1 Aug 15, 2024
c28a2cf
test cuda
neNasko1 Aug 15, 2024
6113f90
.
neNasko1 Aug 15, 2024
94cf7f0
Fix warning
neNasko1 Aug 15, 2024
01aa952
reenable tests
neNasko1 Aug 15, 2024
fadaa83
.
neNasko1 Aug 15, 2024
b9c681b
Merge branch 'fix-cuda' into fix-root-values
neNasko1 Aug 15, 2024
a323acb
fix cuda
neNasko1 Aug 15, 2024
0fd0c59
Fix compilation error
neNasko1 Aug 15, 2024
4cc5dd4
Fix weight
neNasko1 Aug 15, 2024
a743a87
Fix numerical
neNasko1 Aug 15, 2024
031c945
Make tests more robust
neNasko1 Aug 16, 2024
91993a9
Merge branch 'master' into fix-root-values
neNasko1 Sep 2, 2024
f744f64
Merge branch 'master' into fix-root-values
neNasko1 Sep 5, 2024
634b0fc
Fix test failing because of accuracy reasons
neNasko1 Sep 17, 2024
3fe4577
Fix test_dask::test_init_scores
neNasko1 Sep 21, 2024
9e3e8ed
Decrease size of trees in test
neNasko1 Sep 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion include/LightGBM/cuda/cuda_tree.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ class CUDATree : public Tree {
const data_size_t* used_data_indices,
data_size_t num_data, double* score) const override;

inline void AsConstantTree(double val) override;
inline void AsConstantTree(double val, int count) override;

const int* cuda_leaf_parent() const { return cuda_leaf_parent_; }

Expand Down
5 changes: 3 additions & 2 deletions include/LightGBM/tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -228,13 +228,14 @@ class Tree {
shrinkage_ = 1.0f;
}

virtual inline void AsConstantTree(double val) {
virtual inline void AsConstantTree(double val, int count = 0) {
num_leaves_ = 1;
shrinkage_ = 1.0f;
leaf_value_[0] = val;
if (is_linear_) {
leaf_const_[0] = val;
}
leaf_count_[0] = count;
}

/*! \brief Serialize this object to string*/
Expand Down Expand Up @@ -563,7 +564,7 @@ inline void Tree::Split(int leaf, int feature, int real_feature,
leaf_parent_[leaf] = new_node_idx;
leaf_parent_[num_leaves_] = new_node_idx;
// save current leaf value to internal node before change
internal_weight_[new_node_idx] = leaf_weight_[leaf];
internal_weight_[new_node_idx] = left_weight + right_weight;
internal_value_[new_node_idx] = leaf_value_[leaf];
internal_count_[new_node_idx] = left_cnt + right_cnt;
leaf_value_[leaf] = std::isnan(left_value) ? 0.0f : left_value;
Expand Down
2 changes: 1 addition & 1 deletion python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3913,7 +3913,7 @@ def _get_split_feature(
return feature_name

def _is_single_node_tree(tree: Dict[str, Any]) -> bool:
return set(tree.keys()) == {"leaf_value"}
return set(tree.keys()) == {"leaf_value", "leaf_count"}

# Create the node record, and populate universal data members
node: Dict[str, Union[int, str, None]] = OrderedDict()
Expand Down
5 changes: 4 additions & 1 deletion src/boosting/gbdt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,10 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
score_updater->AddScore(init_scores[cur_tree_id], cur_tree_id);
}
}
new_tree->AsConstantTree(init_scores[cur_tree_id]);
new_tree->AsConstantTree(init_scores[cur_tree_id], num_data_);
} else {
// extend init_scores with zeros
new_tree->AsConstantTree(0, num_data_);
}
}
// add model
Expand Down
2 changes: 1 addition & 1 deletion src/boosting/rf.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ class RF : public GBDT {
output = init_scores_[cur_tree_id];
}
}
new_tree->AsConstantTree(output);
new_tree->AsConstantTree(output, num_data_);
MultiplyScore(cur_tree_id, (iter_ + num_init_iteration_));
UpdateScore(new_tree.get(), cur_tree_id);
MultiplyScore(cur_tree_id, 1.0 / (iter_ + num_init_iteration_ + 1));
Expand Down
5 changes: 3 additions & 2 deletions src/io/cuda/cuda_tree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,9 +330,10 @@ void CUDATree::SyncLeafOutputFromCUDAToHost() {
CopyFromCUDADeviceToHost<double>(leaf_value_.data(), cuda_leaf_value_, leaf_value_.size(), __FILE__, __LINE__);
}

void CUDATree::AsConstantTree(double val) {
Tree::AsConstantTree(val);
void CUDATree::AsConstantTree(double val, int count) {
Tree::AsConstantTree(val, count);
CopyFromHostToCUDADevice<double>(cuda_leaf_value_, &val, 1, __FILE__, __LINE__);
CopyFromHostToCUDADevice<int>(cuda_leaf_count_, &count, 1, __FILE__, __LINE__);
}

} // namespace LightGBM
Expand Down
22 changes: 13 additions & 9 deletions src/io/tree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -416,12 +416,16 @@ std::string Tree::ToJSON() const {
str_buf << "\"num_cat\":" << num_cat_ << "," << '\n';
str_buf << "\"shrinkage\":" << shrinkage_ << "," << '\n';
if (num_leaves_ == 1) {
str_buf << "\"tree_structure\":{";
if (is_linear_) {
str_buf << "\"tree_structure\":{" << "\"leaf_value\":" << leaf_value_[0] << ", " << "\n";
str_buf << LinearModelToJSON(0) << "}" << "\n";
str_buf << "\"leaf_value\":" << leaf_value_[0] << ", " << '\n';
neNasko1 marked this conversation as resolved.
Show resolved Hide resolved
str_buf << "\"leaf_count\":" << leaf_count_[0] << ", " << '\n';
str_buf << LinearModelToJSON(0);
} else {
str_buf << "\"tree_structure\":{" << "\"leaf_value\":" << leaf_value_[0] << "}" << '\n';
str_buf << "\"leaf_value\":" << leaf_value_[0] << ", " << '\n';
str_buf << "\"leaf_count\":" << leaf_count_[0];
}
str_buf << "}" << '\n';
} else {
str_buf << "\"tree_structure\":" << NodeToJSON(0) << '\n';
}
Expand Down Expand Up @@ -731,6 +735,12 @@ Tree::Tree(const char* str, size_t* used_len) {
is_linear_ = false;
}

if (key_vals.count("leaf_count")) {
leaf_count_ = CommonC::StringToArrayFast<int>(key_vals["leaf_count"], num_leaves_);
} else {
leaf_count_.resize(num_leaves_);
}

#ifdef USE_CUDA
is_cuda_tree_ = false;
#endif // USE_CUDA
Expand Down Expand Up @@ -793,12 +803,6 @@ Tree::Tree(const char* str, size_t* used_len) {
leaf_weight_.resize(num_leaves_);
}

if (key_vals.count("leaf_count")) {
leaf_count_ = CommonC::StringToArrayFast<int>(key_vals["leaf_count"], num_leaves_);
} else {
leaf_count_.resize(num_leaves_);
}

if (key_vals.count("decision_type")) {
decision_type_ = CommonC::StringToArrayFast<int8_t>(key_vals["decision_type"], num_leaves_ - 1);
} else {
Expand Down
6 changes: 6 additions & 0 deletions src/treelearner/serial_tree_learner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,12 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
auto tree_ptr = tree.get();
constraints_->ShareTreePointer(tree_ptr);

// set the root value by hand, as it is not handled by splits
tree->SetLeafOutput(0, FeatureHistogram::CalculateSplittedLeafOutput<true, true, true, false>(
smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians(),
config_->lambda_l1, config_->lambda_l2, config_->max_delta_step,
BasicConstraint(), config_->path_smooth, static_cast<data_size_t>(num_data_), 0));

// root leaf
int left_leaf = 0;
int cur_depth = 1;
Expand Down
3 changes: 1 addition & 2 deletions tests/python_package_test/test_dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -1464,8 +1464,7 @@ def test_init_score(task, output, cluster):
init_scores = dy.map_blocks(lambda x: np.full((x.size, size_factor), init_score))
model = model_factory(client=client, **params)
model.fit(dX, dy, sample_weight=dw, init_score=init_scores, group=dg)
# value of the root node is 0 when init_score is set
assert model.booster_.trees_to_dataframe()["value"][0] == 0
assert model.fitted_
neNasko1 marked this conversation as resolved.
Show resolved Hide resolved


def sklearn_checks_to_run():
Expand Down
62 changes: 56 additions & 6 deletions tests/python_package_test/test_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

from .utils import (
SERIALIZERS,
assert_all_trees_valid,
dummy_obj,
load_breast_cancer,
load_digits,
Expand Down Expand Up @@ -3853,21 +3854,70 @@ def test_reset_params_works_with_metric_num_class_and_boosting():
assert new_bst.params == expected_params


def test_dump_model():
@pytest.mark.parametrize("linear_tree", [False, True])
def test_dump_model_stump(linear_tree):
X, y = load_breast_cancer(return_X_y=True)
train_data = lgb.Dataset(X, label=y)
params = {"objective": "binary", "verbose": -1}
# intentionally create a stump (tree with only a root-node)
# using restricted # samples
subidx = random.sample(range(len(y)), 30)

train_data = lgb.Dataset(X[subidx], label=y[subidx])
params = {
"objective": "binary",
"verbose": -1,
"n_jobs": 1,
neNasko1 marked this conversation as resolved.
Show resolved Hide resolved
"linear_tree": linear_tree,
}
bst = lgb.train(params, train_data, num_boost_round=5)
dumped_model_str = str(bst.dump_model(5, 0))
dumped_model = bst.dump_model(5, 0)
tree_structure = dumped_model["tree_info"][0]["tree_structure"]
assert len(dumped_model["tree_info"]) == 1
assert "leaf_value" in tree_structure
assert tree_structure["leaf_count"] == 30


def test_dump_model():
offset = 100
X, y = make_synthetic_regression()
train_data = lgb.Dataset(X, label=y + offset)

params = {
"objective": "regression",
"verbose": -1,
"boost_from_average": True,
}
bst = lgb.train(params, train_data, num_boost_round=5)
dumped_model = bst.dump_model(5, 0)
dumped_model_str = str(dumped_model)
assert "leaf_features" not in dumped_model_str
assert "leaf_coeff" not in dumped_model_str
assert "leaf_const" not in dumped_model_str
assert "leaf_value" in dumped_model_str
assert "leaf_count" in dumped_model_str
params["linear_tree"] = True

# CUDA does not return correct values for the root
if getenv("TASK", "") == "cuda":
return
neNasko1 marked this conversation as resolved.
Show resolved Hide resolved

for tree in dumped_model["tree_info"]:
assert not np.all(tree["tree_structure"]["internal_value"] == 0)

np.testing.assert_allclose(dumped_model["tree_info"][0]["tree_structure"]["internal_value"], offset, atol=1)
assert_all_trees_valid(dumped_model)


def test_dump_model_linear():
X, y = load_breast_cancer(return_X_y=True)
params = {
"objective": "binary",
"verbose": -1,
"linear_tree": True,
}
train_data = lgb.Dataset(X, label=y)
bst = lgb.train(params, train_data, num_boost_round=5)
dumped_model_str = str(bst.dump_model(5, 0))
dumped_model = bst.dump_model(5, 0)
assert_all_trees_valid(dumped_model)
dumped_model_str = str(dumped_model)
assert "leaf_features" in dumped_model_str
assert "leaf_coeff" in dumped_model_str
assert "leaf_const" in dumped_model_str
Expand Down
35 changes: 35 additions & 0 deletions tests/python_package_test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,3 +206,38 @@ def np_assert_array_equal(*args, **kwargs):
if not _numpy_testing_supports_strict_kwarg:
kwargs.pop("strict")
np.testing.assert_array_equal(*args, **kwargs)


def assert_subtree_valid(root):
"""Recursively checks the validity of a subtree rooted at `root`.

Currently it only checks whether weights and counts are consistent between
all parent nodes and their children.

Parameters
----------
root : dict
A dictionary representing the root of the subtree.
It should be produced by dump_model()

Returns
-------
tuple
A tuple containing the weight and count of the subtree rooted at `root`.
"""
if "leaf_count" in root:
return (root["leaf_weight"], root["leaf_count"])

left_child = root["left_child"]
right_child = root["right_child"]
(l_w, l_c) = assert_subtree_valid(left_child)
(r_w, r_c) = assert_subtree_valid(right_child)
assert np.allclose(root["internal_weight"], l_w + r_w)
assert np.allclose(root["internal_count"], l_c + r_c)
return (root["internal_weight"], root["internal_count"])


def assert_all_trees_valid(model_dump):
for idx, tree in enumerate(model_dump["tree_info"]):
assert tree["tree_index"] == idx
assert_subtree_valid(tree["tree_structure"])
Loading