-
Notifications
You must be signed in to change notification settings - Fork 0
/
python_convert_version.cpp
199 lines (166 loc) · 7.84 KB
/
python_convert_version.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#include <vector>
#include <algorithm>
#include <numeric>
#include <cmath>
#include <set>
#include <iostream>
#include <random>
// Define a struct for the decision tree nodes
struct DecisionNode {
int feature_index = -1; // Index of the feature used for splitting at this node
double threshold = 0.0; // Threshold value for the split at this node
DecisionNode* left = nullptr; // Pointer to left child node
DecisionNode* right = nullptr; // Pointer to right child node
double value = 0.0; // Value of the node if it is a leaf node (no children)
// Constructor for a leaf node
DecisionNode(double val) : value(val) {}
// Constructor for a decision node
DecisionNode(int index, double thresh, DecisionNode* l, DecisionNode* r)
: feature_index(index), threshold(thresh), left(l), right(r) {}
// Destructor to clean up dynamically allocated children
~DecisionNode() {
delete left;
delete right;
}
};
// Function to calculate the mean squared error (MSE) for a set of targets and associated weights
double calculate_mse(const std::vector<double>& targets, const std::vector<double>& weights) {
double sum_weights = std::accumulate(weights.begin(), weights.end(), 0.0);
double weighted_mean = std::inner_product(targets.begin(), targets.end(), weights.begin(), 0.0) / sum_weights;
double mse = 0.0;
for (size_t i = 0; i < targets.size(); ++i) {
mse += weights[i] * std::pow(targets[i] - weighted_mean, 2);
}
return mse;
}
// Function to find unique values in a sorted manner for a given feature
std::vector<double> unique_sorted(const std::vector<double>& data) {
std::set<double> unique_data(data.begin(), data.end());
return std::vector<double>(unique_data.begin(), unique_data.end());
}
// Function to determine the best feature and threshold to split the dataset
std::pair<int, double> best_split(const std::vector<std::vector<double>>& X, const std::vector<double>& y,
const std::vector<double>& weights, int min_samples_split) {
int best_feature = -1;
double best_threshold = 0.0;
double best_mse = std::numeric_limits<double>::max(); // Initial maximum MSE
// Loop over each feature to evaluate the best split
for (int feature = 0; feature < X[0].size(); ++feature) {
// Store all values for the current feature
std::vector<double> feature_values;
// Collect values for the current feature
for (const auto& sample : X) {
feature_values.push_back(sample[feature]);
}
auto thresholds = unique_sorted(feature_values);
// Loop through each threshold to find the best split
for (double threshold : thresholds) {
std::vector<double> left_y, right_y, left_weights, right_weights;
// Divide data into left and right according to the threshold
for (size_t i = 0; i < X.size(); ++i) {
// Divide samples into left and right based on the threshold
if (X[i][feature] < threshold) {
left_y.push_back(y[i]);
left_weights.push_back(weights[i]);
} else {
right_y.push_back(y[i]);
right_weights.push_back(weights[i]);
}
}
// Skip invalid splits with fewer samples than required
if (left_y.size() < min_samples_split || right_y.size() < min_samples_split) {
continue;
}
// Calculate MSE for both sides and their total
double mse_left = calculate_mse(left_y, left_weights);
double mse_right = calculate_mse(right_y, right_weights);
double mse_total = mse_left + mse_right;
// Update the best split if the total MSE is lower
if (mse_total < best_mse) {
best_mse = mse_total;
best_feature = feature;
best_threshold = threshold;
}
}
}
return {best_feature, best_threshold};
}
// Function to calculate weighted average of a vector
double weighted_average(const std::vector<double>& values, const std::vector<double>& weights) {
double sum_weights = std::accumulate(weights.begin(), weights.end(), 0.0);
double weighted_sum = std::inner_product(values.begin(), values.end(), weights.begin(), 0.0);
return sum_weights == 0 ? 0 : weighted_sum / sum_weights;
}
// Recursive function to build the decision tree
DecisionNode* build_tree(const std::vector<std::vector<double>>& X, const std::vector<double>& y,
const std::vector<double>& weights, int min_samples_split, int max_depth, int depth = 0) {
// Base cases for stopping recursion and creating a leaf node
if (y.size() < 2 * min_samples_split || std::set<double>(y.begin(), y.end()).size() == 1 || depth >= max_depth) {
// If there are not enough samples, all targets are the same, or max depth is reached
double leaf_value = weighted_average(y, weights);
return new DecisionNode(leaf_value);
}
// Get the best feature and threshold for splitting
auto [feature, threshold] = best_split(X, y, weights, min_samples_split);
// If no valid split is found, create a leaf node
if (feature == -1) {
double leaf_value = weighted_average(y, weights);
return new DecisionNode(leaf_value);
}
std::vector<std::vector<double>> left_X, right_X; // Datasets for left and right branches
std::vector<double> left_y, right_y, left_weights, right_weights; // Target values and weights for left and right branches
// Split data into left and right subsets
for (size_t i = 0; i < X.size(); ++i) {
// Add to the left branch
if (X[i][feature] < threshold) {
left_X.push_back(X[i]);
left_y.push_back(y[i]);
left_weights.push_back(weights[i]);
}
// Add to the right branch
else {
right_X.push_back(X[i]);
right_y.push_back(y[i]);
right_weights.push_back(weights[i]);
}
}
// Recursively build left and right subtrees
DecisionNode* left = build_tree(left_X, left_y, left_weights, min_samples_split, max_depth, depth + 1);
DecisionNode* right = build_tree(right_X, right_y, right_weights, min_samples_split, max_depth, depth + 1);
return new DecisionNode(feature, threshold, left, right);
}
// Function to predict the output for a single data point by traversing the tree
double predict_single(const DecisionNode* tree, const std::vector<double>& x) {
while (tree->value == 0.0) { // Traverse until a leaf node is reached
tree = x[tree->feature_index] < tree->threshold ? tree->left : tree->right;
}
return tree->value;
}
// Function to predict outputs for a dataset
std::vector<double> predict(const DecisionNode* tree, const std::vector<std::vector<double>>& X) {
std::vector<double> predictions;
for (const auto& x : X) {
predictions.push_back(predict_single(tree, x));
}
return predictions;
}
int main() {
// Example usage of the tree
std::vector<std::vector<double>> X_train = {{1, 2}, {3, 4}, {5, 6}};
std::vector<double> y_train = {1, 3, 5};
std::vector<std::vector<double>> X_test = {{2, 3}, {4, 5}};
std::vector<double> weights(y_train.size(), 1.0); // Uniform weights for simplicity
int min_samples_split = 1;
int max_depth = 10;
// Build the decision tree using the training data
DecisionNode* tree = build_tree(X_train, y_train, weights, min_samples_split, max_depth);
// Predict using the built tree
auto predictions = predict(tree, X_test);
std::cout << "Predictions: ";
for (auto pred : predictions) {
std::cout << pred << " ";
}
std::cout << std::endl;
delete tree; // Clean up dynamically allocated memory
return 0;
}