From a94ef73fed05a8850288c1619d9b6a491d02af7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?mutlu=20=C5=9Fim=C5=9Fek?= Date: Mon, 28 Oct 2024 16:30:49 +0300 Subject: [PATCH] memory and iter limit added --- Cargo.toml | 8 +-- README.md | 2 +- benches/perpetual_benchmarks.rs | 12 +++- examples/cal_housing.rs | 13 +++- examples/cover_types.rs | 2 +- examples/titanic.rs | 2 +- python-package/Cargo.toml | 4 +- python-package/pyproject.toml | 2 +- python-package/python/perpetual/booster.py | 11 ++- python-package/src/booster.rs | 6 +- python-package/src/multi_output.rs | 6 +- src/booster.rs | 79 +++++++++++++++------- src/constants.rs | 4 +- src/multi_output.rs | 16 +++-- 14 files changed, 118 insertions(+), 49 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 4ce1720..31cdd9d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "perpetual" -version = "0.5.2" +version = "0.6.0" edition = "2021" authors = ["Mutlu Simsek "] homepage = "https://perpetual-ml.com" @@ -21,9 +21,9 @@ codegen-units = 1 [dependencies] rayon = "1.8" -thiserror = "1.0.64" -serde_json = { version = "1.0.129", features = ["float_roundtrip"] } -serde = { version = "1.0.209", features = ["derive"] } +thiserror = "1.0.65" +serde_json = { version = "1.0.132", features = ["float_roundtrip"] } +serde = { version = "1.0.213", features = ["derive"] } approx = "0.5" log = "0.4" rand = "0.8.5" diff --git a/README.md b/README.md index 50eb084..435e69f 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ pip install perpetual To use in a Rust project, add the following to your Cargo.toml file to get the package from [crates.io](https://crates.io/crates/perpetual). ```toml -perpetual = "0.5.2" +perpetual = "0.6.0" ``` ## Paper diff --git a/benches/perpetual_benchmarks.rs b/benches/perpetual_benchmarks.rs index 7dab405..078454e 100644 --- a/benches/perpetual_benchmarks.rs +++ b/benches/perpetual_benchmarks.rs @@ -152,9 +152,11 @@ pub fn tree_benchmarks(c: &mut Criterion) { .fit( black_box(&data), black_box(&y), + black_box(0.3), + black_box(None), + black_box(None), black_box(None), black_box(None), - black_box(0.3), black_box(None), black_box(None), black_box(None), @@ -169,9 +171,11 @@ pub fn tree_benchmarks(c: &mut Criterion) { .fit( black_box(&data), black_box(&y), + black_box(0.3), + black_box(None), + black_box(None), black_box(None), black_box(None), - black_box(0.3), black_box(None), black_box(None), black_box(None), @@ -180,7 +184,9 @@ pub fn tree_benchmarks(c: &mut Criterion) { }) }); let mut booster = PerpetualBooster::default(); - booster.fit(&data, &y, None, None, 0.1, None, None, None).unwrap(); + booster + .fit(&data, &y, 0.1, None, None, None, None, None, None, None) + .unwrap(); booster_train.bench_function("Predict Booster", |b| { b.iter(|| booster.predict(black_box(&data), false)) }); diff --git a/examples/cal_housing.rs b/examples/cal_housing.rs index 11341a2..d51651f 100644 --- a/examples/cal_housing.rs +++ b/examples/cal_housing.rs @@ -123,7 +123,18 @@ fn main() -> Result<(), Box> { .set_num_threads(Some(*num_threads)); let now = SystemTime::now(); - model.fit(&matrix_train, &y_train, None, None, *budget, None, None, None)?; + model.fit( + &matrix_train, + &y_train, + *budget, + None, + None, + None, + None, + None, + None, + None, + )?; println!("now.elapsed: {:?}", now.elapsed().unwrap().as_secs_f32()); let trees = model.get_prediction_trees(); diff --git a/examples/cover_types.rs b/examples/cover_types.rs index 2aafb22..7bbfc1b 100644 --- a/examples/cover_types.rs +++ b/examples/cover_types.rs @@ -148,7 +148,7 @@ fn main() -> Result<(), Box> { .map(|y| if (*y as i32) == i { 1.0 } else { 0.0 }) .collect(); - model.fit(&matrix_train, &y_tr, None, None, *budget, None, None, None)?; + model.fit(&matrix_train, &y_tr, *budget, None, None, None, None, None, None, None)?; println!("Completed fitting model number: {}", i); let trees = model.get_prediction_trees(); diff --git a/examples/titanic.rs b/examples/titanic.rs index 8ffcf6c..f3d9d25 100644 --- a/examples/titanic.rs +++ b/examples/titanic.rs @@ -51,7 +51,7 @@ fn main() -> Result<(), Box> { // the relevant `set_` methods for any parameters you would like to // adjust. let mut model = PerpetualBooster::default().set_objective(Objective::LogLoss); - model.fit(&matrix, &y, None, None, *budget, None, None, None)?; + model.fit(&matrix, &y, *budget, None, None, None, None, None, None, None)?; println!("Model prediction: {:?} ...", &model.predict(&matrix, true)[0..10]); diff --git a/python-package/Cargo.toml b/python-package/Cargo.toml index 722bac4..e2812b7 100644 --- a/python-package/Cargo.toml +++ b/python-package/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py-perpetual" -version = "0.5.2" +version = "0.6.0" edition = "2021" authors = ["Mutlu Simsek "] homepage = "https://perpetual-ml.com" @@ -19,7 +19,7 @@ crate-type = ["cdylib", "rlib"] [dependencies] pyo3 = { version = "0.22.5", features = ["extension-module"] } -perpetual_rs = {package="perpetual", version = "0.5.2", path = "../" } +perpetual_rs = {package="perpetual", version = "0.6.0", path = "../" } numpy = "0.22.0" ndarray = "0.16.1" serde_plain = { version = "1.0" } diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml index dc5b32c..d311d90 100644 --- a/python-package/pyproject.toml +++ b/python-package/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "perpetual" -version = "0.5.2" +version = "0.6.0" description = "A self-generalizing gradient boosting machine which doesn't need hyperparameter optimization" license = { file = "LICENSE" } keywords = [ diff --git a/python-package/python/perpetual/booster.py b/python-package/python/perpetual/booster.py index a43cbb8..82d8963 100644 --- a/python-package/python/perpetual/booster.py +++ b/python-package/python/perpetual/booster.py @@ -157,6 +157,8 @@ def fit( reset: Union[bool, None] = None, categorical_features: Union[Iterable[int], Iterable[str], str, None] = "auto", timeout: Union[float, None] = None, + iteration_limit: Union[int, None] = None, + memory_limit: Union[float, None] = None, ) -> Self: """Fit the gradient booster on a provided dataset. @@ -168,12 +170,17 @@ def fit( training the model. If None is passed, a weight of 1 will be used for every record. Defaults to None. budget: a positive number for fitting budget. Increasing this number will more - likely result in increased accuracy. + likely result in more boosting rounds and more increased predictive power. alpha: only used in quantile regression. reset: whether to reset the model or continue training. categorical_features: The names or indices for categorical features. `auto` for Polars or Pandas categorical data type. timeout: optional fit timeout in seconds + iteration_limit: optional limit for the number of boosting rounds. The default value is 1000 boosting rounds. + The algorithm automatically stops for most of the cases before hitting this limit. + If you want to experiment with very high budget (>2.0), you can also increase this limit. + memory_limit: optional limit for memory allocation in GB. If not set, the memory will be allocated based on + available memory and the algorithm requirements. """ features_, flat_data, rows, cols, categorical_features_, cat_mapping = ( @@ -247,6 +254,8 @@ def fit( reset=reset, categorical_features=categorical_features_, # type: ignore timeout=timeout, + iteration_limit=iteration_limit, + memory_limit=memory_limit, ) return self diff --git a/python-package/src/booster.rs b/python-package/src/booster.rs index ad016f2..d9aaf44 100644 --- a/python-package/src/booster.rs +++ b/python-package/src/booster.rs @@ -141,6 +141,8 @@ impl PerpetualBooster { reset: Option, categorical_features: Option>, timeout: Option, + iteration_limit: Option, + memory_limit: Option, ) -> PyResult<()> { let flat_data = flat_data.as_slice()?; let data = Matrix::new(flat_data, rows, cols); @@ -156,12 +158,14 @@ impl PerpetualBooster { match self.booster.fit( &data, y, + budget, sample_weight_, alpha, - budget, reset, categorical_features, timeout, + iteration_limit, + memory_limit, ) { Ok(m) => Ok(m), Err(e) => Err(PyValueError::new_err(e.to_string())), diff --git a/python-package/src/multi_output.rs b/python-package/src/multi_output.rs index 8dd4457..296ce5a 100644 --- a/python-package/src/multi_output.rs +++ b/python-package/src/multi_output.rs @@ -160,6 +160,8 @@ impl MultiOutputBooster { reset: Option, categorical_features: Option>, timeout: Option, + iteration_limit: Option, + memory_limit: Option, ) -> PyResult<()> { let flat_data = flat_data.as_slice()?; let data = Matrix::new(flat_data, rows, cols); @@ -178,12 +180,14 @@ impl MultiOutputBooster { match self.booster.fit( &data, &y_data, + budget, sample_weight_, alpha, - budget, reset, categorical_features, timeout, + iteration_limit, + memory_limit, ) { Ok(m) => Ok(m), Err(e) => Err(PyValueError::new_err(e.to_string())), diff --git a/src/booster.rs b/src/booster.rs index b4b6417..35d3df7 100644 --- a/src/booster.rs +++ b/src/booster.rs @@ -1,8 +1,8 @@ use crate::bin::Bin; use crate::binning::bin_matrix; use crate::constants::{ - GENERALIZATION_THRESHOLD, ITERATION_LIMIT, MIN_COL_AMOUNT, N_NODES_ALLOC_LIMIT, ROW_COLUMN_RATIO_LIMIT, - STOPPING_ROUNDS, + FREE_MEM_ALLOC_FACTOR, GENERALIZATION_THRESHOLD, ITER_LIMIT, MIN_COL_AMOUNT, N_NODES_ALLOC_LIMIT, + ROW_COLUMN_RATIO_LIMIT, STOPPING_ROUNDS, TIMEOUT_FACTOR, }; use crate::constraints::ConstraintMap; use crate::data::Matrix; @@ -247,22 +247,26 @@ impl PerpetualBooster { /// /// * `data` - Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array. /// * `y` - Either a Polars or Pandas Series, or a 1 dimensional Numpy array. + /// * `budget` - budget to fit the model. /// * `sample_weight` - Instance weights to use when training the model. /// * `alpha` - used only in quantile regression. - /// * `budget` - budget to fit the model. /// * `reset` - Reset the model or continue training. /// * `categorical_features` - categorical features. /// * `timeout` - fit timeout limit in seconds. + /// * `iteration_limit` - optional limit for the number of boosting rounds. + /// * `memory_limit` - optional limit for memory allocation. pub fn fit( &mut self, data: &Matrix, y: &[f64], + budget: f32, sample_weight: Option<&[f64]>, alpha: Option, - budget: f32, reset: Option, categorical_features: Option>, timeout: Option, + iteration_limit: Option, + memory_limit: Option, ) -> Result<(), PerpetualError> { let constraints_map = self .monotone_constraints @@ -282,28 +286,32 @@ impl PerpetualBooster { self.force_children_to_bound_parent, ); self.fit_trees( - y, - sample_weight, data, + y, + budget, &splitter, + sample_weight, alpha, - budget, reset, categorical_features, timeout, + iteration_limit, + memory_limit, )?; } else { let splitter = MissingImputerSplitter::new(self.eta, self.allow_missing_splits, constraints_map); self.fit_trees( - y, - sample_weight, data, + y, + budget, &splitter, + sample_weight, alpha, - budget, reset, categorical_features, timeout, + iteration_limit, + memory_limit, )?; }; @@ -312,15 +320,17 @@ impl PerpetualBooster { fn fit_trees( &mut self, - y: &[f64], - sample_weight: Option<&[f64]>, data: &Matrix, + y: &[f64], + budget: f32, splitter: &T, + sample_weight: Option<&[f64]>, alpha: Option, - budget: f32, reset: Option, categorical_features: Option>, timeout: Option, + iteration_limit: Option, + memory_limit: Option, ) -> Result<(), PerpetualError> { let start = Instant::now(); @@ -409,9 +419,18 @@ impl PerpetualBooster { } else { mem_hist = mem_bin * self.max_bin as usize * col_amount; } - let system = System::new_all(); - let mem_available = system.available_memory() as usize; - let n_nodes_alloc = usize::min(N_NODES_ALLOC_LIMIT, (0.9 * (mem_available / mem_hist) as f32) as usize); + let sys = System::new_all(); + let mem_available = match memory_limit { + Some(mem_limit) => mem_limit * (1e9 as f32), + None => match sys.cgroup_limits() { + Some(limits) => limits.free_memory as f32, + None => sys.available_memory() as f32, + }, + }; + let n_nodes_alloc = usize::min( + N_NODES_ALLOC_LIMIT, + (FREE_MEM_ALLOC_FACTOR * (mem_available / (mem_hist as f32))) as usize, + ); let mut hist_tree_owned: Vec; if col_amount == col_index.len() { @@ -432,7 +451,7 @@ impl PerpetualBooster { let mut split_info_vec: Vec = (0..col_amount).map(|_| SplitInfo::default()).collect(); let split_info_slice = SplitInfoSlice::new(&mut split_info_vec); - for i in 0..ITERATION_LIMIT { + for i in 0..iteration_limit.unwrap_or(ITER_LIMIT) { let verbose = if self.log_iterations == 0 { false } else { @@ -538,13 +557,13 @@ impl PerpetualBooster { } if let Some(t) = timeout { - if start.elapsed().as_secs_f32() > t { + if start.elapsed().as_secs_f32() > t * TIMEOUT_FACTOR { warn!("Reached timeout limit before auto stopping. Try to decrease the budget or increase the timeout for the best performance."); break; } } - if i == ITERATION_LIMIT - 1 { + if i == iteration_limit.unwrap_or(ITER_LIMIT) - 1 { warn!("Reached iteration limit before auto stopping. Try to decrease the budget for the best performance."); } } @@ -1003,7 +1022,9 @@ mod tests { let data = Matrix::new(&data_vec, 891, 5); let mut booster = PerpetualBooster::default().set_max_bin(300).set_base_score(0.5); - booster.fit(&data, &y, None, None, 0.3, None, None, None).unwrap(); + booster + .fit(&data, &y, 0.3, None, None, None, None, None, None, None) + .unwrap(); let preds = booster.predict(&data, false); let contribs = booster.predict_contributions(&data, ContributionsMethod::Average, false); assert_eq!(contribs.len(), (data.cols + 1) * data.rows); @@ -1025,7 +1046,9 @@ mod tests { let mut booster = PerpetualBooster::default(); - booster.fit(&data, &y, None, None, 0.3, None, None, None).unwrap(); + booster + .fit(&data, &y, 0.3, None, None, None, None, None, None, None) + .unwrap(); let preds = booster.predict(&data, false); let contribs = booster.predict_contributions(&data, ContributionsMethod::Average, false); assert_eq!(contribs.len(), (data.cols + 1) * data.rows); @@ -1049,7 +1072,9 @@ mod tests { .set_objective(Objective::SquaredLoss) .set_max_bin(300); - booster.fit(&data, &y, None, None, 0.3, None, None, None).unwrap(); + booster + .fit(&data, &y, 0.3, None, None, None, None, None, None, None) + .unwrap(); let preds = booster.predict(&data, false); let contribs = booster.predict_contributions(&data, ContributionsMethod::Average, false); assert_eq!(contribs.len(), (data.cols + 1) * data.rows); @@ -1072,7 +1097,9 @@ mod tests { //let data = Matrix::new(data.get_col(1), 891, 1); let mut booster = PerpetualBooster::default().set_max_bin(300).set_base_score(0.5); - booster.fit(&data, &y, None, None, 0.3, None, None, None).unwrap(); + booster + .fit(&data, &y, 0.3, None, None, None, None, None, None, None) + .unwrap(); let preds = booster.predict(&data, true); booster.save_booster("resources/model64.json").unwrap(); @@ -1103,7 +1130,7 @@ mod tests { let mut booster = PerpetualBooster::default(); booster - .fit(&data, &y, None, None, 0.1, None, Some(cat_index), None) + .fit(&data, &y, 0.1, None, None, None, Some(cat_index), None, None, None) .unwrap(); let file = fs::read_to_string("resources/titanic_train_y.csv").expect("Something went wrong reading the file"); @@ -1227,8 +1254,8 @@ mod tests { .set_max_bin(10) .set_num_threads(Some(2)); - model1.fit(&matrix_test, &y_test, None, None, 0.1, None, None, None)?; - model2.fit(&matrix_test, &y_test, None, None, 0.1, None, None, None)?; + model1.fit(&matrix_test, &y_test, 0.1, None, None, None, None, None, None, None)?; + model2.fit(&matrix_test, &y_test, 0.1, None, None, None, None, None, None, None)?; let trees1 = model1.get_prediction_trees(); let trees2 = model2.get_prediction_trees(); diff --git a/src/constants.rs b/src/constants.rs index 647f390..f6377c4 100644 --- a/src/constants.rs +++ b/src/constants.rs @@ -1,7 +1,9 @@ pub const STOPPING_ROUNDS: usize = 3; -pub const ITERATION_LIMIT: usize = 1000; +pub const FREE_MEM_ALLOC_FACTOR: f32 = 0.9; pub const N_NODES_ALLOC_LIMIT: usize = 3000; +pub const ITER_LIMIT: usize = 1000; pub const GENERALIZATION_THRESHOLD: f32 = 0.99; pub const ROW_COLUMN_RATIO_LIMIT: usize = 100; pub const MIN_COL_AMOUNT: usize = 30; pub const HESSIAN_EPS: f32 = 1e-3; +pub const TIMEOUT_FACTOR: f32 = 0.95; diff --git a/src/multi_output.rs b/src/multi_output.rs index ae75f33..11b3e35 100644 --- a/src/multi_output.rs +++ b/src/multi_output.rs @@ -189,12 +189,14 @@ impl MultiOutputBooster { &mut self, data: &Matrix, y: &Matrix, - sample_weight: Option<&[f64]>, - quantile: Option, budget: f32, + sample_weight: Option<&[f64]>, + alpha: Option, reset: Option, categorical_features: Option>, timeout: Option, + iteration_limit: Option, + memory_limit: Option, ) -> Result<(), PerpetualError> { let timeout_booster = match timeout { Some(t) => Some(t / self.n_boosters as f32), @@ -205,12 +207,14 @@ impl MultiOutputBooster { let _ = self.boosters[i].fit( data, y.get_col(i), - sample_weight, - quantile, budget, + sample_weight, + alpha, reset, categorical_features.clone(), timeout_booster, + iteration_limit, + memory_limit, ); } Ok(()) @@ -554,7 +558,9 @@ mod tests { println!("The number of boosters: {:?}", booster.get_boosters().len()); assert!(booster.get_boosters().len() == n_classes); - booster.fit(&data, &y, None, None, 0.1, None, None, Some(59.0)).unwrap(); + booster + .fit(&data, &y, 0.1, None, None, None, None, Some(60.0), None, None) + .unwrap(); let probas = booster.predict_proba(&data, true);