diff --git a/Cargo.toml b/Cargo.toml index 892e32e..476166f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "perpetual" -version = "0.6.2" +version = "0.7.0" edition = "2021" authors = ["Mutlu Simsek "] homepage = "https://perpetual-ml.com" @@ -21,9 +21,9 @@ codegen-units = 1 [dependencies] rayon = "1.8" -thiserror = "1.0.65" +thiserror = "2.0.3" serde_json = { version = "1.0.132", features = ["float_roundtrip"] } -serde = { version = "1.0.213", features = ["derive"] } +serde = { version = "1.0.215", features = ["derive"] } approx = "0.5" log = "0.4" rand = "0.8.5" @@ -32,8 +32,8 @@ sysinfo = "0.32.0" [dev-dependencies] criterion = "0.5" polars = "0.41" -reqwest = { version = "0.12.8", features = ["blocking"] } -csv = "1.3" +reqwest = { version = "0.12.9", features = ["blocking"] } +csv = "1.3.1" chrono = "0.4" [[bench]] diff --git a/README.md b/README.md index 31aa758..c0f2b22 100644 --- a/README.md +++ b/README.md @@ -58,12 +58,22 @@ The package can be installed directly from [pypi](https://pypi.org/project/perpe pip install perpetual ``` +Using [conda-forge](https://anaconda.org/conda-forge/perpetual): + +```shell +conda install conda-forge::perpetual +``` + To use in a Rust project, add the following to your Cargo.toml file to get the package from [crates.io](https://crates.io/crates/perpetual). ```toml -perpetual = "0.6.2" +perpetual = "0.7.0" ``` +## Contribution + +Contributions are welcome. Check CONTRIBUTING.md for the guideline. + ## Paper PerpetualBooster prevents overfitting with a generalization algorithm. The paper is work-in-progress to explain how the algorithm works. Check our [blog post](https://perpetual-ml.com/blog/how-perpetual-works) for a high level introduction to the algorithm. diff --git a/python-package/Cargo.toml b/python-package/Cargo.toml index d185ac1..a2b94ef 100644 --- a/python-package/Cargo.toml +++ b/python-package/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py-perpetual" -version = "0.6.2" +version = "0.7.0" edition = "2021" authors = ["Mutlu Simsek "] homepage = "https://perpetual-ml.com" @@ -18,10 +18,10 @@ name = "perpetual" crate-type = ["cdylib", "rlib"] [dependencies] -pyo3 = { version = "0.22.5", features = ["extension-module"] } -perpetual_rs = {package="perpetual", version = "0.6.2", path = "../" } -numpy = "0.22.0" +pyo3 = { version = "0.22.6", features = ["extension-module"] } +perpetual_rs = {package="perpetual", version = "0.7.0", path = "../" } +numpy = "0.22.1" ndarray = "0.16.1" serde_plain = { version = "1.0" } -serde = { version = "1.0.210" } +serde = { version = "1.0.215" } pyo3-log = "0.11" diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml index fcdb278..2e7cbe8 100644 --- a/python-package/pyproject.toml +++ b/python-package/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "perpetual" -version = "0.6.2" +version = "0.7.0" description = "A self-generalizing gradient boosting machine which doesn't need hyperparameter optimization" license = { file = "LICENSE" } keywords = [ diff --git a/python-package/python/perpetual/booster.py b/python-package/python/perpetual/booster.py index d0cd0f8..2d28035 100644 --- a/python-package/python/perpetual/booster.py +++ b/python-package/python/perpetual/booster.py @@ -48,8 +48,17 @@ def __init__( missing_node_treatment: str = "None", log_iterations: int = 0, feature_importance_method: str = "Gain", + budget: Optional[float] = None, + alpha: Optional[float] = None, + reset: Optional[bool] = None, + categorical_features: Union[Iterable[int], Iterable[str], str, None] = "auto", + timeout: Optional[float] = None, + iteration_limit: Optional[int] = None, + memory_limit: Optional[float] = None, ): - """PerpetualBooster Class, used to generate gradient boosted decision tree ensembles. + """PerpetualBooster class, used to generate gradient boosted decision tree ensembles. + The following parameters can also be specified in the fit method to override the values in the constructor: + budget, alpha, reset, categorical_features, timeout, iteration_limit, and memory_limit. Args: objective (str, optional): Learning objective function to be used for optimization. @@ -93,6 +102,19 @@ def __init__( - "AverageNodeWeight": Set the missing node to be equal to the weighted average weight of the left and the right nodes. log_iterations (int, optional): Setting to a value (N) other than zero will result in information being logged about ever N iterations, info can be interacted with directly with the python [`logging`](https://docs.python.org/3/howto/logging.html) module. For an example of how to utilize the logging information see the example [here](/#logging-output). feature_importance_method (str, optional): The feature importance method type that will be used to calculate the `feature_importances_` attribute on the booster. + budget: a positive number for fitting budget. Increasing this number will more + likely result in more boosting rounds and more increased predictive power. + Default value is 1.0. + alpha: only used in quantile regression. + reset: whether to reset the model or continue training. + categorical_features: The names or indices for categorical features. + `auto` for Polars or Pandas categorical data type. + timeout: optional fit timeout in seconds + iteration_limit: optional limit for the number of boosting rounds. The default value is 1000 boosting rounds. + The algorithm automatically stops for most of the cases before hitting this limit. + If you want to experiment with very high budget (>2.0), you can also increase this limit. + memory_limit: optional limit for memory allocation in GB. If not set, the memory will be allocated based on + available memory and the algorithm requirements. Raises: TypeError: Raised if an invalid dtype is passed. @@ -146,6 +168,13 @@ def __init__( self.missing_node_treatment = missing_node_treatment self.log_iterations = log_iterations self.feature_importance_method = feature_importance_method + self.budget = budget + self.alpha = alpha + self.reset = reset + self.categorical_features = categorical_features + self.timeout = timeout + self.iteration_limit = iteration_limit + self.memory_limit = memory_limit booster = CratePerpetualBooster( objective=self.objective, @@ -166,13 +195,13 @@ def fit( X, y, sample_weight=None, - budget: float = 1.0, - alpha: Union[float, None] = None, - reset: Union[bool, None] = None, + budget: Optional[float] = None, + alpha: Optional[float] = None, + reset: Optional[bool] = None, categorical_features: Union[Iterable[int], Iterable[str], str, None] = "auto", - timeout: Union[float, None] = None, - iteration_limit: Union[int, None] = None, - memory_limit: Union[float, None] = None, + timeout: Optional[float] = None, + iteration_limit: Optional[int] = None, + memory_limit: Optional[float] = None, ) -> Self: """Fit the gradient booster on a provided dataset. @@ -185,6 +214,7 @@ def fit( Defaults to None. budget: a positive number for fitting budget. Increasing this number will more likely result in more boosting rounds and more increased predictive power. + Default value is 1.0. alpha: only used in quantile regression. reset: whether to reset the model or continue training. categorical_features: The names or indices for categorical features. @@ -198,7 +228,7 @@ def fit( """ features_, flat_data, rows, cols, categorical_features_, cat_mapping = ( - convert_input_frame(X, categorical_features) + convert_input_frame(X, categorical_features or self.categorical_features) ) self.n_features_ = cols self.cat_mapping = cat_mapping @@ -262,14 +292,14 @@ def fit( rows=rows, cols=cols, y=y_, - budget=budget, + budget=budget or self.budget, sample_weight=sample_weight_, # type: ignore - alpha=alpha, - reset=reset, + alpha=alpha or self.alpha, + reset=reset or self.reset, categorical_features=categorical_features_, # type: ignore - timeout=timeout, - iteration_limit=iteration_limit, - memory_limit=memory_limit, + timeout=timeout or self.timeout, + iteration_limit=iteration_limit or self.iteration_limit, + memory_limit=memory_limit or self.memory_limit, ) return self diff --git a/python-package/src/booster.rs b/python-package/src/booster.rs index d9aaf44..7839524 100644 --- a/python-package/src/booster.rs +++ b/python-package/src/booster.rs @@ -135,7 +135,7 @@ impl PerpetualBooster { rows: usize, cols: usize, y: PyReadonlyArray1, - budget: f32, + budget: Option, sample_weight: Option>, alpha: Option, reset: Option, @@ -158,7 +158,7 @@ impl PerpetualBooster { match self.booster.fit( &data, y, - budget, + budget.unwrap_or(1.0), sample_weight_, alpha, reset, diff --git a/python-package/src/multi_output.rs b/python-package/src/multi_output.rs index 296ce5a..cf03b1a 100644 --- a/python-package/src/multi_output.rs +++ b/python-package/src/multi_output.rs @@ -154,7 +154,7 @@ impl MultiOutputBooster { rows: usize, cols: usize, y: PyReadonlyArray1, - budget: f32, + budget: Option, sample_weight: Option>, alpha: Option, reset: Option, @@ -180,7 +180,7 @@ impl MultiOutputBooster { match self.booster.fit( &data, &y_data, - budget, + budget.unwrap_or(1.0), sample_weight_, alpha, reset, diff --git a/src/bin.rs b/src/bin.rs index e9a36fb..c1d31da 100644 --- a/src/bin.rs +++ b/src/bin.rs @@ -116,8 +116,8 @@ pub fn sort_cat_bins_by_stat(histogram: &mut [&UnsafeCell], is_const_hess: } else if b2.num == 0 { return Ordering::Greater; } - let div1: f32 = b1.g_folded.iter().sum::() / b1.h_folded.unwrap().iter().sum::(); - let div2: f32 = b2.g_folded.iter().sum::() / b2.h_folded.unwrap().iter().sum::(); + let div1: f32 = b1.g_folded.iter().sum::() / b1.counts.iter().sum::() as f32; + let div2: f32 = b2.g_folded.iter().sum::() / b2.counts.iter().sum::() as f32; div2.partial_cmp(&div1).unwrap_or(Ordering::Less) }); } else { @@ -129,8 +129,8 @@ pub fn sort_cat_bins_by_stat(histogram: &mut [&UnsafeCell], is_const_hess: } else if b2.num == 0 { return Ordering::Greater; } - let div1: f32 = b1.g_folded.iter().sum::() / b1.counts.iter().sum::() as f32; - let div2: f32 = b2.g_folded.iter().sum::() / b2.counts.iter().sum::() as f32; + let div1: f32 = b1.g_folded.iter().sum::() / b1.h_folded.unwrap().iter().sum::(); + let div2: f32 = b2.g_folded.iter().sum::() / b2.h_folded.unwrap().iter().sum::(); div2.partial_cmp(&div1).unwrap_or(Ordering::Less) }); } diff --git a/src/booster.rs b/src/booster.rs index 35d3df7..c2e14f2 100644 --- a/src/booster.rs +++ b/src/booster.rs @@ -1,8 +1,8 @@ use crate::bin::Bin; use crate::binning::bin_matrix; use crate::constants::{ - FREE_MEM_ALLOC_FACTOR, GENERALIZATION_THRESHOLD, ITER_LIMIT, MIN_COL_AMOUNT, N_NODES_ALLOC_LIMIT, - ROW_COLUMN_RATIO_LIMIT, STOPPING_ROUNDS, TIMEOUT_FACTOR, + FREE_MEM_ALLOC_FACTOR, GENERALIZATION_THRESHOLD, ITER_LIMIT, MIN_COL_AMOUNT, N_NODES_ALLOC_LIMIT, STOPPING_ROUNDS, + TIMEOUT_FACTOR, }; use crate::constraints::ConstraintMap; use crate::data::Matrix; @@ -254,7 +254,7 @@ impl PerpetualBooster { /// * `categorical_features` - categorical features. /// * `timeout` - fit timeout limit in seconds. /// * `iteration_limit` - optional limit for the number of boosting rounds. - /// * `memory_limit` - optional limit for memory allocation. + /// * `memory_limit` - optional limit for memory allocation. pub fn fit( &mut self, data: &Matrix, @@ -401,16 +401,14 @@ impl PerpetualBooster { let mut rng = StdRng::seed_from_u64(self.seed); // Column sampling is only applied when (n_rows / n_columns) < ROW_COLUMN_RATIO_LIMIT. - // ROW_COLUMN_RATIO_LIMIT is set to 100 by default. - let colsample_bytree = f64::min( - 1.0, - (data.rows as f64 / data.cols as f64) / ROW_COLUMN_RATIO_LIMIT as f64, - ); - - let col_amount = usize::max( - usize::min(MIN_COL_AMOUNT, col_index.len()), - ((col_index.len() as f64) * colsample_bytree).floor() as usize, - ); + // ROW_COLUMN_RATIO_LIMIT is calculated using budget. + // budget = 1.0 -> ROW_COLUMN_RATIO_LIMIT = 100 + // budget = 2.0 -> ROW_COLUMN_RATIO_LIMIT = 10 + let row_column_ratio_limit = 10.0_f32.powf(-budget) * 1000.0; + let colsample_bytree = (data.rows as f32 / data.cols as f32) / row_column_ratio_limit; + + let col_amount = (((col_index.len() as f32) * colsample_bytree).floor() as usize) + .clamp(usize::min(MIN_COL_AMOUNT, col_index.len()), col_index.len()); let mem_bin = mem::size_of::(); let mem_hist: usize; diff --git a/src/constants.rs b/src/constants.rs index f6377c4..c224d19 100644 --- a/src/constants.rs +++ b/src/constants.rs @@ -3,7 +3,6 @@ pub const FREE_MEM_ALLOC_FACTOR: f32 = 0.9; pub const N_NODES_ALLOC_LIMIT: usize = 3000; pub const ITER_LIMIT: usize = 1000; pub const GENERALIZATION_THRESHOLD: f32 = 0.99; -pub const ROW_COLUMN_RATIO_LIMIT: usize = 100; -pub const MIN_COL_AMOUNT: usize = 30; +pub const MIN_COL_AMOUNT: usize = 40; pub const HESSIAN_EPS: f32 = 1e-3; pub const TIMEOUT_FACTOR: f32 = 0.95;