diff --git a/Cargo.toml b/Cargo.toml index d9b65d3..34e025f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "perpetual" -version = "0.7.3" +version = "0.7.4" edition = "2021" authors = ["Mutlu Simsek "] homepage = "https://perpetual-ml.com" diff --git a/README.md b/README.md index 1056999..a4fc3d0 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ Documentation for the Python API can be found [here](https://perpetual-ml.github ## Installation -The package can be installed directly from [pypi](https://pypi.org/project/perpetual). +The package can be installed directly from [pypi](https://pypi.org/project/perpetual): ```shell pip install perpetual @@ -64,10 +64,10 @@ Using [conda-forge](https://anaconda.org/conda-forge/perpetual): conda install conda-forge::perpetual ``` -To use in a Rust project, add the following to your Cargo.toml file to get the package from [crates.io](https://crates.io/crates/perpetual). +To use in a Rust project and to get the package from [crates.io](https://crates.io/crates/perpetual): ```toml -perpetual = "0.7.3" +cargo add perpetual ``` ## Contribution diff --git a/python-package/Cargo.toml b/python-package/Cargo.toml index 36ac0ea..9986d1a 100644 --- a/python-package/Cargo.toml +++ b/python-package/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py-perpetual" -version = "0.7.3" +version = "0.7.4" edition = "2021" authors = ["Mutlu Simsek "] homepage = "https://perpetual-ml.com" @@ -19,7 +19,7 @@ crate-type = ["cdylib", "rlib"] [dependencies] pyo3 = { version = "0.22.6", features = ["extension-module"] } -perpetual_rs = {package="perpetual", version = "0.7.3", path = "../" } +perpetual_rs = {package="perpetual", version = "0.7.4", path = "../" } numpy = "0.22.1" ndarray = "0.16.1" serde_plain = { version = "1.0" } diff --git a/python-package/examples/fetch_openml.ipynb b/python-package/examples/fetch_openml.ipynb new file mode 100644 index 0000000..5c12f1f --- /dev/null +++ b/python-package/examples/fetch_openml.ipynb @@ -0,0 +1,81 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from perpetual import PerpetualBooster\n", + "from sklearn.datasets import fetch_openml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data, target = fetch_openml(data_id=41147, return_X_y=True, as_frame=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = PerpetualBooster()\n", + "model.fit(data, target, budget=0.1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.number_of_trees" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py311", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml index 837937d..49c9be4 100644 --- a/python-package/pyproject.toml +++ b/python-package/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "perpetual" -version = "0.7.3" +version = "0.7.4" description = "A self-generalizing gradient boosting machine which doesn't need hyperparameter optimization" license = { file = "LICENSE" } keywords = [ diff --git a/python-package/python/perpetual/booster.py b/python-package/python/perpetual/booster.py index 330d60b..6bc02fc 100644 --- a/python-package/python/perpetual/booster.py +++ b/python-package/python/perpetual/booster.py @@ -57,10 +57,11 @@ def __init__( memory_limit: Optional[float] = None, stopping_rounds: Optional[int] = None, max_bin: int = 256, + max_cat: int = 1000, ): """PerpetualBooster class, used to generate gradient boosted decision tree ensembles. The following parameters can also be specified in the fit method to override the values in the constructor: - budget, alpha, reset, categorical_features, timeout, iteration_limit, and memory_limit. + budget, alpha, reset, categorical_features, timeout, iteration_limit, memory_limit, and stopping_rounds. Args: objective (str, optional): Learning objective function to be used for optimization. @@ -104,21 +105,24 @@ def __init__( - "AverageNodeWeight": Set the missing node to be equal to the weighted average weight of the left and the right nodes. log_iterations (int, optional): Setting to a value (N) other than zero will result in information being logged about ever N iterations, info can be interacted with directly with the python [`logging`](https://docs.python.org/3/howto/logging.html) module. For an example of how to utilize the logging information see the example [here](/#logging-output). feature_importance_method (str, optional): The feature importance method type that will be used to calculate the `feature_importances_` attribute on the booster. - budget: a positive number for fitting budget. Increasing this number will more + budget (float, optional): a positive number for fitting budget. Increasing this number will more likely result in more boosting rounds and more increased predictive power. Default value is 1.0. - alpha: only used in quantile regression. - reset: whether to reset the model or continue training. - categorical_features: The names or indices for categorical features. - `auto` for Polars or Pandas categorical data type. - timeout: optional fit timeout in seconds - iteration_limit: optional limit for the number of boosting rounds. The default value is 1000 boosting rounds. + alpha (float, optional): only used in quantile regression. + reset (bool, optional): whether to reset the model or continue training. + categorical_features (Union[Iterable[int], Iterable[str], str, None], optional): The names or indices for categorical features. + Defaults to `auto` for Polars or Pandas categorical data types. + timeout (float, optional): optional fit timeout in seconds + iteration_limit (int, optional): optional limit for the number of boosting rounds. The default value is 1000 boosting rounds. The algorithm automatically stops for most of the cases before hitting this limit. If you want to experiment with very high budget (>2.0), you can also increase this limit. - memory_limit: optional limit for memory allocation in GB. If not set, the memory will be allocated based on + memory_limit (float, optional): optional limit for memory allocation in GB. If not set, the memory will be allocated based on available memory and the algorithm requirements. - stopping_rounds: optional limit for auto stopping. - max_bin: number bins for feature discretization. + stopping_rounds (int, optional): optional limit for auto stopping. + max_bin (int, optional): maximum number of bins for feature discretization. Defaults to 256. + max_cat (int, optional): Maximum number of unique categories for a categorical feature. + Features with more categories will be treated as numerical. + Defaults to 1000. Raises: TypeError: Raised if an invalid dtype is passed. @@ -181,6 +185,7 @@ def __init__( self.memory_limit = memory_limit self.stopping_rounds = stopping_rounds self.max_bin = max_bin + self.max_cat = max_cat booster = CratePerpetualBooster( objective=self.objective, @@ -220,24 +225,26 @@ def fit( sample_weight (Union[ArrayLike, None], optional): Instance weights to use when training the model. If None is passed, a weight of 1 will be used for every record. Defaults to None. - budget: a positive number for fitting budget. Increasing this number will more + budget (float, optional): a positive number for fitting budget. Increasing this number will more likely result in more boosting rounds and more increased predictive power. - Default value is 1.0. - alpha: only used in quantile regression. - reset: whether to reset the model or continue training. - categorical_features: The names or indices for categorical features. - `auto` for Polars or Pandas categorical data type. - timeout: optional fit timeout in seconds - iteration_limit: optional limit for the number of boosting rounds. The default value is 1000 boosting rounds. + Defaults to 1.0. + alpha (float, optional): only used in quantile regression. + reset (bool, optional): whether to reset the model or continue training. + categorical_features (Union[Iterable[int], Iterable[str], str, None], optional): The names or indices for categorical features. + Defaults to `auto` for Polars or Pandas categorical data types. + timeout (float, optional): optional fit timeout in seconds + iteration_limit (int, optional): optional limit for the number of boosting rounds. The default value is 1000 boosting rounds. The algorithm automatically stops for most of the cases before hitting this limit. If you want to experiment with very high budget (>2.0), you can also increase this limit. - memory_limit: optional limit for memory allocation in GB. If not set, the memory will be allocated based on + memory_limit (float, optional): optional limit for memory allocation in GB. If not set, the memory will be allocated based on available memory and the algorithm requirements. - stopping_rounds: optional limit for auto stopping. Defaults to 3. + stopping_rounds (int, optional): optional limit for auto stopping. Defaults to 3. """ features_, flat_data, rows, cols, categorical_features_, cat_mapping = ( - convert_input_frame(X, categorical_features or self.categorical_features) + convert_input_frame( + X, categorical_features or self.categorical_features, self.max_cat + ) ) self.n_features_ = cols self.cat_mapping = cat_mapping diff --git a/python-package/python/perpetual/utils.py b/python-package/python/perpetual/utils.py index c500684..e0c9f68 100644 --- a/python-package/python/perpetual/utils.py +++ b/python-package/python/perpetual/utils.py @@ -1,7 +1,11 @@ +import logging import numpy as np from typing import Dict, Iterable, List, Optional, Tuple +logger = logging.getLogger(__name__) + + def type_df(df): library_name = type(df).__module__.split(".")[0] if type(df).__name__ == "DataFrame": @@ -61,7 +65,7 @@ def convert_input_array(x, objective) -> np.ndarray: def convert_input_frame( - X, categorical_features + X, categorical_features, max_cat ) -> Tuple[List[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[Dict]]: """Convert data to format needed by booster. @@ -110,18 +114,28 @@ def convert_input_frame( categorical_features_ = [features_.index(c) for c in categorical_features] cat_mapping = {} # key: feature_name, value: ordered category names + cat_to_num = [] if categorical_features_: for i in categorical_features_: categories = np.unique(X_[:, i].astype(dtype="str", copy=False)) + if len(categories) > max_cat: + cat_to_num.append(i) + logger.warning( + f"Feature {features_[i]} will be treated as numerical since the number of categories ({len(categories)}) exceeds max_cat ({max_cat}) threshold." + ) + continue categories = [c for c in list(categories) if c != "nan"] categories.insert(0, "nan") cat_mapping[features_[i]] = categories + categorical_features_ = [ + x for x in categorical_features_ if x not in cat_to_num + ] if cat_mapping: - print(f"Categorical features: {categorical_features_}") - print(f"Mapping of categories: {cat_mapping}") + logger.info(f"Categorical features: {categorical_features_}") + logger.info(f"Mapping of categories: {cat_mapping}") + for feature_name, categories in cat_mapping.items(): - feature_index = features_.index(feature_name) def f(x): try: @@ -133,6 +147,7 @@ def f(x): except (ValueError, IndexError): return np.nan + feature_index = features_.index(feature_name) X_[:, feature_index] = np.apply_along_axis(f, 1, X_) if not np.issubdtype(X_.dtype, "float64"): diff --git a/scripts/make_resources.py b/scripts/make_resources.py index e4b9ce1..084600f 100644 --- a/scripts/make_resources.py +++ b/scripts/make_resources.py @@ -82,7 +82,7 @@ data_train, data_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) - features_, titanic_train_flat, rows, cols, categorical_features_, cat_mapping = convert_input_frame(data_train, "auto") + features_, titanic_train_flat, rows, cols, categorical_features_, cat_mapping = convert_input_frame(data_train, "auto", 1000) features_, titanic_test_flat, rows, cols = transform_input_frame(data_test, cat_mapping) data_test.to_csv("resources/titanic_test_df.csv", index=False) @@ -97,6 +97,6 @@ df = fetch_openml(data_id=546) X = df.data y = df.target - features_, sensory_flat, rows, cols, categorical_features_, cat_mapping = convert_input_frame(X, "auto") + features_, sensory_flat, rows, cols, categorical_features_, cat_mapping = convert_input_frame(X, "auto", 1000) pd.Series(sensory_flat).to_csv("resources/sensory_flat.csv", index=False, header=False) pd.Series(y).to_csv("resources/sensory_y.csv", index=False, header=False)