polars support

perpetual-ml · Jul 17, 2024 · 88b3f9b · 88b3f9b
1 parent 6f77401
commit 88b3f9b
Show file tree

Hide file tree

Showing 20 changed files with 623 additions and 498 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "perpetual"
-version = "0.1.0"
+version = "0.2.0"
 edition = "2021"
 authors = ["Mutlu Simsek <[email protected]>"]
 homepage = "https://perpetual-ml.com"
@@ -9,6 +9,9 @@ license-file = "LICENSE"
 readme = "README.md"
 repository = "https://github.com/perpetual-ml/perpetual"
 
+keywords = ["machine-learning", "perpetual", "ai", "ml"]
+categories = ["algorithms", "mathematics", "science"]
+
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 [profile.release]
 lto = 'fat'
@@ -27,10 +30,10 @@ hashbrown = { version = "0.14", features = ["serde", "rayon"] }
 
 [dev-dependencies]
 criterion = "0.5"
-polars = "0.40"
+polars = "0.41"
 reqwest = { version = "0.12", features = ["blocking"] }
 csv = "1.3"
-chrono = "0.4.38"
+chrono = "0.4"
 
 [[bench]]
 name = "perpetual_benchmarks"

diff --git a/README.md b/README.md
@@ -7,28 +7,29 @@
 [![Python Versions](https://img.shields.io/pypi/pyversions/perpetual.svg?logo=python&logoColor=white)](https://pypi.org/project/perpetual)
 [![PyPI Version](https://img.shields.io/pypi/v/perpetual.svg?logo=pypi&logoColor=white)](https://pypi.org/project/perpetual)
 [![Crates.io Version](https://img.shields.io/crates/v/perpetual?logo=rust&logoColor=white)](https://crates.io/crates/perpetual)
+[![Discord](https://img.shields.io/discord/1247650900214812692?logo=discord&cacheSeconds=10)](https://discord.gg/vADKk9Wr)
 
 </div>
 
 # Perpetual
 
 ## _A self-generalizing, hyperparameter-free gradient boosting machine_
 
-PerpetualBooster is a gradient boosting machine (GBM) algorithm which doesn't have hyperparameters to be tuned so that you can use it without hyperparameter optimization packages unlike other GBM algorithms. Similar to AutoML libraries, it has a `budget` parameter. Increasing the `budget` parameter increases the predictive power of the algorithm and gives better results on unseen data. Start with a small budget and increase it once you are confident with your features. If you don't see any improvement with further increasing the `budget`, it means that you are already extracting the most predictive power out of your data.
+PerpetualBooster is a gradient boosting machine (GBM) algorithm which doesn't have hyperparameters to be tuned so that you can use it without hyperparameter optimization packages unlike other GBM algorithms. Similar to AutoML libraries, it has a `budget` parameter. Increasing the `budget` parameter increases the predictive power of the algorithm and gives better results on unseen data. Start with a small budget (e.g. 1.0) and increase it (e.g. 2.0) once you are confident with your features. If you don't see any improvement with further increasing the `budget`, it means that you are already extracting the most predictive power out of your data.
 
 ## Benchmark
 
 Hyperparameter optimization usually takes 100 iterations with plain GBM algorithms. PerpetualBooster achieves the same accuracy in the single run. Thus, it achieves around 100x speed-up at the same accuracy with different `budget` levels and with different datasets. The speed-up might be slightly lower or significantly higher than 100x depending on the dataset.
 
-The following table summarizes the results for the [California Housing](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html) dataset:
+The following table summarizes the results for the [California Housing](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html) dataset (regression):
 
 | Perpetual budget | LightGBM n_estimators | Perpetual mse | LightGBM mse | Perpetual cpu time | LightGBM cpu time | Speed-up |
 | ---------------- | --------------------- | ------------- | ------------ | ------------------ | ----------------- | -------- |
-| 1.1              | 100                   | 0.192         | 0.192        | 8.9                | 1003              | 113x     |
-| 1.2              | 200                   | 0.190         | 0.191        | 11.0               | 2030              | 186x     |
-| 1.5              | 300                   | 0.187         | 0.188        | 18.7               | 3272              | 179x     |
+| 1.0              | 100                   | 0.192         | 0.192        | 7.6                | 978               | 129x     |
+| 1.5              | 300                   | 0.188         | 0.188        | 21.8               | 3066              | 141x     |
+| 2.1              | 1000                  | 0.185         | 0.186        | 86.0               | 8720              | 101x     |
 
-You can reproduce the results using the [performance_benchmark.ipynb](./python-package/examples/performance_benchmark.ipynb) notebook in the [examples](./python-package/examples) folder.
+You can reproduce the results using the scripts in the [examples](./python-package/examples) folder.
 
 ## Usage
 
@@ -38,7 +39,7 @@ You can use the algorithm like in the example below. Check examples folders for
 from perpetual import PerpetualBooster
 
 model = PerpetualBooster(objective="SquaredLoss")
-model.fit(X, y, budget=0.4)
+model.fit(X, y, budget=1.0)
 ```
 
 ## Documentation
@@ -53,10 +54,10 @@ The package can be installed directly from [pypi](https://pypi.org/project/perpe
 pip install perpetual
 ```
 
-To use in a rust project, add the following to your Cargo.toml file to get the package from [crates.io](https://crates.io/crates/perpetual).
+To use in a Rust project, add the following to your Cargo.toml file to get the package from [crates.io](https://crates.io/crates/perpetual).
 
 ```toml
-perpetual = "0.1.0"
+perpetual = "0.2.0"
 ```
 
 ## Paper

diff --git a/benches/perpetual_benchmarks.rs b/benches/perpetual_benchmarks.rs
@@ -173,7 +173,7 @@ pub fn tree_benchmarks(c: &mut Criterion) {
     let mut booster = PerpetualBooster::default();
     booster.fit(&data, &y, None, None, 0.3, None, None).unwrap();
     booster_train.bench_function("Predict Booster", |b| {
-        b.iter(|| booster.predict(black_box(&data), false, None))
+        b.iter(|| booster.predict(black_box(&data), false))
     });
 }
 

diff --git a/examples/cal_housing.rs b/examples/cal_housing.rs
@@ -6,6 +6,7 @@
 // hyperfine --runs 3 ./target/release/examples/cal_housing
 // hyperfine --runs 3 .\target\release\examples\cal_housing
 // hyperfine --runs 11 'cargo run --release --example cal_housing 0.1 0.3 2'
+// hyperfine --runs 11 'cargo run --release --example cal_housing 2.0'
 
 // cargo flamegraph --example cal_housing
 
@@ -27,48 +28,51 @@ fn main() -> Result<(), Box<dyn Error>> {
     let args: Vec<String> = env::args().collect();
     let budget = &args[1].parse::<f32>().unwrap();
 
-    let _all_names = [
-        "MedInc",
-        "HouseAge",
-        "AveRooms",
-        "AveBedrms",
-        "Population",
-        "AveOccup",
-        "Latitude",
-        "Longitude",
-        "MedHouseVal",
+    let all_names = [
+        "MedInc".to_string(),
+        "HouseAge".to_string(),
+        "AveRooms".to_string(),
+        "AveBedrms".to_string(),
+        "Population".to_string(),
+        "AveOccup".to_string(),
+        "Latitude".to_string(),
+        "Longitude".to_string(),
+        "MedHouseVal".to_string(),
     ];
 
-    let _feature_names = [
-        "MedInc",
-        "HouseAge",
-        "AveRooms",
-        "AveBedrms",
-        "Population",
-        "AveOccup",
-        "Latitude",
-        "Longitude",
+    let feature_names = [
+        "MedInc".to_string(),
+        "HouseAge".to_string(),
+        "AveRooms".to_string(),
+        "AveBedrms".to_string(),
+        "Population".to_string(),
+        "AveOccup".to_string(),
+        "Latitude".to_string(),
+        "Longitude".to_string(),
     ];
 
+    let column_names_train = Arc::new(all_names.clone());
+    let column_names_test = Arc::new(all_names.clone());
+
     let df_train = CsvReadOptions::default()
         .with_has_header(true)
-        .with_columns(Some(Arc::new(_all_names.iter().map(|&s| s.to_string()).collect())))
+        .with_columns(Some(column_names_train))
         .try_into_reader_with_file_path(Some("resources/cal_housing_train.csv".into()))?
         .finish()
         .unwrap();
 
     let df_test = CsvReadOptions::default()
         .with_has_header(true)
-        .with_columns(Some(Arc::new(_all_names.iter().map(|&s| s.to_string()).collect())))
+        .with_columns(Some(column_names_test))
         .try_into_reader_with_file_path(Some("resources/cal_housing_test.csv".into()))?
         .finish()
         .unwrap();
 
     // Get data in column major format...
     let id_vars_train: Vec<&str> = Vec::new();
-    let mdf_train = df_train.melt(&id_vars_train, _feature_names)?;
+    let mdf_train = df_train.unpivot(feature_names.clone(), &id_vars_train)?;
     let id_vars_test: Vec<&str> = Vec::new();
-    let mdf_test = df_test.melt(&id_vars_test, _feature_names)?;
+    let mdf_test = df_test.unpivot(feature_names, &id_vars_test)?;
 
     let data_train = Vec::from_iter(
         mdf_train
@@ -121,11 +125,11 @@ fn main() -> Result<(), Box<dyn Error>> {
     let n_leaves: usize = trees.iter().map(|t| (t.nodes.len() + 1) / 2).sum();
     println!("n_leaves: {:?}", n_leaves);
 
-    let y_pred = model.predict(&matrix_train, true, None);
+    let y_pred = model.predict(&matrix_train, true);
     let error = mse(&y_train, &y_pred);
     println!("mse_train: {:?}", error);
 
-    let y_pred = model.predict(&matrix_test, true, None);
+    let y_pred = model.predict(&matrix_test, true);
     let error = mse(&y_test, &y_pred);
     println!("mse_test: {:?}", error);
 

diff --git a/examples/cover_types.rs b/examples/cover_types.rs
@@ -1,11 +1,11 @@
 //! An example using the `cover types` dataset
 
-// cargo run --release --example cover_types 0.1 0.3
+// cargo run --release --example cover_types 1.0
 
 // cargo build --release --example cover_types
 // hyperfine --runs 3 ./target/release/examples/cover_types
-// hyperfine --runs 3 .\target\release\examples\cover_types 0.1 0.3
-// hyperfine --runs 3 'cargo run --release --example cover_types 0.1 0.3'
+// hyperfine --runs 3 .\target\release\examples\cover_types 1.0
+// hyperfine --runs 3 'cargo run --release --example cover_types 1.0'
 
 // cargo flamegraph --example cover_types
 
@@ -39,7 +39,7 @@ pub fn multiclass_log_loss(y_true: &[f64], y_pred: &[Vec<f64>]) -> f64 {
 
 fn main() -> Result<(), Box<dyn Error>> {
     let args: Vec<String> = env::args().collect();
-    let budget = &args[1].parse::<f32>().unwrap();
+    let budget = &args[1].parse::<f32>().unwrap_or(1.0);
 
     let mut features: Vec<&str> = [
         "Elevation",
@@ -66,29 +66,37 @@ fn main() -> Result<(), Box<dyn Error>> {
     let mut features_and_target = features.clone();
     features_and_target.push("Cover_Type");
 
+    let features_and_target_arc1 = features_and_target
+        .iter()
+        .map(|s| String::from(s.to_owned()))
+        .collect::<Vec<String>>()
+        .into();
+
+    let features_and_target_arc2 = features_and_target
+        .iter()
+        .map(|s| String::from(s.to_owned()))
+        .collect::<Vec<String>>()
+        .into();
+
     let df_train = CsvReadOptions::default()
         .with_has_header(true)
-        .with_columns(Some(Arc::new(
-            features_and_target.iter().map(|&s| s.to_string()).collect(),
-        )))
+        .with_columns(Some(features_and_target_arc1))
         .try_into_reader_with_file_path(Some("resources/cover_types_train.csv".into()))?
         .finish()
         .unwrap();
 
     let df_test = CsvReadOptions::default()
         .with_has_header(true)
-        .with_columns(Some(Arc::new(
-            features_and_target.iter().map(|&s| s.to_string()).collect(),
-        )))
+        .with_columns(Some(features_and_target_arc2))
         .try_into_reader_with_file_path(Some("resources/cover_types_train.csv".into()))?
         .finish()
         .unwrap();
 
     // Get data in column major format...
     let id_vars_train: Vec<&str> = Vec::new();
-    let mdf_train = df_train.melt(&id_vars_train, &features)?;
+    let mdf_train = df_train.unpivot(&features, &id_vars_train)?;
     let id_vars_test: Vec<&str> = Vec::new();
-    let mdf_test = df_test.melt(&id_vars_test, &features)?;
+    let mdf_test = df_test.unpivot(&features, &id_vars_test)?;
 
     let data_train = Vec::from_iter(
         mdf_train
@@ -128,14 +136,8 @@ fn main() -> Result<(), Box<dyn Error>> {
     let matrix_train = Matrix::new(&data_train, y_train.len(), 54);
     let matrix_test = Matrix::new(&data_test, y_test.len(), 54);
 
-    // Create booster.
-    // To provide parameters generate a default booster, and then use
-    // the relevant `set_` methods for any parameters you would like to
-    // adjust.
-
     let mut raw_train_array = vec![vec![0.0; 7]; y_train.len()];
     let mut raw_test_array = vec![vec![0.0; 7]; y_test.len()];
-    // let mut raw_test = Vec::new();
     for i in 1..8 {
         println!();
 
@@ -155,8 +157,8 @@ fn main() -> Result<(), Box<dyn Error>> {
         let n_leaves: usize = trees.iter().map(|t| (t.nodes.len() + 1) / 2).sum();
         println!("n_leaves: {:?}", n_leaves);
 
-        let y_pred_train = model.predict(&matrix_train, true, None);
-        let y_pred_test = model.predict(&matrix_test, true, None);
+        let y_pred_train = model.predict(&matrix_train, true);
+        let y_pred_test = model.predict(&matrix_test, true);
 
         raw_train_array
             .iter_mut()
@@ -166,7 +168,6 @@ fn main() -> Result<(), Box<dyn Error>> {
             .iter_mut()
             .enumerate()
             .for_each(|(idx, raw)| raw[(i - 1) as usize] = y_pred_test[idx]);
-        // raw_test.push(Series::new(&i.to_string(), y_pred));
     }
 
     let loss_train = multiclass_log_loss(&y_train, &raw_train_array);
@@ -175,10 +176,5 @@ fn main() -> Result<(), Box<dyn Error>> {
     println!("loss_train: {}", loss_train);
     println!("loss_test: {}", loss_test);
 
-    // let mut df_raw_test = DataFrame::new(raw_test).unwrap();
-    // let mut output_file = File::create("./raw_test.csv").expect("Failed to create an output file.");
-    // let mut writer = CsvWriter::new(&mut output_file).include_header(true);
-    // writer.finish(&mut df_raw_test).expect("Failed to write the CSV file.");
-
     Ok(())
 }
diff --git a/examples/titanic.rs b/examples/titanic.rs
@@ -11,18 +11,22 @@ fn main() -> Result<(), Box<dyn Error>> {
 
     let features_and_target = ["survived", "pclass", "age", "sibsp", "parch", "fare"];
 
+    let features_and_target_arc = features_and_target
+        .iter()
+        .map(|s| String::from(s.to_owned()))
+        .collect::<Vec<String>>()
+        .into();
+
     let df = CsvReadOptions::default()
         .with_has_header(true)
-        .with_columns(Some(Arc::new(
-            features_and_target.iter().map(|&s| s.to_string()).collect(),
-        )))
+        .with_columns(Some(features_and_target_arc))
         .try_into_reader_with_file_path(Some("resources/titanic.csv".into()))?
         .finish()
         .unwrap();
 
     // Get data in column major format...
     let id_vars: Vec<&str> = Vec::new();
-    let mdf = df.melt(id_vars, ["pclass", "age", "sibsp", "parch", "fare"])?;
+    let mdf = df.unpivot(["pclass", "age", "sibsp", "parch", "fare"], id_vars)?;
 
     let data = Vec::from_iter(
         mdf.select_at_idx(1)
@@ -49,7 +53,7 @@ fn main() -> Result<(), Box<dyn Error>> {
     let mut model = PerpetualBooster::default().set_objective(Objective::LogLoss);
     model.fit(&matrix, &y, None, None, *budget, None, None)?;
 
-    println!("Model prediction: {:?} ...", &model.predict(&matrix, true, None)[0..10]);
+    println!("Model prediction: {:?} ...", &model.predict(&matrix, true)[0..10]);
 
     Ok(())
 }
diff --git a/python-package/Cargo.toml b/python-package/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "py-perpetual"
-version = "0.1.0"
+version = "0.2.0"
 edition = "2021"
 authors = ["Mutlu Simsek <[email protected]>"]
 homepage = "https://perpetual-ml.com"
@@ -9,16 +9,19 @@ license-file = "LICENSE"
 readme = "README.md"
 repository = "https://github.com/perpetual-ml/perpetual"
 
+keywords = ["machine-learning", "perpetual", "ai", "ml"]
+categories = ["algorithms", "mathematics", "science"]
+
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 [lib]
 name = "perpetual"
 crate-type = ["cdylib"]
 
 [dependencies]
-pyo3 = { version = "0.21.0", features = ["extension-module"] }
-perpetual_rs = {package="perpetual", version = "0.1.0", path = "../" }
+pyo3 = { version = "0.21", features = ["extension-module"] }
+perpetual_rs = {package="perpetual", version = "0.2.0", path = "../" }
 numpy = "0.21.0"
-ndarray = "0.15.1"
+ndarray = "0.15"
 serde_plain = { version = "1.0" }
 serde = { version = "1.0" }
-pyo3-log = "0.10.0"
+pyo3-log = "0.11"