From 96739a56201912ac386f3ef64b7405d31b18fb75 Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Wed, 18 Dec 2024 16:35:53 +0000 Subject: [PATCH] feat: add standard scaler transform (#204) This PR adds a standard scaler transform to the augurs-forecaster transforms. It's similar to the existing `MinMaxScaler`, but instead scales the data such that it has a mean of 0 and a standard deviation of 1. This is similar to the `StandardScaler` in scikit-learn. --- crates/augurs-forecaster/src/transforms.rs | 185 +++++++++++++++++++++ 1 file changed, 185 insertions(+) diff --git a/crates/augurs-forecaster/src/transforms.rs b/crates/augurs-forecaster/src/transforms.rs index 10f9d99..4d878bf 100644 --- a/crates/augurs-forecaster/src/transforms.rs +++ b/crates/augurs-forecaster/src/transforms.rs @@ -53,6 +53,8 @@ pub enum Transform { LinearInterpolator, /// Min-max scaling. MinMaxScaler(MinMaxScaleParams), + /// Standard scaling. + StandardScaler(StandardScaleParams), /// Logit transform. Logit, /// Log transform. @@ -92,6 +94,22 @@ impl Transform { Self::MinMaxScaler(min_max_params) } + /// Create a new standard scaler. + /// + /// This scaler standardizes features by removing the mean and scaling to unit variance. + /// + /// The standard score of a sample x is calculated as: + /// + /// ```text + /// z = (x - u) / s + /// ``` + /// + /// where u is the mean and s is the standard deviation in the provided + /// `StandardScaleParams`. + pub fn standard_scaler(scale_params: StandardScaleParams) -> Self { + Self::StandardScaler(scale_params) + } + /// Create a new logit transform. /// /// This transform applies the logit function to each item. @@ -173,6 +191,7 @@ impl Transform { match self { Self::LinearInterpolator => Box::new(input.interpolate(LinearInterpolator::default())), Self::MinMaxScaler(params) => Box::new(input.min_max_scale(params)), + Self::StandardScaler(params) => Box::new(input.standard_scale(params)), Self::Logit => Box::new(input.logit()), Self::Log => Box::new(input.log()), Self::BoxCox { lambda } => Box::new(input.box_cox(*lambda)), @@ -202,6 +221,7 @@ impl Transform { match self { Self::LinearInterpolator => Box::new(input), Self::MinMaxScaler(params) => Box::new(input.inverse_min_max_scale(params)), + Self::StandardScaler(params) => Box::new(input.inverse_standard_scale(params)), Self::Logit => Box::new(input.logistic()), Self::Log => Box::new(input.exp()), Self::BoxCox { lambda } => Box::new(input.inverse_box_cox(*lambda)), @@ -354,6 +374,126 @@ trait InverseMinMaxScaleExt: Iterator { impl InverseMinMaxScaleExt for T where T: Iterator {} +/// Parameters for the standard scaler. +#[derive(Debug, Clone)] +pub struct StandardScaleParams { + /// The mean of the data. + pub mean: f64, + /// The standard deviation of the data. + pub std_dev: f64, +} + +impl StandardScaleParams { + /// Create a new `StandardScaleParams` with the given mean and standard deviation. + pub fn new(mean: f64, std_dev: f64) -> Self { + Self { mean, std_dev } + } + + /// Create a new `StandardScaleParams` from the given data. + /// + /// Note: this uses Welford's online algorithm to compute mean and variance in a single pass, + /// since we only have an iterator. The standard deviation is calculated using the + /// biased estimator, for parity with the [scikit-learn implementation][sklearn]. + /// + /// [sklearn]: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html + pub fn from_data(data: T) -> Self + where + T: Iterator, + { + // Use Welford's online algorithm to compute mean and variance in a single pass, + // since we only have an iterator. + let mut count = 0_u64; + let mut mean = 0.0; + let mut m2 = 0.0; + + for x in data { + count += 1; + let delta = x - mean; + mean += delta / count as f64; + let delta2 = x - mean; + m2 += delta * delta2; + } + + // Handle empty iterator case + if count == 0 { + return Self::new(0.0, 1.0); + } + + // Calculate standard deviation + let std_dev = (m2 / count as f64).sqrt(); + + Self { mean, std_dev } + } +} + +/// Iterator adapter that scales each item using the given mean and standard deviation, +/// so that (assuming the adapter was created using the same data), the output items +/// have zero mean and unit standard deviation. +#[derive(Debug, Clone)] +struct StandardScale { + inner: T, + mean: f64, + std_dev: f64, +} + +impl Iterator for StandardScale +where + T: Iterator, +{ + type Item = f64; + fn next(&mut self) -> Option { + self.inner.next().map(|x| (x - self.mean) / self.std_dev) + } +} + +trait StandardScaleExt: Iterator { + fn standard_scale(self, params: &StandardScaleParams) -> StandardScale + where + Self: Sized, + { + StandardScale { + inner: self, + mean: params.mean, + std_dev: params.std_dev, + } + } +} + +impl StandardScaleExt for T where T: Iterator {} + +/// Iterator adapter that applies the inverse standard scaling transformation. +#[derive(Debug, Clone)] +struct InverseStandardScale { + inner: T, + mean: f64, + std_dev: f64, +} + +impl Iterator for InverseStandardScale +where + T: Iterator, +{ + type Item = f64; + fn next(&mut self) -> Option { + self.inner.next().map(|x| (x * self.std_dev) + self.mean) + } +} + +trait InverseStandardScaleExt: Iterator { + fn inverse_standard_scale(self, params: &StandardScaleParams) -> InverseStandardScale + where + Self: Sized, + { + InverseStandardScale { + inner: self, + mean: params.mean, + std_dev: params.std_dev, + } + } +} + +impl InverseStandardScaleExt for T where T: Iterator {} + // Logit and logistic functions. /// Returns the logistic function of the given value. @@ -793,6 +933,51 @@ mod test { assert_all_close(&expected, &actual); } + #[test] + fn standard_scale() { + let data = vec![1.0, 2.0, 3.0]; + let params = StandardScaleParams::new(2.0, 1.0); // mean=2, std=1 + let expected = vec![-1.0, 0.0, 1.0]; + let actual: Vec<_> = data.into_iter().standard_scale(¶ms).collect(); + assert_all_close(&expected, &actual); + } + + #[test] + fn inverse_standard_scale() { + let data = vec![-1.0, 0.0, 1.0]; + let params = StandardScaleParams::new(2.0, 1.0); // mean=2, std=1 + let expected = vec![1.0, 2.0, 3.0]; + let actual: Vec<_> = data.into_iter().inverse_standard_scale(¶ms).collect(); + assert_all_close(&expected, &actual); + } + + #[test] + fn standard_scale_params_from_data() { + // Test case 1: Simple sequence + let data = vec![1.0, 2.0, 3.0]; + let params = StandardScaleParams::from_data(data.into_iter()); + assert_approx_eq!(params.mean, 2.0); + assert_approx_eq!(params.std_dev, 0.816496580927726); + + // Test case 2: More complex data + let data = vec![2.0, 4.0, 4.0, 4.0, 5.0, 5.0, 7.0, 9.0]; + let params = StandardScaleParams::from_data(data.into_iter()); + assert_approx_eq!(params.mean, 5.0); + assert_approx_eq!(params.std_dev, 2.0); + + // Test case 3: Empty iterator should return default values + let data: Vec = vec![]; + let params = StandardScaleParams::from_data(data.into_iter()); + assert_approx_eq!(params.mean, 0.0); + assert_approx_eq!(params.std_dev, 1.0); + + // Test case 4: Single value + let data = vec![42.0]; + let params = StandardScaleParams::from_data(data.into_iter()); + assert_approx_eq!(params.mean, 42.0); + assert_approx_eq!(params.std_dev, 0.0); // technically undefined, but we return 0 + } + #[test] fn min_max_scale_params_from_data() { let data = [1.0, 2.0, f64::NAN, 3.0];