diff --git a/zenu-cuda/src/cudnn/batch_norm.rs b/zenu-cuda/src/cudnn/batch_norm.rs index 3b845287..c4622a87 100644 --- a/zenu-cuda/src/cudnn/batch_norm.rs +++ b/zenu-cuda/src/cudnn/batch_norm.rs @@ -670,7 +670,7 @@ mod batch_norm { bias_gpu, running_mean_gpu, running_variance_gpu, - 0.1, + 1.0, saved_mean_gpu, saved_variance_gpu, ) diff --git a/zenu-matrix/src/device/cpu.rs b/zenu-matrix/src/device/cpu.rs index f8670ba5..2ff3b930 100644 --- a/zenu-matrix/src/device/cpu.rs +++ b/zenu-matrix/src/device/cpu.rs @@ -1,6 +1,6 @@ use super::{Device, DeviceBase}; -#[derive(Copy, Clone, Default)] +#[derive(Copy, Clone, Default, Debug)] pub struct Cpu; impl DeviceBase for Cpu { diff --git a/zenu-matrix/src/device/nvidia.rs b/zenu-matrix/src/device/nvidia.rs index 4751c72a..a60bbcaf 100644 --- a/zenu-matrix/src/device/nvidia.rs +++ b/zenu-matrix/src/device/nvidia.rs @@ -1,7 +1,7 @@ use super::{Device, DeviceBase}; use crate::num::Num; -#[derive(Copy, Clone, Default)] +#[derive(Copy, Clone, Default, Debug)] pub struct Nvidia; impl DeviceBase for Nvidia { diff --git a/zenu-matrix/src/nn/batch_norm.rs b/zenu-matrix/src/nn/batch_norm.rs index fc757cc8..c0be9022 100644 --- a/zenu-matrix/src/nn/batch_norm.rs +++ b/zenu-matrix/src/nn/batch_norm.rs @@ -13,7 +13,7 @@ use crate::device::nvidia::Nvidia; #[cfg(feature = "nvidia")] fn batch_norm2d_forward_train_gpu( - momentum: T, + momentum: f64, x: Matrix, DimDyn, Nvidia>, y: Matrix, DimDyn, Nvidia>, scale: Matrix, DimDyn, Nvidia>, @@ -22,23 +22,21 @@ fn batch_norm2d_forward_train_gpu( variance: Matrix, DimDyn, Nvidia>, saving_mean: Matrix, DimDyn, Nvidia>, saving_inv_variance: Matrix, DimDyn, Nvidia>, - epsilon: f64, batch_norm: Option>, ) { - let alpha = T::one() - momentum; - let beta = momentum; + let momentum = 1. - momentum; match batch_norm { Some(batch_norm) => batch_norm .forward_train( - alpha, - beta, + T::one(), + T::zero(), x.as_ptr(), y.as_mut_ptr(), scale.as_ptr(), bias.as_ptr(), mean.as_mut_ptr(), variance.as_mut_ptr(), - epsilon, + momentum, saving_mean.as_mut_ptr(), saving_inv_variance.as_mut_ptr(), ) @@ -115,7 +113,7 @@ fn create_batch_norm_backward_gpu( let batch_norm_backward = BatchNorm2dBackwardBuilder::::new() .input(input.0, input.1, input.2, input.3, TensorFormat::NCHW) .unwrap() - .output(input.0, input.1, input.2, input.3, TensorFormat::NCHW) + .output_grad(input.0, input.1, input.2, input.3, TensorFormat::NCHW) .unwrap() .scale_bias_mean_var(input.1, TensorFormat::NCHW) .unwrap() @@ -131,39 +129,88 @@ fn batch_norm2d_forward_train_cpu( bias: Matrix, DimDyn, Cpu>, mean: Matrix, DimDyn, Cpu>, variance: Matrix, DimDyn, Cpu>, - epsilon: f64, saving_mean: Matrix, DimDyn, Cpu>, saving_inv_variance: Matrix, DimDyn, Cpu>, ) { - let epsilon = T::from_f64(epsilon); + let epsilon = T::from_f64(1e-10); let x_transposed = x.transpose_by_index_new_matrix(&[0, 2, 3, 1]); let x_reshaped = x_transposed.reshape(&[ x_transposed.shape()[0] * x_transposed.shape()[2] * x_transposed.shape()[3], x_transposed.shape()[1], ]); - let num_elements = T::from_usize(x_reshaped.shape()[0]); // 行数を取得 + let num_elements = T::from_usize(x_reshaped.shape()[0]); let x_mean = x_reshaped.mean(Some(0), false); let x_diff = &x_reshaped - &x_mean; let x_diff_squared = &x_diff * &x_diff; let x_variance = x_diff_squared.mean(Some(0), false) * num_elements / (num_elements - T::one()); - let inv_std = Matrix::<_, DimDyn, _>::ones(x_variance.shape()) / (x_variance.sqrt() + epsilon); - let x_hat = &x_diff * &inv_std; - let y_hat = x_hat * scale + bias; - let y_reshaped = y_hat.reshape(&[x.shape()[0], x.shape()[2], x.shape()[3], x.shape()[1]]); - let y_transposed = y_reshaped.transpose_by_index_new_matrix(&[0, 3, 1, 2]); - y.copy_from(&y_transposed); - let mean_t = &x_mean * (T::one() - momentum) + &mean * momentum; let variance_t = x_variance * (T::one() - momentum) + &variance * momentum; + let inv_var = Matrix::<_, DimDyn, _>::ones(variance_t.shape()) / (&variance_t + epsilon); + let inv_std = inv_var.sqrt(); + mean.copy_from(&mean_t); variance.copy_from(&variance_t); saving_mean.copy_from(&x_mean); saving_inv_variance.copy_from(&inv_std); + + let x_normalized = &x_diff * &inv_std; + let y_tmp = &x_normalized * &scale + &bias; + let y_transposed = y_tmp.reshape(&[ + x_transposed.shape()[0], + x_transposed.shape()[2], + x_transposed.shape()[3], + x_transposed.shape()[1], + ]); + y.copy_from(&y_transposed.transpose_by_index_new_matrix(&[0, 3, 1, 2])); +} + +fn batch_norm2d_backward_cpu( + momentum: T, + x: Matrix, DimDyn, Cpu>, + x_grad: Matrix, DimDyn, Cpu>, + y_grad: Matrix, DimDyn, Cpu>, + scale: Matrix, DimDyn, Cpu>, + scale_grad: Matrix, DimDyn, Cpu>, + bias_grad: Matrix, DimDyn, Cpu>, + epsilon: f64, + saving_mean: Matrix, DimDyn, Cpu>, + saving_inv_variance: Matrix, DimDyn, Cpu>, +) { + let epsilon = T::from_f64(1e-10); + let batch_size = T::from_usize(x.shape()[0]); + + let x_transposed = x.transpose_by_index_new_matrix(&[0, 2, 3, 1]); + let x_reshaped = x_transposed.reshape(&[ + x_transposed.shape()[0] * x_transposed.shape()[2] * x_transposed.shape()[3], + x_transposed.shape()[1], + ]); + + let y_grad_transposed = y_grad.transpose_by_index_new_matrix(&[0, 2, 3, 1]); + let y_grad_reshaped = y_grad_transposed.reshape(&[ + y_grad_transposed.shape()[0] * y_grad_transposed.shape()[2] * y_grad_transposed.shape()[3], + y_grad_transposed.shape()[1], + ]); + + let xc = (&x_reshaped - &saving_mean) * &saving_inv_variance; + + bias_grad.copy_from(&y_grad_transposed.to_ref().sum(0, false)); + scale_grad.copy_from(&(&xc * &y_grad_reshaped).to_ref().sum(0, false)); + + let tmp_x_grad = &y_grad_reshaped / batch_size - &xc * &scale_grad / batch_size; + let tmp_x_grad = &tmp_x_grad * &saving_inv_variance; + + let x_grad_transposed = tmp_x_grad.reshape(&[ + x_transposed.shape()[0], + x_transposed.shape()[2], + x_transposed.shape()[3], + x_transposed.shape()[1], + ]); + x_grad.copy_from(&x_grad_transposed.transpose_by_index_new_matrix(&[0, 3, 1, 2])); } pub trait BatchNormalization: Device { @@ -175,7 +222,6 @@ pub trait BatchNormalization: Device { bias: Matrix, DimDyn, Self>, mean: Matrix, DimDyn, Self>, variance: Matrix, DimDyn, Self>, - epsilon: f64, saving_mean: Matrix, DimDyn, Self>, saving_inv_variance: Matrix, DimDyn, Self>, device_batch_norm: Option, @@ -204,7 +250,6 @@ impl BatchNormalization for Cpu { bias: Matrix, DimDyn, Self>, mean: Matrix, DimDyn, Self>, variance: Matrix, DimDyn, Self>, - epsilon: f64, saving_mean: Matrix, DimDyn, Self>, saving_inv_variance: Matrix, DimDyn, Self>, _: Option, @@ -217,7 +262,6 @@ impl BatchNormalization for Cpu { bias, mean, variance, - epsilon, saving_mean, saving_inv_variance, ); @@ -255,6 +299,7 @@ mod batch_norm { #[cfg(feature = "nvidia")] use crate::device::nvidia::Nvidia; + #[derive(Debug)] struct BatchNormInputs { x: Matrix, DimDyn, D>, y: Matrix, DimDyn, D>, @@ -262,20 +307,46 @@ mod batch_norm { bias: Matrix, DimDyn, D>, mean: Matrix, DimDyn, D>, variance: Matrix, DimDyn, D>, + saved_mean: Matrix, DimDyn, D>, + saved_variance: Matrix, DimDyn, D>, } fn small_data() -> BatchNormInputs { let x = Matrix::, DimDyn, D>::from_vec( vec![ - 0., 1., 2., 3., 4., 5., 6., 7., 0., 1., 2., 3., 4., 5., 6., 7., + // 0., 1., 2., 3., 4., 5., 6., 7., 0., 1., 2., 3., 4., 5., 6., 7., + -1.1258398, + -1.1523602, + -0.25057858, + -0.4338788, + 0.84871036, + 0.69200915, + -0.31601277, + -2.1152194, + 0.32227492, + -1.2633348, + 0.3499832, + 0.30813393, + 0.11984151, + 1.2376579, + 1.1167772, + -0.24727815, ], &[2, 2, 2, 2], ); - let y = Matrix::, DimDyn, D>::zeros(x.shape()); - let scale = Matrix::, DimDyn, D>::from_vec(vec![1., 1.], [2]); - let bias = Matrix::, DimDyn, D>::from_vec(vec![0., 0.], [2]); - let mean = Matrix::, DimDyn, D>::zeros([2]); - let variance = Matrix::, DimDyn, D>::zeros([2]); + let running_mean = vec![-0.04057, 0.01670607]; + let running_variance = vec![0.9492437, 1.0200632]; + let saved_mean = vec![-0.04057, 0.01670607]; + let saved_variance = vec![0.9492437, 1.0200632]; + let scale = vec![1.0, 1.0]; + let bias = vec![0.0, 0.0]; + let y = Matrix::, DimDyn, D>::zeros(&[2, 2, 2, 2]); + let mean = Matrix::, DimDyn, D>::from_vec(running_mean, &[2]); + let variance = Matrix::, DimDyn, D>::from_vec(running_variance, &[2]); + let scale = Matrix::, DimDyn, D>::from_vec(scale, &[2]); + let bias = Matrix::, DimDyn, D>::from_vec(bias, &[2]); + let saved_mean = Matrix::, DimDyn, D>::from_vec(saved_mean, &[2]); + let saved_variance = Matrix::, DimDyn, D>::from_vec(saved_variance, &[2]); BatchNormInputs { x, y, @@ -283,14 +354,14 @@ mod batch_norm { bias, mean, variance, + saved_mean, + saved_variance, } } #[test] fn small_cpu() { let mut inputs = small_data::(); - let mut savig_mean = Matrix::, DimDyn, Cpu>::zeros(&[2]); - let mut saving_inv_variance = Matrix::, DimDyn, Cpu>::zeros(&[2]); batch_norm2d_forward_train_cpu( 0.0, inputs.x.to_ref(), @@ -299,18 +370,22 @@ mod batch_norm { inputs.bias.to_ref(), inputs.mean.to_ref_mut(), inputs.variance.to_ref_mut(), - 1e-5, - savig_mean.to_ref_mut(), - saving_inv_variance.to_ref_mut(), + inputs.saved_mean.to_ref_mut(), + inputs.saved_variance.to_ref_mut(), ); + + println!("y {:?}", inputs.y); + println!("mean {:?}", inputs.mean); + println!("variance {:?}", inputs.variance); + println!("saved mean {:?}", inputs.saved_mean); + println!("saved variance {:?}", inputs.saved_variance); + panic!(); } #[cfg(feature = "nvidia")] #[test] fn small_gpu() { let mut inputs = small_data::(); - let mut savig_mean = Matrix::, DimDyn, Nvidia>::zeros(&[2]); - let mut saving_inv_variance = Matrix::, DimDyn, Nvidia>::zeros(&[2]); let batch_norm = BatchNorm2dBuilder::::new() .input(2, 2, 2, 2, TensorFormat::NCHW) .unwrap() @@ -328,10 +403,16 @@ mod batch_norm { inputs.bias.to_ref(), inputs.mean.to_ref_mut(), inputs.variance.to_ref_mut(), - savig_mean.to_ref_mut(), - saving_inv_variance.to_ref_mut(), - 1., + inputs.saved_mean.to_ref_mut(), + inputs.saved_variance.to_ref_mut(), Some(batch_norm), ); + + println!("y {:?}", inputs.y); + println!("mean {:?}", inputs.mean); + println!("variance {:?}", inputs.variance); + println!("saved mean {:?}", inputs.saved_mean); + println!("saved variance {:?}", inputs.saved_variance); + panic!(); } }