Merge pull request pulp-platform#6 from pulp-platform/main

Rebase main branch to updated main of original repo
Dequino · Jun 6, 2024 · f5c6621 · f5c6621
2 parents 216520e + 759b56a
commit f5c6621
Show file tree

Hide file tree

Showing 22 changed files with 1,238 additions and 117 deletions.
diff --git a/README.md b/README.md
@@ -88,6 +88,11 @@ The `tools/` folder contains useful tools which ease the usage of PULP-TrainLib,
 The `assets/` folder contains useful support files for PULP-TrainLib. Inside [CI_test_suite](assets/CI_test_suite/), users can find a testing environment that can be used to verify PULP-TrainLib's primitives for Continuous Integration. 
 
 
+# Tutorials
+
+To learn how to generate the code with our TrainLib_Deployer and more details about the optimizations used in this library, a [tutorial repository](https://github.com/dnadalini/PULP-TrainLib-Tutorial) is available online. This repository contains tutorials and a guide to easily install a conda environment with all the necessary requirements to run PULP-TrainLib.
+
+
 
 # Installation and requirements
 
@@ -137,6 +142,7 @@ To add new functionalities, users can follow the naming convention of PULP-Train
 
 PULP-TrainLib's repository is organized with these branches:
 - `main`: main branch, targeting PULP architectures.
+- `trainlib-tutorial`: branch reserved for tutorial purposes (see [https://github.com/dnadalini/PULP-TrainLib-Tutorial](https://github.com/dnadalini/PULP-TrainLib-Tutorial)).
 - `pulp-trainlib-paper`: branch to reproduce the results provided in the paper ["PULP-TrainLib: Enabling On-Device Training for RISC-V Multi-Core MCUs through Performance-Driven Autotuning"](https://www.samos-conference.com/Resources_Samos_Websites/Proceedings_Repository_SAMOS/2022/Papers/Paper_14.pdf).
 - `pulp-trainlib-stm32`: this is a PULP-TrainLib port compatible with STM32 and other MCUs (FP32 format only).
 
@@ -177,7 +183,6 @@ PULP-TrainLib's repository is organized with these branches:
 - Performance bugs in im2col/im2row with DMA loading (performances tend to be less than im2col/im2row with cores)
 - Missing integration for RNN / MHSE in TrainLib_Deployer
 - FP32 MHSA primitives (Input Grad)
-- FP32 and FP16 InstanceNorm's output do not perfectly match PyTorch ones (need bugfixing)
 - Missing integration of sigmoid function in TrainLib_Deployer
 - Performances of FP16 sigmoid may need to be optimized with FP16 exponenetial (e.g., https://github.com/0xBYTESHIFT/fp16/blob/master/include/half/half.hpp)
 
@@ -191,6 +196,7 @@ PULP-TrainLib's repository is organized with these branches:
 - Manuele Rusci ([email protected])
 - Francesco Conti ([email protected])
 - Cristian Cioflan ([email protected])
+- Luca Bompani ([email protected])
 
 ## Past Contributors
 

diff --git a/lib/include/pulp_batchnorm_fp32.h b/lib/include/pulp_batchnorm_fp32.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (C) 2021-2022 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Authors: Davide Nadalini
+*/ 
+
+
+/**
+ * Instance Norm layer configuration structure
+ */
+
+/**
+ * @brief Structure for Instance Norm Training in FP32
+ * @param input input feauture maps for the depthwise layer
+ * @param output output feature maps for the depthwise layer
+ * @param coeff coefficients to compute normalization, bias are included
+ * @param batch_size size of the batch to be processed by the BatchNorm layer
+ * @param running_mean array of running means computed during the forward step
+ * @param running_var array of running variances computed during the forward step
+ * @param running_stdev array of running standard deviations computed during the forward step
+ * @param freeze_running_params if 1, freezes running mean and variance
+ * @param skip_wg_grad skips the computation of the weight grad
+ * @param skip_in_grad skips the computation of the input grad (1st DNN layer)
+ */
+struct BatchNorm_args {
+	struct blob * input;
+	struct blob * output; 
+	struct blob * coeff;
+    int batch_size;
+	float * running_mean;
+	float * running_var;
+	float * running_stdev;
+	int freeze_running_params;
+	int skip_wg_grad;
+	int skip_in_grad;
+};
+
+/**
+ * @brief Forward function that calls the parallelized version
+ * @param (void *)  (struct InstNorm_args void_args)
+ */
+void pulp_batchnorm_fp32_fw_cl( void * BatchNorm_args );
+
+/**
+ * @brief Function that calls both input and param gradient functions
+ * @param (void *)  (struct InstNorm_args void_args)
+ */
+void pulp_batchnorm_fp32_bw_cl( void * BatchNorm_args );
+
+/**
+ * @brief Backward param gradient function that calls the parallelized version
+ * @param (void *)  (struct InstNorm_args void_args)
+ */
+void pulp_batchnorm_fp32_bw_param_grads_cl( void * BatchNorm_args );
+
+/**
+ * @brief Backward input gradient function that calls the parallelized version
+ * @param (void *)  (struct InstNorm_args void_args)
+ */
+void pulp_batchnorm_fp32_bw_input_grads_cl( void * BatchNorm_args );
+
+/**
+ * @brief Forward backend function parallelized on multicore
+ * @param (void *)  (struct InstNorm_args void_args)
+ */
+void pulp_batchnorm_parallelized_fp32_fw_cl( void * BatchNorm_args );
+/**
+ * @brief Backward backend function for input gradients parallelized on multicore
+ * @param (void *)  (struct InstNorm_args void_args)
+ */
+void pulp_batchnorm_parallelized_fp32_bw_input_grads_cl( void * BatchNorm_args );
+/**
+ * @brief Backward backend function for parameters gradients parallelized on multicore
+ * @param (void *)  (struct InstNorm_args void_args)
+ */
+void pulp_batchnorm_parallelized_fp32_bw_param_grads_cl( void * BatchNorm_args );
diff --git a/lib/include/pulp_instnorm_fp16.h b/lib/include/pulp_instnorm_fp16.h
@@ -15,7 +15,7 @@
  */
 
 /**
- * Authors: Giacomo Saporetti
+ * Authors: Giacomo Saporetti, Davide Nadalini
 */ 
 
 
@@ -29,6 +29,7 @@
  * @param output output feature maps for the depthwise layer
  * @param coeff coefficients to compute normalization, bias are included
  * @param running_mean array of running means computed during the forward step
+ * @param running_var array of running variances computed during the forward step
  * @param running_stdev array of running standard deviations computed during the forward step
  * @param freeze_running_params if 1, freezes running mean and variance
  * @param skip_wg_grad skips the computation of the weight grad
@@ -39,48 +40,49 @@ struct InstNorm_args_fp16 {
 	struct blob_fp16 * output; 
 	struct blob_fp16 * coeff;
 	fp16 * running_mean;
+	fp16 * running_var;
 	fp16 * running_stdev;
 	int freeze_running_params;
 	int skip_wg_grad;
 	int skip_in_grad;
 };
 
 /**
- * @brief Dummy forward function that calls the parallelized version
+ * @brief Forward function that calls the parallelized version
  * @param (void *)  (struct InstNorm_args void_args)
  */
 void pulp_instnorm_fp16_fw_cl( void * InstNorm_args_fp16 );
 
 /**
- * @brief Backward function that calls both input and param gradient functions
+ * @brief Function that calls both input and param gradient functions
  * @param (void *)  (struct InstNorm_args void_args)
  */
 void pulp_instnorm_fp16_bw_cl( void * InstNorm_args_fp16 );
 
 /**
- * @brief Dummy backward param gradient function that calls the parallelized version
+ * @brief Backward param gradient function that calls the parallelized version
  * @param (void *)  (struct InstNorm_args void_args)
  */
 void pulp_instnorm_fp16_bw_param_grads_cl( void * InstNorm_args_fp16 );
 
 /**
- * @brief Dummy backward input gradient function that calls the parallelized version
+ * @brief Backward input gradient function that calls the parallelized version
  * @param (void *)  (struct InstNorm_args void_args)
  */
 void pulp_instnorm_fp16_bw_input_grads_cl( void * InstNorm_args_fp16 );
 
 /**
- * @brief Real forward function parallelized on multicore
+ * @brief Forward backend function parallelized on multicore
  * @param (void *)  (struct InstNorm_args void_args)
  */
 void pulp_instnorm_parallelized_fp16_fw_cl( void * InstNorm_args_fp16 );
 /**
- * @brief Real bacward function for input gradients parallelized on multicore
+ * @brief Backward backend function for input gradients parallelized on multicore
  * @param (void *)  (struct InstNorm_args void_args)
  */
 void pulp_instnorm_parallelized_fp16_bw_input_grads_cl( void * InstNorm_args_fp16 );
 /**
- * @brief Real bacward function for parameters gradients parallelized on multicore
+ * @brief Backward backend function for parameters gradients parallelized on multicore
  * @param (void *)  (struct InstNorm_args void_args)
  */
 void pulp_instnorm_parallelized_fp16_bw_param_grads_cl( void * InstNorm_args_fp16 );
diff --git a/lib/include/pulp_instnorm_fp32.h b/lib/include/pulp_instnorm_fp32.h
@@ -15,7 +15,7 @@
  */
 
 /**
- * Authors: Giacomo Saporetti
+ * Authors: Giacomo Saporetti, Davide Nadalini
 */ 
 
 
@@ -29,6 +29,7 @@
  * @param output output feature maps for the depthwise layer
  * @param coeff coefficients to compute normalization, bias are included
  * @param running_mean array of running means computed during the forward step
+ * @param running_var array of running variances computed during the forward step
  * @param running_stdev array of running standard deviations computed during the forward step
  * @param freeze_running_params if 1, freezes running mean and variance
  * @param skip_wg_grad skips the computation of the weight grad
@@ -39,48 +40,49 @@ struct InstNorm_args {
 	struct blob * output; 
 	struct blob * coeff;
 	float * running_mean;
+	float * running_var;
 	float * running_stdev;
 	int freeze_running_params;
 	int skip_wg_grad;
 	int skip_in_grad;
 };
 
 /**
- * @brief Dummy forward function that calls the parallelized version
+ * @brief Forward function that calls the parallelized version
  * @param (void *)  (struct InstNorm_args void_args)
  */
 void pulp_instnorm_fp32_fw_cl( void * InstNorm_args );
 
 /**
- * @brief Backward function that calls both input and param gradient functions
+ * @brief Function that calls both input and param gradient functions
  * @param (void *)  (struct InstNorm_args void_args)
  */
 void pulp_instnorm_fp32_bw_cl( void * InstNorm_args );
 
 /**
- * @brief Dummy backward param gradient function that calls the parallelized version
+ * @brief Backward param gradient function that calls the parallelized version
  * @param (void *)  (struct InstNorm_args void_args)
  */
 void pulp_instnorm_fp32_bw_param_grads_cl( void * InstNorm_args );
 
 /**
- * @brief Dummy backward input gradient function that calls the parallelized version
+ * @brief Backward input gradient function that calls the parallelized version
  * @param (void *)  (struct InstNorm_args void_args)
  */
 void pulp_instnorm_fp32_bw_input_grads_cl( void * InstNorm_args );
 
 /**
- * @brief Real forward function parallelized on multicore
+ * @brief Forward backend function parallelized on multicore
  * @param (void *)  (struct InstNorm_args void_args)
  */
 void pulp_instnorm_parallelized_fp32_fw_cl( void * InstNorm_args );
 /**
- * @brief Real bacward function for input gradients parallelized on multicore
+ * @brief Backward backend function for input gradients parallelized on multicore
  * @param (void *)  (struct InstNorm_args void_args)
  */
 void pulp_instnorm_parallelized_fp32_bw_input_grads_cl( void * InstNorm_args );
 /**
- * @brief Real bacward function for parameters gradients parallelized on multicore
+ * @brief Backward backend function for parameters gradients parallelized on multicore
  * @param (void *)  (struct InstNorm_args void_args)
  */
 void pulp_instnorm_parallelized_fp32_bw_param_grads_cl( void * InstNorm_args );
diff --git a/lib/include/pulp_train.h b/lib/include/pulp_train.h
@@ -30,6 +30,7 @@
 #include "pulp_train_utils_fp32.h"
 // FP32 primitives
 #include "pulp_act_fp32.h"
+//#include "pulp_batchnorm_fp32.h"
 #include "pulp_conv_dw_fp32.h"
 #include "pulp_conv_pw_fp32.h"
 #include "pulp_conv2d_fp32.h"
@@ -50,6 +51,7 @@
 #include "pulp_train_utils_fp16.h"
 // FP16 primitives
 #include "pulp_act_fp16.h"
+//#include "pulp_batchnorm_fp16.h"
 #include "pulp_conv_dw_fp16.h"
 #include "pulp_conv_pw_fp16.h"
 #include "pulp_conv2d_fp16.h"

diff --git a/lib/include/pulp_train_defines.h b/lib/include/pulp_train_defines.h
@@ -29,7 +29,7 @@
  * @defgroup Data formats
  * @{
  */
-typedef float16alt fp16;                                    // Standard IEEE FP16 format
+typedef float16alt fp16;                                    // FP16 format (float16 or float16alt)
 typedef fp16 v2f16 __attribute__((vector_size (4)));        // Vectorized fp16 for SIMD
 /**
  * @}