From 5a46ab0e10d3bceeca5d92c98387055a4fa7a2c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= <mans.nilsson@arm.com>
Date: Fri, 1 Nov 2024 20:57:00 +0100
Subject: [PATCH 1/5] Add kernels optimized for size flag to FC and SVDF
 (#2734)

The kernels optimized for size flag provides an alternative implementation where size is prioritized over latency.

For size option (speed option is default) it means the CMSIS-NN kernels are calculating kernel sums during inference.

BUG=no bug but this will let users prioritize speed vs size even more
---
 .../micro/kernels/cmsis_nn/fully_connected.cc | 11 +++++--
 .../lite/micro/kernels/cmsis_nn/svdf.cc       | 29 +++++++++++++++++++
 2 files changed, 37 insertions(+), 3 deletions(-)
diff --git a/tensorflow/lite/micro/kernels/cmsis_nn/fully_connected.cc b/tensorflow/lite/micro/kernels/cmsis_nn/fully_connected.cc
index 8e6fc5a9ccb..7b4e1319532 100644
--- a/tensorflow/lite/micro/kernels/cmsis_nn/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/cmsis_nn/fully_connected.cc
@@ -148,15 +148,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     } else if (input->type == kTfLiteInt8) {
       buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims);
 
-      int8_t* filter_data = GetTensorData<int8_t>(filter);
       data->kernel_sums = nullptr;
 
+#if defined(KERNELS_OPTIMIZED_FOR_SPEED)
+      const int8_t* filter_data = GetTensorData<const int8_t>(filter);
+
       if (buf_size > 0 && filter_data != nullptr) {
+        const int32_t input_offset = -data->reference_op_data.input_zero_point;
+        const int32_t filter_offset =
+            -data->reference_op_data.filter_zero_point;
+
         data->kernel_sums = static_cast<int32_t*>(
             context->AllocatePersistentBuffer(context, buf_size));
 
-        int32_t input_offset = -data->reference_op_data.input_zero_point;
-        int32_t filter_offset = -data->reference_op_data.filter_zero_point;
         arm_vector_sum_s8(data->kernel_sums, filter_dims.n, data->output_depth,
                           filter_data, input_offset, filter_offset,
                           tflite::GetTensorData<int32_t>(bias));
@@ -164,6 +168,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         // Do not request a scratch buffer since using persistent memory
         buf_size = 0;
       }
+#endif
     }
   }
 
diff --git a/tensorflow/lite/micro/kernels/cmsis_nn/svdf.cc b/tensorflow/lite/micro/kernels/cmsis_nn/svdf.cc
index d39ae616c0f..b48dcb4a69d 100644
--- a/tensorflow/lite/micro/kernels/cmsis_nn/svdf.cc
+++ b/tensorflow/lite/micro/kernels/cmsis_nn/svdf.cc
@@ -39,6 +39,9 @@ struct CmsisNnOpDataSvdf {
   int effective_scale_1_b;
   int effective_scale_2_b;
   int scratch_tensor_index;
+#if defined(KERNELS_OPTIMIZED_FOR_SIZE)
+  int scratch_weight_tensor_index;
+#endif
   int scratch_output_tensor_index;
 
   // Cached tensor zero point values for quantized operations.
@@ -189,6 +192,7 @@ TfLiteStatus CmsisNnPrepareSvdf(TfLiteContext* context, TfLiteNode* node) {
     const int32_t buf_size = arm_svdf_s8_get_buffer_size(&weights_feature_dims);
 
     if (buf_size > 0) {
+#if defined(KERNELS_OPTIMIZED_FOR_SPEED)
       data->kernel_sums = static_cast<int32_t*>(
           context->AllocatePersistentBuffer(context, buf_size));
 
@@ -196,6 +200,17 @@ TfLiteStatus CmsisNnPrepareSvdf(TfLiteContext* context, TfLiteNode* node) {
                         GetTensorData<int8_t>(weights_feature),
                         -data->input_zero_point,
                         -data->activation_state_zero_point, nullptr);
+#elif defined(KERNELS_OPTIMIZED_FOR_SIZE)
+      const TfLiteStatus scratch_kernel_status =
+          context->RequestScratchBufferInArena(
+              context, buf_size, &(data->scratch_weight_tensor_index));
+      TF_LITE_ENSURE_OK(context, scratch_kernel_status);
+#else
+      MicroPrintf(
+          "Either KERNELS_OPTIMIZED_FOR_SIZE or KERNELS_OPTIMIZED_FOR_SPEED "
+          "must be defined");
+      return kTfLiteError;
+#endif
     }
 
   } else {
@@ -291,7 +306,21 @@ TfLiteStatus EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
   switch (weights_time_tensor->type) {
     case kTfLiteInt8: {
       cmsis_nn_context ctx;
+
+#if defined(KERNELS_OPTIMIZED_FOR_SPEED)
       ctx.buf = data.kernel_sums;
+#elif defined(KERNELS_OPTIMIZED_FOR_SIZE)
+      ctx.buf = static_cast<int32_t*>(
+          context->GetScratchBuffer(context, data.scratch_weight_tensor_index));
+
+      const int input_size = input_tensor->dims->data[1];
+      const int num_filters = weights_feature_tensor->dims->data[0];
+
+      arm_vector_sum_s8(
+          static_cast<int32_t*>(ctx.buf), input_size, num_filters,
+          tflite::micro::GetTensorData<int8_t>(weights_feature_tensor),
+          -data.input_zero_point, -data.activation_state_zero_point, nullptr);
+#endif
 
       arm_svdf_s8(
           &ctx, &scratch_ctx, &scratch_output_ctx, &svdf_params,

From 8eb6b23de4470d6a8da3131650d6a67514dfa130 Mon Sep 17 00:00:00 2001
From: Ryan Kuester <kuester@bdti.com>
Date: Fri, 1 Nov 2024 17:10:05 -0500
Subject: [PATCH 2/5] build(bazel): add integrity check to nnlib_hifi4 download
 (#2743)

build(bazel): add integrity check to nnlib_hifi4 download

Add an integrity check to the http_archive() download of nnlib_hifi4 in
order to make the build more hermetic, reduce the security risk that a
remote file changes, and silence the noisy warning on the console during
the build:

    DEBUG: Rule 'nnlib_hifi4' indicated that a canonical reproducible
    form can be obtained by modifying arguments integrity[....]

BUG=description
---
 WORKSPACE | 1 +
 1 file changed, 1 insertion(+)

diff --git a/WORKSPACE b/WORKSPACE
index e50f8cb0eef..cc6c1a83d72 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -101,6 +101,7 @@ py_pkg_cc_deps(
 http_archive(
     name = "nnlib_hifi4",
     build_file = "@tflite_micro//third_party/xtensa/nnlib_hifi4:nnlib_hifi4.BUILD",
+    integrity = "sha256-ulZ+uY4dRsbDUMZbZtD972eghclWQrqYRb0Y4Znfyyc=",
     strip_prefix = "nnlib-hifi4-34f5f995f28d298ae2b6e2ba6e76c32a5cb34989",
     urls = ["https://github.com/foss-xtensa/nnlib-hifi4/archive/34f5f995f28d298ae2b6e2ba6e76c32a5cb34989.zip"],
 )

From 694d25007c3ca9c9b79e6af1f51a969798425979 Mon Sep 17 00:00:00 2001
From: Ryan Kuester <kuester@bdti.com>
Date: Tue, 5 Nov 2024 15:48:52 -0600
Subject: [PATCH 3/5] chore: remove obsolete ci/temp_patches (#2744)

chore: remove obsolete ci/temp_patches

Remove ci/temp_patches, which was obsoleted in 23f608fd once it
was no longer used by the sync script. It should have been
deleted then.

Remove it not only to clean up dead code, but because it contains
a reference to `micro_copts`, which is about to be refactored
away, and we don't want to leave stray references to it in the
tree.

BUG=#2636
---
 ci/temp_patches/tf_update_visibility.patch | 34 ----------------------
 1 file changed, 34 deletions(-)
 delete mode 100644 ci/temp_patches/tf_update_visibility.patch

diff --git a/ci/temp_patches/tf_update_visibility.patch b/ci/temp_patches/tf_update_visibility.patch
deleted file mode 100644
index a98061eac53..00000000000
--- a/ci/temp_patches/tf_update_visibility.patch
+++ /dev/null
@@ -1,34 +0,0 @@
-diff --git a/tensorflow/lite/micro/kernels/BUILD b/tensorflow/lite/micro/kernels/BUILD
-index 22bcf2b1efd..6f27814f6a2 100644
---- a/tensorflow/lite/micro/kernels/BUILD
-+++ b/tensorflow/lite/micro/kernels/BUILD
-@@ -1,7 +1,5 @@
--load(
--    "//tensorflow/lite/micro:build_def.bzl",
--    "micro_copts",
--)
-+load("//tensorflow/lite/micro:build_def.bzl", "micro_copts")
-+load("//tensorflow:extra_rules.bzl", "tflm_kernel_friends")
- 
- package(
-     features = ["-layering_check"],
-@@ -23,6 +21,11 @@ package_group(
-     packages = ["//tensorflow/lite/micro"],
- )
- 
-+package_group(
-+    name = "kernel_friends",
-+    packages = tflm_kernel_friends(),
-+)
-+
- ####################################
- # C++ libraries
- ####################################
-@@ -245,6 +248,7 @@ cc_library(
-     ],
-     hdrs = ["kernel_util.h"],
-     visibility = [
-+        ":kernel_friends",
-         ":micro",
-     ],
-     deps = [

From 45cd79be34e5572d8105b7238e065d0309b96af7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= <mans.nilsson@arm.com>
Date: Tue, 5 Nov 2024 23:13:20 +0100
Subject: [PATCH 4/5] Replace CoreDebug with DCB (#2746)

BUG=The CMSIS CoreDebug macro is deprecated.
---
 .../lite/micro/cortex_m_corstone_300/system_setup.cc     | 9 ++++-----
 tensorflow/lite/micro/cortex_m_generic/micro_time.cc     | 4 ++--
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/tensorflow/lite/micro/cortex_m_corstone_300/system_setup.cc b/tensorflow/lite/micro/cortex_m_corstone_300/system_setup.cc
index 64733401cca..3ff84214309 100644
--- a/tensorflow/lite/micro/cortex_m_corstone_300/system_setup.cc
+++ b/tensorflow/lite/micro/cortex_m_corstone_300/system_setup.cc
@@ -14,12 +14,11 @@ limitations under the License.
 ==============================================================================*/
 
 #ifdef ETHOS_U
+#include <ethosu_driver.h>
 #include <inttypes.h>
+#include <pmu_ethosu.h>
 
 #include <algorithm>
-
-#include "ethosu_driver.h"
-#include "pmu_ethosu.h"
 #endif
 
 // This is set in micro/tools/make/targets/cortex_m_corstone_300_makefile.inc.
@@ -133,7 +132,7 @@ void InitializeTarget() {
   ARM_PMU_CNTR_Enable(PMU_CNTENSET_CCNTR_ENABLE_Msk);
 
 #else
-  CoreDebug->DEMCR |= CoreDebug_DEMCR_TRCENA_Msk;
+  DCB->DEMCR |= DCB_DEMCR_TRCENA_Msk;
 
   // Reset and enable DWT cycle counter.
   DWT->CYCCNT = 0;
@@ -154,7 +153,7 @@ void InitializeTarget() {
     return;
   }
   NVIC_SetVector(static_cast<IRQn_Type>(ethosu_irq),
-                 (uint32_t)&ethosuIrqHandler0);
+                 reinterpret_cast<uint32_t>(&ethosuIrqHandler0));
   NVIC_SetPriority(static_cast<IRQn_Type>(ethosu_irq), ethosu_irq_priority);
   NVIC_EnableIRQ(static_cast<IRQn_Type>(ethosu_irq));
 #endif
diff --git a/tensorflow/lite/micro/cortex_m_generic/micro_time.cc b/tensorflow/lite/micro/cortex_m_generic/micro_time.cc
index 265bd349c2e..f580129daf1 100644
--- a/tensorflow/lite/micro/cortex_m_generic/micro_time.cc
+++ b/tensorflow/lite/micro/cortex_m_generic/micro_time.cc
@@ -1,4 +1,4 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -52,7 +52,7 @@ uint32_t GetCurrentTimeTicks() {
 #ifdef ARMCM7
     DWT->LAR = 0xC5ACCE55;
 #endif
-    CoreDebug->DEMCR |= CoreDebug_DEMCR_TRCENA_Msk;
+    DCB->DEMCR |= DCB_DEMCR_TRCENA_Msk;
 
     // Reset and DWT cycle counter.
     DWT->CYCCNT = 0;

From 4bb78c700e5c56139f4bab700ea2f74f5455e841 Mon Sep 17 00:00:00 2001
From: Ryan Kuester <kuester@bdti.com>
Date: Tue, 5 Nov 2024 19:11:45 -0600
Subject: [PATCH 5/5] fix(create_tflm_tree): remove recent tests from exported
 tree (#2751)

Remove the recently added span_test.cc and static_vector_test.cc from
the files exported by the create_tflm_tree.py project generation process
by adding them to the list of tests in the Makefile. Unit tests are not
meant to be included in exported trees; they may include files that are
not exported.

This change also ensures that these tests are included when `make test`
is run.

BUG=fixes #2718
---
 tensorflow/lite/micro/tools/make/Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index 3bf2b549316..d9f150d3f96 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -330,6 +330,8 @@ $(TENSORFLOW_ROOT)tensorflow/lite/micro/micro_resource_variable_test.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/micro_time_test.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/micro_utils_test.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/recording_micro_allocator_test.cc \
+$(TENSORFLOW_ROOT)tensorflow/lite/micro/span_test.cc \
+$(TENSORFLOW_ROOT)tensorflow/lite/micro/static_vector_test.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/arena_allocator/non_persistent_arena_buffer_allocator_test.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/arena_allocator/persistent_arena_buffer_allocator_test.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/arena_allocator/recording_single_arena_buffer_allocator_test.cc \