From 5a46ab0e10d3bceeca5d92c98387055a4fa7a2c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= Date: Fri, 1 Nov 2024 20:57:00 +0100 Subject: [PATCH 1/5] Add kernels optimized for size flag to FC and SVDF (#2734) The kernels optimized for size flag provides an alternative implementation where size is prioritized over latency. For size option (speed option is default) it means the CMSIS-NN kernels are calculating kernel sums during inference. BUG=no bug but this will let users prioritize speed vs size even more --- .../micro/kernels/cmsis_nn/fully_connected.cc | 11 +++++-- .../lite/micro/kernels/cmsis_nn/svdf.cc | 29 +++++++++++++++++++ 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/tensorflow/lite/micro/kernels/cmsis_nn/fully_connected.cc b/tensorflow/lite/micro/kernels/cmsis_nn/fully_connected.cc index 8e6fc5a9ccb..7b4e1319532 100644 --- a/tensorflow/lite/micro/kernels/cmsis_nn/fully_connected.cc +++ b/tensorflow/lite/micro/kernels/cmsis_nn/fully_connected.cc @@ -148,15 +148,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { } else if (input->type == kTfLiteInt8) { buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims); - int8_t* filter_data = GetTensorData(filter); data->kernel_sums = nullptr; +#if defined(KERNELS_OPTIMIZED_FOR_SPEED) + const int8_t* filter_data = GetTensorData(filter); + if (buf_size > 0 && filter_data != nullptr) { + const int32_t input_offset = -data->reference_op_data.input_zero_point; + const int32_t filter_offset = + -data->reference_op_data.filter_zero_point; + data->kernel_sums = static_cast( context->AllocatePersistentBuffer(context, buf_size)); - int32_t input_offset = -data->reference_op_data.input_zero_point; - int32_t filter_offset = -data->reference_op_data.filter_zero_point; arm_vector_sum_s8(data->kernel_sums, filter_dims.n, data->output_depth, filter_data, input_offset, filter_offset, tflite::GetTensorData(bias)); @@ -164,6 +168,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { // Do not request a scratch buffer since using persistent memory buf_size = 0; } +#endif } } diff --git a/tensorflow/lite/micro/kernels/cmsis_nn/svdf.cc b/tensorflow/lite/micro/kernels/cmsis_nn/svdf.cc index d39ae616c0f..b48dcb4a69d 100644 --- a/tensorflow/lite/micro/kernels/cmsis_nn/svdf.cc +++ b/tensorflow/lite/micro/kernels/cmsis_nn/svdf.cc @@ -39,6 +39,9 @@ struct CmsisNnOpDataSvdf { int effective_scale_1_b; int effective_scale_2_b; int scratch_tensor_index; +#if defined(KERNELS_OPTIMIZED_FOR_SIZE) + int scratch_weight_tensor_index; +#endif int scratch_output_tensor_index; // Cached tensor zero point values for quantized operations. @@ -189,6 +192,7 @@ TfLiteStatus CmsisNnPrepareSvdf(TfLiteContext* context, TfLiteNode* node) { const int32_t buf_size = arm_svdf_s8_get_buffer_size(&weights_feature_dims); if (buf_size > 0) { +#if defined(KERNELS_OPTIMIZED_FOR_SPEED) data->kernel_sums = static_cast( context->AllocatePersistentBuffer(context, buf_size)); @@ -196,6 +200,17 @@ TfLiteStatus CmsisNnPrepareSvdf(TfLiteContext* context, TfLiteNode* node) { GetTensorData(weights_feature), -data->input_zero_point, -data->activation_state_zero_point, nullptr); +#elif defined(KERNELS_OPTIMIZED_FOR_SIZE) + const TfLiteStatus scratch_kernel_status = + context->RequestScratchBufferInArena( + context, buf_size, &(data->scratch_weight_tensor_index)); + TF_LITE_ENSURE_OK(context, scratch_kernel_status); +#else + MicroPrintf( + "Either KERNELS_OPTIMIZED_FOR_SIZE or KERNELS_OPTIMIZED_FOR_SPEED " + "must be defined"); + return kTfLiteError; +#endif } } else { @@ -291,7 +306,21 @@ TfLiteStatus EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node, switch (weights_time_tensor->type) { case kTfLiteInt8: { cmsis_nn_context ctx; + +#if defined(KERNELS_OPTIMIZED_FOR_SPEED) ctx.buf = data.kernel_sums; +#elif defined(KERNELS_OPTIMIZED_FOR_SIZE) + ctx.buf = static_cast( + context->GetScratchBuffer(context, data.scratch_weight_tensor_index)); + + const int input_size = input_tensor->dims->data[1]; + const int num_filters = weights_feature_tensor->dims->data[0]; + + arm_vector_sum_s8( + static_cast(ctx.buf), input_size, num_filters, + tflite::micro::GetTensorData(weights_feature_tensor), + -data.input_zero_point, -data.activation_state_zero_point, nullptr); +#endif arm_svdf_s8( &ctx, &scratch_ctx, &scratch_output_ctx, &svdf_params, From 8eb6b23de4470d6a8da3131650d6a67514dfa130 Mon Sep 17 00:00:00 2001 From: Ryan Kuester Date: Fri, 1 Nov 2024 17:10:05 -0500 Subject: [PATCH 2/5] build(bazel): add integrity check to nnlib_hifi4 download (#2743) build(bazel): add integrity check to nnlib_hifi4 download Add an integrity check to the http_archive() download of nnlib_hifi4 in order to make the build more hermetic, reduce the security risk that a remote file changes, and silence the noisy warning on the console during the build: DEBUG: Rule 'nnlib_hifi4' indicated that a canonical reproducible form can be obtained by modifying arguments integrity[....] BUG=description --- WORKSPACE | 1 + 1 file changed, 1 insertion(+) diff --git a/WORKSPACE b/WORKSPACE index e50f8cb0eef..cc6c1a83d72 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -101,6 +101,7 @@ py_pkg_cc_deps( http_archive( name = "nnlib_hifi4", build_file = "@tflite_micro//third_party/xtensa/nnlib_hifi4:nnlib_hifi4.BUILD", + integrity = "sha256-ulZ+uY4dRsbDUMZbZtD972eghclWQrqYRb0Y4Znfyyc=", strip_prefix = "nnlib-hifi4-34f5f995f28d298ae2b6e2ba6e76c32a5cb34989", urls = ["https://github.com/foss-xtensa/nnlib-hifi4/archive/34f5f995f28d298ae2b6e2ba6e76c32a5cb34989.zip"], ) From 694d25007c3ca9c9b79e6af1f51a969798425979 Mon Sep 17 00:00:00 2001 From: Ryan Kuester Date: Tue, 5 Nov 2024 15:48:52 -0600 Subject: [PATCH 3/5] chore: remove obsolete ci/temp_patches (#2744) chore: remove obsolete ci/temp_patches Remove ci/temp_patches, which was obsoleted in 23f608fd once it was no longer used by the sync script. It should have been deleted then. Remove it not only to clean up dead code, but because it contains a reference to `micro_copts`, which is about to be refactored away, and we don't want to leave stray references to it in the tree. BUG=#2636 --- ci/temp_patches/tf_update_visibility.patch | 34 ---------------------- 1 file changed, 34 deletions(-) delete mode 100644 ci/temp_patches/tf_update_visibility.patch diff --git a/ci/temp_patches/tf_update_visibility.patch b/ci/temp_patches/tf_update_visibility.patch deleted file mode 100644 index a98061eac53..00000000000 --- a/ci/temp_patches/tf_update_visibility.patch +++ /dev/null @@ -1,34 +0,0 @@ -diff --git a/tensorflow/lite/micro/kernels/BUILD b/tensorflow/lite/micro/kernels/BUILD -index 22bcf2b1efd..6f27814f6a2 100644 ---- a/tensorflow/lite/micro/kernels/BUILD -+++ b/tensorflow/lite/micro/kernels/BUILD -@@ -1,7 +1,5 @@ --load( -- "//tensorflow/lite/micro:build_def.bzl", -- "micro_copts", --) -+load("//tensorflow/lite/micro:build_def.bzl", "micro_copts") -+load("//tensorflow:extra_rules.bzl", "tflm_kernel_friends") - - package( - features = ["-layering_check"], -@@ -23,6 +21,11 @@ package_group( - packages = ["//tensorflow/lite/micro"], - ) - -+package_group( -+ name = "kernel_friends", -+ packages = tflm_kernel_friends(), -+) -+ - #################################### - # C++ libraries - #################################### -@@ -245,6 +248,7 @@ cc_library( - ], - hdrs = ["kernel_util.h"], - visibility = [ -+ ":kernel_friends", - ":micro", - ], - deps = [ From 45cd79be34e5572d8105b7238e065d0309b96af7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= Date: Tue, 5 Nov 2024 23:13:20 +0100 Subject: [PATCH 4/5] Replace CoreDebug with DCB (#2746) BUG=The CMSIS CoreDebug macro is deprecated. --- .../lite/micro/cortex_m_corstone_300/system_setup.cc | 9 ++++----- tensorflow/lite/micro/cortex_m_generic/micro_time.cc | 4 ++-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/tensorflow/lite/micro/cortex_m_corstone_300/system_setup.cc b/tensorflow/lite/micro/cortex_m_corstone_300/system_setup.cc index 64733401cca..3ff84214309 100644 --- a/tensorflow/lite/micro/cortex_m_corstone_300/system_setup.cc +++ b/tensorflow/lite/micro/cortex_m_corstone_300/system_setup.cc @@ -14,12 +14,11 @@ limitations under the License. ==============================================================================*/ #ifdef ETHOS_U +#include #include +#include #include - -#include "ethosu_driver.h" -#include "pmu_ethosu.h" #endif // This is set in micro/tools/make/targets/cortex_m_corstone_300_makefile.inc. @@ -133,7 +132,7 @@ void InitializeTarget() { ARM_PMU_CNTR_Enable(PMU_CNTENSET_CCNTR_ENABLE_Msk); #else - CoreDebug->DEMCR |= CoreDebug_DEMCR_TRCENA_Msk; + DCB->DEMCR |= DCB_DEMCR_TRCENA_Msk; // Reset and enable DWT cycle counter. DWT->CYCCNT = 0; @@ -154,7 +153,7 @@ void InitializeTarget() { return; } NVIC_SetVector(static_cast(ethosu_irq), - (uint32_t)ðosuIrqHandler0); + reinterpret_cast(ðosuIrqHandler0)); NVIC_SetPriority(static_cast(ethosu_irq), ethosu_irq_priority); NVIC_EnableIRQ(static_cast(ethosu_irq)); #endif diff --git a/tensorflow/lite/micro/cortex_m_generic/micro_time.cc b/tensorflow/lite/micro/cortex_m_generic/micro_time.cc index 265bd349c2e..f580129daf1 100644 --- a/tensorflow/lite/micro/cortex_m_generic/micro_time.cc +++ b/tensorflow/lite/micro/cortex_m_generic/micro_time.cc @@ -1,4 +1,4 @@ -/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -52,7 +52,7 @@ uint32_t GetCurrentTimeTicks() { #ifdef ARMCM7 DWT->LAR = 0xC5ACCE55; #endif - CoreDebug->DEMCR |= CoreDebug_DEMCR_TRCENA_Msk; + DCB->DEMCR |= DCB_DEMCR_TRCENA_Msk; // Reset and DWT cycle counter. DWT->CYCCNT = 0; From 4bb78c700e5c56139f4bab700ea2f74f5455e841 Mon Sep 17 00:00:00 2001 From: Ryan Kuester Date: Tue, 5 Nov 2024 19:11:45 -0600 Subject: [PATCH 5/5] fix(create_tflm_tree): remove recent tests from exported tree (#2751) Remove the recently added span_test.cc and static_vector_test.cc from the files exported by the create_tflm_tree.py project generation process by adding them to the list of tests in the Makefile. Unit tests are not meant to be included in exported trees; they may include files that are not exported. This change also ensures that these tests are included when `make test` is run. BUG=fixes #2718 --- tensorflow/lite/micro/tools/make/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile index 3bf2b549316..d9f150d3f96 100644 --- a/tensorflow/lite/micro/tools/make/Makefile +++ b/tensorflow/lite/micro/tools/make/Makefile @@ -330,6 +330,8 @@ $(TENSORFLOW_ROOT)tensorflow/lite/micro/micro_resource_variable_test.cc \ $(TENSORFLOW_ROOT)tensorflow/lite/micro/micro_time_test.cc \ $(TENSORFLOW_ROOT)tensorflow/lite/micro/micro_utils_test.cc \ $(TENSORFLOW_ROOT)tensorflow/lite/micro/recording_micro_allocator_test.cc \ +$(TENSORFLOW_ROOT)tensorflow/lite/micro/span_test.cc \ +$(TENSORFLOW_ROOT)tensorflow/lite/micro/static_vector_test.cc \ $(TENSORFLOW_ROOT)tensorflow/lite/micro/arena_allocator/non_persistent_arena_buffer_allocator_test.cc \ $(TENSORFLOW_ROOT)tensorflow/lite/micro/arena_allocator/persistent_arena_buffer_allocator_test.cc \ $(TENSORFLOW_ROOT)tensorflow/lite/micro/arena_allocator/recording_single_arena_buffer_allocator_test.cc \