Merge branch 'feature/update_algorithm_ns_model' into 'master'

algorithm_stream: Add nsnet related configurations See merge request adf/esp-adf-internal!1299
espressif · Aug 7, 2024 · 501921b · 501921b
2 parents 5478bce + 9712588
commit 501921b
Show file tree

Hide file tree

Showing 13 changed files with 126 additions and 40 deletions.
diff --git a/components/audio_stream/algorithm_stream.c b/components/audio_stream/algorithm_stream.c
@@ -36,6 +36,11 @@
 #include "esp_afe_sr_iface.h"
 #include "esp_afe_sr_models.h"
 
+#ifdef CONFIG_USE_NSNET
+#include "esp_nsn_models.h"
+#include "model_path.h"
+#endif  /* CONFIG_USE_NSNET */
+
 #define ALGORITHM_CHUNK_MAX_SIZE            (1024)
 #define ALGORITHM_FETCH_TASK_STACK_SIZE     (3 * 1024)
 #define ALGORITHM_GET_REFERENCE_TIMEOUT     (32 / portTICK_PERIOD_MS)
@@ -45,23 +50,26 @@ static const char *TAG = "ALGORITHM_STREAM";
 const int FETCH_STOPPED_BIT = BIT0;
 
 typedef struct {
-    int16_t *record;
-    int16_t *reference;
-    int16_t *aec_buff;
-    int8_t algo_mask;
-    int8_t mic_ch;
-    bool afe_fetch_run;
-    int sample_rate;
-    int rec_linear_factor;
-    int ref_linear_factor;
+    int16_t                      *record;
+    int16_t                      *reference;
+    int16_t                      *aec_buff;
+    int8_t                        algo_mask;
+    int8_t                        mic_ch;
+    bool                          afe_fetch_run;
+    int                           sample_rate;
+    int                           rec_linear_factor;
+    int                           ref_linear_factor;
     algorithm_stream_input_type_t input_type;
-    const esp_afe_sr_iface_t *afe_handle;
-    esp_afe_sr_data_t *afe_data;
-    EventGroupHandle_t state;
-    bool debug_input;
-    bool swap_ch;
-    bool aec_low_cost;
-    int agc_gain;
+    const esp_afe_sr_iface_t     *afe_handle;
+    esp_afe_sr_data_t            *afe_data;
+    EventGroupHandle_t            state;
+    bool                          debug_input;
+    bool                          swap_ch;
+    bool                          aec_low_cost;
+    int                           agc_gain;
+#ifdef CONFIG_USE_NSNET
+    srmodel_list_t               *models;
+#endif  /* CONFIG_USE_NSNET */
 } algo_stream_t;
 
 esp_err_t algorithm_mono_fix(uint8_t *sbuff, uint32_t len)
@@ -87,7 +95,12 @@ static esp_err_t _algo_close(audio_element_handle_t self)
     while (xEventGroupWaitBits(algo->state, FETCH_STOPPED_BIT, false, true, 10 / portTICK_PERIOD_MS) != FETCH_STOPPED_BIT) {
         algo->afe_handle->feed(algo->afe_data, algo->aec_buff);
     }
+    return ESP_OK;
+}
 
+static esp_err_t _algo_destroy(audio_element_handle_t self)
+{
+    algo_stream_t *algo = (algo_stream_t *)audio_element_getdata(self);
     if (algo->afe_data) {
         algo->afe_handle->destroy(algo->afe_data);
         algo->afe_data = NULL;
@@ -114,6 +127,12 @@ static esp_err_t _algo_close(audio_element_handle_t self)
         vEventGroupDelete(algo->state);
     }
 
+#ifdef CONFIG_USE_NSNET
+    if (algo->models) {
+        esp_srmodel_deinit(algo->models);
+    }
+#endif  /* CONFIG_USE_NSNET */
+
     if (algo) {
         audio_free(algo);
         algo = NULL;
@@ -163,6 +182,11 @@ static esp_err_t _algo_open(audio_element_handle_t self)
     afe_config.pcm_config.mic_num = algo->mic_ch;
     afe_config.pcm_config.ref_num = 1;
     afe_config.pcm_config.total_ch_num = algo->mic_ch + 1;
+#ifdef CONFIG_USE_NSNET
+    char *model_name = esp_srmodel_filter(algo->models, ESP_NSNET_PREFIX, NULL);
+    afe_config.afe_ns_mode = NS_MODE_NET;
+    afe_config.afe_ns_model_name = model_name;
+#endif  /* CONFIG_USE_NSNET */
 
     if (!algo->aec_low_cost) {
         afe_config.pcm_config.sample_rate = algo->sample_rate;
@@ -198,7 +222,7 @@ static esp_err_t _algo_open(audio_element_handle_t self)
         xEventGroupSetBits(algo->state, FETCH_STOPPED_BIT);
     } else {
         audio_thread_create(NULL, "algo_fetch", _algo_fetch_task, (void *)self, ALGORITHM_FETCH_TASK_STACK_SIZE,
-            ALGORITHM_STREAM_TASK_PERIOD, true, ALGORITHM_STREAM_PINNED_TO_CORE);
+                            ALGORITHM_STREAM_TASK_PERIOD, true, ALGORITHM_STREAM_PINNED_TO_CORE);
     }
 
     AUDIO_NULL_CHECK(TAG, algo->afe_data, {
@@ -316,6 +340,7 @@ audio_element_handle_t algo_stream_init(algorithm_stream_cfg_t *config)
     cfg.open = _algo_open;
     cfg.close = _algo_close;
     cfg.process = _algo_process;
+    cfg.destroy = _algo_destroy;
     cfg.task_stack = config->task_stack;
     cfg.task_prio = config->task_prio;
     cfg.task_core = config->task_core;
@@ -348,6 +373,18 @@ audio_element_handle_t algo_stream_init(algorithm_stream_cfg_t *config)
         _success &= ((algo->reference = audio_calloc(1, ALGORITHM_CHUNK_MAX_SIZE)) != NULL);
     }
 
+#ifdef CONFIG_USE_NSNET
+    algo->models = esp_srmodel_init(config->partition_label);
+    if (algo->models != NULL) {
+        for (int i = 0; i < algo->models->num; i++) {
+            ESP_LOGI(TAG, "Load: %s", algo->models->model_name[i]);
+        }
+    } else {
+        ESP_LOGE(TAG, "Failed to load models");
+        _success = false;
+    }
+#endif  /* CONFIG_USE_NSNET */
+
     AUDIO_NULL_CHECK(TAG, _success, {
         ESP_LOGE(TAG, "Error occured");
         _algo_close(el);

diff --git a/components/audio_stream/include/algorithm_stream.h b/components/audio_stream/include/algorithm_stream.h
@@ -125,8 +125,8 @@ typedef struct {
     int sample_rate;                            /*!< The sampling rate of the input PCM (in Hz) */
     int mic_ch;                                 /*!< MIC channel num */
     int agc_gain;                               /*!< AGC gain(dB) for voice communication */
-    bool aec_low_cost;                          /*!< AEC uses less cpu and ram resources,
-                                                     but has poor suppression of nonlinear distortion */
+    bool aec_low_cost;                          /*!< AEC uses less cpu and ram resources, but has poor suppression of nonlinear distortion */
+    char *partition_label;                      /*!< Partition label which stored the model data */
 } algorithm_stream_cfg_t;
 
 #define ALGORITHM_STREAM_DEFAULT_MASK    (ALGORITHM_STREAM_USE_AEC | ALGORITHM_STREAM_USE_NS)
@@ -147,6 +147,7 @@ typedef struct {
     .mic_ch = ALGORITHM_STREAM_DEFAULT_MIC_CHANNELS,                                              \
     .agc_gain = ALGORITHM_STREAM_DEFAULT_AGC_GAIN_DB,                                             \
     .aec_low_cost = false,                                                                        \
+    .partition_label = "model",                                                                   \
 }
 
 /**

diff --git a/components/esp-sr b/components/esp-sr
diff --git a/examples/advanced_examples/algorithm/README.md b/examples/advanced_examples/algorithm/README.md
@@ -46,6 +46,7 @@ Load and run the example:
 - The board will start playing automatically.
 - After finish, you can open `/sdcard/aec_out.wav` to hear the recorded file.
 
+NSNET has better noise suppression. If using NSNET, please enable 'ESP Speech Recognition ->USE_SNET' in menuconfig.
 
 ### Build and Flash
 Build the project and flash it to the board, then run monitor tool to view serial output (replace `PORT` with your board's serial port name):

diff --git a/examples/advanced_examples/algorithm/README_CN.md b/examples/advanced_examples/algorithm/README_CN.md
@@ -45,6 +45,7 @@
 - 开发板上电后后自动运行例程。
 - 例程完成后，你可以打开 microSD 卡目录 `/sdcard/aec_out.wav` 收听录音文件。
 
+NSNET 具备更好噪声抑制。若使用 NSNET，请在 menuconfig 中使能 `ESP Speech Recognition -> USE_NSNET`。
 
 ### 编译和下载
 

diff --git a/examples/advanced_examples/algorithm/partitions_algo.csv b/examples/advanced_examples/algorithm/partitions_algo.csv
@@ -2,4 +2,5 @@
 # Note: if you change the phy_init or app partition offset, make sure to change the offset in Kconfig.projbuild
 nvs,      data, nvs,     0x9000,  0x6000,
 phy_init, data, phy,     0xf000,  0x1000,
-factory,  app,  factory, 0x10000, 3M,
+factory,  app,  factory, 0x10000, 2M,
+model,    data, spiffs,         , 5168K,
diff --git a/examples/advanced_examples/algorithm/sdkconfig.defaults.esp32 b/examples/advanced_examples/algorithm/sdkconfig.defaults.esp32
@@ -17,3 +17,21 @@ CONFIG_ESP_LYRAT_V4_3_BOARD=y
 #
 CONFIG_ESP32_DEFAULT_CPU_FREQ_240=y
 CONFIG_ESP32_DEFAULT_CPU_FREQ_MHZ=240
+
+#
+# Serial flasher config
+#
+CONFIG_ESPTOOLPY_FLASHMODE_QIO=y
+CONFIG_ESPTOOLPY_FLASH_SAMPLE_MODE_STR=y
+CONFIG_ESPTOOLPY_FLASHMODE="dio"
+CONFIG_ESPTOOLPY_FLASHFREQ_80M=y
+CONFIG_ESPTOOLPY_FLASHFREQ="80m"
+CONFIG_ESPTOOLPY_FLASHSIZE_8MB=y
+CONFIG_ESPTOOLPY_FLASHSIZE="8MB"
+CONFIG_ESPTOOLPY_HEADER_FLASHSIZE_UPDATE=y
+CONFIG_ESPTOOLPY_BEFORE_RESET=y
+CONFIG_ESPTOOLPY_BEFORE="default_reset"
+CONFIG_ESPTOOLPY_AFTER_RESET=y
+CONFIG_ESPTOOLPY_AFTER="hard_reset"
+CONFIG_ESPTOOLPY_MONITOR_BAUD=115200
+# end of Serial flasher config
diff --git a/examples/advanced_examples/algorithm/sdkconfig.defaults.esp32s3 b/examples/advanced_examples/algorithm/sdkconfig.defaults.esp32s3
@@ -26,3 +26,44 @@ CONFIG_ESP_SYSTEM_EVENT_TASK_STACK_SIZE=3072
 # Audio HAL
 #
 CONFIG_ESP32_S3_KORVO2_V3_BOARD=y
+
+#
+# Serial flasher config
+#
+CONFIG_ESPTOOLPY_FLASHMODE_QIO=y
+CONFIG_ESPTOOLPY_FLASH_SAMPLE_MODE_STR=y
+CONFIG_ESPTOOLPY_FLASHMODE="dio"
+CONFIG_ESPTOOLPY_FLASHFREQ_80M=y
+CONFIG_ESPTOOLPY_FLASHFREQ="80m"
+CONFIG_ESPTOOLPY_FLASHSIZE_8MB=y
+CONFIG_ESPTOOLPY_FLASHSIZE="8MB"
+CONFIG_ESPTOOLPY_HEADER_FLASHSIZE_UPDATE=y
+CONFIG_ESPTOOLPY_BEFORE_RESET=y
+CONFIG_ESPTOOLPY_BEFORE="default_reset"
+CONFIG_ESPTOOLPY_AFTER_RESET=y
+CONFIG_ESPTOOLPY_AFTER="hard_reset"
+CONFIG_ESPTOOLPY_MONITOR_BAUD=115200
+# end of Serial flasher config
+
+#
+# ESP Speech Recognition
+#
+CONFIG_MODEL_IN_FLASH=y
+CONFIG_USE_AFE=y
+CONFIG_AFE_INTERFACE_V1=y
+CONFIG_USE_WAKENET=y
+CONFIG_SR_WN_WN9_HILEXIN=y
+CONFIG_USE_MULTINET=y
+CONFIG_SR_MN_CN_MULTINET6_QUANT=y
+CONFIG_SR_MN_EN_NONE=y
+# end of ESP Speech Recognition
+
+#
+# Partition Table
+#
+CONFIG_PARTITION_TABLE_CUSTOM=y
+CONFIG_PARTITION_TABLE_CUSTOM_FILENAME="partitions_algo.csv"
+CONFIG_PARTITION_TABLE_FILENAME="partitions_algo.csv"
+CONFIG_PARTITION_TABLE_OFFSET=0x8000
+CONFIG_PARTITION_TABLE_MD5=y
+# end of Partition Table
diff --git a/examples/dueros/sdkconfig.defaults.esp32s3 b/examples/dueros/sdkconfig.defaults.esp32s3
@@ -36,7 +36,7 @@ CONFIG_ESP32_S3_KORVO2_V3_BOARD=y
 #
 # ESP Speech Recognition
 #
-CONFIG_MODEL_IN_SPIFFS=y
+CONFIG_MODEL_IN_FLASH=y
 # CONFIG_MODEL_IN_SDCARD is not set
 CONFIG_USE_WAKENET=y
 CONFIG_SR_WN_WN9_HILEXIN=y

diff --git a/examples/protocols/esp-rtc/sdkconfig.defaults b/examples/protocols/esp-rtc/sdkconfig.defaults
@@ -54,7 +54,7 @@ CONFIG_LWIP_TCPIP_RECVMBOX_SIZE=64
 #
 # ESP Speech Recognition
 #
-CONFIG_MODEL_IN_SPIFFS=y
+CONFIG_MODEL_IN_FLASH=y
 CONFIG_USE_AFE=y
 CONFIG_AFE_INTERFACE_V1=y
 # CONFIG_USE_WAKENET is not set

diff --git a/examples/protocols/esp-rtsp/sdkconfig.defaults b/examples/protocols/esp-rtsp/sdkconfig.defaults
@@ -39,7 +39,7 @@ CONFIG_LWIP_TCPIP_RECVMBOX_SIZE=64
 #
 # ESP Speech Recognition
 #
-CONFIG_MODEL_IN_SPIFFS=y
+CONFIG_MODEL_IN_FLASH=y
 CONFIG_USE_AFE=y
 CONFIG_AFE_INTERFACE_V1=y
 # CONFIG_USE_WAKENET is not set

diff --git a/examples/protocols/voip/sdkconfig.defaults b/examples/protocols/voip/sdkconfig.defaults
@@ -36,7 +36,7 @@ CONFIG_LWIP_TCPIP_RECVMBOX_SIZE=64
 #
 # ESP Speech Recognition
 #
-CONFIG_MODEL_IN_SPIFFS=y
+CONFIG_MODEL_IN_FLASH=y
 CONFIG_USE_AFE=y
 CONFIG_AFE_INTERFACE_V1=y
 # CONFIG_USE_WAKENET is not set

diff --git a/examples/speech_recognition/wwe/sdkconfig.defaults.esp32s3 b/examples/speech_recognition/wwe/sdkconfig.defaults.esp32s3
@@ -38,28 +38,14 @@ CONFIG_AFE_MIC_NUM=2
 #
 # ESP Speech Recognition
 #
-CONFIG_MODEL_IN_SPIFFS=y
-# CONFIG_MODEL_IN_SDCARD is not set
+CONFIG_MODEL_IN_FLASH=y
 CONFIG_USE_AFE=y
 CONFIG_AFE_INTERFACE_V1=y
 CONFIG_USE_WAKENET=y
-# CONFIG_SR_WN_WN8_ALEXA is not set
 CONFIG_SR_WN_WN9_HILEXIN=y
-# CONFIG_SR_WN_WN9_XIAOAITONGXUE is not set
-# CONFIG_SR_WN_WN9_ALEXA is not set
-# CONFIG_SR_WN_WN9_HIESP is not set
-# CONFIG_SR_WN_WN9_NIHAOXIAOZHI is not set
-# CONFIG_SR_WN_WN9_CUSTOMWORD is not set
-# CONFIG_SR_WN_LOAD_MULIT_WORD is not set
 CONFIG_USE_MULTINET=y
-# CONFIG_SR_MN_CN_NONE is not set
-# CONFIG_SR_MN_CN_MULTINET5_RECOGNITION_QUANT8 is not set
 CONFIG_SR_MN_CN_MULTINET6_QUANT=y
-# CONFIG_SR_MN_CN_MULTINET6_AC_QUANT is not set
 CONFIG_SR_MN_EN_NONE=y
-# CONFIG_SR_MN_EN_MULTINET5_SINGLE_RECOGNITION_QUANT8 is not set
-# CONFIG_SR_MN_EN_MULTINET6_QUANT is not set
-# CONFIG_SR_MN_EN_MULTINET7_QUANT is not set
 # end of ESP Speech Recognition
 
 #