From 6b70db0c4b8674bad4133f2eb6258e062eccf046 Mon Sep 17 00:00:00 2001 From: David Ma Date: Fri, 20 Sep 2024 21:25:43 +0000 Subject: [PATCH] #0: Bugfix for launch message setting on watcher hang for Tensix On active erisc, when we exit to base FW due to a watcher assert, we need to set the launch message to DONE. This is because the next run will look at the launch message and try to force exit a running kernel if it finds one via launch message GO. The launch message was being set to DONE for Tensix as well, causing SD to finish before picking up the assert. --- tt_metal/hw/inc/debug/assert.h | 8 +++++--- tt_metal/hw/inc/debug/sanitize_noc.h | 5 +++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/tt_metal/hw/inc/debug/assert.h b/tt_metal/hw/inc/debug/assert.h index 4f1b27f696f..c90c682020b 100644 --- a/tt_metal/hw/inc/debug/assert.h +++ b/tt_metal/hw/inc/debug/assert.h @@ -17,12 +17,14 @@ void assert_and_hang(uint32_t line_num) { v->which = debug_get_which_riscv(); } - // Update launch msg to show that we've exited. + // Hang, or in the case of erisc, early exit. +#if defined(COMPILE_FOR_ERISC) + // Update launch msg to show that we've exited. This is required so that the next run doesn't think there's a kernel + // still running and try to make it exit. tt_l1_ptr launch_msg_t *launch_msg = GET_MAILBOX_ADDRESS_DEV(launch); launch_msg->go.run = RUN_MSG_DONE; - // Hang, or in the case of erisc, early exit. -#if defined(COMPILE_FOR_ERISC) + // This exits to base FW internal_::disable_erisc_app(); erisc_early_exit(eth_l1_mem::address_map::ERISC_MEM_MAILBOX_STACK_SAVE); #endif diff --git a/tt_metal/hw/inc/debug/sanitize_noc.h b/tt_metal/hw/inc/debug/sanitize_noc.h index 563fcb4c31e..7daa0cdf805 100644 --- a/tt_metal/hw/inc/debug/sanitize_noc.h +++ b/tt_metal/hw/inc/debug/sanitize_noc.h @@ -130,11 +130,12 @@ inline void debug_sanitize_post_noc_addr_and_hang( v[noc_id].invalid = invalid; } - // Update launch msg to show that we've exited. +#if defined(COMPILE_FOR_ERISC) + // Update launch msg to show that we've exited. This is required so that the next run doesn't think there's a kernel + // still running and try to make it exit. tt_l1_ptr launch_msg_t *launch_msg = GET_MAILBOX_ADDRESS_DEV(launch); launch_msg->go.run = RUN_MSG_DONE; -#if defined(COMPILE_FOR_ERISC) // For erisc, we can't hang the kernel/fw, because the core doesn't get restarted when a new // kernel is written. In this case we'll do an early exit back to base FW. internal_::disable_erisc_app();