diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp index f716930ceb8..eb2fbae0807 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp @@ -17,6 +17,10 @@ #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_test.hpp" +#include "llrt/hal.hpp" + +#define CQ_PREFETCH_CMD_BARE_MIN_SIZE tt::tt_metal::hal.get_alignment(tt::tt_metal::HalMemType::HOST) + constexpr uint32_t DEFAULT_TEST_TYPE = 0; constexpr uint32_t DEVICE_DATA_SIZE = 768 * 1024; constexpr uint32_t MAX_PAGE_SIZE = 256 * 1024; // bigger than scratch_db_page_size diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index 286d6b670ea..ee0d65ba059 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -3319,7 +3319,8 @@ void Device::end_trace(const uint8_t cq_id, const uint32_t tid) { auto &trace_data = this->trace_buffer_pool_[tid]->desc->data; trace_data = std::move(this->sysmem_manager().get_bypass_data()); // Add command to terminate the trace buffer - DeviceCommand command_sequence(CQ_PREFETCH_CMD_BARE_MIN_SIZE); + uint32_t cq_prefetch_cmd_bare_min_size = hal.get_alignment(HalMemType::HOST); + DeviceCommand command_sequence(cq_prefetch_cmd_bare_min_size); command_sequence.add_prefetch_exec_buf_end(); for (int i = 0; i < command_sequence.size_bytes() / sizeof(uint32_t); i++) { trace_data.push_back(((uint32_t*)command_sequence.data())[i]); diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index d263e369755..8098a0cc5f1 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -33,6 +33,10 @@ #include "tt_metal/impl/kernels/kernel.hpp" #include "tt_metal/third_party/umd/device/tt_xy_pair.h" +#include "llrt/hal.hpp" + +#define CQ_PREFETCH_CMD_BARE_MIN_SIZE tt::tt_metal::hal.get_alignment(tt::tt_metal::HalMemType::HOST) + using namespace tt::tt_metal; using std::map; @@ -338,7 +342,7 @@ EnqueueProgramCommand::EnqueueProgramCommand( } void EnqueueProgramCommand::assemble_preamble_commands(ProgramCommandSequence& program_command_sequence, std::vector& kernel_config_addrs) { - constexpr uint32_t uncached_cmd_sequence_sizeB = + uint32_t uncached_cmd_sequence_sizeB = CQ_PREFETCH_CMD_BARE_MIN_SIZE; // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_SET_WRITE_OFFSET program_command_sequence.preamble_command_sequence = @@ -361,7 +365,7 @@ void EnqueueProgramCommand::assemble_stall_commands(ProgramCommandSequence& prog // Wait command so previous program finishes // Wait command with barrier for binaries to commit to DRAM // Prefetch stall to prevent prefetcher picking up incomplete binaries from DRAM - constexpr uint32_t uncached_cmd_sequence_sizeB = + uint32_t uncached_cmd_sequence_sizeB = CQ_PREFETCH_CMD_BARE_MIN_SIZE + // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT CQ_PREFETCH_CMD_BARE_MIN_SIZE; // CQ_PREFETCH_CMD_STALL @@ -377,7 +381,7 @@ void EnqueueProgramCommand::assemble_stall_commands(ProgramCommandSequence& prog true, this->dispatch_message_addr, this->expected_num_workers_completed); } else { // Wait command so previous program finishes - constexpr uint32_t cached_cmd_sequence_sizeB = + uint32_t cached_cmd_sequence_sizeB = CQ_PREFETCH_CMD_BARE_MIN_SIZE; // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT program_command_sequence.stall_command_sequence = diff --git a/tt_metal/impl/dispatch/cq_commands.hpp b/tt_metal/impl/dispatch/cq_commands.hpp index a2a0399fb5e..675be8dd774 100644 --- a/tt_metal/impl/dispatch/cq_commands.hpp +++ b/tt_metal/impl/dispatch/cq_commands.hpp @@ -10,9 +10,8 @@ #pragma once -#include "noc/noc_parameters.h" +#include -constexpr uint32_t CQ_PREFETCH_CMD_BARE_MIN_SIZE = PCIE_ALIGNMENT; // for NOC PCIe alignemnt constexpr uint32_t CQ_DISPATCH_CMD_SIZE = 16; // for L1 alignment // Prefetcher CMD ID enums @@ -293,18 +292,7 @@ struct CQDispatchCmd { ////////////////////////////////////////////////////////////////////////////// -struct CQPrefetchHToPrefetchDHeader_s { - uint32_t length; -}; - -typedef union { - struct CQPrefetchHToPrefetchDHeader_s header; - unsigned char padding[CQ_PREFETCH_CMD_BARE_MIN_SIZE]; -} CQPrefetchHToPrefetchDHeader; - - static_assert(sizeof(CQPrefetchBaseCmd) == sizeof(uint8_t)); // if this fails, padding above needs to be adjusted static_assert(sizeof(CQDispatchBaseCmd) == sizeof(uint8_t)); // if this fails, padding above needs to be adjusted static_assert((sizeof(CQPrefetchCmd) & (CQ_DISPATCH_CMD_SIZE - 1)) == 0); static_assert((sizeof(CQDispatchCmd) & (CQ_DISPATCH_CMD_SIZE - 1)) == 0); -static_assert((sizeof(CQPrefetchHToPrefetchDHeader) & (CQ_PREFETCH_CMD_BARE_MIN_SIZE - 1)) == 0); diff --git a/tt_metal/impl/dispatch/device_command.hpp b/tt_metal/impl/dispatch/device_command.hpp index 54ef442bec2..12f4caf3af5 100644 --- a/tt_metal/impl/dispatch/device_command.hpp +++ b/tt_metal/impl/dispatch/device_command.hpp @@ -94,7 +94,7 @@ class DeviceCommand { relay_wait->base.cmd_id = CQ_PREFETCH_CMD_RELAY_INLINE; relay_wait->relay_inline.dispatcher_type = dispatcher_type; relay_wait->relay_inline.length = sizeof(CQDispatchCmd); - relay_wait->relay_inline.stride = CQ_PREFETCH_CMD_BARE_MIN_SIZE; + relay_wait->relay_inline.stride = this->pcie_alignment; wait_cmd->base.cmd_id = CQ_DISPATCH_CMD_WAIT; wait_cmd->wait.barrier = barrier; diff --git a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp index acf40f655ea..dd903bcbb92 100644 --- a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp @@ -13,6 +13,17 @@ #include "tt_metal/impl/dispatch/kernels/cq_common.hpp" #include "debug/dprint.h" +#include "noc/noc_parameters.h" // PCIE_ALIGNMENT +constexpr uint32_t CQ_PREFETCH_CMD_BARE_MIN_SIZE = PCIE_ALIGNMENT; // for NOC PCIe alignemnt +struct CQPrefetchHToPrefetchDHeader_s { + uint32_t length; +}; +typedef union { + struct CQPrefetchHToPrefetchDHeader_s header; + unsigned char padding[CQ_PREFETCH_CMD_BARE_MIN_SIZE]; +} CQPrefetchHToPrefetchDHeader; +static_assert((sizeof(CQPrefetchHToPrefetchDHeader) & (CQ_PREFETCH_CMD_BARE_MIN_SIZE - 1)) == 0); + typedef uint16_t prefetch_q_entry_type; constexpr uint32_t downstream_cb_base = get_compile_time_arg_val(0);