From d45a4047c213dfd7a4093cf384ffe23d404025cc Mon Sep 17 00:00:00 2001 From: Paul Keller Date: Thu, 16 May 2024 21:18:46 +0000 Subject: [PATCH] #7530: Add no stride flag to dispatch packed write --- .../perf_microbenchmark/dispatch/common.h | 50 +++++++++++++++---- .../dispatch/test_prefetcher.cpp | 4 ++ tt_metal/impl/dispatch/cq_commands.hpp | 6 ++- tt_metal/impl/dispatch/device_command.hpp | 3 +- .../impl/dispatch/kernels/cq_dispatch.cpp | 19 ++++--- 5 files changed, 62 insertions(+), 20 deletions(-) diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h index 02a362eb035..c8fec0b8871 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h @@ -588,21 +588,30 @@ inline void generate_random_paged_payload(Device *device, inline void generate_random_packed_payload(vector& cmds, vector& worker_cores, DeviceData& data, - uint32_t size_words) { + uint32_t size_words, + bool repeat = false) { static uint32_t coherent_count = 0; const uint32_t bank_id = 0; // No interleaved pages here. + bool first_core = true; + vectorresults; + CoreCoord first_worker = worker_cores[0]; + for (uint32_t i = 0; i < size_words; i++) { + uint32_t datum = (use_coherent_data_g) ? ((first_worker.x << 16) | (first_worker.y << 24) | coherent_count++) : std::rand(); + results.push_back(datum); + } for (CoreCoord core : worker_cores) { for (uint32_t i = 0; i < size_words; i++) { - uint32_t datum = (use_coherent_data_g) ? ((core.x << 16) | (core.y << 24) | coherent_count++) : std::rand(); - - cmds.push_back(datum); - data.push_one(core, bank_id, datum); + data.push_one(core, bank_id, results[i]); + if (!repeat || first_core) { + cmds.push_back(results[i]); + } } cmds.resize(padded_size(cmds.size(), 4)); // XXXXX L1_ALIGNMENT16/sizeof(uint) data.pad(core, bank_id, 16); // L1_ALIGNMENT16 + first_core = false; } } @@ -709,7 +718,8 @@ inline void add_dispatcher_packed_cmd(Device *device, vector& worker_cores, DeviceData& device_data, CQDispatchCmd cmd, - uint32_t size_words) { + uint32_t size_words, + bool repeat = false) { size_t prior_end = debug_prologue(cmds); @@ -720,7 +730,7 @@ inline void add_dispatcher_packed_cmd(Device *device, } cmds.resize(padded_size(cmds.size(), 4)); // XXXXX L1_ALIGNMENT16/sizeof(uint) - generate_random_packed_payload(cmds, worker_cores, device_data, size_words); + generate_random_packed_payload(cmds, worker_cores, device_data, size_words, repeat); debug_epilogue(cmds, prior_end); } @@ -843,7 +853,8 @@ inline void gen_dispatcher_packed_write_cmd(Device *device, vector& cmds, vector& worker_cores, DeviceData& device_data, - uint32_t size_words) { + uint32_t size_words, + bool repeat = false) { // Pad w/ blank data until all workers are at the same address device_data.relevel(CoreType::WORKER); @@ -852,12 +863,15 @@ inline void gen_dispatcher_packed_write_cmd(Device *device, memset(&cmd, 0, sizeof(CQDispatchCmd)); cmd.base.cmd_id = CQ_DISPATCH_CMD_WRITE_PACKED; - cmd.write_packed.is_multicast = 0; + cmd.write_packed.flags = repeat ? CQ_DISPATCH_CMD_PACKED_WRITE_FLAG_NO_STRIDE : CQ_DISPATCH_CMD_PACKED_WRITE_FLAG_NONE; cmd.write_packed.count = worker_cores.size(); cmd.write_packed.addr = device_data.get_result_data_addr(worker_cores[0]); cmd.write_packed.size = size_words * sizeof(uint32_t); - add_dispatcher_packed_cmd(device, cmds, worker_cores, device_data, cmd, size_words); + uint32_t sub_cmds_size = padded_size(worker_cores.size() * sizeof(uint32_t), sizeof(CQDispatchCmd)); + TT_FATAL(repeat == false || size_words * sizeof(uint32_t) + sizeof(CQDispatchCmd) + sub_cmds_size <= dispatch_buffer_page_size_g); + + add_dispatcher_packed_cmd(device, cmds, worker_cores, device_data, cmd, size_words, repeat); } inline void gen_rnd_dispatcher_packed_write_cmd(Device *device, @@ -887,8 +901,22 @@ inline void gen_rnd_dispatcher_packed_write_cmd(Device *device, } } + bool repeat = std::rand() % 2; + if (repeat) { + // TODO fix this if/when we add mcast + uint32_t sub_cmds_size = padded_size(gets_data.size() * sizeof(uint32_t), 16); // L1_ALIGNMENT16 + if (xfer_size_bytes + sizeof (CQDispatchCmd) + sub_cmds_size > dispatch_buffer_page_size_g) { + static bool warned = false; + if (!warned) { + log_warning(tt::LogTest, "Clamping packed_write cmd w/ stride=0 size to fit a dispatch page. Adjust max/min xfer sizes for reliable perf data"); + warned = true; + } + xfer_size_bytes = dispatch_buffer_page_size_g - sizeof (CQDispatchCmd) - sub_cmds_size; + } + } + gen_dispatcher_packed_write_cmd(device, cmds, gets_data, device_data, - xfer_size_bytes / sizeof(uint32_t)); + xfer_size_bytes / sizeof(uint32_t), repeat); } inline void gen_dispatcher_host_write_cmd(vector& cmds, diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp index c6dae75b446..f80c565e142 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp @@ -979,6 +979,10 @@ void gen_smoke_test(Device *device, gen_dispatcher_packed_write_cmd(device, dispatch_cmds, worker_cores, device_data, 12); add_prefetcher_cmd(prefetch_cmds, cmd_sizes, CQ_PREFETCH_CMD_RELAY_INLINE, dispatch_cmds); + dispatch_cmds.resize(0); + gen_dispatcher_packed_write_cmd(device, dispatch_cmds, worker_cores, device_data, 12, true); + add_prefetcher_cmd(prefetch_cmds, cmd_sizes, CQ_PREFETCH_CMD_RELAY_INLINE, dispatch_cmds); + dispatch_cmds.resize(0); worker_cores.resize(0); worker_cores.push_back(first_worker_g); diff --git a/tt_metal/impl/dispatch/cq_commands.hpp b/tt_metal/impl/dispatch/cq_commands.hpp index 56202b5f897..f4a4ddb0a44 100644 --- a/tt_metal/impl/dispatch/cq_commands.hpp +++ b/tt_metal/impl/dispatch/cq_commands.hpp @@ -138,8 +138,12 @@ struct CQDispatchWritePagedCmd { uint32_t pages; } __attribute__((packed)); + +constexpr uint32_t CQ_DISPATCH_CMD_PACKED_WRITE_FLAG_NONE = 0x00; +constexpr uint32_t CQ_DISPATCH_CMD_PACKED_WRITE_FLAG_MCAST = 0x01; +constexpr uint32_t CQ_DISPATCH_CMD_PACKED_WRITE_FLAG_NO_STRIDE = 0x02; struct CQDispatchWritePackedCmd { - uint8_t is_multicast; + uint8_t flags; // see above uint16_t count; // number of sub-cmds (max 1020 unicast, 510 mcast). Max num sub-cmds = (dispatch_constants::TRANSFER_PAGE_SIZE - sizeof(CQDispatchCmd)) / sizeof(CQDispatchWritePacked*castSubCmd) uint32_t addr; // common memory address across all packed SubCmds uint16_t size; // size of each packet, stride is padded to L1 alignment and less than dispatch_cb_page_size diff --git a/tt_metal/impl/dispatch/device_command.hpp b/tt_metal/impl/dispatch/device_command.hpp index bf391b5ae13..08916379c41 100644 --- a/tt_metal/impl/dispatch/device_command.hpp +++ b/tt_metal/impl/dispatch/device_command.hpp @@ -324,7 +324,8 @@ class DeviceCommand { auto initialize_write_packed_cmd = [&](CQDispatchCmd *write_packed_cmd) { write_packed_cmd->base.cmd_id = CQ_DISPATCH_CMD_WRITE_PACKED; - write_packed_cmd->write_packed.is_multicast = multicast; + write_packed_cmd->write_packed.flags = + multicast ? CQ_DISPATCH_CMD_PACKED_WRITE_FLAG_MCAST : CQ_DISPATCH_CMD_PACKED_WRITE_FLAG_NONE; write_packed_cmd->write_packed.count = num_sub_cmds; write_packed_cmd->write_packed.addr = common_addr; write_packed_cmd->write_packed.size = packed_data_sizeB; diff --git a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp index ac35c7fbea3..a506c16df3e 100644 --- a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp @@ -543,7 +543,7 @@ void process_write_paged() { // Since all subcmds all appear in the first page and given the size restrictions // this command can't be too many pages. All pages are released at the end template -void process_write_packed() { +void process_write_packed(uint32_t flags) { volatile CQDispatchCmd tt_l1_ptr *cmd = (volatile CQDispatchCmd tt_l1_ptr *)cmd_ptr; uint32_t count = cmd->write_packed.count; @@ -560,7 +560,9 @@ void process_write_packed() { uint32_t data_ptr = cmd_ptr + sizeof(CQDispatchCmd) + count * sizeof(WritePackedSubCmd); data_ptr = round_up_pow2(data_ptr, L1_NOC_ALIGNMENT); - uint32_t stride = round_up_pow2(xfer_size, L1_NOC_ALIGNMENT); + uint32_t stride = (flags & CQ_DISPATCH_CMD_PACKED_WRITE_FLAG_NO_STRIDE) ? 0 : round_up_pow2(xfer_size, L1_NOC_ALIGNMENT); + DPRINT << data_ptr << " " << cmd_ptr << " " << xfer_size << " " << dispatch_cb_page_size << ENDL(); + ASSERT(stride != 0 || data_ptr - cmd_ptr + xfer_size <= dispatch_cb_page_size); volatile uint32_t tt_l1_ptr *l1_addr = (uint32_t *)(cmd_ptr + sizeof(CQDispatchCmd)); cq_noc_async_write_init_state(0, dst_addr, xfer_size); @@ -766,11 +768,14 @@ static inline bool process_cmd_d(uint32_t& cmd_ptr) { break; case CQ_DISPATCH_CMD_WRITE_PACKED: - DPRINT << "cmd_write_packed" << ENDL(); - if (cmd->write_packed.is_multicast) { - process_write_packed(); - } else { - process_write_packed(); + { + DPRINT << "cmd_write_packed" << ENDL(); + uint32_t flags = cmd->write_packed.flags; + if (flags & CQ_DISPATCH_CMD_PACKED_WRITE_FLAG_MCAST) { + process_write_packed(flags); + } else { + process_write_packed(flags); + } } break;