Skip to content

Commit

Permalink
#0: Cache program cmds, convert cb configs from write linear to write…
Browse files Browse the repository at this point in the history
… packed

Invalidate cache for enqueue program when program recompile triggered
Only send up to max cb for enqueue_program
  • Loading branch information
tt-aho committed May 21, 2024
1 parent da4bc52 commit b5d7699
Show file tree
Hide file tree
Showing 8 changed files with 311 additions and 287 deletions.
2 changes: 1 addition & 1 deletion tt_metal/common/base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ using std::string;
using std::size_t;
using std::map;

inline uint32_t align(uint32_t addr, uint32_t alignment) { return ((addr - 1) | (alignment - 1)) + 1; }
inline constexpr uint32_t align(uint32_t addr, uint32_t alignment) { return ((addr - 1) | (alignment - 1)) + 1; }


namespace tt
Expand Down
6 changes: 1 addition & 5 deletions tt_metal/impl/buffers/circular_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,7 @@ uint32_t CircularBuffer::page_size(uint32_t buffer_index) const {
}

uint32_t CircularBuffer::num_pages(uint32_t buffer_index) const {
uint32_t page_size = this->page_size(buffer_index);
if (this->size() % page_size != 0) {
TT_THROW("Total circular buffer size {} B must be divisible by page size {} B", this->size(), page_size);
}
return this->size() / page_size;
return this->size() / this->page_size(buffer_index);
}

DataFormat CircularBuffer::data_format(uint32_t buffer_index) const {
Expand Down
557 changes: 292 additions & 265 deletions tt_metal/impl/dispatch/command_queue.cpp

Large diffs are not rendered by default.

19 changes: 14 additions & 5 deletions tt_metal/impl/dispatch/command_queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ string EnqueueCommandTypeToString(EnqueueCommandType ctype);
#define NOC_Y(y) y

uint32_t get_noc_unicast_encoding(const CoreCoord &coord);
uint32_t get_noc_multcast_encoding(const CoreCoord &start, const CoreCoord &end);
uint32_t get_noc_multicast_encoding(const CoreCoord &start, const CoreCoord &end);

class CommandQueue;
class CommandInterface;
Expand Down Expand Up @@ -277,14 +277,23 @@ class EnqueueProgramCommand : public Command {
SystemMemoryManager& manager;
CoreType dispatch_core_type;
uint32_t expected_num_workers_completed;
HostMemDeviceCommand preamble_command_sequence;
thread_local static std::unordered_map<uint64_t, std::vector<HostMemDeviceCommand>> runtime_args_command_sequences;
HostMemDeviceCommand program_command_sequence;

public:

struct CachedProgramCommandSequence {
HostMemDeviceCommand preamble_command_sequence;
std::vector<HostMemDeviceCommand> runtime_args_command_sequences;
uint32_t runtime_args_fetch_size_bytes;
HostMemDeviceCommand program_command_sequence;
uint32_t cb_configs_payload_start;
uint32_t aligned_cb_config_size_bytes;
std::vector<std::vector<std::shared_ptr<CircularBuffer>>> circular_buffers_on_core_ranges;
};
thread_local static std::unordered_map<uint64_t, CachedProgramCommandSequence> cached_program_command_sequences;

EnqueueProgramCommand(uint32_t command_queue_id, Device* device, Program& program, SystemMemoryManager& manager, uint32_t expected_num_workers_completed);

void assemble_preamble_commands();
void assemble_preamble_commands(bool prefetch_stall);
void assemble_device_commands();
void assemble_runtime_args_commands();

Expand Down
4 changes: 2 additions & 2 deletions tt_metal/impl/dispatch/cq_commands.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,13 +160,13 @@ struct CQDispatchWaitCmd {
uint8_t clear_count; // if true, reset count to 0
uint32_t addr; // address to read
uint32_t count; // wait while address is < count
};
} __attribute__((packed));

struct CQDispatchDelayCmd {
uint8_t pad1;
uint16_t pad2;
uint32_t delay;
};
} __attribute__((packed));

struct CQDispatchCmd {
CQDispatchBaseCmd base;
Expand Down
1 change: 1 addition & 0 deletions tt_metal/impl/program/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -837,6 +837,7 @@ void Program::compile( Device * device )
detail::MemoryReporter::inst().flush_program_memory_usage(*this, device);
}
compile_needed_[device->id()] = false;
this->loaded_onto_device = false;
}

Program::~Program() {
Expand Down
5 changes: 0 additions & 5 deletions tt_metal/impl/program/program.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,11 +136,6 @@ class Program {

void allocate_circular_buffers();

// Commands generated by EnqueueProgram
// TODO: move these to static objects, either in device command or command queue
vector_memcpy_aligned<uint32_t> cached_device_commands;
ProgramCommandIndices command_indices;

private:
void populate_dispatch_data(Device *device);

Expand Down
4 changes: 0 additions & 4 deletions tt_metal/impl/program/program_device_map.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,3 @@ struct ProgramTransferInfo {
std::unordered_map<uint32_t, vector<transfer_info_2>> unicast_semaphores; // WritePacked, sorted by dst
vector<kernel_bins_transfer_info> kernel_bins; // RelayPaged, WriteLinear
};

struct ProgramCommandIndices {
std::uint32_t cb_configs_payload_start; // device_commands
};

0 comments on commit b5d7699

Please sign in to comment.