diff --git a/device/tt_device.h b/device/tt_device.h index 07d507ba..8722d02a 100644 --- a/device/tt_device.h +++ b/device/tt_device.h @@ -282,9 +282,10 @@ class tt_device /** * Give UMD a 1:1 function mapping a core to its appropriate static TLB (currently only support a single TLB per core). * + * @param logical_device_id MMIO chip being targeted. * @param mapping_function Function which maps core to TLB index. */ - virtual void setup_core_to_tlb_map(std::function mapping_function) { + virtual void setup_core_to_tlb_map(const chip_id_t logical_device_id, std::function mapping_function) { throw std::runtime_error("---- tt_device::setup_core_to_tlb_map is not implemented\n"); } @@ -592,7 +593,6 @@ class tt_device bool performed_harvesting = false; std::unordered_map harvested_rows_per_target = {}; bool translation_tables_en = false; - bool tlbs_init = false; protected: std::unordered_map soc_descriptor_per_chip = {}; @@ -633,7 +633,7 @@ class tt_SiliconDevice: public tt_device virtual void set_driver_eth_interface_params(const tt_driver_eth_interface_params& eth_interface_params_); virtual void configure_tlb(chip_id_t logical_device_id, tt_xy_pair core, std::int32_t tlb_index, std::int32_t address, uint64_t ordering = TLB_DATA::Posted); virtual void set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering = TLB_DATA::Posted); - virtual void setup_core_to_tlb_map(std::function mapping_function); + virtual void setup_core_to_tlb_map(const chip_id_t logical_device_id, std::function mapping_function); virtual void configure_active_ethernet_cores_for_mmio_device(chip_id_t mmio_chip, const std::unordered_set& active_eth_cores_per_chip); virtual void start_device(const tt_device_params &device_params); virtual void assert_risc_reset(); @@ -661,7 +661,7 @@ class tt_SiliconDevice: public tt_device /** * If the tlbs are initialized, returns a tuple with the TLB base address and its size */ - std::optional> get_tlb_data_from_target(const tt_xy_pair& target); + std::optional> get_tlb_data_from_target(const tt_cxy_pair& target); /** * This API allows you to write directly to device memory that is addressable by a static TLB */ @@ -705,6 +705,8 @@ class tt_SiliconDevice: public tt_device virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel); virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id); virtual tt_version get_ethernet_fw_version() const; + // TODO: This should be accessible through public API, probably to be moved to tt_device. + PCIDevice *get_pci_device(int device_id) const; // Destructor virtual ~tt_SiliconDevice (); @@ -760,7 +762,6 @@ class tt_SiliconDevice: public tt_device int pcie_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr); int remote_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr); bool address_in_tlb_space(uint32_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, uint32_t chip); - PCIDevice *get_pci_device(int pci_intf_id) const; std::shared_ptr get_mutex(const std::string& tlb_name, int pci_interface_id); virtual uint32_t get_harvested_noc_rows_for_chip(int logical_device_id); // Returns one-hot encoded harvesting mask for PCIe mapped chips void generate_tensix_broadcast_grids_for_grayskull( std::set>& broadcast_grids, std::set& rows_to_exclude, std::set& cols_to_exclude); @@ -815,7 +816,11 @@ class tt_SiliconDevice: public tt_device std::map> tlb_config_map = {}; std::set all_target_mmio_devices; std::unordered_map> host_channel_size; - std::function map_core_to_tlb; + + // Note that these maps holds only entries for local PCIe chips. + std::unordered_map> map_core_to_tlb_per_chip = {}; + std::unordered_map tlbs_init_per_chip = {}; + std::unordered_map dynamic_tlb_config = {}; std::unordered_map dynamic_tlb_ordering_modes = {}; std::map, std::unordered_map>>> bcast_header_cache = {}; diff --git a/device/tt_silicon_driver.cpp b/device/tt_silicon_driver.cpp index 6b76c967..734f5739 100644 --- a/device/tt_silicon_driver.cpp +++ b/device/tt_silicon_driver.cpp @@ -914,7 +914,7 @@ tt::Writer tt_SiliconDevice::get_static_tlb_writer(tt_cxy_pair target) { throw std::runtime_error(fmt::format("Target not in MMIO chip: {}", target.str())); } - if (!tlbs_init || !map_core_to_tlb) { + if (!tlbs_init_per_chip[target.chip] || !map_core_to_tlb_per_chip[target.chip]) { throw std::runtime_error("TLBs not initialized"); } @@ -924,7 +924,7 @@ tt::Writer tt_SiliconDevice::get_static_tlb_writer(tt_cxy_pair target) { throw std::runtime_error("No write-combined mapping for BAR0"); } - auto tlb_index = map_core_to_tlb(tt_xy_pair(target.x, target.y)); + auto tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y)); auto tlb_data = dev->get_architecture_implementation()->describe_tlb(tlb_index); if (!tlb_data.has_value()) { @@ -946,8 +946,8 @@ void tt_SiliconDevice::write_device_memory(const void *mem_ptr, uint32_t size_in std::int32_t tlb_index = 0; std::optional> tlb_data = std::nullopt; - if(tlbs_init) { - tlb_index = map_core_to_tlb(tt_xy_pair(target.x, target.y)); + if(tlbs_init_per_chip[target.chip]) { + tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y)); tlb_data = dev->get_architecture_implementation()->describe_tlb(tlb_index); } @@ -987,8 +987,8 @@ void tt_SiliconDevice::read_device_memory(void *mem_ptr, tt_cxy_pair target, std std::int32_t tlb_index = 0; std::optional> tlb_data = std::nullopt; - if(tlbs_init) { - tlb_index = map_core_to_tlb(tt_xy_pair(target.x, target.y)); + if(tlbs_init_per_chip[target.chip]) { + tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y)); tlb_data = dev->get_architecture_implementation()->describe_tlb(tlb_index); } log_debug(LogSiliconDriver, " tlb_index: {}, tlb_data.has_value(): {}", tlb_index, tlb_data.has_value()); @@ -1165,12 +1165,12 @@ tt_SiliconDevice::~tt_SiliconDevice () { dynamic_tlb_ordering_modes.clear(); } -std::optional> tt_SiliconDevice::get_tlb_data_from_target(const tt_xy_pair& target) { +std::optional> tt_SiliconDevice::get_tlb_data_from_target(const tt_cxy_pair& target) { std::int32_t tlb_index = 0; std::optional> tlb_data; - if (tlbs_init) { - tlb_index = map_core_to_tlb(target); + if (tlbs_init_per_chip[target.chip]) { + tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y)); auto architecture_implementation = tt::umd::architecture_implementation::create(static_cast(arch_name)); tlb_data = architecture_implementation->describe_tlb(tlb_index); } @@ -2909,9 +2909,9 @@ void tt_SiliconDevice::set_driver_eth_interface_params(const tt_driver_eth_inter eth_interface_params = eth_interface_params_; } -void tt_SiliconDevice::setup_core_to_tlb_map(std::function mapping_function) { - map_core_to_tlb = mapping_function; - tlbs_init = true; +void tt_SiliconDevice::setup_core_to_tlb_map(const chip_id_t logical_device_id, std::function mapping_function) { + map_core_to_tlb_per_chip[logical_device_id] = mapping_function; + tlbs_init_per_chip[logical_device_id] = true; } std::uint32_t tt_SiliconDevice::get_num_dram_channels(std::uint32_t device_id) { diff --git a/tests/api/CMakeLists.txt b/tests/api/CMakeLists.txt index 08c11ffe..ce569112 100644 --- a/tests/api/CMakeLists.txt +++ b/tests/api/CMakeLists.txt @@ -1,6 +1,7 @@ set(API_TESTS_SRCS - test_cluster.cpp + test_chip.cpp test_cluster_descriptor.cpp + test_cluster.cpp ) add_executable(api_tests ${API_TESTS_SRCS}) diff --git a/tests/api/test_chip.cpp b/tests/api/test_chip.cpp new file mode 100644 index 00000000..ecb2df2c --- /dev/null +++ b/tests/api/test_chip.cpp @@ -0,0 +1,163 @@ +// SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +// This file holds Chip specific API examples. + +#include +#include "fmt/xchar.h" + +#include +#include +#include +#include + +#include "tests/test_utils/generate_cluster_desc.hpp" + +// TODO: change to tt_cluster +#include "device/tt_device.h" +#include "device/tt_cluster_descriptor.h" + +// TODO: write this test to work with Chip not whole Cluster. +using Cluster = tt_SiliconDevice; + +inline std::unique_ptr get_cluster_desc() { + // TODO: This should not be needed. And could be part of the cluster descriptor probably. + // Note that cluster descriptor holds logical ids of chips. + // Which are different than physical PCI ids, which are /dev/tenstorrent/N ones. + // You have to see if physical PCIe is GS before constructing a cluster descriptor. + std::vector pci_device_ids = PCIDevice::enumerate_devices(); + std::set pci_device_ids_set (pci_device_ids.begin(), pci_device_ids.end()); + + tt::ARCH device_arch = tt::ARCH::GRAYSKULL; + if (!pci_device_ids.empty()) { + // TODO: This should be removed from the API, the driver itself should do it. + int physical_device_id = pci_device_ids[0]; + // TODO: remove logical_device_id + PCIDevice pci_device (physical_device_id, 0); + device_arch = pci_device.get_arch(); + } + + // TODO: Make this test work on a host system without any tt devices. + if (pci_device_ids.empty()) { + std::cout << "No Tenstorrent devices found. Skipping test." << std::endl; + return nullptr; + } + + // TODO: remove getting manually cluster descriptor from yaml. + std::string yaml_path = test_utils::GetClusterDescYAML(); + std::unique_ptr cluster_desc; + if (device_arch == tt::ARCH::GRAYSKULL) { + cluster_desc = tt_ClusterDescriptor::create_for_grayskull_cluster(pci_device_ids_set, pci_device_ids); + } else { + cluster_desc = tt_ClusterDescriptor::create_from_yaml(yaml_path); + } + + return cluster_desc; +} + +inline std::unique_ptr get_cluster() { + + // TODO: This should not be needed. And could be part of the cluster descriptor probably. + // Note that cluster descriptor holds logical ids of chips. + // Which are different than physical PCI ids, which are /dev/tenstorrent/N ones. + // You have to see if physical PCIe is GS before constructing a cluster descriptor. + std::vector pci_device_ids = PCIDevice::enumerate_devices(); + std::set pci_device_ids_set (pci_device_ids.begin(), pci_device_ids.end()); + + tt::ARCH device_arch = tt::ARCH::GRAYSKULL; + if (!pci_device_ids.empty()) { + // TODO: This should be removed from the API, the driver itself should do it. + int physical_device_id = pci_device_ids[0]; + // TODO: remove logical_device_id + PCIDevice pci_device (physical_device_id, 0); + device_arch = pci_device.get_arch(); + } + + // TODO: Make this test work on a host system without any tt devices. + if (pci_device_ids.empty()) { + std::cout << "No Tenstorrent devices found. Skipping test." << std::endl; + return nullptr; + } + + // TODO: remove getting manually cluster descriptor from yaml. + std::string yaml_path = test_utils::GetClusterDescYAML(); + // TODO: Remove the need to do this, allow default constructor to construct with all chips. + std::unique_ptr cluster_desc = get_cluster_desc(); + std::unordered_set detected_num_chips = cluster_desc->get_all_chips(); + + // TODO: make this unordered vs set conversion not needed. + std::set detected_num_chips_set (detected_num_chips.begin(), detected_num_chips.end()); + + + // TODO: This would be incorporated inside SocDescriptor. + std::string soc_path; + if (device_arch == tt::ARCH::GRAYSKULL) { + soc_path = test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"); + } else if (device_arch == tt::ARCH::WORMHOLE || device_arch == tt::ARCH::WORMHOLE_B0) { + soc_path = test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"); + } else if (device_arch == tt::ARCH::BLACKHOLE) { + soc_path = test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"); + } else { + throw std::runtime_error("Unsupported architecture"); + } + + + // TODO: Don't pass each of these arguments. + return std::unique_ptr(new Cluster(soc_path, device_arch == tt::ARCH::GRAYSKULL ? "" : yaml_path, detected_num_chips_set)); +} + +// TODO: Once default auto TLB setup is in, check it is setup properly. +TEST(ApiChipTest, ManualTLBConfiguration) { + std::unique_ptr umd_cluster = get_cluster(); + + // Expect to throw for remote chip for any worker core + auto remote_chips = umd_cluster->get_target_remote_device_ids(); + if (!remote_chips.empty()) { + chip_id_t any_remote_chip = *remote_chips.begin(); + const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(any_remote_chip); + tt_xy_pair core = soc_desc.workers[0]; + EXPECT_THROW(umd_cluster->get_static_tlb_writer(tt_cxy_pair(any_remote_chip, core)), std::runtime_error); + } + + // Expect to throw for non configured mmio chip. + chip_id_t any_mmio_chip = *umd_cluster->get_target_mmio_device_ids().begin(); + const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(any_mmio_chip); + tt_xy_pair core = soc_desc.workers[0]; + EXPECT_THROW(umd_cluster->get_static_tlb_writer(tt_cxy_pair(any_mmio_chip, core)), std::runtime_error); + + // TODO: This should be part of TTDevice interface, not Cluster or Chip. + // Configure TLBs. + std::function get_static_tlb_index = [&](tt_xy_pair core) -> int { + // TODO: Make this per arch. + bool is_worker_core = soc_desc.is_worker_core(core); + if (!is_worker_core) { + return -1; + } + return core.x + core.y * umd_cluster->get_pci_device(any_mmio_chip)->get_architecture_implementation()->get_grid_size_x(); + }; + + std::int32_t c_zero_address = 0; + + // Each MMIO chip has it's own set of TLBs, so needs its own configuration. + for (chip_id_t mmio_chip: umd_cluster->get_target_mmio_device_ids()) { + + const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(mmio_chip); + for (tt_xy_pair core: soc_desc.workers) { + umd_cluster->configure_tlb(mmio_chip, core, get_static_tlb_index(core), c_zero_address); + } + + umd_cluster->setup_core_to_tlb_map(mmio_chip, get_static_tlb_index); + } + + // Expect not to throw for now configured mmio chip, same one as before. + EXPECT_NO_THROW(umd_cluster->get_static_tlb_writer(tt_cxy_pair(any_mmio_chip, core))); + + // Expect to throw for non worker cores. + tt_xy_pair dram_core = soc_desc.dram_cores[0][0]; + EXPECT_THROW(umd_cluster->get_static_tlb_writer(tt_cxy_pair(any_mmio_chip, dram_core)), std::runtime_error); + if (!soc_desc.ethernet_cores.empty()) { + tt_xy_pair eth_core = soc_desc.ethernet_cores[0]; + EXPECT_THROW(umd_cluster->get_static_tlb_writer(tt_cxy_pair(any_mmio_chip, eth_core)), std::runtime_error); + } +} diff --git a/tests/api/test_cluster.cpp b/tests/api/test_cluster.cpp index 41faea8a..b618c89c 100644 --- a/tests/api/test_cluster.cpp +++ b/tests/api/test_cluster.cpp @@ -1,3 +1,8 @@ +// SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +// This file holds Cluster specific API examples. #include #include "fmt/xchar.h" diff --git a/tests/api/test_cluster_descriptor.cpp b/tests/api/test_cluster_descriptor.cpp index d5df1bc0..c7b313d9 100644 --- a/tests/api/test_cluster_descriptor.cpp +++ b/tests/api/test_cluster_descriptor.cpp @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 #include @@ -46,7 +49,7 @@ inline std::unique_ptr get_cluster_desc() { return cluster_desc; } -TEST(ApiTest, DetectArch) { +TEST(ApiClusterDescriptorTest, DetectArch) { // TODO: This should be part of cluster descriptor. It is currently used like this from tt_metal. tt::ARCH arch = detect_arch(); diff --git a/tests/grayskull/test_silicon_driver.cpp b/tests/grayskull/test_silicon_driver.cpp index 346c80b2..f6b2985e 100644 --- a/tests/grayskull/test_silicon_driver.cpp +++ b/tests/grayskull/test_silicon_driver.cpp @@ -91,10 +91,9 @@ TEST(SiliconDriverGS, HarvestingRuntime) { // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. device.configure_tlb(i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE); } + device.setup_core_to_tlb_map(i, get_static_tlb_index); } - device.setup_core_to_tlb_map(get_static_tlb_index); - tt_device_params default_params; device.start_device(default_params); device.deassert_risc_reset(); @@ -154,9 +153,8 @@ TEST(SiliconDriverGS, StaticTLB_RW) { // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. device.configure_tlb(i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE, TLB_DATA::Posted); } + device.setup_core_to_tlb_map(i, get_static_tlb_index); } - - device.setup_core_to_tlb_map(get_static_tlb_index); tt_device_params default_params; device.start_device(default_params); @@ -324,10 +322,9 @@ TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. device.configure_tlb(i, core, get_static_tlb_index(core), base_addr); } + device.setup_core_to_tlb_map(i, get_static_tlb_index); } - device.setup_core_to_tlb_map(get_static_tlb_index); - tt_device_params default_params; device.start_device(default_params); device.deassert_risc_reset(); diff --git a/tests/wormhole/test_silicon_driver_wh.cpp b/tests/wormhole/test_silicon_driver_wh.cpp index 00eb9455..f317a477 100644 --- a/tests/wormhole/test_silicon_driver_wh.cpp +++ b/tests/wormhole/test_silicon_driver_wh.cpp @@ -230,10 +230,9 @@ TEST(SiliconDriverWH, UnalignedStaticTLB_RW) { // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. device.configure_tlb(i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); } - } + device.setup_core_to_tlb_map(i, get_static_tlb_index_callback); + } } - - device.setup_core_to_tlb_map(get_static_tlb_index_callback); tt_device_params default_params; device.start_device(default_params); @@ -290,10 +289,10 @@ TEST(SiliconDriverWH, StaticTLB_RW) { // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. device.configure_tlb(i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); } + device.setup_core_to_tlb_map(i, get_static_tlb_index_callback); } } - device.setup_core_to_tlb_map(get_static_tlb_index_callback); tt_device_params default_params; device.start_device(default_params); @@ -436,9 +435,9 @@ TEST(SiliconDriverWH, MultiThreadedMemBar) { // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. device.configure_tlb(i, core, get_static_tlb_index_callback(core), base_addr); } + device.setup_core_to_tlb_map(i, get_static_tlb_index_callback); } } - device.setup_core_to_tlb_map(get_static_tlb_index_callback); tt_device_params default_params; device.start_device(default_params);