Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TLB map setup per chip #179

Merged
merged 13 commits into from
Oct 23, 2024
17 changes: 11 additions & 6 deletions device/tt_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -282,9 +282,10 @@ class tt_device
/**
* Give UMD a 1:1 function mapping a core to its appropriate static TLB (currently only support a single TLB per core).
*
* @param logical_device_id MMIO chip being targeted.
* @param mapping_function Function which maps core to TLB index.
*/
virtual void setup_core_to_tlb_map(std::function<std::int32_t(tt_xy_pair)> mapping_function) {
virtual void setup_core_to_tlb_map(const chip_id_t logical_device_id, std::function<std::int32_t(tt_xy_pair)> mapping_function) {
throw std::runtime_error("---- tt_device::setup_core_to_tlb_map is not implemented\n");
}

Expand Down Expand Up @@ -592,7 +593,6 @@ class tt_device
bool performed_harvesting = false;
std::unordered_map<chip_id_t, uint32_t> harvested_rows_per_target = {};
bool translation_tables_en = false;
bool tlbs_init = false;

protected:
std::unordered_map<chip_id_t, tt_SocDescriptor> soc_descriptor_per_chip = {};
Expand Down Expand Up @@ -633,7 +633,7 @@ class tt_SiliconDevice: public tt_device
virtual void set_driver_eth_interface_params(const tt_driver_eth_interface_params& eth_interface_params_);
virtual void configure_tlb(chip_id_t logical_device_id, tt_xy_pair core, std::int32_t tlb_index, std::int32_t address, uint64_t ordering = TLB_DATA::Posted);
virtual void set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering = TLB_DATA::Posted);
virtual void setup_core_to_tlb_map(std::function<std::int32_t(tt_xy_pair)> mapping_function);
virtual void setup_core_to_tlb_map(const chip_id_t logical_device_id, std::function<std::int32_t(tt_xy_pair)> mapping_function);
virtual void configure_active_ethernet_cores_for_mmio_device(chip_id_t mmio_chip, const std::unordered_set<tt_xy_pair>& active_eth_cores_per_chip);
virtual void start_device(const tt_device_params &device_params);
virtual void assert_risc_reset();
Expand Down Expand Up @@ -661,7 +661,7 @@ class tt_SiliconDevice: public tt_device
/**
* If the tlbs are initialized, returns a tuple with the TLB base address and its size
*/
std::optional<std::tuple<uint32_t, uint32_t>> get_tlb_data_from_target(const tt_xy_pair& target);
std::optional<std::tuple<uint32_t, uint32_t>> get_tlb_data_from_target(const tt_cxy_pair& target);
/**
* This API allows you to write directly to device memory that is addressable by a static TLB
*/
Expand Down Expand Up @@ -705,6 +705,8 @@ class tt_SiliconDevice: public tt_device
virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel);
virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id);
virtual tt_version get_ethernet_fw_version() const;
// TODO: This should be accessible through public API, probably to be moved to tt_device.
PCIDevice *get_pci_device(int device_id) const;

// Destructor
virtual ~tt_SiliconDevice ();
Expand Down Expand Up @@ -760,7 +762,6 @@ class tt_SiliconDevice: public tt_device
int pcie_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr);
int remote_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr);
bool address_in_tlb_space(uint32_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, uint32_t chip);
PCIDevice *get_pci_device(int pci_intf_id) const;
std::shared_ptr<boost::interprocess::named_mutex> get_mutex(const std::string& tlb_name, int pci_interface_id);
virtual uint32_t get_harvested_noc_rows_for_chip(int logical_device_id); // Returns one-hot encoded harvesting mask for PCIe mapped chips
void generate_tensix_broadcast_grids_for_grayskull( std::set<std::pair<tt_xy_pair, tt_xy_pair>>& broadcast_grids, std::set<uint32_t>& rows_to_exclude, std::set<uint32_t>& cols_to_exclude);
Expand Down Expand Up @@ -815,7 +816,11 @@ class tt_SiliconDevice: public tt_device
std::map<chip_id_t, std::unordered_map<std::int32_t, std::int32_t>> tlb_config_map = {};
std::set<chip_id_t> all_target_mmio_devices;
std::unordered_map<chip_id_t, std::vector<uint32_t>> host_channel_size;
std::function<std::int32_t(tt_xy_pair)> map_core_to_tlb;

// Note that these maps holds only entries for local PCIe chips.
std::unordered_map<chip_id_t, std::function<std::int32_t(tt_xy_pair)>> map_core_to_tlb_per_chip = {};
std::unordered_map<chip_id_t, bool> tlbs_init_per_chip = {};
broskoTT marked this conversation as resolved.
Show resolved Hide resolved

std::unordered_map<std::string, std::int32_t> dynamic_tlb_config = {};
std::unordered_map<std::string, uint64_t> dynamic_tlb_ordering_modes = {};
std::map<std::set<chip_id_t>, std::unordered_map<chip_id_t, std::vector<std::vector<int>>>> bcast_header_cache = {};
Expand Down
24 changes: 12 additions & 12 deletions device/tt_silicon_driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -914,7 +914,7 @@ tt::Writer tt_SiliconDevice::get_static_tlb_writer(tt_cxy_pair target) {
throw std::runtime_error(fmt::format("Target not in MMIO chip: {}", target.str()));
}

if (!tlbs_init || !map_core_to_tlb) {
if (!tlbs_init_per_chip[target.chip] || !map_core_to_tlb_per_chip[target.chip]) {
throw std::runtime_error("TLBs not initialized");
}

Expand All @@ -924,7 +924,7 @@ tt::Writer tt_SiliconDevice::get_static_tlb_writer(tt_cxy_pair target) {
throw std::runtime_error("No write-combined mapping for BAR0");
}

auto tlb_index = map_core_to_tlb(tt_xy_pair(target.x, target.y));
auto tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y));
auto tlb_data = dev->get_architecture_implementation()->describe_tlb(tlb_index);

if (!tlb_data.has_value()) {
Expand All @@ -946,8 +946,8 @@ void tt_SiliconDevice::write_device_memory(const void *mem_ptr, uint32_t size_in

std::int32_t tlb_index = 0;
std::optional<std::tuple<std::uint64_t, std::uint64_t>> tlb_data = std::nullopt;
if(tlbs_init) {
tlb_index = map_core_to_tlb(tt_xy_pair(target.x, target.y));
if(tlbs_init_per_chip[target.chip]) {
tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y));
broskoTT marked this conversation as resolved.
Show resolved Hide resolved
tlb_data = dev->get_architecture_implementation()->describe_tlb(tlb_index);
}

Expand Down Expand Up @@ -987,8 +987,8 @@ void tt_SiliconDevice::read_device_memory(void *mem_ptr, tt_cxy_pair target, std

std::int32_t tlb_index = 0;
std::optional<std::tuple<std::uint64_t, std::uint64_t>> tlb_data = std::nullopt;
if(tlbs_init) {
tlb_index = map_core_to_tlb(tt_xy_pair(target.x, target.y));
if(tlbs_init_per_chip[target.chip]) {
tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y));
tlb_data = dev->get_architecture_implementation()->describe_tlb(tlb_index);
}
log_debug(LogSiliconDriver, " tlb_index: {}, tlb_data.has_value(): {}", tlb_index, tlb_data.has_value());
Expand Down Expand Up @@ -1165,12 +1165,12 @@ tt_SiliconDevice::~tt_SiliconDevice () {
dynamic_tlb_ordering_modes.clear();
}

std::optional<std::tuple<uint32_t, uint32_t>> tt_SiliconDevice::get_tlb_data_from_target(const tt_xy_pair& target) {
std::optional<std::tuple<uint32_t, uint32_t>> tt_SiliconDevice::get_tlb_data_from_target(const tt_cxy_pair& target) {
std::int32_t tlb_index = 0;
std::optional<std::tuple<std::uint32_t, std::uint32_t>> tlb_data;

if (tlbs_init) {
tlb_index = map_core_to_tlb(target);
if (tlbs_init_per_chip[target.chip]) {
tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y));
auto architecture_implementation = tt::umd::architecture_implementation::create(static_cast<tt::umd::architecture>(arch_name));
tlb_data = architecture_implementation->describe_tlb(tlb_index);
}
Expand Down Expand Up @@ -2909,9 +2909,9 @@ void tt_SiliconDevice::set_driver_eth_interface_params(const tt_driver_eth_inter
eth_interface_params = eth_interface_params_;
}

void tt_SiliconDevice::setup_core_to_tlb_map(std::function<std::int32_t(tt_xy_pair)> mapping_function) {
map_core_to_tlb = mapping_function;
tlbs_init = true;
void tt_SiliconDevice::setup_core_to_tlb_map(const chip_id_t logical_device_id, std::function<std::int32_t(tt_xy_pair)> mapping_function) {
map_core_to_tlb_per_chip[logical_device_id] = mapping_function;
tlbs_init_per_chip[logical_device_id] = true;
}

std::uint32_t tt_SiliconDevice::get_num_dram_channels(std::uint32_t device_id) {
Expand Down
3 changes: 2 additions & 1 deletion tests/api/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
set(API_TESTS_SRCS
test_cluster.cpp
test_chip.cpp
test_cluster_descriptor.cpp
test_cluster.cpp
)

add_executable(api_tests ${API_TESTS_SRCS})
Expand Down
163 changes: 163 additions & 0 deletions tests/api/test_chip.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
// SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

// This file holds Chip specific API examples.

#include <gtest/gtest.h>
#include "fmt/xchar.h"

#include <algorithm>
#include <filesystem>
#include <string>
#include <vector>

#include "tests/test_utils/generate_cluster_desc.hpp"

// TODO: change to tt_cluster
#include "device/tt_device.h"
#include "device/tt_cluster_descriptor.h"

// TODO: write this test to work with Chip not whole Cluster.
using Cluster = tt_SiliconDevice;

inline std::unique_ptr<tt_ClusterDescriptor> get_cluster_desc() {
// TODO: This should not be needed. And could be part of the cluster descriptor probably.
// Note that cluster descriptor holds logical ids of chips.
// Which are different than physical PCI ids, which are /dev/tenstorrent/N ones.
// You have to see if physical PCIe is GS before constructing a cluster descriptor.
std::vector<int> pci_device_ids = PCIDevice::enumerate_devices();
std::set<int> pci_device_ids_set (pci_device_ids.begin(), pci_device_ids.end());

tt::ARCH device_arch = tt::ARCH::GRAYSKULL;
if (!pci_device_ids.empty()) {
// TODO: This should be removed from the API, the driver itself should do it.
int physical_device_id = pci_device_ids[0];
// TODO: remove logical_device_id
PCIDevice pci_device (physical_device_id, 0);
device_arch = pci_device.get_arch();
}

// TODO: Make this test work on a host system without any tt devices.
if (pci_device_ids.empty()) {
std::cout << "No Tenstorrent devices found. Skipping test." << std::endl;
return nullptr;
}

// TODO: remove getting manually cluster descriptor from yaml.
std::string yaml_path = test_utils::GetClusterDescYAML();
std::unique_ptr<tt_ClusterDescriptor> cluster_desc;
if (device_arch == tt::ARCH::GRAYSKULL) {
cluster_desc = tt_ClusterDescriptor::create_for_grayskull_cluster(pci_device_ids_set, pci_device_ids);
} else {
cluster_desc = tt_ClusterDescriptor::create_from_yaml(yaml_path);
}

return cluster_desc;
}

inline std::unique_ptr<Cluster> get_cluster() {

// TODO: This should not be needed. And could be part of the cluster descriptor probably.
// Note that cluster descriptor holds logical ids of chips.
// Which are different than physical PCI ids, which are /dev/tenstorrent/N ones.
// You have to see if physical PCIe is GS before constructing a cluster descriptor.
std::vector<int> pci_device_ids = PCIDevice::enumerate_devices();
std::set<int> pci_device_ids_set (pci_device_ids.begin(), pci_device_ids.end());

tt::ARCH device_arch = tt::ARCH::GRAYSKULL;
if (!pci_device_ids.empty()) {
// TODO: This should be removed from the API, the driver itself should do it.
int physical_device_id = pci_device_ids[0];
// TODO: remove logical_device_id
PCIDevice pci_device (physical_device_id, 0);
device_arch = pci_device.get_arch();
}

// TODO: Make this test work on a host system without any tt devices.
if (pci_device_ids.empty()) {
std::cout << "No Tenstorrent devices found. Skipping test." << std::endl;
return nullptr;
}

// TODO: remove getting manually cluster descriptor from yaml.
std::string yaml_path = test_utils::GetClusterDescYAML();
// TODO: Remove the need to do this, allow default constructor to construct with all chips.
std::unique_ptr<tt_ClusterDescriptor> cluster_desc = get_cluster_desc();
std::unordered_set<int> detected_num_chips = cluster_desc->get_all_chips();

// TODO: make this unordered vs set conversion not needed.
std::set<chip_id_t> detected_num_chips_set (detected_num_chips.begin(), detected_num_chips.end());


// TODO: This would be incorporated inside SocDescriptor.
std::string soc_path;
if (device_arch == tt::ARCH::GRAYSKULL) {
soc_path = test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml");
} else if (device_arch == tt::ARCH::WORMHOLE || device_arch == tt::ARCH::WORMHOLE_B0) {
soc_path = test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml");
} else if (device_arch == tt::ARCH::BLACKHOLE) {
soc_path = test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml");
} else {
throw std::runtime_error("Unsupported architecture");
}


// TODO: Don't pass each of these arguments.
return std::unique_ptr<Cluster>(new Cluster(soc_path, device_arch == tt::ARCH::GRAYSKULL ? "" : yaml_path, detected_num_chips_set));
}

// TODO: Once default auto TLB setup is in, check it is setup properly.
TEST(ApiChipTest, ManualTLBConfiguration) {
std::unique_ptr<Cluster> umd_cluster = get_cluster();

// Expect to throw for remote chip for any worker core
auto remote_chips = umd_cluster->get_target_remote_device_ids();
if (!remote_chips.empty()) {
chip_id_t any_remote_chip = *remote_chips.begin();
const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(any_remote_chip);
tt_xy_pair core = soc_desc.workers[0];
EXPECT_THROW(umd_cluster->get_static_tlb_writer(tt_cxy_pair(any_remote_chip, core)), std::runtime_error);
}

// Expect to throw for non configured mmio chip.
chip_id_t any_mmio_chip = *umd_cluster->get_target_mmio_device_ids().begin();
const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(any_mmio_chip);
tt_xy_pair core = soc_desc.workers[0];
EXPECT_THROW(umd_cluster->get_static_tlb_writer(tt_cxy_pair(any_mmio_chip, core)), std::runtime_error);

// TODO: This should be part of TTDevice interface, not Cluster or Chip.
// Configure TLBs.
std::function<int(tt_xy_pair)> get_static_tlb_index = [&](tt_xy_pair core) -> int {
// TODO: Make this per arch.
bool is_worker_core = soc_desc.is_worker_core(core);
if (!is_worker_core) {
return -1;
}
return core.x + core.y * umd_cluster->get_pci_device(any_mmio_chip)->get_architecture_implementation()->get_grid_size_x();
};

std::int32_t c_zero_address = 0;

// Each MMIO chip has it's own set of TLBs, so needs its own configuration.
for (chip_id_t mmio_chip: umd_cluster->get_target_mmio_device_ids()) {

const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(mmio_chip);
for (tt_xy_pair core: soc_desc.workers) {
umd_cluster->configure_tlb(mmio_chip, core, get_static_tlb_index(core), c_zero_address);
}

umd_cluster->setup_core_to_tlb_map(mmio_chip, get_static_tlb_index);
}

// Expect not to throw for now configured mmio chip, same one as before.
EXPECT_NO_THROW(umd_cluster->get_static_tlb_writer(tt_cxy_pair(any_mmio_chip, core)));

// Expect to throw for non worker cores.
tt_xy_pair dram_core = soc_desc.dram_cores[0][0];
EXPECT_THROW(umd_cluster->get_static_tlb_writer(tt_cxy_pair(any_mmio_chip, dram_core)), std::runtime_error);
if (!soc_desc.ethernet_cores.empty()) {
tt_xy_pair eth_core = soc_desc.ethernet_cores[0];
EXPECT_THROW(umd_cluster->get_static_tlb_writer(tt_cxy_pair(any_mmio_chip, eth_core)), std::runtime_error);
}
}
5 changes: 5 additions & 0 deletions tests/api/test_cluster.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
// SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

// This file holds Cluster specific API examples.

#include <gtest/gtest.h>
#include "fmt/xchar.h"
Expand Down
5 changes: 4 additions & 1 deletion tests/api/test_cluster_descriptor.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
// SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <gtest/gtest.h>

Expand Down Expand Up @@ -46,7 +49,7 @@ inline std::unique_ptr<tt_ClusterDescriptor> get_cluster_desc() {
return cluster_desc;
}

TEST(ApiTest, DetectArch) {
TEST(ApiClusterDescriptorTest, DetectArch) {
// TODO: This should be part of cluster descriptor. It is currently used like this from tt_metal.
tt::ARCH arch = detect_arch();

Expand Down
9 changes: 3 additions & 6 deletions tests/grayskull/test_silicon_driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,9 @@ TEST(SiliconDriverGS, HarvestingRuntime) {
// Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE.
device.configure_tlb(i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE);
}
device.setup_core_to_tlb_map(i, get_static_tlb_index);
}

device.setup_core_to_tlb_map(get_static_tlb_index);

tt_device_params default_params;
device.start_device(default_params);
device.deassert_risc_reset();
Expand Down Expand Up @@ -154,9 +153,8 @@ TEST(SiliconDriverGS, StaticTLB_RW) {
// Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE.
device.configure_tlb(i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE, TLB_DATA::Posted);
}
device.setup_core_to_tlb_map(i, get_static_tlb_index);
}

device.setup_core_to_tlb_map(get_static_tlb_index);

tt_device_params default_params;
device.start_device(default_params);
Expand Down Expand Up @@ -324,10 +322,9 @@ TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run
// Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE.
device.configure_tlb(i, core, get_static_tlb_index(core), base_addr);
}
device.setup_core_to_tlb_map(i, get_static_tlb_index);
}

device.setup_core_to_tlb_map(get_static_tlb_index);

tt_device_params default_params;
device.start_device(default_params);
device.deassert_risc_reset();
Expand Down
Loading