Skip to content

Commit

Permalink
Fix Metal build: put detect_arch() back (#175)
Browse files Browse the repository at this point in the history
This family of functions was removed during a refactor. Metal is relying
on one of them. This change reintroduces it.

Fixes #171
  • Loading branch information
joelsmithTT authored Oct 17, 2024
1 parent d8486fd commit 35aa474
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 3 deletions.
28 changes: 28 additions & 0 deletions device/pcie/pci_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,17 @@ inline void memcpy_from_device(void *dest, const void *src, std::size_t num_byte
}
}

tt::ARCH PciDeviceInfo::get_arch() const {
if (this->device_id == GS_PCIE_DEVICE_ID){
return tt::ARCH::GRAYSKULL;
} else if (this->device_id == WH_PCIE_DEVICE_ID) {
return tt::ARCH::WORMHOLE_B0;
} else if (this->device_id == WH_PCIE_DEVICE_ID){
return tt::ARCH::BLACKHOLE;
}
return tt::ARCH::Invalid;
}


/* static */ std::vector<int> PCIDevice::enumerate_devices() {
std::vector<int> device_ids;
Expand All @@ -212,6 +223,23 @@ inline void memcpy_from_device(void *dest, const void *src, std::size_t num_byte
return device_ids;
}

/* static */ std::map<int, PciDeviceInfo> PCIDevice::enumerate_devices_info() {
std::map<int, PciDeviceInfo> infos;
for (int n : PCIDevice::enumerate_devices()) {
int fd = open(fmt::format("/dev/tenstorrent/{}", n).c_str(), O_RDWR | O_CLOEXEC);
if (fd == -1) {
continue;
}

try {
infos[n] = read_device_info(fd);
} catch (...) {}

close(fd);
}
return infos;
}

PCIDevice::PCIDevice(int pci_device_number, int logical_device_id)
: device_path(fmt::format("/dev/tenstorrent/{}", pci_device_number))
, pci_device_num(pci_device_number)
Expand Down
16 changes: 13 additions & 3 deletions device/pcie/pci_device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@

#include <cstdint>
#include <cstdio>
#include <vector>
#include <map>
#include <unordered_map>
#include <vector>

#include "device/tt_xy_pair.h"
#include "device/tt_arch_types.h"
Expand Down Expand Up @@ -41,13 +42,17 @@ struct PciDeviceInfo
uint16_t pci_bus;
uint16_t pci_device;
uint16_t pci_function;

tt::ARCH get_arch() const;
// TODO: does it make sense to move attributes that we can read from sysfs
// onto this struct as methods? e.g. current_link_width etc.
};

class PCIDevice {
const std::string device_path; // Path to character device: /dev/tenstorrent/N
const int pci_device_num; // N in /dev/tenstorrent/N
const int pci_device_num; // N in /dev/tenstorrent/N
const int logical_id; // Unique identifier for each device in entire network topology
const int pci_device_file_desc; // Character device file descriptor
const int pci_device_file_desc; // Character device file descriptor
const PciDeviceInfo info; // PCI device info
const int numa_node; // -1 if non-NUMA
const int revision; // PCI revision value from sysfs
Expand All @@ -60,6 +65,11 @@ class PCIDevice {
*/
static std::vector<int> enumerate_devices();

/**
* @return a map of PCI device numbers (/dev/tenstorrent/N) to PciDeviceInfo
*/
static std::map<int, PciDeviceInfo> enumerate_devices_info();

/**
* PCI device constructor.
*
Expand Down
3 changes: 3 additions & 0 deletions device/tt_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@

using TLB_DATA = tt::umd::tlb_data;

// TODO: Remove this - it's here for Metal backwards compatibility.
// Implementation is in tt_silicon_driver.cpp.
tt::ARCH detect_arch(int pci_device_num);

namespace boost::interprocess{
class named_mutex;
Expand Down
21 changes: 21 additions & 0 deletions device/tt_silicon_driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,27 @@ std::string hugepage_dir = hugepage_dir_env ? hugepage_dir_env : "/dev/hugepages
// TLB size for DRAM on blackhole - 4GB
const uint64_t BH_4GB_TLB_SIZE = 4ULL * 1024 * 1024 * 1024;

// Metal uses this function to determine the architecture of the first PCIe chip
// and then verifies that all subsequent chips are of the same architecture. It
// looks like Metal is doing this because we don't provide any other way... When
// we are further along in our refactoring efforts and `tt_device` is more of a
// Cluster abstraction, we should provide Metal with interfaces for:
// 1. Checking that all chips are of the same architecture (we may not care
// about this, but the application might).
// 2. Getting the architecture of a specific chip.
// Until then... I'm putting this function back so that Metal will still build
// next time someone bumps its UMD submodule version.
tt::ARCH detect_arch(int pci_device_num) {
const auto devices_info = PCIDevice::enumerate_devices_info();
const auto it = devices_info.find(pci_device_num);
if (it == devices_info.end()) {
return tt::ARCH::Invalid;
}

const auto info = it->second;
return info.get_arch();
}

template <typename T>
void size_buffer_to_capacity(std::vector<T> &data_buf, std::size_t size_in_bytes) {
std::size_t target_size = 0;
Expand Down

0 comments on commit 35aa474

Please sign in to comment.