From 09e414518140a5d6e5ce3439c87e40e5bdc51b1b Mon Sep 17 00:00:00 2001 From: Lars T Hansen Date: Mon, 7 Oct 2024 10:50:17 +0200 Subject: [PATCH] Fix #44 - check for presence of GPUs before running probes --- src/amd.rs | 23 +++++++++++++++++++---- src/nvidia.rs | 16 +++++++++++++++- 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/src/amd.rs b/src/amd.rs index ec9e4a2..ab4f4cf 100644 --- a/src/amd.rs +++ b/src/amd.rs @@ -19,10 +19,18 @@ use crate::ps::UserTable; use crate::TIMEOUT_SECONDS; use std::cmp::Ordering; +use std::path::Path; #[cfg(test)] use crate::util::map; +// On all nodes we've looked at (ML systems, Lumi), /sys/module/amdgpu exists iff there are AMD +// accelerators present. + +fn amd_present() -> bool { + return Path::new("/sys/module/amdgpu").exists() +} + // We only have one machine with AMD GPUs at UiO and rocm-smi is unable to show eg how much memory // is installed on each card on this machine, so this is pretty limited. But we are at least able // to extract gross information about the installed cards. @@ -40,6 +48,9 @@ use crate::util::map; // too small. This is presumably all driver dependent.) pub fn get_amd_configuration() -> Option> { + if !amd_present() { + return None + } match command::safe_command("rocm-smi", &["--showproductname"], TIMEOUT_SECONDS) { Ok(raw_text) => { let mut cards = vec![]; @@ -65,12 +76,16 @@ pub fn get_amd_configuration() -> Option> { } } -/// Get information about AMD cards. -/// -/// Err(e) really means the command started running but failed, for the reason given. If the -/// command could not be found, we return Ok(vec![]). +// Get information about AMD cards. +// +// Err(e) really means the command started running but failed, for the reason given. If the +// command could not be found or no card is present, we return Ok(vec![]). pub fn get_amd_information(user_by_pid: &UserTable) -> Result, String> { + if !amd_present() { + return Ok(vec![]) + } + // I've not been able to combine the two invocations of rocm-smi yet; we have to run the command // twice. Not a happy situation. diff --git a/src/nvidia.rs b/src/nvidia.rs index e8c7080..4d0c368 100644 --- a/src/nvidia.rs +++ b/src/nvidia.rs @@ -12,9 +12,17 @@ use crate::ps::UserTable; use crate::util; use crate::TIMEOUT_SECONDS; +use std::path::Path; #[cfg(test)] use crate::util::map; +// On all nodes we've looked at (Fox, Betzy, ML systems), /sys/module/nvidia exists iff there are +// nvidia accelerators present. + +fn nvidia_present() -> bool { + return Path::new("/sys/module/nvidia").exists() +} + // `nvidia-smi -a` dumps a lot of information about all the cards in a semi-structured form, // each line a textual keyword/value pair. // @@ -24,6 +32,9 @@ use crate::util::map; // Parsing all the output lines in order yields the information about all the cards. pub fn get_nvidia_configuration() -> Option> { + if !nvidia_present() { + return None + } match command::safe_command("nvidia-smi", &["-a"], TIMEOUT_SECONDS) { Ok(raw_text) => { let mut cards = vec![]; @@ -74,9 +85,12 @@ pub fn get_nvidia_configuration() -> Option> { } // Err(e) really means the command started running but failed, for the reason given. If the -// command could not be found, we return Ok(vec![]). +// command could not be found or no card is present, we return Ok(vec![]). pub fn get_nvidia_information(user_by_pid: &UserTable) -> Result, String> { + if !nvidia_present() { + return Ok(vec![]) + } match command::safe_command(NVIDIA_PMON_COMMAND, NVIDIA_PMON_ARGS, TIMEOUT_SECONDS) { Ok(pmon_raw_text) => { let mut processes = parse_pmon_output(&pmon_raw_text, user_by_pid)?;