Skip to content

Commit

Permalink
Fix #44 - check for presence of GPUs before running probes
Browse files Browse the repository at this point in the history
  • Loading branch information
Lars T Hansen committed Oct 7, 2024
1 parent d636b16 commit 09e4145
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 5 deletions.
23 changes: 19 additions & 4 deletions src/amd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,18 @@ use crate::ps::UserTable;
use crate::TIMEOUT_SECONDS;

use std::cmp::Ordering;
use std::path::Path;

#[cfg(test)]
use crate::util::map;

// On all nodes we've looked at (ML systems, Lumi), /sys/module/amdgpu exists iff there are AMD
// accelerators present.

fn amd_present() -> bool {
return Path::new("/sys/module/amdgpu").exists()
}

// We only have one machine with AMD GPUs at UiO and rocm-smi is unable to show eg how much memory
// is installed on each card on this machine, so this is pretty limited. But we are at least able
// to extract gross information about the installed cards.
Expand All @@ -40,6 +48,9 @@ use crate::util::map;
// too small. This is presumably all driver dependent.)

pub fn get_amd_configuration() -> Option<Vec<gpu::Card>> {
if !amd_present() {
return None
}
match command::safe_command("rocm-smi", &["--showproductname"], TIMEOUT_SECONDS) {
Ok(raw_text) => {
let mut cards = vec![];
Expand All @@ -65,12 +76,16 @@ pub fn get_amd_configuration() -> Option<Vec<gpu::Card>> {
}
}

/// Get information about AMD cards.
///
/// Err(e) really means the command started running but failed, for the reason given. If the
/// command could not be found, we return Ok(vec![]).
// Get information about AMD cards.
//
// Err(e) really means the command started running but failed, for the reason given. If the
// command could not be found or no card is present, we return Ok(vec![]).

pub fn get_amd_information(user_by_pid: &UserTable) -> Result<Vec<gpu::Process>, String> {
if !amd_present() {
return Ok(vec![])
}

// I've not been able to combine the two invocations of rocm-smi yet; we have to run the command
// twice. Not a happy situation.

Expand Down
16 changes: 15 additions & 1 deletion src/nvidia.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,17 @@ use crate::ps::UserTable;
use crate::util;
use crate::TIMEOUT_SECONDS;

use std::path::Path;
#[cfg(test)]
use crate::util::map;

// On all nodes we've looked at (Fox, Betzy, ML systems), /sys/module/nvidia exists iff there are
// nvidia accelerators present.

fn nvidia_present() -> bool {
return Path::new("/sys/module/nvidia").exists()
}

// `nvidia-smi -a` dumps a lot of information about all the cards in a semi-structured form,
// each line a textual keyword/value pair.
//
Expand All @@ -24,6 +32,9 @@ use crate::util::map;
// Parsing all the output lines in order yields the information about all the cards.

pub fn get_nvidia_configuration() -> Option<Vec<gpu::Card>> {
if !nvidia_present() {
return None
}
match command::safe_command("nvidia-smi", &["-a"], TIMEOUT_SECONDS) {
Ok(raw_text) => {
let mut cards = vec![];
Expand Down Expand Up @@ -74,9 +85,12 @@ pub fn get_nvidia_configuration() -> Option<Vec<gpu::Card>> {
}

// Err(e) really means the command started running but failed, for the reason given. If the
// command could not be found, we return Ok(vec![]).
// command could not be found or no card is present, we return Ok(vec![]).

pub fn get_nvidia_information(user_by_pid: &UserTable) -> Result<Vec<gpu::Process>, String> {
if !nvidia_present() {
return Ok(vec![])
}
match command::safe_command(NVIDIA_PMON_COMMAND, NVIDIA_PMON_ARGS, TIMEOUT_SECONDS) {
Ok(pmon_raw_text) => {
let mut processes = parse_pmon_output(&pmon_raw_text, user_by_pid)?;
Expand Down

0 comments on commit 09e4145

Please sign in to comment.