Skip to content

Commit

Permalink
fix: avoid nvidia-smi if it will fail
Browse files Browse the repository at this point in the history
Avoid GPU query whenever running in a SLURM environment without GPU. If the node doesn't have NVIDIA drivers this query will fail and the process will get killed.
  • Loading branch information
PabloNA97 committed Sep 27, 2024
1 parent 015f34e commit f43b56c
Showing 1 changed file with 55 additions and 0 deletions.
55 changes: 55 additions & 0 deletions openfe/utils/system_probe.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,55 @@ def _get_hostname() -> str:
return socket.gethostname()


def _slurm_environment() -> bool:
"""
Check if the current environment is managed by SLURM.
"""

slurm_job_id = os.environ.get("SLURM_JOB_ID")

if slurm_job_id:
return True

Check warning on line 284 in openfe/utils/system_probe.py

View check run for this annotation

Codecov / codecov/patch

openfe/utils/system_probe.py#L284

Added line #L284 was not covered by tests
else:
return False


def _check_slurm_gpu_info() -> bool:
"""
Check if the GPU information is available in the SLURM environment.
Returns
-------
bool
True if the GPU information is available in the SLURM environment, False
otherwise.
Notes
-----
This function checks if the GPU information is available in the SLURM environment by
inspecting the environment variables.
The function returns True if any of the following environment variables are present:
- 'SLURM_JOB_GPUS'
- 'SLURM_GPUS'
- 'CUDA_VISIBLE_DEVICES'
Otherwise, it returns False.
"""

slurm_job_gpus = os.environ.get("SLURM_JOB_GPUS")
slurm_gpus = os.environ.get("SLURM_GPUS")
cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")

Check warning on line 314 in openfe/utils/system_probe.py

View check run for this annotation

Codecov / codecov/patch

openfe/utils/system_probe.py#L312-L314

Added lines #L312 - L314 were not covered by tests

logging.debug(f"SLURM_JOB_GPUS: {slurm_job_gpus}")
logging.debug(f"SLURM_GPUS_PER_NODE: {slurm_gpus}")
logging.debug(f"CUDA_VISIBLE_DEVICES: {cuda_visible_devices}")

Check warning on line 318 in openfe/utils/system_probe.py

View check run for this annotation

Codecov / codecov/patch

openfe/utils/system_probe.py#L316-L318

Added lines #L316 - L318 were not covered by tests

if slurm_job_gpus or slurm_gpus or cuda_visible_devices:
return True

Check warning on line 321 in openfe/utils/system_probe.py

View check run for this annotation

Codecov / codecov/patch

openfe/utils/system_probe.py#L320-L321

Added lines #L320 - L321 were not covered by tests
else:
return False

Check warning on line 323 in openfe/utils/system_probe.py

View check run for this annotation

Codecov / codecov/patch

openfe/utils/system_probe.py#L323

Added line #L323 was not covered by tests

def _get_gpu_info() -> dict[str, dict[str, str]]:
"""
Get GPU information using the 'nvidia-smi' command-line utility.
Expand Down Expand Up @@ -336,6 +385,12 @@ def _get_gpu_info() -> dict[str, dict[str, str]]:
"utilization.memory,memory.total,driver_version,"
)

if _slurm_environment() and not _check_slurm_gpu_info():
logging.debug(

Check warning on line 389 in openfe/utils/system_probe.py

View check run for this annotation

Codecov / codecov/patch

openfe/utils/system_probe.py#L389

Added line #L389 was not covered by tests
"SLURM environment detected, but GPU information is not available."
)
return {}

Check warning on line 392 in openfe/utils/system_probe.py

View check run for this annotation

Codecov / codecov/patch

openfe/utils/system_probe.py#L392

Added line #L392 was not covered by tests

try:
nvidia_smi_output = subprocess.check_output(
["nvidia-smi", GPU_QUERY, "--format=csv"]
Expand Down

0 comments on commit f43b56c

Please sign in to comment.