mila-iqia · Delaunay · Mar 1, 2024 · Jan 26, 2024 · Mar 1, 2024 · Mar 1, 2024
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -26,9 +26,9 @@ jobs:
       matrix:
         include:
           - arch: cuda
-            exclude : "no-cuda"
+            exclude : "unsupported-cuda"
           # - arch: rocm
-          #   exclude : "no-rocm"
+          #   exclude : "unsupported-rocm"
 
     runs-on: [self-hosted, "${{ matrix.arch }}"]
 

diff --git a/config/base.yaml b/config/base.yaml
@@ -522,7 +522,7 @@ rwkv:
   tags:
     - llm
     - rnn
-    - no-rocm
+    - unsupported-rocm
   plan:
     method: per_gpu
   argv:

diff --git a/docker/Dockerfile-cuda b/docker/Dockerfile-cuda
@@ -1,15 +1,17 @@
-FROM ubuntu:22.04
+# FROM ubuntu:22.04
 
 # For cuda-gdb
-# FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
+FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
 
 # Arguments
 # ---------
 
+# Use ofed_info -s to get your local version
+ARG MOFED_VERSION=5.4-3.4.0.0
+ARG CONFIG=standard.yaml
 ARG ARCH=cuda
-ENV MILABENCH_GPU_ARCH=$ARCH
 
-ARG CONFIG=standard.yaml
+ENV MILABENCH_GPU_ARCH=$ARCH
 ENV MILABENCH_CONFIG_NAME=$CONFIG
 ENV MILABENCH_DOCKER=1
 
@@ -36,24 +38,18 @@ COPY . /milabench/milabench/
 #           rustc: used by BERT models inside https://pypi.org/project/tokenizers/
 # build-essential: for rust
 
-# Use ofed_info -s to get your local version
-ARG MOFED_VERSION=5.4-3.4.0.0
 
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update -y &&\
-    apt-get install -y git build-essential curl python3 python-is-python3 python3-pip &&\
+    apt-get install -y --no-install-recommends git build-essential curl python3 python-is-python3 python3-pip &&\
     curl -o /etc/apt/trusted.gpg.d/mellanox.asc https://content.mellanox.com/ofed/RPM-GPG-KEY-Mellanox &&\
     curl -o /etc/apt/sources.list.d/mellanox.list https://linux.mellanox.com/public/repo/mlnx_ofed/${MOFED_VERSION}/ubuntu22.04/mellanox_mlnx_ofed.list &&\
-    curl -o cuda-keyring_1.1-1_all.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb &&\
-    dpkg -i cuda-keyring_1.1-1_all.deb &&\
     apt-get update -y &&\
-    apt-get install -y libibverbs1 nvidia-compute-utils-535 nvidia-utils-535 cuda-11-8 &&\
+    apt-get install -y --no-install-recommends libibverbs1 &&\
     apt-get clean &&\
-    rm -rf /var/lib/apt/lists/* &&\
-    rm cuda-keyring_1.1-1_all.deb
+    rm -rf /var/lib/apt/lists/* 
 
 # Install Rust
-
 RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
 ENV PATH="/root/.cargo/bin:${PATH}"
 ENV CUDA_HOME=/usr/local/cuda-11.8
@@ -78,8 +74,4 @@ RUN milabench install --config $MILABENCH_CONFIG --base $MILABENCH_BASE $MILABEN
     milabench prepare --config $MILABENCH_CONFIG --base $MILABENCH_BASE $MILABENCH_ARGS &&\
     python -m pip cache purge                                                          
 
-# Patch for https://github.com/pytorch/pytorch/issues/97041
-RUN cd /milabench/envs/venv/torch/lib/python3.10/site-packages/torch/lib                &&\
-   ln -sfn libnvrtc-672ee683.so.11.2 libnvrtc.so
-
 CMD milabench run
diff --git a/milabench/cli/publish.py b/milabench/cli/publish.py
@@ -1,19 +1,18 @@
-import re
 import json
-import subprocess
-from contextlib import contextmanager
 import multiprocessing
-from dataclasses import dataclass
-from urllib.parse import urlparse, ParseResult
-import time
-import threading
-import signal
 import os
+import re
+import signal
+import subprocess
 import sys
+import threading
+import time
+from contextlib import contextmanager
+from dataclasses import dataclass
+from urllib.parse import ParseResult, urlparse
 
 from coleo import Option, tooled
 
-
 SLEEP = 0.01
 _INIT = 0
 _READY = 1

diff --git a/milabench/common.py b/milabench/common.py
@@ -75,7 +75,7 @@ def arguments():
 
     # Define capabilities
     capabilities: Option = ""
-    
+
     return CommonArguments(
         config,
         system,
@@ -91,7 +91,7 @@ def arguments():
 def get_multipack(args = None, run_name=None, overrides={}):
     if args is None:
         args = arguments()
-        
+
     override = [
         o if re.match(pattern=r"[.\w]+=", string=o) else f"={o}" for o in args.override
     ]
@@ -225,13 +225,14 @@ def _get_multipack(
 
     arch = deduce_arch()
     base_defaults = get_base_defaults(
-        base=args.base, 
-        arch=arch, 
+        base=args.base,
+        arch=arch,
         run_name=run_name
     )
     system_config = build_system_config(
         args.system,
         defaults={"system": base_defaults["_defaults"]["system"]},
+        gpu=True
     )
     overrides = merge({"*": system_config}, overrides)
 

diff --git a/milabench/config.py b/milabench/config.py
@@ -1,4 +1,5 @@
 import contextvars
+import os
 import socket
 
 import psutil
@@ -174,16 +175,28 @@ def resolve_addresses(nodes):
     return self
 
 
-def get_gpu_capacity():
-    capacity = float(0)
+def get_gpu_capacity(strict=False):
+    try:
+        capacity = 0
+
+        for k, v in get_gpu_info()["gpus"].items():
+            capacity = min(v["memory"]["total"], capacity)
 
-    for k, v in get_gpu_info()["gpus"].items():
-        capacity = min(v["memory"]["total"], capacity)
+        return capacity
+    except:
+        print("GPU not available, defaulting to 0 MiB")
+        if strict:
+            raise
+        return 0
 
-    return capacity
+def is_autoscale_enabled():
+    return (
+        os.getenv("MILABENCH_SIZER_AUTO", False)
+        or os.getenv("MILABENCH_SIZER_MULTIPLE") is not None
+    )
 
 
-def build_system_config(config_file, defaults=None):
+def build_system_config(config_file, defaults=None, gpu=True):
     """Load the system configuration, verify its validity and resolve ip addresses
 
     Notes
@@ -204,7 +217,8 @@ def build_system_config(config_file, defaults=None):
 
     system = config.get("system", {})
 
-    if "gpu" not in system:
+    # capacity is only required if batch resizer is enabled
+    if (gpu or is_autoscale_enabled()) and not "gpu" not in system:
         system["gpu"] = {"capacity": f"{int(get_gpu_capacity())} MiB"}
 
     if system.get("sshkey") is not None:

diff --git a/milabench/log.py b/milabench/log.py
@@ -300,9 +300,9 @@ def on_data(self, entry, data, row):
                 load = int(data.get("load", 0) * 100)
                 currm, totalm = data.get("memory", [0, 0])
                 temp = int(data.get("temperature", 0))
-                row[
-                    f"gpu:{gpuid}"
-                ] = f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
+                row[f"gpu:{gpuid}"] = (
+                    f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
+                )
                 row["gpu_load"] = f"{load}%"
                 row["gpu_mem"] = f"{currm:.0f}/{totalm:.0f} MB"
                 row["gpu_temp"] = f"{temp}C"
@@ -376,9 +376,9 @@ def on_data(self, entry, data, row):
                 load = int(data.get("load", 0) * 100)
                 currm, totalm = data.get("memory", [0, 0])
                 temp = int(data.get("temperature", 0))
-                row[
-                    f"gpu:{gpuid}"
-                ] = f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
+                row[f"gpu:{gpuid}"] = (
+                    f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
+                )
         else:
             task = data.pop("task", "")
             units = data.pop("units", "")

diff --git a/milabench/merge.py b/milabench/merge.py
@@ -1,6 +1,5 @@
 """Utilities to merge dictionaries and other data structures."""
 
-
 from collections import deque
 from functools import reduce
 from typing import Union

diff --git a/milabench/scripts/vcs.py b/milabench/scripts/vcs.py
@@ -1,5 +1,6 @@
 """Use to retrieve GIT version info, this file cannot import milabench modules
 as it is executed as part of the installation process"""
+
 import os
 import subprocess
 import warnings

diff --git a/milabench/sizer.py b/milabench/sizer.py
@@ -6,21 +6,14 @@
 import numpy as np
 import yaml
 
-from .config import system_global
+from .config import is_autoscale_enabled, system_global
 from .validation.validation import ValidationLayer
 
 ROOT = os.path.dirname(__file__)
 
 default_scaling_config = os.path.join(ROOT, "..", "config", "scaling.yaml")
 
 
-def is_autoscale_enabled():
-    return (
-        os.getenv("MILABENCH_SIZER_AUTO", False)
-        or os.getenv("MILABENCH_SIZER_MULTIPLE") is not None
-    )
-
-
 def getenv(name, type):
     value = os.getenv(name)
 
@@ -109,6 +102,9 @@ def get_capacity(self, capacity):
     def auto_size(self, benchmark, capacity):
         capacity = self.get_capacity(capacity)
 
+        if capacity is None:
+            return None
+
         config = self.benchscaling(benchmark)
 
         data = list(sorted(config["model"].items(), key=lambda x: x[0]))
@@ -182,7 +178,7 @@ def scale_argv(pack, argv):
     sizer = sizer_global.get()
     system = system_global.get()
 
-    capacity = system["gpu"]["capacity"]
+    capacity = system.get("gpu", dict()).get("capacity")
 
     return sizer.argv(pack, capacity, argv)