From 59c968711f60c30fae546a6fbef733cae2e7b4e9 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Fri, 26 Jan 2024 12:47:45 -0500 Subject: [PATCH 1/4] Update Dockerfile-cuda --- docker/Dockerfile-cuda | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/docker/Dockerfile-cuda b/docker/Dockerfile-cuda index ebbc4a460..594d507d9 100644 --- a/docker/Dockerfile-cuda +++ b/docker/Dockerfile-cuda @@ -39,18 +39,34 @@ COPY . /milabench/milabench/ # Use ofed_info -s to get your local version ARG MOFED_VERSION=5.4-3.4.0.0 + +ENV NVARCH=x86_64 +ENV NV_CUDA_CUDART_VERSION=11.8.89-1 +ENV NV_CUDA_COMPAT_PACKAGE=cuda-compat-11-8 +ENV CUDA_VERSION=11.8.0 +ENV NVIDIA_VISIBLE_DEVICES=all +ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility + + ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update -y &&\ - apt-get install -y git build-essential curl python3 python-is-python3 python3-pip &&\ + apt-get install -y --no-install-recommends git build-essential gnupg2 curl ca-certificates python3 python-is-python3 python3-pip &&\ curl -o /etc/apt/trusted.gpg.d/mellanox.asc https://content.mellanox.com/ofed/RPM-GPG-KEY-Mellanox &&\ curl -o /etc/apt/sources.list.d/mellanox.list https://linux.mellanox.com/public/repo/mlnx_ofed/${MOFED_VERSION}/ubuntu22.04/mellanox_mlnx_ofed.list &&\ - curl -o cuda-keyring_1.1-1_all.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb &&\ + curl -o cuda-keyring_1.1-1_all.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/{NVARCH}/cuda-keyring_1.1-1_all.deb &&\ dpkg -i cuda-keyring_1.1-1_all.deb &&\ apt-get update -y &&\ - apt-get install -y libibverbs1 nvidia-compute-utils-535 nvidia-utils-535 cuda-11-8 &&\ + apt-get install -y --no-install-recommends cuda-cudart-11-8=${NV_CUDA_CUDART_VERSION} ${NV_CUDA_COMPAT_PACKAGE} cuda-libraries-11-8=${NV_CUDA_LIB_VERSION} &&\ + apt-get install -y --no-install-recommends cuda-command-line-tools-11-8=${NV_CUDA_LIB_VERSION} cuda-minimal-build-11-8=${NV_CUDA_LIB_VERSION} cuda-nvml-dev-11-8=${NV_NVML_DEV_VERSION} &&\ + apt-get install -y --no-install-recommends libibverbs1 nvidia-compute-utils-535 nvidia-utils-535 cuda-11-8 &&\ apt-get clean &&\ rm -rf /var/lib/apt/lists/* &&\ - rm cuda-keyring_1.1-1_all.deb + rm cuda-keyring_1.1-1_all.deb &&\ + /bin/sh -c echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf &&\ + echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf # buildkit + +ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin +ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64 # Install Rust From ced199403908a9fc414d02d7331ea1a600f5c510 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Fri, 1 Mar 2024 10:42:26 -0500 Subject: [PATCH 2/4] Update Dockerfile-cuda --- docker/Dockerfile-cuda | 31 ++++--------------------------- 1 file changed, 4 insertions(+), 27 deletions(-) diff --git a/docker/Dockerfile-cuda b/docker/Dockerfile-cuda index 594d507d9..81b8b459c 100644 --- a/docker/Dockerfile-cuda +++ b/docker/Dockerfile-cuda @@ -1,7 +1,7 @@ -FROM ubuntu:22.04 +# FROM ubuntu:22.04 # For cuda-gdb -# FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 +FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 # Arguments # --------- @@ -39,37 +39,18 @@ COPY . /milabench/milabench/ # Use ofed_info -s to get your local version ARG MOFED_VERSION=5.4-3.4.0.0 - -ENV NVARCH=x86_64 -ENV NV_CUDA_CUDART_VERSION=11.8.89-1 -ENV NV_CUDA_COMPAT_PACKAGE=cuda-compat-11-8 -ENV CUDA_VERSION=11.8.0 -ENV NVIDIA_VISIBLE_DEVICES=all -ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility - - ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update -y &&\ apt-get install -y --no-install-recommends git build-essential gnupg2 curl ca-certificates python3 python-is-python3 python3-pip &&\ curl -o /etc/apt/trusted.gpg.d/mellanox.asc https://content.mellanox.com/ofed/RPM-GPG-KEY-Mellanox &&\ curl -o /etc/apt/sources.list.d/mellanox.list https://linux.mellanox.com/public/repo/mlnx_ofed/${MOFED_VERSION}/ubuntu22.04/mellanox_mlnx_ofed.list &&\ - curl -o cuda-keyring_1.1-1_all.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/{NVARCH}/cuda-keyring_1.1-1_all.deb &&\ dpkg -i cuda-keyring_1.1-1_all.deb &&\ apt-get update -y &&\ - apt-get install -y --no-install-recommends cuda-cudart-11-8=${NV_CUDA_CUDART_VERSION} ${NV_CUDA_COMPAT_PACKAGE} cuda-libraries-11-8=${NV_CUDA_LIB_VERSION} &&\ - apt-get install -y --no-install-recommends cuda-command-line-tools-11-8=${NV_CUDA_LIB_VERSION} cuda-minimal-build-11-8=${NV_CUDA_LIB_VERSION} cuda-nvml-dev-11-8=${NV_NVML_DEV_VERSION} &&\ - apt-get install -y --no-install-recommends libibverbs1 nvidia-compute-utils-535 nvidia-utils-535 cuda-11-8 &&\ + apt-get install -y --no-install-recommends libibverbs1 &&\ apt-get clean &&\ - rm -rf /var/lib/apt/lists/* &&\ - rm cuda-keyring_1.1-1_all.deb &&\ - /bin/sh -c echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf &&\ - echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf # buildkit - -ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin -ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64 + rm -rf /var/lib/apt/lists/* # Install Rust - RUN curl https://sh.rustup.rs -sSf | sh -s -- -y ENV PATH="/root/.cargo/bin:${PATH}" ENV CUDA_HOME=/usr/local/cuda-11.8 @@ -94,8 +75,4 @@ RUN milabench install --config $MILABENCH_CONFIG --base $MILABENCH_BASE $MILABEN milabench prepare --config $MILABENCH_CONFIG --base $MILABENCH_BASE $MILABENCH_ARGS &&\ python -m pip cache purge -# Patch for https://github.com/pytorch/pytorch/issues/97041 -RUN cd /milabench/envs/venv/torch/lib/python3.10/site-packages/torch/lib &&\ - ln -sfn libnvrtc-672ee683.so.11.2 libnvrtc.so - CMD milabench run From 10dfb573864c479b6ad2581a9a0dfab8752b5376 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Fri, 1 Mar 2024 10:43:38 -0500 Subject: [PATCH 3/4] Update Dockerfile-cuda --- docker/Dockerfile-cuda | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/docker/Dockerfile-cuda b/docker/Dockerfile-cuda index 81b8b459c..459ddb4dd 100644 --- a/docker/Dockerfile-cuda +++ b/docker/Dockerfile-cuda @@ -6,10 +6,12 @@ FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 # Arguments # --------- +# Use ofed_info -s to get your local version +ARG MOFED_VERSION=5.4-3.4.0.0 +ARG CONFIG=standard.yaml ARG ARCH=cuda -ENV MILABENCH_GPU_ARCH=$ARCH -ARG CONFIG=standard.yaml +ENV MILABENCH_GPU_ARCH=$ARCH ENV MILABENCH_CONFIG_NAME=$CONFIG ENV MILABENCH_DOCKER=1 @@ -36,15 +38,12 @@ COPY . /milabench/milabench/ # rustc: used by BERT models inside https://pypi.org/project/tokenizers/ # build-essential: for rust -# Use ofed_info -s to get your local version -ARG MOFED_VERSION=5.4-3.4.0.0 ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update -y &&\ - apt-get install -y --no-install-recommends git build-essential gnupg2 curl ca-certificates python3 python-is-python3 python3-pip &&\ + apt-get install -y --no-install-recommends git build-essential curl python3 python-is-python3 python3-pip &&\ curl -o /etc/apt/trusted.gpg.d/mellanox.asc https://content.mellanox.com/ofed/RPM-GPG-KEY-Mellanox &&\ curl -o /etc/apt/sources.list.d/mellanox.list https://linux.mellanox.com/public/repo/mlnx_ofed/${MOFED_VERSION}/ubuntu22.04/mellanox_mlnx_ofed.list &&\ - dpkg -i cuda-keyring_1.1-1_all.deb &&\ apt-get update -y &&\ apt-get install -y --no-install-recommends libibverbs1 &&\ apt-get clean &&\ From 66e9e155a3bebe5d3e7eb6c43faa7611dc89bfad Mon Sep 17 00:00:00 2001 From: Setepenre Date: Fri, 1 Mar 2024 16:04:13 +0000 Subject: [PATCH 4/4] update capcity resolution --- .github/workflows/tests.yml | 4 ++-- config/base.yaml | 2 +- milabench/cli/publish.py | 17 ++++++++--------- milabench/common.py | 9 +++++---- milabench/config.py | 29 ++++++++++++++++++++++------- milabench/log.py | 12 ++++++------ milabench/merge.py | 1 - milabench/scripts/vcs.py | 1 + milabench/sizer.py | 14 +++++--------- 9 files changed, 50 insertions(+), 39 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 1192bd0d7..7d456f9bb 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -26,9 +26,9 @@ jobs: matrix: include: - arch: cuda - exclude : "no-cuda" + exclude : "unsupported-cuda" # - arch: rocm - # exclude : "no-rocm" + # exclude : "unsupported-rocm" runs-on: [self-hosted, "${{ matrix.arch }}"] diff --git a/config/base.yaml b/config/base.yaml index e5043e8e4..ddb804cdc 100644 --- a/config/base.yaml +++ b/config/base.yaml @@ -521,7 +521,7 @@ rwkv: tags: - llm - rnn - - no-rocm + - unsupported-rocm plan: method: per_gpu argv: diff --git a/milabench/cli/publish.py b/milabench/cli/publish.py index cb60812d3..077cda9cd 100644 --- a/milabench/cli/publish.py +++ b/milabench/cli/publish.py @@ -1,19 +1,18 @@ -import re import json -import subprocess -from contextlib import contextmanager import multiprocessing -from dataclasses import dataclass -from urllib.parse import urlparse, ParseResult -import time -import threading -import signal import os +import re +import signal +import subprocess import sys +import threading +import time +from contextlib import contextmanager +from dataclasses import dataclass +from urllib.parse import ParseResult, urlparse from coleo import Option, tooled - SLEEP = 0.01 _INIT = 0 _READY = 1 diff --git a/milabench/common.py b/milabench/common.py index 70789b212..35f9cf125 100644 --- a/milabench/common.py +++ b/milabench/common.py @@ -75,7 +75,7 @@ def arguments(): # Define capabilities capabilities: Option = "" - + return CommonArguments( config, system, @@ -91,7 +91,7 @@ def arguments(): def get_multipack(args = None, run_name=None, overrides={}): if args is None: args = arguments() - + override = [ o if re.match(pattern=r"[.\w]+=", string=o) else f"={o}" for o in args.override ] @@ -225,13 +225,14 @@ def _get_multipack( arch = deduce_arch() base_defaults = get_base_defaults( - base=args.base, - arch=arch, + base=args.base, + arch=arch, run_name=run_name ) system_config = build_system_config( args.system, defaults={"system": base_defaults["_defaults"]["system"]}, + gpu=True ) overrides = merge({"*": system_config}, overrides) diff --git a/milabench/config.py b/milabench/config.py index fa3b85f47..b6617688b 100644 --- a/milabench/config.py +++ b/milabench/config.py @@ -1,4 +1,5 @@ import contextvars +import os import socket import psutil @@ -174,16 +175,29 @@ def resolve_addresses(nodes): return self -def get_gpu_capacity(): - capacity = float("+inf") +def get_gpu_capacity(strict=False): + try: + capacity = 0 + + for k, v in get_gpu_info()["gpus"].items(): + capacity = min(v["memory"]["total"], capacity) + + return capacity + except: + print("GPU not available, defaulting to 0 MiB") + if strict: + raise + return 0 - for k, v in get_gpu_info()["gpus"].items(): - capacity = min(v["memory"]["total"], capacity) - return capacity +def is_autoscale_enabled(): + return ( + os.getenv("MILABENCH_SIZER_AUTO", False) + or os.getenv("MILABENCH_SIZER_MULTIPLE") is not None + ) -def build_system_config(config_file, defaults=None): +def build_system_config(config_file, defaults=None, gpu=True): """Load the system configuration, verify its validity and resolve ip addresses Notes @@ -204,7 +218,8 @@ def build_system_config(config_file, defaults=None): system = config.get("system", {}) - if "gpu" not in system: + # capacity is only required if batch resizer is enabled + if (gpu or is_autoscale_enabled()) and not "gpu" not in system: system["gpu"] = {"capacity": f"{int(get_gpu_capacity())} MiB"} if system.get("sshkey") is not None: diff --git a/milabench/log.py b/milabench/log.py index 5826d309b..a6f7388a9 100644 --- a/milabench/log.py +++ b/milabench/log.py @@ -300,9 +300,9 @@ def on_data(self, entry, data, row): load = int(data.get("load", 0) * 100) currm, totalm = data.get("memory", [0, 0]) temp = int(data.get("temperature", 0)) - row[ - f"gpu:{gpuid}" - ] = f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C" + row[f"gpu:{gpuid}"] = ( + f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C" + ) row["gpu_load"] = f"{load}%" row["gpu_mem"] = f"{currm:.0f}/{totalm:.0f} MB" row["gpu_temp"] = f"{temp}C" @@ -376,9 +376,9 @@ def on_data(self, entry, data, row): load = int(data.get("load", 0) * 100) currm, totalm = data.get("memory", [0, 0]) temp = int(data.get("temperature", 0)) - row[ - f"gpu:{gpuid}" - ] = f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C" + row[f"gpu:{gpuid}"] = ( + f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C" + ) else: task = data.pop("task", "") units = data.pop("units", "") diff --git a/milabench/merge.py b/milabench/merge.py index e5010c629..a9efa4cec 100644 --- a/milabench/merge.py +++ b/milabench/merge.py @@ -1,6 +1,5 @@ """Utilities to merge dictionaries and other data structures.""" - from collections import deque from functools import reduce from typing import Union diff --git a/milabench/scripts/vcs.py b/milabench/scripts/vcs.py index f1a8c4ddf..0f895f886 100644 --- a/milabench/scripts/vcs.py +++ b/milabench/scripts/vcs.py @@ -1,5 +1,6 @@ """Use to retrieve GIT version info, this file cannot import milabench modules as it is executed as part of the installation process""" + import os import subprocess import warnings diff --git a/milabench/sizer.py b/milabench/sizer.py index a2aa8b87b..4ce2a3f22 100644 --- a/milabench/sizer.py +++ b/milabench/sizer.py @@ -6,7 +6,7 @@ import numpy as np import yaml -from .config import system_global +from .config import is_autoscale_enabled, system_global from .validation.validation import ValidationLayer ROOT = os.path.dirname(__file__) @@ -14,13 +14,6 @@ default_scaling_config = os.path.join(ROOT, "..", "config", "scaling.yaml") -def is_autoscale_enabled(): - return ( - os.getenv("MILABENCH_SIZER_AUTO", False) - or os.getenv("MILABENCH_SIZER_MULTIPLE") is not None - ) - - def getenv(name, type): value = os.getenv(name) @@ -109,6 +102,9 @@ def get_capacity(self, capacity): def auto_size(self, benchmark, capacity): capacity = self.get_capacity(capacity) + if capacity is None: + return None + config = self.benchscaling(benchmark) data = list(sorted(config["model"].items(), key=lambda x: x[0])) @@ -182,7 +178,7 @@ def scale_argv(pack, argv): sizer = sizer_global.get() system = system_global.get() - capacity = system["gpu"]["capacity"] + capacity = system.get("gpu", dict()).get("capacity") return sizer.argv(pack, capacity, argv)