From 36a1b3d1dccc33326001900f5ec7eb3e79564451 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Fri, 26 Jan 2024 13:34:41 -0500 Subject: [PATCH 01/10] Add dataset revision (#187) * Add dataset revision * add defautl rev --- benchmarks/accelerate_opt/main.py | 9 ++++++--- config/base.yaml | 1 + 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/benchmarks/accelerate_opt/main.py b/benchmarks/accelerate_opt/main.py index 7d1a33e61..bc2ead051 100644 --- a/benchmarks/accelerate_opt/main.py +++ b/benchmarks/accelerate_opt/main.py @@ -35,6 +35,7 @@ def arguments(): parser.add_argument("--validation_split_percentage", required=True, type=int) parser.add_argument("--dataset_name", required=True, type=str) parser.add_argument("--dataset_config_name", required=True, type=str) + parser.add_argument("--dataset_rev", required=True, type=str) parser.add_argument("--cache", required=True, type=str) parser.add_argument("--model_name", required=True, type=str) parser.add_argument("--prepare_only", action="store_true", default=False) @@ -180,17 +181,19 @@ def mblog(data): validation_split_percentage = config["validation_split_percentage"] dataset_name = config["dataset_name"] dataset_config_name = config["dataset_config_name"] - raw_datasets = load_dataset(dataset_name, dataset_config_name) + raw_datasets = load_dataset(dataset_name, dataset_config_name, revision=config["dataset_rev"]) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( dataset_name, dataset_config_name, - split=f"train[:{validation_split_percentage}%]", + split=f"train[:{validation_split_percentage}%]", + revision=config["dataset_rev"] ) raw_datasets["train"] = load_dataset( dataset_name, dataset_config_name, - split=f"train[{validation_split_percentage}%:]", + split=f"train[{validation_split_percentage}%:]", + revision=config["dataset_rev"] ) model_name = config["model_name"] diff --git a/config/base.yaml b/config/base.yaml index e5043e8e4..daa358f77 100644 --- a/config/base.yaml +++ b/config/base.yaml @@ -109,6 +109,7 @@ _accelerate_opt: --max_train_steps: 100 --dataset_name: "wikitext" --dataset_config_name: "wikitext-103-v1" + --dataset_rev: "b08601e" --validation_split_percentage: 5 --per_gpu_batch_size: 1 --cpus_per_gpu: 8 From 3047692a404907fdd58c6e66f14f4a88604b24b7 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Fri, 26 Jan 2024 13:38:20 -0500 Subject: [PATCH 02/10] Update README.md (#180) --- README.md | 269 +++++++++++++++++------------------------------------- 1 file changed, 84 insertions(+), 185 deletions(-) diff --git a/README.md b/README.md index 71bad28cc..dbb032b7f 100644 --- a/README.md +++ b/README.md @@ -1,197 +1,96 @@ # Milabench -[Documentation](https://mila-iqia.github.io/milabench) +[Documentation](https://milabench.readthedocs.io/en/stable/) + +Benchmarking framework for Machine learning and Artificial Intelligence, geared toward +evaluating current and future hardware in a research environment. + +* Simple / Hands-off +* Wide selection of models on diverse applications + * Multi GPUs + * Multi node + * nlp / transformer / llm / rl / rnn + * vision / classification / convnet / resnet / transformer + * audio +* Docker Container +* Works on slurm +* Automatic batch resize +* Focussed on training +* Ease of use +* Pytorch focused +* ROCm & NVIDIA +* Independent + +## Getting Started + +The easiest way to run milabbench is to run it with one of its docker image. +It will include all of the necessary data + + + # Choose the image you want to use + export MILABENCH_IMAGE=ghcr.io/mila-iqia/milabench:cuda-nightly + + # Pull the image we are going to run + docker pull $MILABENCH_IMAGE + + # Run milabench + docker run -it --rm --ipc=host --gpus=all \ + -v $(pwd)/results:/milabench/envs/runs \ + $MILABENCH_IMAGE \ + milabench run + + ================= + Benchmark results + ================= + perf: items/sec, (sum batchsize) / (elapsed time), higher is better. + peak_memory: MiB + fail n perf sem% std% peak_memory score weight + bert-fp16 0 1 49.82 0.0% 0.2% 23952 49.815508 0.00 + bert-fp32 0 1 20.78 0.0% 0.2% 30922 20.783989 0.00 + bert-tf32 0 1 20.79 0.0% 0.2% 30922 20.787725 0.00 + bert-tf32-fp16 0 1 49.70 0.1% 0.3% 23952 49.697091 3.00 + bf16 0 1 7.91 0.0% 0.1% 1140 7.910341 0.00 + convnext_large-fp16 0 1 123.77 2.5% 13.6% 26632 123.767014 0.00 + convnext_large-fp32 0 1 32.69 0.5% 2.6% 45356 32.687851 0.00 + convnext_large-tf32 0 1 32.64 0.5% 2.6% 45356 32.636185 0.00 + convnext_large-tf32-fp16 0 1 124.93 2.5% 13.4% 26632 124.930007 3.00 + davit_large 0 1 114.54 1.3% 9.8% 32374 114.539282 1.00 + davit_large-multi 0 1 115.18 1.2% 9.3% 32374 115.176873 5.00 + dlrm 0 1 255977.96 0.5% 4.0% 6354 255977.960840 1.00 + focalnet 0 1 151.78 1.6% 12.4% 24098 151.775544 2.00 + fp16 0 1 101.03 0.1% 0.6% 1142 101.025637 0.00 + fp32 0 1 14.42 0.0% 0.2% 1524 14.418942 0.00 + reformer 0 1 10.22 0.0% 0.1% 24756 10.222305 1.00 + regnet_y_128gf 0 1 30.52 0.3% 1.9% 30748 30.518845 2.00 + resnet152 0 1 232.63 1.1% 8.1% 29904 232.629851 1.00 + resnet152-multi 0 1 232.14 1.0% 7.7% 30614 232.144301 5.00 + resnet50 0 1 490.08 2.5% 19.0% 4166 490.076388 1.00 + rwkv 0 1 109.45 0.3% 2.0% 4944 109.449712 1.00 + stargan 0 1 11.40 4.2% 31.9% 35648 11.399463 1.00 + super-slomo 0 1 11.46 0.1% 0.5% 36364 11.463760 1.00 + t5 0 1 13.91 0.6% 4.5% 34794 13.913109 2.00 + tf32 0 1 14.43 0.0% 0.2% 1524 14.430707 0.00 + whisper 0 1 81.71 0.1% 0.6% 35968 81.705971 1.00 + + Scores + ------ + Failure rate: 0.00% (OK) + Score: 10.68 + + +## Details The benchmark suite has been validated on the following configurations: -| Python version | GPU | Configuration file | -| - | - | - | -| 3.9.12 (conda) | 4x NVIDIA A100 80GB | config/standard.yaml | +| Python version | GPU | Configuration file | +| - | - | - | +| 3.9.12 (conda) | 4x NVIDIA A100 80GB | config/standard.yaml | | 3.9.12 (conda) | 4x NVIDIA RTX8000 48GB | config/standard.yaml | -| 3.9.16 (conda) | 2x NVIDIA K80 | config/ci.yaml | -| 3.9.16 (conda) | 2x AMD MI100 | config/ci.yaml | +| 3.9.16 (conda) | 2x NVIDIA K80 | config/ci.yaml | +| 3.9.16 (conda) | 2x AMD MI100 | config/ci.yaml | We are working on validating it on more configurations and will update the above table as we do. - From c96be01faeea9490d1b65b932a7cff7e385df918 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Mon, 5 Feb 2024 08:52:50 -0500 Subject: [PATCH 03/10] Use node["port"] to ssh to the node (#189) * Use node["port"] to ssh to the node * Update benchfile.py --- benchmarks/accelerate_opt/benchfile.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/accelerate_opt/benchfile.py b/benchmarks/accelerate_opt/benchfile.py index 56bd98bb9..8dfe62c30 100644 --- a/benchmarks/accelerate_opt/benchfile.py +++ b/benchmarks/accelerate_opt/benchfile.py @@ -44,6 +44,7 @@ def build_run_plan(self): for rank, node in enumerate(nodes): host = node["ip"] user = node["user"] + port = node.get("port", 22) options = dict() if rank == 0: @@ -63,6 +64,7 @@ def build_run_plan(self): host=host, user=user, key=key, + port=port, executor=DockerRunCommand( AccelerateLaunchCommand(pack, rank=rank), self.config["system"].get("docker_image"), From 604ad7f40129bc46a1458053adfaad65c7a4b4c1 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Tue, 20 Feb 2024 09:09:10 -0500 Subject: [PATCH 04/10] Update README.md (#195) --- README.md | 62 +++++++++++++++++++++++++++---------------------------- 1 file changed, 30 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index dbb032b7f..e71634e1c 100644 --- a/README.md +++ b/README.md @@ -43,40 +43,38 @@ It will include all of the necessary data ================= Benchmark results ================= - perf: items/sec, (sum batchsize) / (elapsed time), higher is better. - peak_memory: MiB - fail n perf sem% std% peak_memory score weight - bert-fp16 0 1 49.82 0.0% 0.2% 23952 49.815508 0.00 - bert-fp32 0 1 20.78 0.0% 0.2% 30922 20.783989 0.00 - bert-tf32 0 1 20.79 0.0% 0.2% 30922 20.787725 0.00 - bert-tf32-fp16 0 1 49.70 0.1% 0.3% 23952 49.697091 3.00 - bf16 0 1 7.91 0.0% 0.1% 1140 7.910341 0.00 - convnext_large-fp16 0 1 123.77 2.5% 13.6% 26632 123.767014 0.00 - convnext_large-fp32 0 1 32.69 0.5% 2.6% 45356 32.687851 0.00 - convnext_large-tf32 0 1 32.64 0.5% 2.6% 45356 32.636185 0.00 - convnext_large-tf32-fp16 0 1 124.93 2.5% 13.4% 26632 124.930007 3.00 - davit_large 0 1 114.54 1.3% 9.8% 32374 114.539282 1.00 - davit_large-multi 0 1 115.18 1.2% 9.3% 32374 115.176873 5.00 - dlrm 0 1 255977.96 0.5% 4.0% 6354 255977.960840 1.00 - focalnet 0 1 151.78 1.6% 12.4% 24098 151.775544 2.00 - fp16 0 1 101.03 0.1% 0.6% 1142 101.025637 0.00 - fp32 0 1 14.42 0.0% 0.2% 1524 14.418942 0.00 - reformer 0 1 10.22 0.0% 0.1% 24756 10.222305 1.00 - regnet_y_128gf 0 1 30.52 0.3% 1.9% 30748 30.518845 2.00 - resnet152 0 1 232.63 1.1% 8.1% 29904 232.629851 1.00 - resnet152-multi 0 1 232.14 1.0% 7.7% 30614 232.144301 5.00 - resnet50 0 1 490.08 2.5% 19.0% 4166 490.076388 1.00 - rwkv 0 1 109.45 0.3% 2.0% 4944 109.449712 1.00 - stargan 0 1 11.40 4.2% 31.9% 35648 11.399463 1.00 - super-slomo 0 1 11.46 0.1% 0.5% 36364 11.463760 1.00 - t5 0 1 13.91 0.6% 4.5% 34794 13.913109 2.00 - tf32 0 1 14.43 0.0% 0.2% 1524 14.430707 0.00 - whisper 0 1 81.71 0.1% 0.6% 35968 81.705971 1.00 - + fail n perf sem% std% peak_memory score weight + bert-fp16 0 8 155.08 0.3% 4.3% 24552 1241.260310 0.00 + bert-fp32 0 8 29.52 0.0% 0.5% 31524 236.337218 0.00 + bert-tf32 0 8 120.46 0.4% 6.1% 31524 964.713297 0.00 + bert-tf32-fp16 0 8 154.76 0.3% 4.1% 24552 1238.477257 3.00 + convnext_large-fp16 0 8 337.48 0.9% 14.0% 27658 2741.604444 0.00 + convnext_large-fp32 0 8 44.61 0.8% 12.6% 49786 354.207225 0.00 + convnext_large-tf32 0 8 135.99 0.7% 11.2% 49786 1089.394916 0.00 + convnext_large-tf32-fp16 0 8 338.58 0.8% 13.0% 27658 2744.325170 3.00 + davit_large 0 8 312.79 0.3% 6.7% 35058 2515.326450 1.00 + davit_large-multi 0 1 2401.65 1.0% 7.7% 42232 2401.651720 5.00 + dlrm 0 1 188777.20 1.8% 14.0% 3194 188777.203190 1.00 + focalnet 0 8 400.47 0.2% 5.4% 26604 3215.431924 2.00 + opt-1_3b 0 1 26.71 0.1% 0.4% 44116 26.714365 5.00 + opt-1_3b-multinode 0 2 34.62 0.2% 1.0% 43552 34.618292 10.00 + opt-6_7b 0 1 14.32 0.0% 0.1% 55750 14.319587 5.00 + opt-6_7b-multinode 0 2 10.79 0.1% 0.7% 49380 10.792595 10.00 + reformer 0 8 61.70 0.0% 0.9% 25376 494.110834 1.00 + regnet_y_128gf 0 8 99.96 0.2% 5.0% 31840 803.012507 2.00 + resnet152 0 8 710.18 0.3% 6.2% 36732 5710.828608 1.00 + resnet152-multi 0 1 5367.34 1.0% 8.1% 38638 5367.338469 5.00 + resnet50 0 8 984.43 0.9% 19.1% 5026 7927.257351 1.00 + rwkv 0 8 428.65 0.2% 3.8% 5546 3435.097716 1.00 + stargan 0 8 51.32 1.8% 40.8% 37848 413.238870 1.00 + super-slomo 0 8 41.63 0.1% 2.3% 34082 332.395065 1.00 + t5 0 8 48.05 0.2% 3.9% 35466 384.317023 2.00 + whisper 0 8 248.16 0.0% 0.6% 37006 1985.861017 1.00 + Scores ------ - Failure rate: 0.00% (OK) - Score: 10.68 + Failure rate: 0.00% (PASS) + Score: 219.06 ## Details From 2dea49852fc3e153831341a1dc704e43ae999173 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Mon, 26 Feb 2024 10:57:58 -0500 Subject: [PATCH 05/10] Build docker container for reporting (#197) --- .github/workflows/report_container.yml | 67 ++++++++++++++++++++++++++ docker/Dockerfile-report | 48 ++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 .github/workflows/report_container.yml create mode 100644 docker/Dockerfile-report diff --git a/.github/workflows/report_container.yml b/.github/workflows/report_container.yml new file mode 100644 index 000000000..fbe9bc378 --- /dev/null +++ b/.github/workflows/report_container.yml @@ -0,0 +1,67 @@ +name: Publish Docker image for reports + +on: + # Allow manual runs + workflow_dispatch: + + # Only run for push on the main branch or for tagged version + push: + branches: + - master + tags: + - v*.*.* + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + + +permissions: + packages: write + + +# define build arguments +jobs: + build-image: + strategy: + fail-fast: false + + permissions: + contents: read + packages: write + + steps: + - name: Check out the repo + uses: actions/checkout@v3 + + - name: Get Image Tag Name + env: + GITHUB_REF_NAME_ENV: ${{ github.ref_name }} + run: | + IMAGE_TAG="report" + + - name: Log in to the registry + uses: docker/login-action@v2 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata (tags, labels) for the image + id: meta + uses: docker/metadata-action@v4 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=raw,value=report-${{ env.IMAGE_TAG }} + + - name: Build and push the image + uses: docker/build-push-action@v3 + with: + context: . + push: true + file: docker/Dockerfile-report + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + build-args: | + CONFIG=standard.yaml diff --git a/docker/Dockerfile-report b/docker/Dockerfile-report new file mode 100644 index 000000000..629cade0f --- /dev/null +++ b/docker/Dockerfile-report @@ -0,0 +1,48 @@ +FROM ubuntu:22.04 + + +# Arguments +# --------- + +ARG ARCH=cuda +ENV MILABENCH_GPU_ARCH=$ARCH + +ARG CONFIG=standard.yaml +ENV MILABENCH_CONFIG_NAME=$CONFIG +ENV MILABENCH_DOCKER=1 + +# Paths +# ----- + +ENV MILABENCH_CONFIG=/milabench/milabench/config/$MILABENCH_CONFIG_NAME +ENV MILABENCH_BASE=/milabench/envs +ENV MILABENCH_OUTPUT=/milabench/results/ +ENV MILABENCH_ARGS="" + +# Copy milabench +# -------------- + +WORKDIR /milabench +COPY . /milabench/milabench/ + +# Install Dependencies +# -------------------- + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get update -y &&\ + apt-get install -y python3 python-is-python3 python3-pip &&\ + apt-get update -y &&\ + apt-get clean &&\ + rm -rf /var/lib/apt/lists/* + +# Install Milabench +# ----------------- + +RUN python -m pip install -U pip &&\ + python -m pip install -U setuptools &&\ + python -m pip install -U poetry &&\ + python -m pip install -e /milabench/milabench/ &&\ + python -m pip cache purge + +CMD milabench report + From 562e493c37af09015829e5bfeb8487a44da349d6 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Mon, 26 Feb 2024 11:01:45 -0500 Subject: [PATCH 06/10] Add missing property (#198) --- .github/workflows/report_container.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/report_container.yml b/.github/workflows/report_container.yml index fbe9bc378..88eff866f 100644 --- a/.github/workflows/report_container.yml +++ b/.github/workflows/report_container.yml @@ -19,10 +19,11 @@ env: permissions: packages: write - # define build arguments jobs: build-image: + runs-on: ubuntu-22.04 + strategy: fail-fast: false From 468770bd932d485c814bb1e2e03279141bc692d9 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Mon, 26 Feb 2024 11:20:39 -0500 Subject: [PATCH 07/10] Add git to docker (#199) --- docker/Dockerfile-report | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile-report b/docker/Dockerfile-report index 629cade0f..a967a1cfe 100644 --- a/docker/Dockerfile-report +++ b/docker/Dockerfile-report @@ -30,7 +30,7 @@ COPY . /milabench/milabench/ ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update -y &&\ - apt-get install -y python3 python-is-python3 python3-pip &&\ + apt-get install -y git python3 python-is-python3 python3-pip &&\ apt-get update -y &&\ apt-get clean &&\ rm -rf /var/lib/apt/lists/* From 629bee3d8a4365512d74339e9731d01f2972a8dd Mon Sep 17 00:00:00 2001 From: Setepenre Date: Mon, 26 Feb 2024 11:27:43 -0500 Subject: [PATCH 08/10] Simplify name (#200) --- .github/workflows/report_container.yml | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/.github/workflows/report_container.yml b/.github/workflows/report_container.yml index 88eff866f..673b20d83 100644 --- a/.github/workflows/report_container.yml +++ b/.github/workflows/report_container.yml @@ -4,18 +4,10 @@ on: # Allow manual runs workflow_dispatch: - # Only run for push on the main branch or for tagged version - push: - branches: - - master - tags: - - v*.*.* - env: REGISTRY: ghcr.io IMAGE_NAME: ${{ github.repository }} - permissions: packages: write @@ -54,7 +46,7 @@ jobs: with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} tags: | - type=raw,value=report-${{ env.IMAGE_TAG }} + type=raw,value=report - name: Build and push the image uses: docker/build-push-action@v3 From 9bcb741f3eee4fa250acd9668c10b44f1d121fda Mon Sep 17 00:00:00 2001 From: Setepenre Date: Mon, 26 Feb 2024 11:46:02 -0500 Subject: [PATCH 09/10] Make sure report works without GPU (#201) Co-authored-by: Pierre Delaunay --- milabench/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milabench/config.py b/milabench/config.py index fa3b85f47..14c8bf46e 100644 --- a/milabench/config.py +++ b/milabench/config.py @@ -175,7 +175,7 @@ def resolve_addresses(nodes): def get_gpu_capacity(): - capacity = float("+inf") + capacity = float(0) for k, v in get_gpu_info()["gpus"].items(): capacity = min(v["memory"]["total"], capacity) From 0bf63487d2b99d46c2c205a65b9b5b6c6e298e43 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Mon, 26 Feb 2024 12:53:49 -0500 Subject: [PATCH 10/10] Tag report containers (#202) --- .github/workflows/report_container.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/report_container.yml b/.github/workflows/report_container.yml index 673b20d83..1d48daedd 100644 --- a/.github/workflows/report_container.yml +++ b/.github/workflows/report_container.yml @@ -31,7 +31,7 @@ jobs: env: GITHUB_REF_NAME_ENV: ${{ github.ref_name }} run: | - IMAGE_TAG="report" + echo "IMAGE_TAG=$GITHUB_REF_NAME_ENV" >> $GITHUB_ENV - name: Log in to the registry uses: docker/login-action@v2 @@ -46,7 +46,7 @@ jobs: with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} tags: | - type=raw,value=report + type=raw,value=report-${{ env.IMAGE_TAG }} - name: Build and push the image uses: docker/build-push-action@v3