mila-iqia · satyaog · Sep 3, 2024 · Sep 11, 2024 · Sep 18, 2024
diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml
@@ -0,0 +1,204 @@
+name: cloud-tests
+
+on:
+  # Runs for pull requests
+  pull_request:
+    branches:
+      - master
+
+permissions:
+  id-token: write
+  contents: write
+
+jobs:
+  cloud-tests:
+    strategy:
+      fail-fast: true
+      max-parallel: 1
+      matrix:
+        system: ["1n:1g", "1n:4g", "2n:4g"]
+        include:
+          - arch: cuda
+            exclude: "no-cuda"
+          # - arch: rocm
+          #   exclude : "no-rocm"
+
+    runs-on: ubuntu-latest
+    environment: cloud-ci
+
+    # Cancel previous jobs if a new version was pushed
+    concurrency:
+      group: "${{ github.ref }}-${{ matrix.arch }}-${{ matrix.system }}"
+      cancel-in-progress: true
+
+    defaults:
+      run:
+        shell: bash -el {0}
+
+    env:
+      MILABENCH_CONFIG: "config/standard.yaml"
+      MILABENCH_SYSTEM: "config/cloud-multinodes-system.yaml"
+      MILABENCH_BASE: "../output"
+      MILABENCH_ARGS: ""
+      MILABENCH_DASH: "no"
+      MILABENCH_HF_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
+      ARM_TENANT_ID: "${{ secrets.ARM_TENANT_ID }}"
+      ARM_SUBSCRIPTION_ID: "${{ secrets.ARM_SUBSCRIPTION_ID }}"
+      AZURE_CORE_OUTPUT: none
+      _MULTI_GPUS: "diffusion-gpus,dinov2-giant-gpus,lightning-gpus,llava-gpus,resnet152-ddp-gpus,llm-full-mp-gpus,llm-lora-ddp-gpus,llm-lora-mp-gpus"
+      _MULTI_NODES: "multinode"
+
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          token: ${{ github.token }}
+
+      - uses: actions/setup-python@v2
+        with:
+          python-version: '3.10'
+
+      # Follow
+      # https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/guides/service_principal_client_secret
+      # to generate a clientId as well as a clientSecret
+      - name: Azure login
+        uses: azure/login@v2
+        with:
+          creds: |
+            {
+              "clientId": "${{ secrets.ARM_CLIENT_ID }}",
+              "clientSecret": "${{ secrets.ARM_CLIENT_SECRET }}",
+              "subscriptionId": "${{ secrets.ARM_SUBSCRIPTION_ID }}",
+              "tenantId": "${{ secrets.ARM_TENANT_ID }}"
+            }
+
+      - name: dependencies
+        run: |
+          python -m pip install -U pip
+          python -m pip install -U poetry
+          poetry lock --no-update
+          poetry install
+
+      - name: setup cloud credentials
+        run: |
+          mkdir -p ~/.aws
+          mkdir -p ~/.ssh/covalent
+          echo "${{ secrets.COVALENT_EC2_EXECUTOR_KEYPAIR }}" >~/.ssh/covalent/covalent-ec2-executor-keypair.pem
+          echo "[default]" >~/.aws/credentials
+          echo "aws_access_key_id=${{ secrets.AWS_ACCESS_KEY_ID }}" >>~/.aws/credentials
+          echo "aws_secret_access_key=${{ secrets.AWS_SECRET_ACCESS_KEY }}" >>~/.aws/credentials
+          chmod -R a-rwx,u+rwX ~/.aws ~/.ssh
+
+      - name: start covalent server
+        run: |
+          poetry run -- python3 -m milabench.scripts.covalent serve start --develop
+
+      - name: setup cloud
+        run: |
+          nodes=$(echo "${{ matrix.system }}" | cut -d":" -f1)
+          gpus=$(echo "${{ matrix.system }}" | cut -d":" -f2)
+          case "$nodes" in
+            "1n")
+              MILABENCH_SYSTEM="config/cloud-system.yaml"
+              EXCLUDE="$EXCLUDE,$_MULTI_NODES"
+              ;;
+            "2n")
+              MILABENCH_SYSTEM="config/cloud-multinodes-system.yaml"
+              SELECT="$SELECT,$_MULTI_NODES"
+              EXCLUDE="$EXCLUDE,$_MULTI_GPUS"
+              ;;
+            *)
+              exit 1
+              ;;
+          esac
+          case "$gpus" in
+            "1g")
+              RUN_ON="azure__a100"
+              EXCLUDE="$EXCLUDE,$_MULTI_GPUS,$_MULTI_NODES"
+              ;;
+            "2g")
+              RUN_ON="azure__a100_x2"
+              SELECT="$SELECT,$_MULTI_GPUS"
+              ;;
+            "4g")
+              RUN_ON="azure__a100_x4"
+              SELECT="$SELECT,$_MULTI_GPUS"
+              ;;
+            *)
+              exit 1
+              ;;
+          esac
+
+          if [[ -z "$(echo "$SELECT" | cut -d"," -f1)" ]]
+          then
+            SELECT="$(echo "$SELECT" | cut -d"," -f2-)"
+          fi
+
+          if [[ -z "$(echo "$EXCLUDE" | cut -d"," -f1)" ]]
+          then
+            EXCLUDE="$(echo "$EXCLUDE" | cut -d"," -f2-)"
+          fi
+
+          if [[ ! -z "$SELECT" ]]
+          then
+            SELECT="--select $SELECT"
+          fi
+
+          if [[ ! -z "$EXCLUDE" ]]
+          then
+            EXCLUDE="--exclude $EXCLUDE"
+          fi
+
+          echo "RUN_ON=$RUN_ON" >>$GITHUB_ENV
+
+          poetry run milabench cloud \
+            --setup \
+            --run-on $RUN_ON \
+            --system "$MILABENCH_SYSTEM" >$MILABENCH_SYSTEM.$RUN_ON
+
+          echo "MILABENCH_SYSTEM=$MILABENCH_SYSTEM.$RUN_ON" >>$GITHUB_ENV
+          echo "SELECT=$SELECT" >>$GITHUB_ENV
+          echo "EXCLUDE=$EXCLUDE" >>$GITHUB_ENV
+
+      - name: install benchmarks
+        run: |
+          poetry run milabench install --variant ${{ matrix.arch }} $SELECT $EXCLUDE
+
+      - name: prepare benchmarks
+        run: |
+          poetry run milabench prepare $SELECT $EXCLUDE
+
+      - name: run benchmarks
+        run: |
+          poetry run milabench run $SELECT $EXCLUDE
+
+      - name: Summary
+        run: |
+          git config credential.${{ github.server_url }}.username ${{ github.actor }}
+          git config credential.helper '!f() { test "$1" = get && echo "password=$GITHUB_TOKEN"; }; f'
+          git config --global user.email "[email protected]"
+          git config --global user.name "GitHub CI"
+          poetry run milabench report --push
+        env:
+          GITHUB_TOKEN: ${{ github.token }}
+
+      - name: DEBUG state file
+        if: always()
+        run: |
+          cat /tmp/milabench/covalent_venv/lib/python*/site-packages/covalent_azure_plugin/infra/*.tfstate
+
+      - name: teardown cloud
+        if: always()
+        run: |
+          if [[ -f "${MILABENCH_SYSTEM%.*}" ]]
+          then
+            export MILABENCH_SYSTEM=${MILABENCH_SYSTEM%.*}
+          fi
+          poetry run milabench cloud \
+            --teardown \
+            --run-on $RUN_ON \
+            --all
+
+      - name: DEBUG logs
+        if: always()
+        run: |
+          cat ~/.cache/covalent/covalent_ui.log
diff --git a/benchmarks/_templates/simple/requirements.cpu.txt b/benchmarks/_templates/simple/requirements.cpu.txt
diff --git a/benchmarks/llm/configs/llama3_70B_full.yaml b/benchmarks/llm/configs/llama3_70B_full.yaml
@@ -36,7 +36,7 @@ checkpointer:
   _component_: torchtune.utils.FullModelHFCheckpointer
   checkpoint_dir: /tmp/Meta-Llama-3.1-70B-Instruct/
   checkpoint_files: [
-     model-00001-of-00030.safetensors,
+    model-00001-of-00030.safetensors,
     model-00002-of-00030.safetensors,
     model-00003-of-00030.safetensors,
     model-00004-of-00030.safetensors,

diff --git a/benchmarks/llm/prepare.py b/benchmarks/llm/prepare.py
@@ -23,7 +23,6 @@
 class Arguments:
     recipe: str
     config: str = None
-    no_pretrained: bool = False
 
 
 @dataclass
@@ -100,12 +99,19 @@ def load_model(recipe, cfg):
 
 
 def generate_weights(args, config):
+    is_done:Path = args.output_dir / "generated"
+    if is_done.exists():
+        print(f"{args.output_dir}/['*.safetensors'] or ['*consolidated.*.pth'] already generated")
+        return
+
     if config.get("safetensors", False):
         params_path = args.output_dir / "config.json"
         model = LlamaForCausalLM(LlamaConfig(**json.loads(params_path.read_text())))
         # Avoid saving this as part of the config.
         del model.config._name_or_path
-        model.config.torch_dtype = torch.float16
+        # Even if model if loaded with a config.torch_dtype == bf16, model.dtype
+        # seams to be f32. Force model.dtype to be bf16
+        model.to(model.config.torch_dtype)
         model.save_pretrained(str(args.output_dir), safe_serialization=True)
 
     else:
@@ -138,6 +144,8 @@ def generate_weights(args, config):
             conn.send(True)
             p.join()
 
+    is_done.touch()
+
 
 def main():
     parser = ArgumentParser()
@@ -154,7 +162,7 @@ def main():
 
     #
     huggingface_format = config.get("safetensors", False)
-    pretrained = not args.no_pretrained
+    pretrained = not config.get("no_pretrained", False)
 
     if not pretrained:
         # if we will generate the weights do not download anyweights

diff --git a/config/base.yaml b/config/base.yaml
@@ -10,6 +10,8 @@ _defaults:
       gpu_load_threshold: 0.5
       gpu_mem_threshold: 0.5
 
+  num_machines: 1
+
 _torchvision:
   inherits: _defaults
   definition: ../benchmarks/torchvision
@@ -549,7 +551,7 @@ llm-lora-single:
     repo_id="meta-llama/Meta-Llama-3.1-8B": true
     batch_size=8: true
     gradient_accumulation_steps=8: true
-
+    no_pretrained=True: true
 
 llm-lora-ddp-gpus:
   inherits: _llm
@@ -569,7 +571,7 @@ llm-lora-ddp-gpus:
     repo_id="meta-llama/Meta-Llama-3.1-8B": true
     batch_size=8: true
     gradient_accumulation_steps=8: true
-
+    no_pretrained=True: true
 
 llm-lora-ddp-nodes:
   tags:
@@ -592,6 +594,7 @@ llm-lora-ddp-nodes:
     repo_id="meta-llama/Meta-Llama-3.1-8B": true
     batch_size=8: true
     gradient_accumulation_steps=8: true
+    no_pretrained=True: true
 
   num_machines: 2
   requires_capabilities:
@@ -617,6 +620,7 @@ llm-lora-mp-gpus:
     repo_id="meta-llama/Meta-Llama-3.1-70B": true
     batch_size=8: true
     gradient_accumulation_steps=1: true
+    no_pretrained=True: true
 
 llm-full-mp-gpus:
   inherits: _llm
@@ -637,6 +641,7 @@ llm-full-mp-gpus:
     safetensors=true: true
     batch_size=2: true
     gradient_accumulation_steps=1: true
+    no_pretrained=True: true
 
 llm-full-mp-nodes:
   tags:
@@ -660,6 +665,7 @@ llm-full-mp-nodes:
     safetensors=true: true
     batch_size=2: true
     gradient_accumulation_steps=1: true
+    no_pretrained=True: true
 
   num_machines: 2
   requires_capabilities: