Skip to content

Commit

Permalink
ci: Switch to FW templates for build (NVIDIA#11077)
Browse files Browse the repository at this point in the history
* ci: Switch to FW templates for build

Signed-off-by: Oliver Koenig <[email protected]>

* fix

Signed-off-by: Oliver Koenig <[email protected]>

* fix

Signed-off-by: Oliver Koenig <[email protected]>

* fix

Signed-off-by: Oliver Koenig <[email protected]>

* fix image tag

Signed-off-by: Oliver Koenig <[email protected]>

* fix

Signed-off-by: Oliver Koenig <[email protected]>

* use labels for building and cleaning

Signed-off-by: Oliver Koenig <[email protected]>

* fix tag

Signed-off-by: Oliver Koenig <[email protected]>

* fix

Signed-off-by: Oliver Koenig <[email protected]>

* fix

Signed-off-by: Oliver Koenig <[email protected]>

---------

Signed-off-by: Oliver Koenig <[email protected]>
  • Loading branch information
ko3n1g authored Nov 6, 2024
1 parent 30235e0 commit fa3b13b
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 61 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/_test_template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ jobs:
- name: Docker pull image
run: |
docker pull nemoci.azurecr.io/nemo_container_${{ github.run_id }}
docker pull nemoci.azurecr.io/nemo_container:${{ github.run_id }}
- name: Start container
run: |
Expand All @@ -60,7 +60,7 @@ jobs:
ARG=("--runtime=nvidia --gpus all")
fi
docker run --rm -d --name nemo_container_${{ github.run_id }} ${ARG[@]} --shm-size=64g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
docker run --rm -d --name nemo_container_${{ github.run_id }} ${ARG[@]} --shm-size=64g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container:${{ github.run_id }} bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
- id: main
name: Run main script
Expand Down
73 changes: 16 additions & 57 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,69 +48,29 @@ jobs:
id: all
run: |
echo "main=${{ contains(fromJSON(steps.test_to_run.outputs.main), 'all') }}" | tee -a "$GITHUB_OUTPUT"
gpu-test:
needs: [pre-flight]
runs-on: self-hosted-azure
if: ${{ github.event.label.name == 'Run CICD' || github.event_name == 'workflow_dispatch' }}
steps:
- name: Run nvidia-smi test
run: |
whoami
nvidia-smi

cicd-cluster-clean:
runs-on: self-hosted-azure-builder
needs: [pre-flight]
cicd-test-container-build:
if: ${{ github.event.label.name == 'Run CICD' || github.event_name == 'workflow_dispatch' }}
steps:
- name: Clean server from old files
run: |
docker system prune --filter "until=24h" --filter "label=nemo.library=nemo-core" --force
uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/[email protected]
with:
image-name: nemo_container
dockerfile: Dockerfile.ci
image-label: nemo-core
build-args: |
IMAGE_LABEL=nemo-core
prune-filter-timerange: 24h

cicd-test-container-setup:
needs: [cicd-cluster-clean, pre-flight]
needs: [cicd-test-container-build, pre-flight]
runs-on: self-hosted-azure-builder
if: ${{ github.event.label.name == 'Run CICD' || github.event_name == 'workflow_dispatch' }}
outputs:
test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
all: ${{ needs.pre-flight.outputs.all }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
path: ${{ github.run_id }}

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
with:
# We use `docker` driver as this speeds things up for
# trivial (non-multi-stage) builds.
driver: docker

- name: Restore cache
run: |
docker pull nemoci.azurecr.io/nemo_container:latest
docker pull nemoci.azurecr.io/nemo_container_${{ github.event.number || 'noop' }} || true
- name: Build and push
uses: docker/build-push-action@v5
with:
file: Dockerfile.ci
push: true
cache-from: |
nemoci.azurecr.io/nemo_container:latest
nemoci.azurecr.io/nemo_container_${{ github.event.number || 'noop' }}
cache-to: type=inline
tags: |
nemoci.azurecr.io/nemo_container_${{ github.run_id }}
nemoci.azurecr.io/nemo_container_${{ github.event.number || 'noop' }}
nemoci.azurecr.io/nemo_container:latest
- name: Run some checks
run: |
docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --env PYTHONUNBUFFERED=1 nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '\
docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --env PYTHONUNBUFFERED=1 nemoci.azurecr.io/nemo_container:${{ github.run_id }} bash -c '\
# PyTorch Lightning version
python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"
Expand Down Expand Up @@ -475,7 +435,7 @@ jobs:
# needs: [cicd-test-container-setup]
# runs-on: self-hosted-azure
# container:
# image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
# image: nemoci.azurecr.io/nemo_container:${{ github.run_id }}
# options:
# # --user 0:128
# --device=/dev/nvidia0
Expand Down Expand Up @@ -527,7 +487,7 @@ jobs:
# runs-on: self-hosted-azure
# timeout-minutes: 10
# container:
# image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
# image: nemoci.azurecr.io/nemo_container:${{ github.run_id }}
# options:
# # --user 0:128
# --device=/dev/nvidia0
Expand Down Expand Up @@ -3611,7 +3571,7 @@ jobs:
# runs-on: self-hosted-azure
# timeout-minutes: 10
# container:
# image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
# image: nemoci.azurecr.io/nemo_container:${{ github.run_id }}
# options:
# # --user 0:128
# --device=/dev/nvidia0
Expand Down Expand Up @@ -3676,7 +3636,7 @@ jobs:
# needs: [cicd-test-container-setup]
# runs-on: self-hosted-azure
# container:
# image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
# image: nemoci.azurecr.io/nemo_container:${{ github.run_id }}
# options:
# # --user 0:128
# --device=/dev/nvidia0
Expand Down Expand Up @@ -4341,7 +4301,6 @@ jobs:
Nemo_CICD_Test:
needs:
- pre-flight
- gpu-test
- cicd-test-container-setup

- L0_Unit_Tests_GPU_ASR
Expand Down
5 changes: 3 additions & 2 deletions Dockerfile.ci
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@
# limitations under the License.

ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3

ARG IMAGE_LABEL
FROM ${BASE_IMAGE}
LABEL "nemo.library"="nemo-core"

LABEL "nemo.library"=${IMAGE_LABEL}

ENV TRANSFORMERS_OFFLINE=0
ENV HYDRA_FULL_ERROR=1
Expand Down

0 comments on commit fa3b13b

Please sign in to comment.