Skip to content

Commit

Permalink
Pre-compiled end-to-end gpu driver validation
Browse files Browse the repository at this point in the history
Signed-off-by: shiva kumar <[email protected]>
  • Loading branch information
shivakunv committed Sep 12, 2024
1 parent b5d38ba commit 5e152f0
Show file tree
Hide file tree
Showing 18 changed files with 394 additions and 44 deletions.
23 changes: 14 additions & 9 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,31 +46,36 @@ jobs:
id: get_public_dns_name
uses: mikefarah/yq@master
with:
cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml
cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml

- name: Set and Calculate test vars
run: |
echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
echo "${{ secrets.AWS_SSH_KEY }}" > ${{ github.workspace }}/key.pem && chmod 400 ${{ github.workspace }}/key.pem
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}')
echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV
echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV
- name: Validate gpu driver
env:
TEST_CASE: "./tests/cases/nvidia-driver.sh"
GPU_OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia"
run: |
echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key}
rc=0
for driver_version in ${DRIVER_VERSIONS}; do
echo "Running e2e for DRIVER_VERSION=$driver_version"
./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} || status=$?
for DRIVER_VERSION in ${DRIVER_VERSIONS}; do
echo "Running e2e for DRIVER_VERSION=$DRIVER_VERSION"
status=0
TEST_CASE_ARGS="${GPU_OPERATOR_OPTIONS} --set driver.version=${COMMIT_SHORT_SHA}-${DRIVER_VERSION}"
# add escape character for space
TEST_CASE_ARGS=$(printf '%q ' "$TEST_CASE_ARGS")
./tests/ci-run-e2e.sh "${TEST_CASE}" "${TEST_CASE_ARGS}" || status=$?
if [ $status -ne 0 ]; then
echo "e2e validation failed for driver version $driver_version with status $status"
echo "e2e validation failed for driver version $DRIVER_VERSION with status $status"
rc=$status
fi
done
source ./tests/scripts/.definitions.sh
./tests/scripts/pull.sh /tmp/logs logs
exit $rc
Expand All @@ -80,4 +85,4 @@ jobs:
with:
name: nvidiadriver-e2e-test-logs
path: ./logs/
retention-days: 15
retention-days: 15
192 changes: 179 additions & 13 deletions .github/workflows/precompiled.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,34 @@ on:
- cron: '00 09 * * *' # scheduled job

jobs:
pre-compiled:
set-driver-version-matrix:
runs-on: ubuntu-latest
outputs:
driver_branch: ${{ steps.extract_driver_branch.outputs.driver_branch }}
kernel_flavors: ${{ steps.extract_driver_branch.outputs.kernel_flavors }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Read driver versions
id: extract_driver_branch
run: |
# get driver-branch
DRIVER_BRANCH=("535" "550")
driver_branch_json=$(printf '%s\n' "${DRIVER_BRANCH[@]}" | jq -R . | jq -cs .)
echo "driver_branch=$driver_branch_json" >> $GITHUB_OUTPUT
# get kernel flavors
KERNEL_FLAVORS=("aws" "azure" "generic" "nvidia" "oracle")
kernel_flavors_json=$(printf '%s\n' "${KERNEL_FLAVORS[@]}" | jq -R . | jq -cs .)
echo "kernel_flavors=$kernel_flavors_json" >> $GITHUB_OUTPUT
precompiled-image:
needs: set-driver-version-matrix
runs-on: ubuntu-latest
strategy:
matrix:
driver:
- 535
- 550
flavor:
- aws
- azure
- generic
- nvidia
- oracle
driver-branch: ${{ fromJson(needs.set-driver-version-matrix.outputs.driver_branch) }}
flavor: ${{ fromJson(needs.set-driver-version-matrix.outputs.kernel_flavors) }}
steps:
- uses: actions/checkout@v4
name: Check out code
Expand Down Expand Up @@ -64,10 +79,10 @@ jobs:
VERSION: ${COMMIT_SHORT_SHA}
BASE_TARGET: jammy
run: |
make DRIVER_BRANCH=${{ matrix.driver }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET}
make DRIVER_BRANCH=${{ matrix.driver-branch }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET}
trap "docker rm -f base-${BASE_TARGET}-${{ matrix.flavor }}" EXIT
docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver }}
docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver-branch }}
# try 3 times every 10 seconds to get the file, if success exit the loop
for i in {1..3}; do
docker cp base-${BASE_TARGET}-${{ matrix.flavor }}:/var/kernel_version.txt kernel_version.txt && break
Expand All @@ -81,4 +96,155 @@ jobs:
DIST: signed_ubuntu22.04
run: |
source kernel_version.txt && \
make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver }} build-${DIST}-${DRIVER_VERSION}
make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver-branch }} build-${DIST}-${DRIVER_VERSION}
determine-e2e-test-matrix:
runs-on: ubuntu-latest
needs:
- precompiled-image
- set-driver-version-matrix
outputs:
matrix_values_not_empty: ${{ steps.set_kernel_version.outputs.matrix_values_not_empty }}
matrix_values: ${{ steps.set_kernel_version.outputs.matrix_values }}
steps:
- name: Check out code
uses: actions/checkout@v4
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Set kernel version
id: set_kernel_version
env:
BASE_TARGET: "jammy"
DIST: "ubuntu22.04"
run: |
echo "matrix_values_not_empty=0" >> $GITHUB_OUTPUT
kernel_flavors_json='${{ needs.set-driver-version-matrix.outputs.kernel_flavors }}'
kernel_flavors=$(echo "$kernel_flavors_json" | jq -r '.[]')
driver_branch_json='${{ needs.set-driver-version-matrix.outputs.driver_branch }}'
driver_branch=$(echo "$driver_branch_json" | jq -r '.[]')
kernel_versions=()
for kernel_flavor in $kernel_flavors; do
# FIXME -- remove if condition, once azure kernel upgrade starts working
if [[ "$kernel_flavor" == "azure" ]]; then
echo "skipping azure kernel testing"
continue
fi
for DRIVER_BRANCH in $driver_branch; do
source ./tests/scripts/findkernelversion.sh "$BASE_TARGET" "${kernel_flavor}" "$DRIVER_BRANCH" "$DIST"
if [[ "$should_continue" == true ]]; then
echo "matrix_values_not_empty=1" >> $GITHUB_OUTPUT
break
fi
done
if [[ "$should_continue" == false ]]; then
echo "Skipping e2e tests for the following driver tag: ${KERNEL_VERSION}-${kernel_flavor}-${DIST}"
else
KERNEL_VERSION=$(echo "$KERNEL_VERSION" | tr -d ' \n')
kernel_versions+=("$KERNEL_VERSION")
echo "Adding the following tag to the e2e test matrix: ${KERNEL_VERSION}-${kernel_flavor}-${DIST}"
fi
done
# Convert array to JSON format and assign
echo "[]" > $GITHUB_WORKSPACE/matrix_values.json
printf '%s\n' "${kernel_versions[@]}" | jq -R . | jq -s . > $GITHUB_WORKSPACE/matrix_values.json
echo "matrix_values=$(cat $GITHUB_WORKSPACE/matrix_values.json | jq -c .)" >> $GITHUB_OUTPUT
e2e-tests-nvidiadriver:
runs-on: ubuntu-latest
needs:
- determine-e2e-test-matrix
- set-driver-version-matrix
if: ${{ needs.determine-e2e-test-matrix.outputs.matrix_values_not_empty == '1' }}
strategy:
matrix:
kernel_version: ${{ fromJson(needs.determine-e2e-test-matrix.outputs.matrix_values) }}
steps:
- name: Check out code
uses: actions/checkout@v4
- name: Set up Holodeck
uses: NVIDIA/[email protected]
env:
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
with:
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws_ssh_key: ${{ secrets.AWS_SSH_KEY }}
holodeck_config: "tests/holodeck.yaml"

- name: Get public dns name
id: get_public_dns_name
uses: mikefarah/yq@master
with:
cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml
- name: Set and Calculate test vars
run: |
echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
echo "${{ secrets.AWS_SSH_KEY }}" > ${{ github.workspace }}/key.pem && chmod 400 ${{ github.workspace }}/key.pem
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV
KERNEL_VERSION="${{ matrix.kernel_version }}"
echo "KERNEL_VERSION=$KERNEL_VERSION" >> $GITHUB_ENV
- name: Upgrade the kernel for Precompiled e2e test
env:
UPGRADE_KERNEL_SCRIPT: "./tests/scripts/upgrade-kernel.sh"
run: |
status=0
./tests/ci-remote-exec.sh "${UPGRADE_KERNEL_SCRIPT}" "${KERNEL_VERSION}" || status=$?
# On the target system, all scripts/test-case exit with code 1 for error handling.
# However, since reboot-related disconnections break the SSH connection
# and can cause the entire job to exit, we should ignore all errors except
# exit code 1. During a reboot, exit code 1 will not be thrown, so handling
# other errors as code 1 will ensure proper management of reboot scenarios
if [ $status -eq 1 ]; then
echo "Kernel version $KERNEL_VERSION upgrade failed"
exit 1
fi
./tests/scripts/remote_retry.sh || status=$?
if [ $status -ne 0 ]; then
echo "Failed to connect to remote instance"
exit $status
fi
- name: Precompiled e2e test gpu driver validation
env:
TEST_CASE: "./tests/cases/nvidia-driver.sh"
GPU_OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia --set driver.usePrecompiled=true"
run: |
rc=0
# for precompiled driver we are setting driver branch as driver version
driver_versions_json='${{ needs.set-driver-version-matrix.outputs.driver_branch }}'
driver_versions=$(echo "$driver_versions_json" | jq -r '.[]')
for DRIVER_VERSION in $driver_versions; do
echo "Running e2e for DRIVER_VERSION=$DRIVER_VERSION"
status=0
OPERATOR_OPTIONS="${GPU_OPERATOR_OPTIONS} --set driver.version=${DRIVER_VERSION}"
# add escape character for space
OPERATOR_OPTIONS=$(printf '%q ' "$OPERATOR_OPTIONS")
./tests/ci-run-e2e.sh "${TEST_CASE}" "${OPERATOR_OPTIONS}" || status=$?
if [ $status -eq 1 ]; then
echo "e2e validation failed for driver version $DRIVER_VERSION with status $status"
rc=$status
fi
done
./tests/scripts/pull.sh /tmp/logs logs
exit $rc
- name: Archive test logs
if: ${{ failure() }}
uses: actions/upload-artifact@v4
with:
name: nvidiadriver-Precompiled-e2e-test-logs
path: ./logs/
retention-days: 15
8 changes: 8 additions & 0 deletions tests/cases/nvidia-driver.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
#! /bin/bash
# This test case runs the operator installation / test case with the default options.

if [[ $# -lt 1 ]]; then
echo "Error: $0 must be called with driver options"
exit 1
fi

# export gpu-operator options
export TEST_CASE_ARGS="$1"

SCRIPTS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../scripts && pwd )"
source "${SCRIPTS_DIR}"/.definitions.sh

Expand Down
12 changes: 12 additions & 0 deletions tests/ci-remote-exec.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash

set -xe

if [[ $# -lt 1 ]]; then
echo "Error:$0 must be called with 1(REMOTE_EXEC) or more than 1 args (REMOTE_EXEC, ARGS1 ARGS2 etc)"
exit 1
fi

TEST_DIR="$(pwd)/tests"

${TEST_DIR}/remote-exec-local.sh "$@"
8 changes: 2 additions & 6 deletions tests/ci-run-e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,10 @@
set -xe

if [[ $# -ne 2 ]]; then
echo "TEST_CASE TARGET_DRIVER_VERSION are required"
echo "TEST_CASE TEST_CASE_ARGS are required"
exit 1
fi

export TEST_CASE=${1}
export TARGET_DRIVER_VERSION=${2}


TEST_DIR="$(pwd)/tests"

${TEST_DIR}/local.sh
${TEST_DIR}/local.sh "$@"
3 changes: 1 addition & 2 deletions tests/local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,4 @@ remote SKIP_PREREQUISITES="${SKIP_PREREQUISITES}" ./tests/scripts/prerequisites.
# are forwarded to the remote shell.
remote \
PROJECT="${PROJECT}" \
TARGET_DRIVER_VERSION="${TARGET_DRIVER_VERSION}" \
${TEST_CASE}
"$@"
21 changes: 21 additions & 0 deletions tests/remote-exec-local.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#! /bin/bash

if [[ $# -ge 1 ]]; then
REMOTE_EXEC=${1}
test -n "${REMOTE_EXEC}"
fi
test -f ${PROJECT_DIR}/${REMOTE_EXEC}

export PROJECT="gpu-driver-container"

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/scripts && pwd )"
source ${SCRIPT_DIR}/.definitions.sh
source ${SCRIPT_DIR}/.local.sh

# Sync the project folder to the remote
${SCRIPT_DIR}/push.sh

# We trigger the specified script on the remote instance.
remote \
PROJECT="${PROJECT}" \
"$@"
8 changes: 4 additions & 4 deletions tests/scripts/.definitions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )"

: ${TEST_NAMESPACE:="test-operator"}

: ${PRIVATE_REGISTRY:="ghcr.io"}

: ${HELM_NVIDIA_REPO:="https://helm.ngc.nvidia.com/nvidia"}

: ${TARGET_DRIVER_VERSION:="550.90.07"}

: ${DAEMON_POD_STATUS_TIME_OUT:="15m"}
: ${POD_STATUS_TIME_OUT:="2m"}

: ${LOG_DIR:="/tmp/logs"}

: ${SYSTEM_ONLINE_CHECK_TIMEOUT:="900"}

: ${BASE_TARGET:="jammy"}
4 changes: 4 additions & 0 deletions tests/scripts/.local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,7 @@
function remote() {
${SCRIPT_DIR}/remote.sh "cd ${PROJECT} && "$@""
}

function remote_retry() {
${SCRIPT_DIR}/remote_retry.sh
}
4 changes: 0 additions & 4 deletions tests/scripts/.rsync-excludes

This file was deleted.

2 changes: 2 additions & 0 deletions tests/scripts/.rsync-includes
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
tests/
tests/***
2 changes: 1 addition & 1 deletion tests/scripts/end-to-end-nvidia-driver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ ${SCRIPT_DIR}/install-operator.sh

"${SCRIPT_DIR}"/verify-operator.sh

echo "--------------Verification completed for GPU Operator, uninstalling the operator--------------"
echo "--------------Verification completed for GPU Operator, uninstalling the GPU operator--------------"

${SCRIPT_DIR}/uninstall-operator.sh ${TEST_NAMESPACE} "gpu-operator"

Expand Down
Loading

0 comments on commit 5e152f0

Please sign in to comment.