Skip to content

Commit

Permalink
Pre end-to-end gpu driver validation
Browse files Browse the repository at this point in the history
Signed-off-by: shiva kumar <[email protected]>
  • Loading branch information
shivakunv committed Aug 22, 2024
1 parent f8c3a2b commit 3fe0f83
Show file tree
Hide file tree
Showing 15 changed files with 195 additions and 21 deletions.
69 changes: 62 additions & 7 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,21 @@ on:
types:
- completed
branches:
- main
- e2etestdriver

pull_request:
types:
- opened
- synchronize
branches:
# - main
# - release-*
- e2etestdriver
push:
branches:
# - main
# - release-*
- e2etestdriver

jobs:
e2e-tests-nvidiadriver:
Expand Down Expand Up @@ -52,33 +66,74 @@ jobs:
run: |
echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
# SHIVA
# echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
echo "COMMIT_SHORT_SHA=5ba28fea" >> $GITHUB_ENV
DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}')
echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV
- name: Validate gpu driver
# - name: Validate gpu driver
# env:
# TEST_CASE: "./tests/cases/nvidia-driver.sh"
# USE_PRECOMPILED: "0"
# run: |
# sudo chmod 644 ${{ github.workspace }}/.cache/key
# echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key}
# rc=0
# for driver_version in ${DRIVER_VERSIONS}; do
# echo "Running e2e for DRIVER_VERSION=$driver_version"
# ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} ${USE_PRECOMPILED} || status=$?
# if [ $status -ne 0 ]; then
# echo "e2e validation failed for driver version $driver_version with status $status"
# rc=$status
# fi
# done
# source ./tests/scripts/.definitions.sh
# ./tests/scripts/pull.sh ${LOG_DIR} logs
# exit $rc

# - name: Archive test logs
# if: ${{ failure() }}
# uses: actions/upload-artifact@v4
# with:
# name: nvidiadriver-e2e-test-logs
# path: ./logs/
# retention-days: 15

- name: Precompiled e2e test- upgrade kernel and Validate gpu driver
env:
TEST_CASE_KERNEL_UPGRADE: "./tests/cases/nvidia-kernel-upgrade.sh"
TEST_CASE: "./tests/cases/nvidia-driver.sh"
run: |
sudo chmod 644 ${{ github.workspace }}/.cache/key
echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key}
rc=0
for driver_version in ${DRIVER_VERSIONS}; do
echo "Running e2e for DRIVER_VERSION=$driver_version"
./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} || status=$?
export USE_PRECOMPILED="0"
./tests/ci-run-e2e.sh ${TEST_CASE_KERNEL_UPGRADE} ${driver_version} ${USE_PRECOMPILED} || status=$?
if [ $status -ne 0 ]; then
echo "e2e validation failed for driver version $driver_version with status $status"
echo "Kernel upgrade failed"
rc=$status
else
# system rebooted enable ssh retry
export USE_PRECOMPILED="1"
DRIVER_BRANCH=$(echo "${driver_version}" | cut -d '.' -f 1)
DRIVER_VERSION="${DRIVER_BRANCH}"
./tests/ci-run-e2e.sh ${TEST_CASE} ${DRIVER_VERSION} ${USE_PRECOMPILED} || status=$?
if [ $status -ne 0 ]; then
echo "e2e validation failed for driver version $DRIVER_VERSION with status $status"
rc=$status
fi
fi
done
source ./tests/scripts/.definitions.sh
./tests/scripts/pull.sh ${LOG_DIR} logs
exit $rc
- name: Archive test logs
if: ${{ failure() }}
uses: actions/upload-artifact@v4
with:
name: nvidiadriver-e2e-test-logs
name: nvidiadriver-Precompiled-e2e-test-logs
path: ./logs/
retention-days: 15
10 changes: 6 additions & 4 deletions .github/workflows/image.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,14 @@ on:
- opened
- synchronize
branches:
- main
- release-*
# - main
# - release-*
- e2etestdriver_no
push:
branches:
- main
- release-*
# - main
# - release-*
- e2etestdriver_no

jobs:
image:
Expand Down
8 changes: 8 additions & 0 deletions tests/cases/nvidia-kernel-upgrade.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#! /bin/bash
# This test case runs the operator installation / test case with the default options.

SCRIPTS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../scripts && pwd )"
source "${SCRIPTS_DIR}"/.definitions.sh

# Run an end-to-end test cycle
"${SCRIPTS_DIR}"/nvidia-kernel-upgrade-aws.sh
6 changes: 3 additions & 3 deletions tests/ci-run-e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@

set -xe

if [[ $# -ne 2 ]]; then
echo "TEST_CASE TARGET_DRIVER_VERSION are required"
if [[ $# -ne 3 ]]; then
echo "TEST_CASE TARGET_DRIVER_VERSION USE_PRECOMPILED are required"
exit 1
fi

export TEST_CASE=${1}
export TARGET_DRIVER_VERSION=${2}

export USE_PRECOMPILED=${3}

TEST_DIR="$(pwd)/tests"

Expand Down
5 changes: 5 additions & 0 deletions tests/local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/scripts && pwd )"
source ${SCRIPT_DIR}/.definitions.sh
source ${SCRIPT_DIR}/.local.sh

if [ "${USE_PRECOMPILED}" == "1" ]; then
remote_retry
fi

# Sync the project folder to the remote
${SCRIPT_DIR}/push.sh

Expand All @@ -24,4 +28,5 @@ remote SKIP_PREREQUISITES="${SKIP_PREREQUISITES}" ./tests/scripts/prerequisites.
remote \
PROJECT="${PROJECT}" \
TARGET_DRIVER_VERSION="${TARGET_DRIVER_VERSION}" \
USE_PRECOMPILED="${USE_PRECOMPILED}" \
${TEST_CASE}
6 changes: 6 additions & 0 deletions tests/scripts/.definitions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,9 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )"
: ${POD_STATUS_TIME_OUT:="2m"}

: ${LOG_DIR:="/tmp/logs"}

: ${USE_PRECOMPILED:="0"}
: ${SYSTEM_ONLINE_CHECK_TIMEOUT:="900"}

: ${BASE_TARGET:="jammy"}

4 changes: 4 additions & 0 deletions tests/scripts/.local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,7 @@
function remote() {
${SCRIPT_DIR}/remote.sh "cd ${PROJECT} && "$@""
}

function remote_retry() {
${SCRIPT_DIR}/remote_retry.sh
}
2 changes: 1 addition & 1 deletion tests/scripts/end-to-end-nvidia-driver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ ${SCRIPT_DIR}/install-operator.sh

"${SCRIPT_DIR}"/verify-operator.sh

echo "--------------Verification completed for GPU Operator, uninstalling the operator--------------"
echo "--------------Verification completed for GPU Operator, uninstalling the GPU operator--------------"

${SCRIPT_DIR}/uninstall-operator.sh ${TEST_NAMESPACE} "gpu-operator"

Expand Down
18 changes: 18 additions & 0 deletions tests/scripts/findkernelversion.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash

if [[ "${SKIP_INSTALL}" == "true" ]]; then
echo "Skipping install: SKIP_INSTALL=${SKIP_INSTALL}"
exit 0
fi

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source "${SCRIPT_DIR}"/.definitions.sh

export REGCTL_VERSION=v0.4.7
mkdir -p bin
curl -sSLo bin/regctl https://github.com/regclient/regclient/releases/download/${REGCTL_VERSION}/regctl-linux-amd64
chmod a+x bin/regctl
export PATH=$(pwd)/bin:${PATH}
DRIVER_BRANCH=$(echo "${TARGET_DRIVER_VERSION}" | cut -d '.' -f 1)
KERNEL_FLAVOR=$(uname -r | awk -F'-' '{print $3}')
regctl image get-file ghcr.io/nvidia/driver:base-${BASE_TARGET}-${KERNEL_FLAVOR}-${DRIVER_BRANCH} /var/kernel_version.txt ${LOG_DIR}/kernel_version.txt || true
8 changes: 8 additions & 0 deletions tests/scripts/install-operator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,17 @@ if [[ "${SKIP_INSTALL}" == "true" ]]; then
exit 0
fi

echo "Checking current kernel version..."
CURRENT_KERNEL=$(uname -r)
echo "Current kernel version: $CURRENT_KERNEL"

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh

if [ "${USE_PRECOMPILED}" == "1" ]; then
OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.usePrecompiled=true"
fi

OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.repository=${PRIVATE_REGISTRY}/nvidia --set driver.version=${TARGET_DRIVER_VERSION}"

# add helm driver repo
Expand Down
42 changes: 42 additions & 0 deletions tests/scripts/nvidia-kernel-upgrade-aws.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/bin/bash

if [[ "${SKIP_INSTALL}" == "true" ]]; then
echo "Skipping install: SKIP_INSTALL=${SKIP_INSTALL}"
exit 0
fi

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source "${SCRIPT_DIR}"/.definitions.sh

# finding kernel version
${SCRIPT_DIR}/findkernelversion.sh
source "${LOG_DIR}"/kernel_version.txt

echo "Checking current kernel version..."
CURRENT_KERNEL=$(uname -r)
echo "Current kernel version: $CURRENT_KERNEL"

echo ""
echo ""
echo "--------------Starting the Precompiled kernel version ${KERNEL_VERSION} upgrade--------------"

sudo apt-get update -y
sudo apt-get install --allow-downgrades linux-image-${KERNEL_VERSION} -y
if [ $? -ne 0 ]; then
echo "Kernel upgrade failed."
exit 1
fi

echo "update grub ..."
sudo sed -i "s/^GRUB_DEFAULT=.*/GRUB_DEFAULT=\"Advanced options for Ubuntu>Ubuntu, with Linux ${KERNEL_VERSION}\"/" /etc/default/grub
sudo cat /etc/default/grub | grep "GRUB_DEFAULT"
sudo update-grub

echo "Rebooting ..."
# Run the reboot command with nohup to avoid abrupt SSH closure issues
nohup sudo reboot &

echo "--------------Installation of kernel completed --------------"

# Exit with a success code since the reboot command was issued successfully
exit 0
3 changes: 3 additions & 0 deletions tests/scripts/prerequisites.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ fi
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source "${SCRIPT_DIR}"/.definitions.sh

echo "Create log dir ${LOG_DIR}"
mkdir -p "${LOG_DIR}"

export DEBIAN_FRONTEND=noninteractive

echo "Load kernel modules i2c_core and ipmi_msghandler"
Expand Down
27 changes: 27 additions & 0 deletions tests/scripts/remote_retry.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh
source ${SCRIPT_DIR}/.local.sh

try_ssh_connection() {
ssh -o ConnectTimeout=10 -i ${private_key} ${instance_hostname} "exit"
return $?
}

echo "Waiting for aws system to come back online..."
START_TIME=$(date +%s)
while true; do
sleep 60 # sleep before as system restarted earlier
try_ssh_connection
if [ $? -eq 0 ]; then
echo "Successfully connected to aws system after reboot."
break;
fi
ELAPSED_TIME=$(($(date +%s) - START_TIME))
if [ "$ELAPSED_TIME" -ge "$SYSTEM_ONLINE_CHECK_TIMEOUT" ]; then
echo "Failed to connect to aws within ${SYSTEM_ONLINE_CHECK_TIMEOUT} minutes after reboot."
exit 1
fi
echo "ssh retry again..."
done
7 changes: 1 addition & 6 deletions vgpu/src/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,4 @@ require (
gopkg.in/yaml.v2 v2.4.0
)

require (
github.com/cpuguy83/go-md2man/v2 v2.0.2 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect
golang.org/x/sys v0.8.0 // indirect
)
require golang.org/x/sys v0.8.0 // indirect
1 change: 1 addition & 0 deletions vgpu/src/go.sum
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
github.com/BurntSushi/toml v1.2.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ=
github.com/cpuguy83/go-md2man/v2 v2.0.2 h1:p1EgwI/C7NhT0JmVkwCD2ZBK8j4aeHQX2pMHHBfMQ6w=
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
Expand Down

0 comments on commit 3fe0f83

Please sign in to comment.