Skip to content

Commit

Permalink
Merge pull request #88 from shivakunv/enhancegpuvalidation
Browse files Browse the repository at this point in the history
end-to-end  gpu driver testing enhancement
  • Loading branch information
cdesiniotis committed Aug 20, 2024
2 parents 600b7bf + c6f8865 commit f8c3a2b
Show file tree
Hide file tree
Showing 8 changed files with 87 additions and 43 deletions.
31 changes: 23 additions & 8 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

name: CI
name: End-to-end tests

on:
workflow_run:
Expand All @@ -25,11 +25,6 @@ on:
jobs:
e2e-tests-nvidiadriver:
runs-on: ubuntu-latest
strategy:
matrix:
driver:
- 535.183.06
- 550.90.07

steps:
- name: Check out code
Expand All @@ -41,7 +36,6 @@ jobs:
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
AWS_SESSION_TOKEN: ${{ secrets.AWS_SESSION_TOKEN }}
with:
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
Expand All @@ -59,11 +53,32 @@ jobs:
echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}')
echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV
- name: Validate gpu driver
env:
TEST_CASE: "./tests/cases/nvidia-driver.sh"
run: |
sudo chmod 644 ${{ github.workspace }}/.cache/key
echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key}
./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${{ matrix.driver }}
rc=0
for driver_version in ${DRIVER_VERSIONS}; do
echo "Running e2e for DRIVER_VERSION=$driver_version"
./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} || status=$?
if [ $status -ne 0 ]; then
echo "e2e validation failed for driver version $driver_version with status $status"
rc=$status
fi
done
source ./tests/scripts/.definitions.sh
./tests/scripts/pull.sh ${LOG_DIR} logs
exit $rc
- name: Archive test logs
if: ${{ failure() }}
uses: actions/upload-artifact@v4
with:
name: nvidiadriver-e2e-test-logs
path: ./logs/
retention-days: 15
5 changes: 5 additions & 0 deletions tests/scripts/.definitions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,8 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )"
: ${HELM_NVIDIA_REPO:="https://helm.ngc.nvidia.com/nvidia"}

: ${TARGET_DRIVER_VERSION:="550.90.07"}

: ${DAEMON_POD_STATUS_TIME_OUT:="15m"}
: ${POD_STATUS_TIME_OUT:="2m"}

: ${LOG_DIR:="/tmp/logs"}
39 changes: 12 additions & 27 deletions tests/scripts/checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,35 +2,20 @@

check_pod_ready() {
local pod_label=$1
local current_time=0
while :; do
echo "Checking $pod_label pod"
kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE}
local pod_status_time_out=$2

echo "Checking $pod_label pod"

kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE}

echo "Checking $pod_label pod readiness"
is_pod_ready=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -ojsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' 2>/dev/null || echo "terminated")
echo "Checking $pod_label pod readiness"

if [ "${is_pod_ready}" = "True" ]; then
# Check if the pod is not in terminating state
is_pod_terminating=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -o jsonpath='{.items[0].metadata.deletionGracePeriodSeconds}' 2>/dev/null || echo "terminated")
if [ "${is_pod_terminating}" != "" ]; then
echo "pod $pod_label is in terminating state..."
else
echo "Pod $pod_label is ready"
break;
fi
fi

if [[ "${current_time}" -gt $((60 * 45)) ]]; then
echo "timeout reached"
exit 1;
fi

# Echo useful information on stdout
if kubectl wait -n ${TEST_NAMESPACE} --for=condition=Ready pod -l app=$pod_label --timeout ${pod_status_time_out}; then
return 0
else
# print status of pod
kubectl get pods -n ${TEST_NAMESPACE}
fi

echo "Sleeping 5 seconds"
current_time=$((${current_time} + 5))
sleep 5
done
return 1
}
6 changes: 5 additions & 1 deletion tests/scripts/end-to-end-nvidia-driver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,12 @@ echo ""
echo ""
echo "--------------Installing the GPU Operator--------------"

# Install the operator with usePrecompiled mode set to true
${SCRIPT_DIR}/install-operator.sh

"${SCRIPT_DIR}"/verify-operator.sh

echo "--------------Verification completed for GPU Operator, uninstalling the operator--------------"

${SCRIPT_DIR}/uninstall-operator.sh ${TEST_NAMESPACE} "gpu-operator"

echo "--------------Verification completed for GPU Operator--------------"
12 changes: 12 additions & 0 deletions tests/scripts/pull.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash

if [[ $# -ne 2 ]]; then
echo "Pull requires a source and destination"
exit 1
fi

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh
source ${SCRIPT_DIR}/.local.sh

${SCRIPT_DIR}/sync.sh ${instance_hostname}:${1} ${2}
3 changes: 2 additions & 1 deletion tests/scripts/remote.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh
source ${SCRIPT_DIR}/.local.sh

ssh -i ${private_key} ${instance_hostname} "${@}"
# keep alive 60sec and timeout after 30 tries
ssh -o ServerAliveInterval=60 -o ServerAliveCountMax=30 -i ${private_key} ${instance_hostname} "${@}"
14 changes: 14 additions & 0 deletions tests/scripts/uninstall-operator.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash

if [[ "${SKIP_INSTALL}" == "true" ]]; then
echo "Skipping install: SKIP_INSTALL=${SKIP_INSTALL}"
exit 0
fi

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh

namespace=$1
release_name=$2
helm uninstall $release_name --namespace $namespace || true
kubectl delete namespace $namespace || true
20 changes: 14 additions & 6 deletions tests/scripts/verify-operator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,17 @@ source ${SCRIPT_DIR}/.definitions.sh
# Import the check definitions
source ${SCRIPT_DIR}/checks.sh

check_pod_ready "nvidia-driver-daemonset"
check_pod_ready "nvidia-container-toolkit-daemonset"
check_pod_ready "nvidia-device-plugin-daemonset"
check_pod_ready "nvidia-dcgm-exporter"
check_pod_ready "gpu-feature-discovery"
check_pod_ready "nvidia-operator-validator"
# wait for the nvidia-driver pod to be ready
# If successful, then wait for the validator pod to be ready (this means that the rest of the pods are healthy)
# collect log in case of failure
check_pod_ready "nvidia-driver-daemonset" ${DAEMON_POD_STATUS_TIME_OUT} && \
check_pod_ready "nvidia-operator-validator" ${POD_STATUS_TIME_OUT}; exit_status=$?
if [ $exit_status -ne 0 ]; then
curl -o ${SCRIPT_DIR}/must-gather.sh "https://raw.githubusercontent.com/NVIDIA/gpu-operator/main/hack/must-gather.sh"
chmod +x ${SCRIPT_DIR}/must-gather.sh
ARTIFACT_DIR="${LOG_DIR}" ${SCRIPT_DIR}/must-gather.sh
${SCRIPT_DIR}/uninstall-operator.sh ${TEST_NAMESPACE} "gpu-operator"
exit 1
else
echo "All gpu-operator pods are ready."
fi

0 comments on commit f8c3a2b

Please sign in to comment.