Skip to content

Commit

Permalink
ci: Restore rosetta-t5x unit tests
Browse files Browse the repository at this point in the history
  • Loading branch information
ko3n1g committed Apr 22, 2024
1 parent 246f8b6 commit 99db46d
Show file tree
Hide file tree
Showing 2 changed files with 109 additions and 119 deletions.
97 changes: 0 additions & 97 deletions .github/workflows/_test_rosetta.yaml

This file was deleted.

131 changes: 109 additions & 22 deletions .github/workflows/_test_rosetta_t5x.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,34 +6,33 @@ on:
T5X_IMAGE:
type: string
description: T5X image from ghcr.io/nvidia/t5x
default: 'ghcr.io/nvidia/t5x:latest'
default: "ghcr.io/nvidia/t5x:latest"
required: false
BADGE_FILENAME:
type: string
description: 'Name of the endpoint JSON file for shields.io badge'
description: "Name of the endpoint JSON file for shields.io badge"
required: false
default: 'badge-rosetta-t5x-mgmn-test.json'
default: "badge-rosetta-t5x-mgmn-test.json"
ARTIFACT_NAME:
type: string
description: 'Name of the artifact zip file'
description: "Name of the artifact zip file"
required: false
default: 'artifact-rosetta-t5x-mgmn-test'
default: "artifact-rosetta-t5x-mgmn-test"
FW_NAME:
type: string
description: 'Name of the framework being used'
description: "Name of the framework being used"
required: false
default: 'rosetta-t5x'
default: "rosetta-t5x"
outputs:
TEST_STATUS:
description: 'Summary pass/fail value indicating if results from tests are acceptable'
description: "Summary pass/fail value indicating if results from tests are acceptable"
value: ${{ jobs.sitrep.outputs.STATUS }}

env:
BATCH_SIZE_PER_GPU: 32
VIT_BATCH_SIZE_PER_GPU: 256

jobs:

single-process-multi-device:
strategy:
matrix:
Expand Down Expand Up @@ -63,10 +62,10 @@ jobs:
uses: webfactory/[email protected]
with:
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}

- name: Check out the repository under ${GITHUB_WORKSPACE}
uses: actions/checkout@v4

- name: Setup SSH known hosts
id: ssh-known-hosts
run: |
Expand Down Expand Up @@ -182,7 +181,7 @@ jobs:
dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"}
json.dump(dump, f)
EOF

- name: Generate sitrep
if: success() || failure()
shell: bash -x -e {0}
Expand All @@ -196,7 +195,7 @@ jobs:
passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
total_tests=$(ls $EXIT_STATUSES | wc -l)

if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
badge_message='error'
badge_color=red
Expand Down Expand Up @@ -402,7 +401,7 @@ jobs:
passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
total_tests=$(ls $EXIT_STATUSES | wc -l)

if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
badge_message='error'
badge_color=red
Expand All @@ -429,7 +428,7 @@ jobs:
color="${badge_color}" \
to_json schemaVersion label message color \
> output/${{ env.BADGE_FILENAME_PREFIX }}-${{ steps.meta.outputs.TEST_CASE_NAME }}.json

- name: Upload training logs as artifacts
uses: actions/upload-artifact@v4
with:
Expand Down Expand Up @@ -571,7 +570,7 @@ jobs:
passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
total_tests=$(ls $EXIT_STATUSES | wc -l)

if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
badge_message='error'
badge_color=red
Expand Down Expand Up @@ -744,7 +743,7 @@ jobs:
passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
total_tests=$(ls $EXIT_STATUSES | wc -l)

if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
badge_message='error'
badge_color=red
Expand All @@ -771,15 +770,21 @@ jobs:
color="${badge_color}" \
to_json schemaVersion label message color \
> output/${{ env.BADGE_FILENAME_PREFIX }}-${{ steps.meta.outputs.TEST_CASE_NAME }}.json

- name: Upload training logs as artifacts
uses: actions/upload-artifact@v4
with:
name: ${{ steps.meta.outputs.JOB_NAME }}
path: output/*

metrics:
needs: [multi-gpu-multi-node, single-process-multi-device, vit-single-process-multi-device, vit-multi-gpu-multi-node]
needs:
[
multi-gpu-multi-node,
single-process-multi-device,
vit-single-process-multi-device,
vit-multi-gpu-multi-node,
]
runs-on: ubuntu-22.04

steps:
Expand Down Expand Up @@ -810,7 +815,7 @@ jobs:
path: |
report.jsonl
*_metrics.json

sitrep:
needs: metrics
if: "!cancelled()"
Expand All @@ -820,10 +825,16 @@ jobs:
BADGE_FILENAME: ${{ inputs.BADGE_FILENAME }}
ARTIFACT_NAME: ${{ inputs.ARTIFACT_NAME }}
FW_NAME: ${{ inputs.FW_NAME }}

summary:
runs-on: ubuntu-22.04
needs: [multi-gpu-multi-node, single-process-multi-device, vit-single-process-multi-device, vit-multi-gpu-multi-node]
needs:
[
multi-gpu-multi-node,
single-process-multi-device,
vit-single-process-multi-device,
vit-multi-gpu-multi-node,
]
if: "!cancelled()"
steps:
- name: Generate TensorBoard query URL
Expand All @@ -848,3 +859,79 @@ jobs:
if [[ ${{ needs.sitrep.outputs.STATUS }} != success ]]; then
exit 1
fi

unit-tests:
runs-on: [self-hosted, V100]
env:
TEST_ARTIFACT_NAME: rosetta-test-logs
TEST_LOG_LOCAL_PATH: /log/unit-report.jsonl
steps:
- name: Print environment variables
run: |
env

- name: Print GPU information
run: nvidia-smi

- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Pull Rosetta image
shell: bash -x -e {0}
run: |
docker pull ${{ inputs.T5X_IMAGE }}
docker tag ${{ inputs.T5X_IMAGE }} rosetta:latest

- name: Run Rosetta tests w/ docker
shell: docker run --gpus all -v {0}:/cmd.sh -v /log:/log rosetta:latest bash -x -e /cmd.sh
run: |
ROSETTA_PATH=$(dirname $(python -c "import rosetta; print(*rosetta.__path__)"))
pip install "${ROSETTA_PATH}[test]" pytest-reportlog
pytest --report-log=${{ env.TEST_LOG_LOCAL_PATH }} ${ROSETTA_PATH} || true

- name: Upload unit test json logs
uses: actions/upload-artifact@v4
with:
name: ${{ env.TEST_ARTIFACT_NAME }}
path: ${{ env.TEST_LOG_LOCAL_PATH }}

publish-test:
needs: unit-tests
uses: ./.github/workflows/_publish_badge.yaml
if: ( always() )
secrets: inherit
with:
ENDPOINT_FILENAME: "rosetta-unit-test-status.json"
PUBLISH: false
SCRIPT: |
ARTIFACTS="${{ needs.rosetta-unit-tests.outputs.TEST_ARTIFACT_NAME }}/*.jsonl"
all_outcomes() {
cat $ARTIFACTS | jq -r '. | select((.["$report_type"] == "TestReport") and (.when == "call")) | .outcome'
}
cnt_type() {
cat $ARTIFACTS | jq '. | select((.["$report_type"] == "TestReport") and (.when == "call") and (.outcome | contains("'${1}'"))) | .outcome' | wc -l
}
SKIPPED_TESTS=$(cnt_type skipped)
FAILED_TESTS=$(cnt_type failed)
PASSED_TESTS=$(cnt_type passed)
TOTAL_TESTS=$(all_outcomes | wc -l)
echo "## Unit/Integration test breakdown" | tee -a $GITHUB_STEP_SUMMARY
all_outcomes | sort | uniq -c | tee -a $GITHUB_STEP_SUMMARY
if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]]; then
BADGE_COLOR=brightgreen
echo "STATUS=success" >> $GITHUB_OUTPUT
else
echo "STATUS=failure" >> $GITHUB_OUTPUT
if [[ $PASSED_TESTS -eq 0 ]]; then
BADGE_COLOR=red
else
BADGE_COLOR=yellow
fi
fi
echo "LABEL='V100 Unit'" >> $GITHUB_OUTPUT
echo "MESSAGE='${PASSED_TESTS}/${SKIPPED_TESTS}/${FAILED_TESTS} pass/skip/fail'" >> $GITHUB_OUTPUT
echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT

0 comments on commit 99db46d

Please sign in to comment.