From 99db46d550b54bfddbeb419acc499ee5f449e498 Mon Sep 17 00:00:00 2001
From: Oliver Koenig <okoenig@nvidia.com>
Date: Mon, 22 Apr 2024 15:27:48 +0200
Subject: [PATCH] ci: Restore rosetta-t5x unit tests

---
 .github/workflows/_test_rosetta.yaml     |  97 -----------------
 .github/workflows/_test_rosetta_t5x.yaml | 131 +++++++++++++++++++----
 2 files changed, 109 insertions(+), 119 deletions(-)
 delete mode 100644 .github/workflows/_test_rosetta.yaml

diff --git a/.github/workflows/_test_rosetta.yaml b/.github/workflows/_test_rosetta.yaml
deleted file mode 100644
index 017662ea3..000000000
--- a/.github/workflows/_test_rosetta.yaml
+++ /dev/null
@@ -1,97 +0,0 @@
-name: ~test Rosetta
-
-on:
-  workflow_call:
-    inputs:
-      ROSETTA_IMAGE:
-        type: string
-        description: 'Rosetta image build by NVIDIA/JAX-Toolbox'
-        required: true
-        default: 'ghcr.io/nvidia/t5x:latest'
-    outputs:
-      TEST_ARTIFACT_NAME:
-        description: 'Name of the unit test artifact for downstream workflows'
-        value: ${{ jobs.rosetta-unit-tests.outputs.TEST_ARTIFACT_NAME }}
-      TEST_STATUS:
-        description: 'Summary pass/fail value indicating if results from tests are acceptable'
-        value: ${{ jobs.publish-test.outputs.STATUS }}
-
-env:
-  TEST_ARTIFACT_NAME: rosetta-test-logs
-  TEST_LOG_LOCAL_PATH: /log/unit-report.jsonl
-
-jobs:
-  rosetta-unit-tests:
-    runs-on: [self-hosted, V100]
-    outputs:
-      TEST_ARTIFACT_NAME: ${{ env.TEST_ARTIFACT_NAME }}
-    steps:
-      - name: Print environment variables
-        run: |
-          env
-
-      - name: Print GPU information
-        run: nvidia-smi  
-
-      - name: Login to GitHub Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Pull Rosetta image
-        shell: bash -x -e {0}
-        run: |
-          docker pull ${{ inputs.ROSETTA_IMAGE }}
-          docker tag ${{ inputs.ROSETTA_IMAGE }} rosetta:latest
-
-      - name: Run Rosetta tests w/ docker
-        shell: docker run --gpus all -v {0}:/cmd.sh -v /log:/log rosetta:latest bash -x -e /cmd.sh
-        run: |
-          ROSETTA_PATH=$(dirname $(python -c "import rosetta; print(*rosetta.__path__)"))
-          pip install "${ROSETTA_PATH}[test]" pytest-reportlog
-          pytest --report-log=${{ env.TEST_LOG_LOCAL_PATH }} ${ROSETTA_PATH} || true
-
-      - name: Upload unit test json logs
-        uses: actions/upload-artifact@v4
-        with:
-          name: ${{ env.TEST_ARTIFACT_NAME }}
-          path: ${{ env.TEST_LOG_LOCAL_PATH }}
-
-  publish-test:
-    needs: rosetta-unit-tests
-    uses: ./.github/workflows/_publish_badge.yaml
-    if: ( always() )
-    secrets: inherit
-    with:
-      ENDPOINT_FILENAME: 'rosetta-unit-test-status.json'
-      PUBLISH: false
-      SCRIPT: |
-          ARTIFACTS="${{ needs.rosetta-unit-tests.outputs.TEST_ARTIFACT_NAME }}/*.jsonl"
-          all_outcomes() {
-            cat $ARTIFACTS | jq -r '. | select((.["$report_type"] == "TestReport") and (.when == "call")) | .outcome'
-          }
-          cnt_type() {
-            cat $ARTIFACTS | jq '. | select((.["$report_type"] == "TestReport") and (.when == "call") and (.outcome | contains("'${1}'"))) | .outcome' | wc -l
-          }
-          SKIPPED_TESTS=$(cnt_type skipped)
-          FAILED_TESTS=$(cnt_type failed)
-          PASSED_TESTS=$(cnt_type passed)
-          TOTAL_TESTS=$(all_outcomes | wc -l)
-          echo "## Unit/Integration test breakdown" | tee -a $GITHUB_STEP_SUMMARY
-          all_outcomes | sort | uniq -c | tee -a $GITHUB_STEP_SUMMARY
-          if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]]; then
-            BADGE_COLOR=brightgreen
-            echo "STATUS=success" >> $GITHUB_OUTPUT
-          else
-            echo "STATUS=failure" >> $GITHUB_OUTPUT
-            if [[ $PASSED_TESTS -eq 0 ]]; then
-              BADGE_COLOR=red
-            else
-              BADGE_COLOR=yellow
-            fi
-          fi
-          echo "LABEL='V100 Unit'" >> $GITHUB_OUTPUT
-          echo "MESSAGE='${PASSED_TESTS}/${SKIPPED_TESTS}/${FAILED_TESTS} pass/skip/fail'" >> $GITHUB_OUTPUT
-          echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT
diff --git a/.github/workflows/_test_rosetta_t5x.yaml b/.github/workflows/_test_rosetta_t5x.yaml
index 7bf6cc150..f6f43d8d2 100644
--- a/.github/workflows/_test_rosetta_t5x.yaml
+++ b/.github/workflows/_test_rosetta_t5x.yaml
@@ -6,26 +6,26 @@ on:
       T5X_IMAGE:
         type: string
         description: T5X image from ghcr.io/nvidia/t5x
-        default: 'ghcr.io/nvidia/t5x:latest'
+        default: "ghcr.io/nvidia/t5x:latest"
         required: false
       BADGE_FILENAME:
         type: string
-        description: 'Name of the endpoint JSON file for shields.io badge'
+        description: "Name of the endpoint JSON file for shields.io badge"
         required: false
-        default: 'badge-rosetta-t5x-mgmn-test.json'
+        default: "badge-rosetta-t5x-mgmn-test.json"
       ARTIFACT_NAME:
         type: string
-        description: 'Name of the artifact zip file'
+        description: "Name of the artifact zip file"
         required: false
-        default: 'artifact-rosetta-t5x-mgmn-test'
+        default: "artifact-rosetta-t5x-mgmn-test"
       FW_NAME:
         type: string
-        description: 'Name of the framework being used'
+        description: "Name of the framework being used"
         required: false
-        default: 'rosetta-t5x'
+        default: "rosetta-t5x"
     outputs:
       TEST_STATUS:
-        description: 'Summary pass/fail value indicating if results from tests are acceptable'
+        description: "Summary pass/fail value indicating if results from tests are acceptable"
         value: ${{ jobs.sitrep.outputs.STATUS }}
 
 env:
@@ -33,7 +33,6 @@ env:
   VIT_BATCH_SIZE_PER_GPU: 256
 
 jobs:
-
   single-process-multi-device:
     strategy:
       matrix:
@@ -63,10 +62,10 @@ jobs:
         uses: webfactory/ssh-agent@v0.9.0
         with:
           ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
-          
+
       - name: Check out the repository under ${GITHUB_WORKSPACE}
         uses: actions/checkout@v4
-        
+
       - name: Setup SSH known hosts
         id: ssh-known-hosts
         run: |
@@ -182,7 +181,7 @@ jobs:
               dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"}
               json.dump(dump, f)
           EOF
-          
+
       - name: Generate sitrep
         if: success() || failure()
         shell: bash -x -e {0}
@@ -196,7 +195,7 @@ jobs:
           passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
           failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
           total_tests=$(ls $EXIT_STATUSES | wc -l)
-          
+
           if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
             badge_message='error'
             badge_color=red
@@ -402,7 +401,7 @@ jobs:
           passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
           failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
           total_tests=$(ls $EXIT_STATUSES | wc -l)
-          
+
           if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
             badge_message='error'
             badge_color=red
@@ -429,7 +428,7 @@ jobs:
           color="${badge_color}" \
           to_json schemaVersion label message color \
           > output/${{ env.BADGE_FILENAME_PREFIX }}-${{ steps.meta.outputs.TEST_CASE_NAME }}.json
- 
+
       - name: Upload training logs as artifacts
         uses: actions/upload-artifact@v4
         with:
@@ -571,7 +570,7 @@ jobs:
           passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
           failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
           total_tests=$(ls $EXIT_STATUSES | wc -l)
-          
+
           if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
             badge_message='error'
             badge_color=red
@@ -744,7 +743,7 @@ jobs:
           passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
           failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
           total_tests=$(ls $EXIT_STATUSES | wc -l)
-          
+
           if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
             badge_message='error'
             badge_color=red
@@ -771,7 +770,7 @@ jobs:
           color="${badge_color}" \
           to_json schemaVersion label message color \
           > output/${{ env.BADGE_FILENAME_PREFIX }}-${{ steps.meta.outputs.TEST_CASE_NAME }}.json
- 
+
       - name: Upload training logs as artifacts
         uses: actions/upload-artifact@v4
         with:
@@ -779,7 +778,13 @@ jobs:
           path: output/*
 
   metrics:
-    needs: [multi-gpu-multi-node, single-process-multi-device, vit-single-process-multi-device, vit-multi-gpu-multi-node]
+    needs:
+      [
+        multi-gpu-multi-node,
+        single-process-multi-device,
+        vit-single-process-multi-device,
+        vit-multi-gpu-multi-node,
+      ]
     runs-on: ubuntu-22.04
 
     steps:
@@ -810,7 +815,7 @@ jobs:
           path: |
             report.jsonl
             *_metrics.json
-  
+
   sitrep:
     needs: metrics
     if: "!cancelled()"
@@ -820,10 +825,16 @@ jobs:
       BADGE_FILENAME: ${{ inputs.BADGE_FILENAME }}
       ARTIFACT_NAME: ${{ inputs.ARTIFACT_NAME }}
       FW_NAME: ${{ inputs.FW_NAME }}
-      
+
   summary:
     runs-on: ubuntu-22.04
-    needs: [multi-gpu-multi-node, single-process-multi-device, vit-single-process-multi-device, vit-multi-gpu-multi-node]
+    needs:
+      [
+        multi-gpu-multi-node,
+        single-process-multi-device,
+        vit-single-process-multi-device,
+        vit-multi-gpu-multi-node,
+      ]
     if: "!cancelled()"
     steps:
       - name: Generate TensorBoard query URL
@@ -848,3 +859,79 @@ jobs:
           if [[ ${{ needs.sitrep.outputs.STATUS }} != success ]]; then
             exit 1
           fi
+
+  unit-tests:
+    runs-on: [self-hosted, V100]
+    env:
+      TEST_ARTIFACT_NAME: rosetta-test-logs
+      TEST_LOG_LOCAL_PATH: /log/unit-report.jsonl
+    steps:
+      - name: Print environment variables
+        run: |
+          env
+
+      - name: Print GPU information
+        run: nvidia-smi
+
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Pull Rosetta image
+        shell: bash -x -e {0}
+        run: |
+          docker pull ${{ inputs.T5X_IMAGE }}
+          docker tag ${{ inputs.T5X_IMAGE }} rosetta:latest
+
+      - name: Run Rosetta tests w/ docker
+        shell: docker run --gpus all -v {0}:/cmd.sh -v /log:/log rosetta:latest bash -x -e /cmd.sh
+        run: |
+          ROSETTA_PATH=$(dirname $(python -c "import rosetta; print(*rosetta.__path__)"))
+          pip install "${ROSETTA_PATH}[test]" pytest-reportlog
+          pytest --report-log=${{ env.TEST_LOG_LOCAL_PATH }} ${ROSETTA_PATH} || true
+
+      - name: Upload unit test json logs
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.TEST_ARTIFACT_NAME }}
+          path: ${{ env.TEST_LOG_LOCAL_PATH }}
+
+  publish-test:
+    needs: unit-tests
+    uses: ./.github/workflows/_publish_badge.yaml
+    if: ( always() )
+    secrets: inherit
+    with:
+      ENDPOINT_FILENAME: "rosetta-unit-test-status.json"
+      PUBLISH: false
+      SCRIPT: |
+        ARTIFACTS="${{ needs.rosetta-unit-tests.outputs.TEST_ARTIFACT_NAME }}/*.jsonl"
+        all_outcomes() {
+          cat $ARTIFACTS | jq -r '. | select((.["$report_type"] == "TestReport") and (.when == "call")) | .outcome'
+        }
+        cnt_type() {
+          cat $ARTIFACTS | jq '. | select((.["$report_type"] == "TestReport") and (.when == "call") and (.outcome | contains("'${1}'"))) | .outcome' | wc -l
+        }
+        SKIPPED_TESTS=$(cnt_type skipped)
+        FAILED_TESTS=$(cnt_type failed)
+        PASSED_TESTS=$(cnt_type passed)
+        TOTAL_TESTS=$(all_outcomes | wc -l)
+        echo "## Unit/Integration test breakdown" | tee -a $GITHUB_STEP_SUMMARY
+        all_outcomes | sort | uniq -c | tee -a $GITHUB_STEP_SUMMARY
+        if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]]; then
+          BADGE_COLOR=brightgreen
+          echo "STATUS=success" >> $GITHUB_OUTPUT
+        else
+          echo "STATUS=failure" >> $GITHUB_OUTPUT
+          if [[ $PASSED_TESTS -eq 0 ]]; then
+            BADGE_COLOR=red
+          else
+            BADGE_COLOR=yellow
+          fi
+        fi
+        echo "LABEL='V100 Unit'" >> $GITHUB_OUTPUT
+        echo "MESSAGE='${PASSED_TESTS}/${SKIPPED_TESTS}/${FAILED_TESTS} pass/skip/fail'" >> $GITHUB_OUTPUT
+        echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT