diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml
deleted file mode 100644
index c0aedbc1524ef..0000000000000
--- a/.github/workflows/_test_template.yml
+++ /dev/null
@@ -1,86 +0,0 @@
-name: ~test template
-
-on:
-  workflow_call:
-    inputs:
-      RUNNER:
-        type: string
-        description: Runner to use for test
-        required: true
-      TIMEOUT:
-        type: number
-        description: Max runtime of test in minutes
-        required: false
-        default: 10
-      SCRIPT:
-        type: string
-        description: Test script to execute
-        required: true
-      AFTER_SCRIPT:
-        type: string
-        description: Script to run after main test
-        required: false
-        default: ":"
-      IS_OPTIONAL:
-        type: boolean
-        description: Failure will cancel all other tests if set to true
-        required: false
-        default: false
-    outputs:
-      conclusion:
-        description: Conclusion of main test step
-        value: ${{ jobs.main.outputs.conclusion }}
-      log:
-        description: Last 2000 characters of the test step's log
-        value: ${{ jobs.main.outputs.log }} 
-jobs:
-  
-  main:
-    runs-on: ${{ inputs.RUNNER }} 
-    outputs:
-      conclusion: ${{ steps.main.conclusion }}
-      log: ${{ steps.main.outputs.log }}
-    steps:
-        - name: Docker system cleanup
-          run: |
-            docker system prune -a --filter "until=48h" --force || true
-
-        - name: Docker pull image
-          run: |
-            docker pull nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-
-        - name: Start container
-          run: |
-            docker run --rm -d --name nemo_container_${{ github.run_id }} --runtime=nvidia --gpus all --shm-size=64g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
-
-        - id: main
-          name: Run main script
-          timeout-minutes: ${{ inputs.TIMEOUT }}
-          run: |
-            mkdir -p ${{ github.run_id }}
-            cd ${{ github.run_id }}/
-            set +e 
-            (  
-              set -e
-
-              docker exec nemo_container_${{ github.run_id }} bash -c '${{ inputs.SCRIPT }}'
-            ) 2> >(tee err.log)
-
-            EXIT_CODE=$?
-            
-            echo "log=$(tail -c 2000 err.log |  base64 -w 0)" >> "$GITHUB_OUTPUT"
-            
-            exit $EXIT_CODE
-            
-        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-          if: failure() && inputs.IS_OPTIONAL == false
-        - name: after_script
-          if: always() && inputs.AFTER_SCRIPT != ':'
-          run: |
-            docker exec nemo_container_${{ github.run_id }} bash -c '${{ inputs.AFTER_SCRIPT }}'
-
-        - name: Container shutdown
-          if: always()
-          run: |
-            docker container stop nemo_container_${{ github.run_id }} || true
-            docker container rm nemo_container_${{ github.run_id }} || true
\ No newline at end of file
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
deleted file mode 100644
index 871f58ed7bf28..0000000000000
--- a/.github/workflows/cicd-main.yml
+++ /dev/null
@@ -1,5465 +0,0 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-name: "CICD NeMo"
-on:
-  pull_request:
-    branches:
-      - 'main'
-      - 'r**'
-    types: [ labeled ]
-
-  workflow_dispatch:
-    inputs:
-      test_to_run:
-        required: false
-        default: all
-        type: string
-        description: Comma-separated list of tests to run. Use "all" to run the full test suite.
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  pre-flight:
-    runs-on: ubuntu-latest
-    outputs:
-      test_to_run: ${{ steps.test_to_run.outputs.main }}
-      all: ${{ steps.all.outputs.main }}
-    steps:
-      - name: Parse test_to_run
-        id: test_to_run
-        run: |
-          parsed_string=$(echo ${{ inputs.test_to_run || 'all' }} | jq -c --raw-input 'split(",")')
-          echo "main=${parsed_string}" | tee -a "$GITHUB_OUTPUT"
-      - name: Parse all
-        id: all
-        run: |
-          echo "main=${{ contains(fromJSON(steps.test_to_run.outputs.main), 'all') }}" | tee -a "$GITHUB_OUTPUT" 
-
-  gpu-test:
-    needs: [pre-flight]
-    runs-on: self-hosted-azure
-    if: ${{ github.event.label.name == 'Run CICD' || github.event_name == 'workflow_dispatch' }}
-    steps:
-    - name: Run nvidia-smi test
-      run: |
-        whoami
-        nvidia-smi
-  
-
-  cicd-cluster-clean:
-    runs-on: self-hosted-azure-builder
-    needs: [pre-flight]
-    if: ${{ github.event.label.name == 'Run CICD' || github.event_name == 'workflow_dispatch' }}
-    steps:
-    - name: Clean server from old files
-      run: |
-        docker system prune --filter "until=24h" --filter "label=nemo.library=nemo-core" --force
-        
-  cicd-test-container-setup:
-    needs: [cicd-cluster-clean, pre-flight]
-    runs-on: self-hosted-azure-builder
-    if: ${{ github.event.label.name == 'Run CICD' || github.event_name == 'workflow_dispatch' }}
-    outputs:
-      test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
-      all: ${{ needs.pre-flight.outputs.all }}
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-      with:
-        path: ${{ github.run_id }}
-    
-    - name: Set up Docker Buildx
-      uses: docker/setup-buildx-action@v3
-      with: 
-        # We use `docker` driver as this speeds things up for 
-        # trivial (non-multi-stage) builds.
-        driver: docker
-
-    - name: Restore cache
-      run: |
-        docker pull nemoci.azurecr.io/nemo_container:latest
-        docker pull nemoci.azurecr.io/nemo_container_${{ github.event.number || 'noop' }} || true
-
-    - name: Build and push
-      uses: docker/build-push-action@v5
-      with:
-        file: Dockerfile.ci
-        push: true
-        cache-from: |
-          nemoci.azurecr.io/nemo_container:latest
-          nemoci.azurecr.io/nemo_container_${{ github.event.number || 'noop' }}
-        cache-to: type=inline
-        tags: |
-          nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-          nemoci.azurecr.io/nemo_container_${{ github.event.number || 'noop' }}
-          nemoci.azurecr.io/nemo_container:latest
-
-    - name: Run some checks
-      run: |
-        docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --env PYTHONUNBUFFERED=1 nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '\
-          # PyTorch Lightning version
-          python -c "import pytorch_lightning; print(pytorch_lightning.__version__)"
-
-          # PyTorch Lightning DDP Checks
-          CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py"
-
-          # Basic Import Checks
-          python -c "import nemo.collections.asr as nemo_asr"
-          python -c "import nemo.collections.nlp as nemo_nlp"
-          python -c "import nemo.collections.nlp as nemo_nlp; nemo_nlp.modules.get_tokenizer_list()"
-          python -c "import nemo.collections.tts as nemo_tts"
-
-          python setup.py style
-          python tests/check_copyright_header.py --dir .
-
-          # These checks are not crucial
-          exit 0
-        '
-        ### \'\'
-  
-  # L0: GPU unit tests
-  OPTIONAL_L0_Unit_Tests_GPU_ASR:
-     needs: [cicd-test-container-setup]
-     uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_ASR') || needs.cicd-test-container-setup.outputs.all == 'true'
-     with:
-       RUNNER: self-hosted-azure
-       TIMEOUT: 20
-       SCRIPT: |
-         NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --with_downloads
-       IS_OPTIONAL: true
-
-  L0_Unit_Tests_GPU_Audio:
-     needs: [cicd-test-container-setup]
-     uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Audio') || needs.cicd-test-container-setup.outputs.all == 'true'
-     with:
-       RUNNER: self-hosted-azure
-       TIMEOUT: 20
-       SCRIPT: |
-         NEMO_NUMBA_MINVER=0.53 pytest tests/collections/audio -m "not pleasefixme" --with_downloads
-
-  L0_Unit_Tests_GPU_Common:
-     needs: [cicd-test-container-setup]
-     uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Common') || needs.cicd-test-container-setup.outputs.all == 'true'
-     with:
-       RUNNER: self-hosted-azure
-       SCRIPT: |
-         NEMO_NUMBA_MINVER=0.53 pytest tests/collections/common -m "not pleasefixme" --with_downloads
-
-  L0_Unit_Tests_GPU_LLM:
-     needs: [cicd-test-container-setup]
-     uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_LLM') || needs.cicd-test-container-setup.outputs.all == 'true'
-     with:
-       RUNNER: self-hosted-azure
-       SCRIPT: |
-         NEMO_NUMBA_MINVER=0.53 pytest tests/collections/llm -m "not pleasefixme" --with_downloads
-
-  L0_Unit_Tests_GPU_Multimodal:
-     needs: [cicd-test-container-setup]
-     uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Multimodal') || needs.cicd-test-container-setup.outputs.all == 'true'
-     with:
-       RUNNER: self-hosted-azure
-       SCRIPT: |
-         NEMO_NUMBA_MINVER=0.53 pytest tests/collections/multimodal -m "not pleasefixme" --with_downloads
-
-  L0_Unit_Tests_GPU_NLP:
-     needs: [cicd-test-container-setup]
-     uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_NLP') || needs.cicd-test-container-setup.outputs.all == 'true'
-     with:
-       RUNNER: self-hosted-azure
-       SCRIPT: |
-         NEMO_NUMBA_MINVER=0.53 pytest tests/collections/nlp -m "not pleasefixme" --with_downloads
-
-  L0_Unit_Tests_GPU_TTS:
-     needs: [cicd-test-container-setup]
-     uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_TTS') || needs.cicd-test-container-setup.outputs.all == 'true'
-     with:
-       RUNNER: self-hosted-azure
-       SCRIPT: |
-         NEMO_NUMBA_MINVER=0.53 pytest tests/collections/tts -m "not pleasefixme" --with_downloads
-
-  OPTIONAL_L0_Unit_Tests_GPU_Core:
-     needs: [cicd-test-container-setup]
-     uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_Core') || needs.cicd-test-container-setup.outputs.all == 'true'
-     with:
-       RUNNER: self-hosted-azure
-       TIMEOUT: 20
-       SCRIPT: |
-         NEMO_NUMBA_MINVER=0.53 pytest tests/core -m "not pleasefixme" --with_downloads
-       IS_OPTIONAL: true
-
-  L0_Unit_Tests_GPU_Hydra:
-     needs: [cicd-test-container-setup]
-     uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Hydra') || needs.cicd-test-container-setup.outputs.all == 'true'
-     with:
-       RUNNER: self-hosted-azure
-       SCRIPT: |
-         NEMO_NUMBA_MINVER=0.53 pytest tests/hydra -m "not pleasefixme" --with_downloads
-
-  OPTIONAL_L0_Unit_Tests_GPU_Lightning:
-     needs: [cicd-test-container-setup]
-     uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_Lightning') || needs.cicd-test-container-setup.outputs.all == 'true'
-     with:
-       RUNNER: self-hosted-azure
-       SCRIPT: |
-         NEMO_NUMBA_MINVER=0.53 pytest tests/lightning -m "not pleasefixme" --with_downloads
-       IS_OPTIONAL: true
-
-  L0_Unit_Tests_GPU_Others:
-     needs: [cicd-test-container-setup]
-     uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Others') || needs.cicd-test-container-setup.outputs.all == 'true'
-     with:
-       RUNNER: self-hosted-azure
-       SCRIPT: |
-         NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads \
-         --ignore=tests/collections/asr \
-         --ignore=tests/collections/audio \
-         --ignore=tests/collections/common \
-         --ignore=tests/collections/llm \
-         --ignore=tests/collections/multimodal \
-         --ignore=tests/collections/nlp \
-         --ignore=tests/collections/tts \
-         --ignore=tests/core \
-         --ignore=tests/core_ptl \
-         --ignore=tests/hydra \
-         --ignore=tests/lightning \
-         --ignore=tests/utils
-
-  # L0: CPU unit tests
-  L0_Unit_Tests_CPU_ASR:
-     needs: [cicd-test-container-setup]
-     uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_ASR') || needs.cicd-test-container-setup.outputs.all == 'true'
-     with:
-       RUNNER: self-hosted-azure-cpu
-       TIMEOUT: 20
-       SCRIPT: |
-         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
-
-  L0_Unit_Tests_CPU_Audio:
-     needs: [cicd-test-container-setup]
-     uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Audio') || needs.cicd-test-container-setup.outputs.all == 'true'
-     with:
-       RUNNER: self-hosted-azure-cpu
-       SCRIPT: |
-         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/audio -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
-
-  L0_Unit_Tests_CPU_Common:
-     needs: [cicd-test-container-setup]
-     uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Common') || needs.cicd-test-container-setup.outputs.all == 'true'
-     with:
-       RUNNER: self-hosted-azure-cpu
-       TIMEOUT: 20
-       SCRIPT: |
-         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/common -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
-
-  L0_Unit_Tests_CPU_LLM:
-     needs: [cicd-test-container-setup]
-     uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_LLM') || needs.cicd-test-container-setup.outputs.all == 'true'
-     with:
-       RUNNER: self-hosted-azure-cpu
-       SCRIPT: |
-         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/llm -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
-
-  L0_Unit_Tests_CPU_Multimodal:
-     needs: [cicd-test-container-setup]
-     uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Multimodal') || needs.cicd-test-container-setup.outputs.all == 'true'
-     with:
-       RUNNER: self-hosted-azure-cpu
-       SCRIPT: |
-         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/multimodal -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
-
-  L0_Unit_Tests_CPU_NLP:
-     needs: [cicd-test-container-setup]
-     uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_NLP') || needs.cicd-test-container-setup.outputs.all == 'true'
-     with:
-       RUNNER: self-hosted-azure-cpu
-       TIMEOUT: 20
-       SCRIPT: |
-         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/nlp -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
-
-  L0_Unit_Tests_CPU_TTS:
-     needs: [cicd-test-container-setup]
-     uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_TTS') || needs.cicd-test-container-setup.outputs.all == 'true'
-     with:
-       RUNNER: self-hosted-azure-cpu
-       SCRIPT: |
-         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/tts -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
-
-  L0_Unit_Tests_CPU_Core:
-     needs: [cicd-test-container-setup]
-     uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Core') || needs.cicd-test-container-setup.outputs.all == 'true'
-     with:
-       RUNNER: self-hosted-azure-cpu
-       TIMEOUT: 20
-       SCRIPT: |
-         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/core tests/core_ptl -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
-
-  L0_Unit_Tests_CPU_Hydra:
-     needs: [cicd-test-container-setup]
-     uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Hydra') || needs.cicd-test-container-setup.outputs.all == 'true'
-     with:
-       RUNNER: self-hosted-azure-cpu
-       SCRIPT: |
-         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/hydra -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
-
-  L0_Unit_Tests_CPU_Lightning:
-     needs: [cicd-test-container-setup]
-     uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Lightning') || needs.cicd-test-container-setup.outputs.all == 'true'
-     with:
-       RUNNER: self-hosted-azure-cpu
-       SCRIPT: |
-         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/lightning -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
-
-  L0_Unit_Tests_CPU_Others:
-     needs: [cicd-test-container-setup]
-     uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Others') || needs.cicd-test-container-setup.outputs.all == 'true'
-     with:
-       RUNNER: self-hosted-azure-cpu
-       SCRIPT: |
-         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat \
-         --ignore=tests/collections/asr \
-         --ignore=tests/collections/audio \
-         --ignore=tests/collections/common \
-         --ignore=tests/collections/llm \
-         --ignore=tests/collections/multimodal \
-         --ignore=tests/collections/nlp \
-         --ignore=tests/collections/tts \
-         --ignore=tests/core \
-         --ignore=tests/core_ptl \
-         --ignore=tests/hydra \
-         --ignore=tests/lightning \
-         --ignore=tests/utils
-
-
-  L0_Setup_Test_Data_And_Models:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Setup_Test_Data_And_Models') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python -m tests.setup --save_dir /home/TestData/nlp
-
-  #     - name: L2: Multimodal Imagen Train
-
-  # L2: Community LLM Checkpoints tests
-  L2_Community_LLM_Checkpoints_tests_Bert:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_Bert') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-          python scripts/checkpoint_converters/convert_bert_hf_to_nemo.py  \
-          --input_name_or_path /home/TestData/nlp/megatron_ir/sbert/hf_model/bert-base-uncased \
-          --output_path /tmp/nlp_megatron_ir_sbert/sbert.nemo
-
-  L2_Community_LLM_Checkpoints_tests_Mamba2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_Mamba2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-          python scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py  \
-          --input_name_or_path /home/TestData/nlp/megatron_mamba/model_optim_rng.pt \
-          --output_path /tmp/nlp_megatron_mamba/converted_mamba.nemo \
-          --precision=bf16 \
-          --mamba_ssm_ngroups 1
-
-  L2_Community_LLM_Checkpoints_tests_Llama:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_Llama') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \
-          --input_name_or_path=/home/TestData/nlp/megatron_llama/llama-ci-hf-tiny \
-          --output_path=/tmp/nlp_megatron_llama/llama_ci.nemo \
-          --precision=16
-
-  L2_Community_LLM_Checkpoints_tests_Llama3:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_Llama3') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \
-          --input_name_or_path=/home/TestData/nlp/megatron_llama/llama3-ci-hf \
-          --output_path=/tmp/nlp_megatron_llama_llama3-ci-hf/llama3_ci.nemo \
-          --precision=16
-
-  L2_Community_LLM_Checkpoints_tests_StarCoder:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_StarCoder') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        mkdir -p /tmp/nlp_megatron_gpt_starcoder-ci-hf/
-        python scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py \
-        --input_name_or_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf \
-        --output_path /tmp/nlp_megatron_gpt_starcoder-ci-hf/
-
-  L2_Community_LLM_Checkpoints_tests_Falcon:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_Falcon') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-          python scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py \
-          --input_name_or_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf \
-          --output_path /tmp/nlp_megatron_gpt_falcon-ci-hf/falcon_ci.nemo
-  
-  # L2: Community llava multimodal Checkpoints tests
-  L2_Community_vita_Checkpoints_tests_Llama3:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Community_vita_Checkpoints_tests_Llama3') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        mkdir /tmp/${{ github.run_id }}
-        export PYTHONPATH=/home/TestData/multimodal/video_neva/LLaVA:$PYTHONPATH
-        CUDA_VISIBLE_DEVICES=0 python examples/multimodal/multimodal_llm/neva/convert_llava_to_neva.py \
-          --in-file /home/TestData/multimodal/video_neva/Llama-3-VILA1.5-8B/llm \
-          --mm-projector-ckpt-dir /home/TestData/multimodal/video_neva/Llama-3-VILA1.5-8B/mm_projector \
-          --mm-vision-tower /home/TestData/multimodal/video_neva/Llama-3-VILA1.5-8B/vision_tower \
-          --tokenizer-model /home/TestData/multimodal/video_neva/vita-tokenizer/ \
-          --config-file vita_config.yaml \
-          --out-file=/tmp/${{ github.run_id }}/llama3_ci.nemo \
-          --model-type VITA \
-          --conv-template llama_3
-
-  # this test is using a 7B model which is too large for GitHub CI
-  # replace the model in this test with a toy model or move the test
-  # to the nightly CI
-  # OPTIONAL_L2_Community_LLM_Checkpoints_tests_Baichuan2:
-  #   needs: [cicd-test-container-setup]
-  #   runs-on: self-hosted-azure
-  #   container:
-  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-  #     options: 
-  #       # --user 0:128
-  #       --device=/dev/nvidia0
-  #       --gpus all
-  #       --shm-size=8g
-  #       --env TRANSFORMERS_OFFLINE=0 
-  #       --env HYDRA_FULL_ERROR=1
-  #       --volume /mnt/datadrive/TestData:/home/TestData
-  #   steps:
-  #       - name: Checkout repository
-  #         uses: actions/checkout@v4
-  #       - run: |
-  #           python scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py \
-  #           --input_name_or_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base \
-  #           --output_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo
-  #           rm -f /home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo
-  #       - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-  #         if: "failure()"
-
-  L2_PTQ_Llama2_Export_Only:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_PTQ_Llama2_Export_Only') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_ptq.py \
-          model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-          quantization.algorithm=null \
-          export.save_path=/tmp/nlp_megatron_llama_export_only/ci_baseline
-
-  L2_PTQ_Llama2_FP8:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_PTQ_Llama2_FP8') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_ptq.py \
-          model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-          model.tensor_model_parallel_size=2 \
-          trainer.devices=2 \
-          quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
-          quantization.algorithm=fp8 \
-          quantization.num_calib_size=8 \
-          inference.batch_size=2 \
-          export.inference_tensor_parallel=2 \
-          export.sample_output=False \
-          export.save_path=/tmp/nlp_megatron_llama_eo/ci_fp8.qnemo
-
-  L2_PTQ_Llama2_INT8_SQ:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_PTQ_Llama2_INT8_SQ') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      TIMEOUT: 15
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_ptq.py \
-          model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-          quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
-          quantization.algorithm=int8_sq \
-          quantization.num_calib_size=8 \
-          inference.batch_size=2 \
-          export.sample_output=False \
-          export.save_path=/tmp/nlp_megatron_llama_eo/ci_int8_sq.qnemo
-
-  # TODO: investigate int4_awq stuck issues and restore the test
-  #L2_PTQ_Llama2_INT4_AWQ:
-  #  needs: [cicd-test-container-setup]
-  #  runs-on: self-hosted-azure
-  #  timeout-minutes: 10
-  #  container:
-  #    image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-  #    options:
-  #      # --user 0:128
-  #      --device=/dev/nvidia0
-  #      --gpus all
-  #      --shm-size=8g
-  #      --env TRANSFORMERS_OFFLINE=0
-  #      --env HYDRA_FULL_ERROR=1
-  #      --volume /mnt/datadrive/TestData:/home/TestData
-  #  steps:
-  #      - name: Checkout repository
-  #        uses: actions/checkout@v4
-  #      - run: |
-  #          python examples/nlp/language_modeling/megatron_gpt_ptq.py \
-  #          model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-  #          model.tensor_model_parallel_size=1 \
-  #          trainer.devices=1 \
-  #          quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
-  #          quantization.algorithm=int4_awq \
-  #          quantization.num_calib_size=8 \
-  #          inference.batch_size=2 \
-  #          export.save_path=/home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
-  #
-  #          rm -rf /home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
-        #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-        #  if: "failure()"
-
-  # OPTIONAL_L2_QAT_Llama2_INT4:
-  #    needs: [cicd-test-container-setup]
-  #    runs-on: self-hosted-azure
-  #    timeout-minutes: 10
-  #    container:
-  #      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-  #      options:
-  #        # --user 0:128
-  #        --device=/dev/nvidia0
-  #        --gpus all
-  #        --shm-size=8g
-  #        --env TRANSFORMERS_OFFLINE=0
-  #        --env HYDRA_FULL_ERROR=1
-  #        --volume /mnt/datadrive/TestData:/home/TestData
-  #    steps:
-  #        - name: Checkout repository
-  #          uses: actions/checkout@v4
-  #        - run: |
-  #           python examples/nlp/language_modeling/tuning/megatron_gpt_qat.py \
-  #           quantization.algorithm=int4 \
-  #           quantization.num_calib_size=8 \
-  #           trainer.devices=1 \
-  #           trainer.num_nodes=1 \
-  #           trainer.max_steps=4 \
-  #           trainer.val_check_interval=4 \
-  #           +trainer.limit_val_batches=2 \
-  #           exp_manager.explicit_log_dir=llama2_qat_results \
-  #           model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-  #           model.tensor_model_parallel_size=1 \
-  #           model.pipeline_model_parallel_size=1 \
-  #           model.global_batch_size=2 \
-  #           model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-  #           model.data.train_ds.concat_sampling_probabilities=[1.0] \
-  #           model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl]
-
-  #           rm -rf llama2_qat_results
-
-  L2_Distill_Llama2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Distill_Llama2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_distillation.py \
-          trainer.devices=2 \
-          trainer.num_nodes=1 \
-          trainer.precision=bf16 \
-          trainer.max_steps=5 \
-          trainer.log_every_n_steps=5 \
-          trainer.val_check_interval=5 \
-          trainer.limit_val_batches=2 \
-          model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-          model.kd_teacher_restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-          model.tensor_model_parallel_size=2 \
-          model.pipeline_model_parallel_size=1 \
-          model.micro_batch_size=1 \
-          model.global_batch_size=4 \
-          model.optim.name=distributed_fused_adam \
-          model.optim.sched.warmup_steps=1 \
-          model.data.data_prefix=[1.0,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-          model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \
-          exp_manager.exp_dir=/tmp/megatron_llama_distill
-
-  L2_Prune_Width_Llama2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Prune_Width_Llama2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_prune.py \
-          trainer.devices=2 \
-          trainer.num_nodes=1 \
-          trainer.precision=bf16 \
-          model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-          model.tensor_model_parallel_size=1 \
-          model.pipeline_model_parallel_size=2 \
-          prune.num_calib_size=8 \
-          prune.ffn_hidden_size=192 \
-          prune.num_attention_heads=2 \
-          prune.num_query_groups=2 \
-          prune.hidden_size=null \
-          export.save_path=examples/nlp/language_modeling/ci_prune_width.nemo
-      AFTER_SCRIPT: |
-          rm -rf examples/nlp/language_modeling/ci_prune_width.nemo
-
-  # L2: ASR dev run
-  ASR_dev_run_Speech_to_Text:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'ASR_dev_run_Speech_to_Text') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        python examples/asr/asr_ctc/speech_to_text_ctc.py \
-          model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-          model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-          trainer.devices=1 \
-          trainer.accelerator="gpu" \
-          +trainer.fast_dev_run=True \
-          exp_manager.exp_dir=/tmp/speech_to_text_results
-
-  ASR_dev_run_Speech_to_Text_WPE_-_CitriNet:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'ASR_dev_run_Speech_to_Text_WPE_-_CitriNet') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
-          --config-path="../conf/citrinet/" --config-name="config_bpe" \
-          model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-          model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-          model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
-          model.tokenizer.type="wpe" \
-          trainer.devices=1 \
-          trainer.accelerator="gpu" \
-          +trainer.fast_dev_run=True \
-          exp_manager.exp_dir=/tmp/speech_to_text_wpe_results
-
-  ASR_dev_run_Speech_Pre-training_-_CitriNet:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'ASR_dev_run_Speech_Pre-training_-_CitriNet') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        python examples/asr/speech_pretraining/speech_pre_training.py \
-        --config-path="../conf/ssl/citrinet/" --config-name="citrinet_ssl_ci" \
-        model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-        model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=True \
-        exp_manager.exp_dir=/tmp/speech_pre_training_results
-
-  ASR_dev_run_Speech_To_Text_Finetuning:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'ASR_dev_run_Speech_To_Text_Finetuning') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        python examples/asr/speech_to_text_finetune.py \
-        --config-path="conf/asr_finetune" --config-name="speech_to_text_finetune" \
-        model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-        model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-        init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \
-        model.tokenizer.update_tokenizer=False \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=True \
-        exp_manager.exp_dir=/tmp/speech_finetuning_results
-
-  ASR_dev_run_Speech_To_Text_HF_Finetuning:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'ASR_dev_run_Speech_To_Text_HF_Finetuning') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |-
-        python examples/asr/speech_to_text_finetune.py \
-        --config-path="conf/asr_finetune" --config-name="speech_to_text_hf_finetune" \
-        ~model.train_ds.hf_data_cfg \
-        model.train_ds.num_workers=1 \
-        model.train_ds.batch_size=2 model.validation_ds.batch_size=2 \
-        model.train_ds.streaming=true \
-        +model.train_ds.hf_data_cfg.path="librispeech_asr" \
-        +model.train_ds.hf_data_cfg.name=null \
-        +model.train_ds.hf_data_cfg.split="test.clean" \
-        +model.train_ds.hf_data_cfg.streaming=true \
-        +model.train_ds.hf_data_cfg.trust_remote_code=True \
-        ~model.validation_ds.hf_data_cfg \
-        model.validation_ds.streaming=true \
-        +model.validation_ds.hf_data_cfg.path="librispeech_asr" \
-        +model.validation_ds.hf_data_cfg.name=null \
-        +model.validation_ds.hf_data_cfg.split="test.clean" \
-        +model.validation_ds.hf_data_cfg.streaming=true \
-        +model.validation_ds.hf_data_cfg.trust_remote_code=True \
-        ~model.test_ds \
-        init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \
-        model.tokenizer.update_tokenizer=False \
-        model.optim.sched.warmup_steps=0 \
-        +model.optim.sched.max_steps=3 \
-        trainer.max_epochs=null \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=True \
-        exp_manager.exp_dir=/tmp/speech_finetuning_results
-
-  ASR_dev_run_Speech_to_Text_WPE_-_Conformer:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'ASR_dev_run_Speech_to_Text_WPE_-_Conformer') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
-        --config-path="../conf/conformer" --config-name="conformer_ctc_bpe" \
-        model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-        model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-        model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
-        model.tokenizer.type="wpe" \
-        model.train_ds.batch_size=4 \
-        model.validation_ds.batch_size=4 \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=True \
-        exp_manager.exp_dir=/tmp/speech_to_text_wpe_conformer_results
-
-  # L2: ASR dev run - part two
-  ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
-        --config-path="../conf/squeezeformer" --config-name="squeezeformer_ctc_bpe" \
-        model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-        model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-        model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
-        model.tokenizer.type="wpe" \
-        model.encoder.d_model=144 \
-        model.train_ds.batch_size=4 \
-        model.validation_ds.batch_size=4 \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=True \
-        exp_manager.exp_dir=/tmp/speech_to_text_wpe_squeezeformer_results
-
-  L2_Speech_to_Text_EMA:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speech_to_Text_EMA') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/asr/asr_ctc/speech_to_text_ctc.py \
-        model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-        model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-        trainer.devices=2 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=True \
-        +exp_manager.ema.enable=True \
-        exp_manager.exp_dir=/tmp/speech_to_text_results
-
-  L2_Speech_to_Text_AED:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speech_to_Text_AED') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        python examples/asr/speech_multitask/speech_to_text_aed.py \
-        model.prompt_format=canary \
-        model.model_defaults.asr_enc_hidden=256 \
-        model.model_defaults.lm_dec_hidden=256 \
-        model.encoder.n_layers=12 \
-        model.transf_encoder.num_layers=0 \
-        model.transf_decoder.config_dict.num_layers=12 \
-        model.train_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_train.json \
-        model.train_ds.batch_duration=60 \
-        model.train_ds.use_bucketing=false \
-        model.train_ds.shuffle_buffer_size=100 \
-        model.train_ds.num_workers=0 \
-        +model.train_ds.text_field="answer" \
-        +model.train_ds.lang_field="target_lang" \
-        model.validation_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \
-        +model.validation_ds.text_field="answer" \
-        +model.validation_ds.lang_field="target_lang" \
-        model.validation_ds.num_workers=0 \
-        model.test_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \
-        +model.test_ds.text_field="answer" \
-        +model.test_ds.lang_field="target_lang" \
-        model.test_ds.num_workers=0 \
-        spl_tokens.model_dir=/home/TestData/asr_tokenizers/canary/canary_spl_tokenizer_v32 \
-        model.tokenizer.langs.en.dir=/home/TestData/asr_tokenizers/canary/en/tokenizer_spe_bpe_v1024_max_4 \
-        model.tokenizer.langs.en.type=bpe \
-        ++model.tokenizer.langs.es.dir=/home/TestData/asr_tokenizers/canary/es/tokenizer_spe_bpe_v1024_max_4 \
-        ++model.tokenizer.langs.es.type=bpe \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=True \
-        exp_manager.exp_dir=/tmp/speech_to_text_aed_results
-
-  # L2: Speaker dev run
-  L2_Speaker_dev_run_Speaker_Recognition:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speaker_dev_run_Speaker_Recognition') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        python examples/speaker_tasks/recognition/speaker_reco.py \
-        model.train_ds.batch_size=10 \
-        model.validation_ds.batch_size=2 \
-        model.train_ds.manifest_filepath=/home/TestData/an4_speaker/train.json \
-        model.validation_ds.manifest_filepath=/home/TestData/an4_speaker/dev.json \
-        model.decoder.num_classes=2 \
-        trainer.max_epochs=10 \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=True \
-        exp_manager.exp_dir=/tmp/speaker_recognition_results
-
-  L2_Speaker_dev_run_Speaker_Diarization:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speaker_dev_run_Speaker_Diarization') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder.py \
-        model.diarizer.speaker_embeddings.model_path=titanet_large \
-        model.train_ds.batch_size=5 \
-        model.validation_ds.batch_size=5 \
-        model.train_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \
-        model.validation_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \
-        model.train_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_train/msdd_data.50step.json \
-        model.validation_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_valid/msdd_data.50step.json \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=True \
-        exp_manager.exp_dir=/tmp/speaker_diarization_results
-
-  L2_Speaker_dev_run_Speech_to_Label:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speaker_dev_run_Speech_to_Label') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        python examples/asr/speech_classification/speech_to_label.py \
-        model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \
-        model.validation_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \
-        model.test_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=True \
-        model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \
-        ~model.preprocessor.window_size \
-        ~model.preprocessor.window_stride \
-        ~model.preprocessor.window \
-        ~model.preprocessor.n_mels \
-        ~model.preprocessor.n_mfcc \
-        ~model.preprocessor.n_fft \
-        exp_manager.exp_dir=/tmp/speech_to_label_results
-
-  L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py \
-        diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
-        diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \
-        diarizer.speaker_embeddings.parameters.save_embeddings=True \
-        diarizer.speaker_embeddings.parameters.window_length_in_sec=[1.5] \
-        diarizer.speaker_embeddings.parameters.shift_length_in_sec=[0.75] \
-        diarizer.speaker_embeddings.parameters.multiscale_weights=[1.0] \
-        diarizer.asr.model_path=QuartzNet15x5Base-En \
-        diarizer.asr.parameters.asr_based_vad=True \
-        diarizer.out_dir=/tmp/speaker_diarization_asr_results
-
-  L2_Speaker_dev_run_Clustering_Diarizer_Inference:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speaker_dev_run_Clustering_Diarizer_Inference') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py \
-        diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
-        diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \
-        diarizer.speaker_embeddings.parameters.save_embeddings=True \
-        diarizer.speaker_embeddings.parameters.window_length_in_sec=1.5 \
-        diarizer.speaker_embeddings.parameters.shift_length_in_sec=0.75 \
-        diarizer.speaker_embeddings.parameters.multiscale_weights=null \
-        diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \
-        diarizer.out_dir=/tmp/clustering_diarizer_results
-
-  L2_Speaker_dev_run_Neural_Diarizer_Inference:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speaker_dev_run_Neural_Diarizer_Inference') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py \
-        diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
-        diarizer.msdd_model.model_path=/home/TestData/an4_diarizer/diar_msdd_telephonic.nemo \
-        diarizer.speaker_embeddings.parameters.save_embeddings=True \
-        diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \
-        diarizer.out_dir=/tmp/neural_diarizer_results
-
-  L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python tools/speech_data_simulator/multispeaker_simulator.py \
-        --config-path=conf --config-name=data_simulator.yaml \
-        data_simulator.random_seed=42 \
-        data_simulator.manifest_filepath=/home/TestData/LibriSpeechShort/dev-clean-align-short.json \
-        data_simulator.outputs.output_dir=/tmp/test_simulator \
-        data_simulator.session_config.num_sessions=2 \
-        data_simulator.session_config.session_length=60
-
-  # L2: ASR Multi-dataloader dev run
-  L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        python examples/asr/asr_ctc/speech_to_text_ctc.py \
-        model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-        model.validation_ds.manifest_filepath=[/home/TestData/an4_dataset/an4_val.json,/home/TestData/an4_dataset/an4_val.json] \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        trainer.max_epochs=1 \
-        trainer.max_steps=1 \
-        +trainer.num_sanity_val_steps=1 \
-        exp_manager.exp_dir=/tmp/speech_to_text_results
-
-  L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        python examples/asr/speech_classification/speech_to_label.py \
-        model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \
-        model.validation_ds.manifest_filepath=[/home/TestData/speech_commands/test_manifest.json,/home/TestData/speech_commands/test_manifest.json] \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        trainer.max_epochs=1 \
-        trainer.max_steps=1 \
-        +trainer.num_sanity_val_steps=1 \
-        model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \
-        ~model.preprocessor.window_size \
-        ~model.preprocessor.window_stride \
-        ~model.preprocessor.window \
-        ~model.preprocessor.n_mels \
-        ~model.preprocessor.n_mfcc \
-        ~model.preprocessor.n_fft \
-        exp_manager.exp_dir=/tmp/speech_to_label_results
-
-  # L2: ASR Adapters
-  L2_ASR_Adapters_Linear_Adapters:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_ASR_Adapters_Linear_Adapters') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        python examples/asr/asr_adapters/train_asr_adapter.py \
-        model.pretrained_model="stt_en_conformer_ctc_small" \
-        model.adapter.adapter_name="an4" \
-        model.adapter.linear.in_features=176 \
-        model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-        model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-        trainer.max_steps=5 \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=True \
-        exp_manager.exp_dir=/tmp/speech_to_text_adapters_results
-
-  L2_ASR_Adapters_RelPos_MHA_Adapters:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_ASR_Adapters_RelPos_MHA_Adapters') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        python examples/asr/asr_adapters/train_asr_adapter.py \
-        model.pretrained_model="stt_en_conformer_ctc_small" \
-        model.adapter.adapter_name="encoder:an4" \
-        model.adapter.adapter_type="tiny_attn" \
-        model.adapter.tiny_attn.n_feat=176 \
-        model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
-        model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-        trainer.max_steps=5 \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=True \
-        exp_manager.exp_dir=/tmp/speech_to_text_adapters_mha_results
-
-  # L2: OOMptimizer
-  L2_Speech_Estimate_Duration_Bins:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speech_Estimate_Duration_Bins') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        set -x
-        # 1D buckets [SSL, CTC]
-        python scripts/speech_recognition/estimate_duration_bins.py \
-          /home/TestData/an4_dataset/an4_train.json \
-          --buckets 5 
-        # 2D buckets [CTC, RNNT, TDT] / with tokenizer
-        python scripts/speech_recognition/estimate_duration_bins_2d.py \
-          /home/TestData/an4_dataset/an4_train_lang.json \
-          --tokenizer /home/TestData/asr_tokenizers/canary/en/tokenizer_spe_bpe_v1024_max_4/tokenizer.model \
-          --buckets 5 \
-          --sub-buckets 2
-        # TODO(pzelasko): Figure out how to quote the value in the test properly for CI to accept it...
-        # 2D buckets with prompt [AED/Canary, SpeechLM] / with aggregate tokenizer + prompt format 
-        # python scripts/speech_recognition/estimate_duration_bins_2d.py \
-        #   /home/TestData/an4_dataset/an4_train_lang.json \
-        #   --tokenizer /home/TestData/asr_tokenizers/canary/canary_spl_tokenizer_v32/tokenizer.model \
-        #      /home/TestData/asr_tokenizers/canary/en/tokenizer_spe_bpe_v1024_max_4/tokenizer.model \
-        #      /home/TestData/asr_tokenizers/canary/es/tokenizer_spe_bpe_v1024_max_4/tokenizer.model \
-        #   --langs spl_tokens en es \
-        #   --prompt-format canary \
-        #   --prompt '[{"role":"user","slots":{"source_lang":"en","target_lang":"en","task":"asr","pnc":"yes"}}]' \
-        #   --buckets 5 \
-        #   --sub-buckets 2
-
-  # L2: OOMptimizer
-  L2_Speech_Batch_Size_OOMptimizer:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speech_Batch_Size_OOMptimizer') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        # 1D bucketing
-        python scripts/speech_recognition/oomptimizer.py \
-          -c /home/TestData/oomptimizer/fast-conformer_ctc_bpe.yaml \
-          -m nemo.collections.asr.models.EncDecCTCModelBPE \
-          -b "[5.0,10.0]"
-        # 2D bucketing
-        python scripts/speech_recognition/oomptimizer.py \
-          -c /home/TestData/oomptimizer/fast-conformer_ctc_bpe.yaml \
-          -m nemo.collections.asr.models.EncDecCTCModelBPE \
-          -b "[[5.0,30],[5.0,45],[10.0,57],[10.0,71]]"
-
-  # L2: OOMptimizer Canary (has a different batch schema)
-  L2_Speech_Batch_Size_OOMptimizer_Canary:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speech_Batch_Size_OOMptimizer_Canary') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python scripts/speech_recognition/oomptimizer.py \
-          -c /home/TestData/oomptimizer/fast-conformer_aed.yaml \
-          -m nemo.collections.asr.models.EncDecMultiTaskModel \
-          -b "[[5.0,30],[5.0,45],[10.0,57],[10.0,71]]"
-
-  # L2: Speech Transcription
-  L2_Speech_Transcription_Speech_to_Text_Transcribe:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speech_Transcription_Speech_to_Text_Transcribe') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/asr/transcribe_speech.py \
-        pretrained_name="QuartzNet15x5Base-En" \
-        audio_dir="/home/TestData/an4_transcribe/test_subset/" \
-        output_filename="/tmp/stt_test_res.json" \
-        amp=true
-
-  # L2: Speech Transcription
-  L2_Speech_Transcription_Canary_Transcribe_Full_Manifest:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speech_Transcription_Canary_Transcribe_Full_Manifest') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/asr/transcribe_speech.py \
-        dataset_manifest=/home/TestData/asr/canary/dev-other-wav-10-canary-fields.json \
-        output_filename=/tmp/preds.json \
-        batch_size=10 \
-        pretrained_name=nvidia/canary-1b \
-        num_workers=0 \
-        amp=false \
-        compute_dtype=bfloat16 \
-        matmul_precision=medium
-      AFTER_SCRIPT: |
-        rm -rf /tmp/preds.json transcribe.log
-
-  L2_Speech_Transcription_Canary_Transcribe_With_Prompt:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speech_Transcription_Canary_Transcribe_With_Prompt') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/asr/transcribe_speech.py \
-        dataset_manifest=/home/TestData/asr/canary/dev-other-wav-10.json \
-        output_filename=preds.json \
-        batch_size=10 \
-        pretrained_name=nvidia/canary-1b \
-        num_workers=0 \
-        amp=false \
-        compute_dtype=bfloat16 \
-        matmul_precision=medium \
-        +prompt.source_lang="en" \
-        +prompt.target_lang="en" \
-        +prompt.task="asr" \
-        +prompt.pnc="no"
-      AFTER_SCRIPT: |
-        rm -rf preds.json transcribe.log
-
-  L2_Speech_Transcription_Canary_Transcribe_Audio_Dir:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speech_Transcription_Canary_Transcribe_Audio_Dir') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/asr/transcribe_speech.py \
-        audio_dir=/home/TestData/asr/canary/dev-other-wav \
-        output_filename=preds.json \
-        batch_size=10 \
-        pretrained_name=nvidia/canary-1b \
-        num_workers=0 \
-        amp=false \
-        compute_dtype=bfloat16 \
-        matmul_precision=medium
-      AFTER_SCRIPT: |
-        rm -rf preds.json
-  
-
-  # L2: Transducer alignment
-  OPTIONAL_L2_Transducer_alignment_Running_pytest:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L2_Transducer_alignment_Running_pytest') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1 --with_downloads
-      IS_OPTIONAL: true
-
-  # L2: Segmentation Tool
-  L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        cd tools/ctc_segmentation && \
-        TIME=`date +"%Y-%m-%d-%T"` && \
-        /bin/bash run_segmentation.sh \
-        --MODEL_NAME_OR_PATH="stt_en_citrinet_512_gamma_0_25" \
-        --DATA_DIR=/home/TestData/ctc_segmentation/eng \
-        --OUTPUT_DIR=/tmp/ctc_seg_en/output${TIME} \
-        --LANGUAGE=en \
-        --USE_NEMO_NORMALIZATION="TRUE" && \
-        python /home/TestData/ctc_segmentation/verify_alignment.py \
-        -r /home/TestData/ctc_segmentation/eng/eng_valid_segments_1.7.txt \
-        -g /tmp/ctc_seg_en/output${TIME}/verified_segments/nv_test_segments.txt;
-
-  L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        cd tools/ctc_segmentation && \
-        TIME=`date +"%Y-%m-%d-%T"` && \
-        /bin/bash run_segmentation.sh \
-        --MODEL_NAME_OR_PATH=/home/TestData/ctc_segmentation/QuartzNet15x5-Ru-e512-wer14.45.nemo \
-        --DATA_DIR=/home/TestData/ctc_segmentation/ru \
-        --OUTPUT_DIR=/tmp/ctc_seg_ru/output${TIME} \
-        --LANGUAGE=ru \
-        --ADDITIONAL_SPLIT_SYMBOLS=";" && \
-        python /home/TestData/ctc_segmentation/verify_alignment.py \
-        -r /home/TestData/ctc_segmentation/ru/valid_ru_segments_1.7.txt \
-        -g /tmp/ctc_seg_ru/output${TIME}/verified_segments/ru_segments.txt;
-
-  # L2: G2P Models
-  L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        cd examples/tts/g2p && \
-            TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_CONFORMER=output_ctc_${TIME} && \
-            python g2p_train_and_evaluate.py \
-                train_manifest=/home/TestData/g2p/g2p.json \
-                validation_manifest=/home/TestData/g2p/g2p.json \
-                model.test_ds.manifest_filepath=/home/TestData/g2p/g2p.json \
-                model.tokenizer.dir=/home/TestData/g2p/tokenizer_spe_unigram_v512 \
-                trainer.max_epochs=1 \
-                model.max_source_len=64 \
-                trainer.devices=1 \
-                do_training=True \
-                do_testing=True \
-                exp_manager.exp_dir=${OUTPUT_DIR_CONFORMER} \
-                +exp_manager.use_datetime_version=False\
-                +exp_manager.version=test \
-                --config-name=g2p_conformer_ctc && \
-            python g2p_inference.py \
-                pretrained_model=${OUTPUT_DIR_CONFORMER}/G2P-Conformer-CTC/test/checkpoints/G2P-Conformer-CTC.nemo \
-                manifest_filepath=/home/TestData/g2p/g2p.json \
-                phoneme_field=text
-
-    # TODO: pleasefixme @redoctopus
-    # - name: ByT5G2P training, evaluation and inference
-    #   run: |
-    #     cd examples/tts/g2p && \
-    #         TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_T5=output_byt5_${TIME} && \
-    #         python g2p_train_and_evaluate.py \
-    #             train_manifest=/home/TestData/g2p/g2p.json \
-    #             validation_manifest=/home/TestData/g2p/g2p.json \
-    #             model.test_ds.manifest_filepath=/home/TestData/g2p/g2p.json \
-    #             trainer.max_epochs=1 \
-    #             model.max_source_len=64 \
-    #             trainer.devices=1 \
-    #             do_training=True \
-    #             do_testing=True \
-    #             exp_manager.exp_dir=${OUTPUT_DIR_T5} \
-    #             +exp_manager.use_datetime_version=False\
-    #             +exp_manager.version=test && \
-    #         python g2p_inference.py \
-    #             pretrained_model=${OUTPUT_DIR_T5}/T5G2P/test/checkpoints/T5G2P.nemo \
-    #             manifest_filepath=/home/TestData/g2p/g2p.json \
-    #             phoneme_field=text
-    #   }
-    # }
-    # - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-    # if: "failure()"
-
-  L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        cd examples/tts/g2p && \
-            TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR=output_${TIME} && \
-            python g2p_heteronym_classification_train_and_evaluate.py \
-                train_manifest=/home/TestData/g2p/manifest.json \
-                validation_manifest=/home/TestData/g2p/manifest.json \
-                test_manifest=/home/TestData/g2p/manifest.json \
-                model.wordids=/home/TestData/g2p/wordids.tsv \
-                trainer.max_epochs=1 \
-                model.max_seq_length=64 \
-                do_training=True \
-                do_testing=True \
-                exp_manager.exp_dir=${OUTPUT_DIR} \
-                +exp_manager.use_datetime_version=False\
-                +exp_manager.version=test && \
-            python g2p_heteronym_classification_inference.py \
-                manifest=/home/TestData/g2p/manifest.json \
-                pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \
-                output_manifest=preds.json
-
-  # L2: Duplex Text Normalization
-  L2_Duplex_Text_Normalization_with_Tarred_dataset:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Duplex_Text_Normalization_with_Tarred_dataset') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        cd examples/nlp/duplex_text_normalization && \
-        python duplex_text_normalization_train.py \
-        data.validation_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv \
-        mode=tn \
-        lang=en \
-        tagger_model.do_training=false \
-        decoder_model.transformer=t5-small \
-        data.validation_ds.batch_size=2 \
-        data.train_ds.use_cache=false \
-        data.validation_ds.use_cache=false \
-        data.test_ds.batch_size=2 \
-        data.train_ds.decoder_data_augmentation=false \
-        data.train_ds.num_workers=2 \
-        decoder_trainer.devices=[0,1] \
-        decoder_trainer.accelerator="gpu" \
-        data.train_ds.use_tarred_dataset=true \
-        +decoder_trainer.fast_dev_run=true \
-        decoder_exp_manager.create_checkpoint_callback=false \
-        data.train_ds.tar_metadata_file=/home/TestData/nlp/duplex_text_norm/tarred_small/metadata.json \
-        data.test_ds.use_cache=false \
-        data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv
-
-  # L2: Intent and Slot Classification Tasks
-  L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/intent_slot_classification && \
-        python intent_slot_classification.py \
-        model.data_dir=/home/TestData/nlp/retail \
-        model.validation_ds.prefix=dev \
-        model.test_ds.prefix=dev \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=true \
-        exp_manager.exp_dir=checkpoints
-      AFTER_SCRIPT: |
-        rm -rf checkpoints
-
-  L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/intent_slot_classification && \
-        python multi_label_intent_slot_classification.py \
-        model.data_dir=/home/TestData/nlp/new_multiatis \
-        model.validation_ds.prefix=dev \
-        model.test_ds.prefix=dev \
-        trainer.devices=1 \
-        +trainer.fast_dev_run=true \
-        exp_manager.exp_dir=checkpoints2
-      AFTER_SCRIPT: |
-        rm -rf checkpoints2
-
-    # TODO: add when megatron-bert is supported again
-    # stage("L2: Model Parallel Size 2 Megatron Text Classification") {
-    #   when {
-    #     anyOf{
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   steps{
-    #     cd examples/nlp/text_classification && \
-    #     python text_classification_with_bert.py \
-    #     trainer.devices=[0,1] \
-    #     trainer.accelerator="gpu" \
-    #     trainer.num_nodes=1 \
-    #     trainer.precision=16 \
-    #     trainer.gradient_clip_val=1.0 \
-    #     +trainer.fast_dev_run=true \
-    #     model.dataset.num_classes=6 \
-    #     model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \
-    #     model.train_ds.batch_size=4 \
-    #     model.language_model.pretrained_model_name=megatron-bert-uncased \
-    #     model.language_model.config_file=/home/TestData/nlp/mp_2_bert_toy/config.json \
-    #     model.language_model.lm_checkpoint=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \
-    #     model.nemo_path=null \
-    #     ~model.infer_samples \
-    #     exp_manager=null
-    #   }
-    # }
-
-    # stage("L2: Model Parallel Size 2 Megatron Autoresume") {
-    #   when {
-    #     anyOf{
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   steps{
-    #     cd examples/nlp/text_classification && \
-    #     python text_classification_with_bert.py \
-    #     trainer.devices=[0,1] \
-    #     trainer.accelerator="gpu" \
-    #     trainer.num_nodes=1 \
-    #     trainer.precision=16 \
-    #     trainer.gradient_clip_val=1.0 \
-    #     trainer.max_epochs=1 \
-    #     +trainer.fast_dev_run=true \
-    #     model.dataset.num_classes=6 \
-    #     model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \
-    #     model.train_ds.batch_size=4 \
-    #     model.language_model.pretrained_model_name=megatron-bert-uncased \
-    #     model.language_model.config_file=/home/TestData/nlp/mp_2_bert_toy/config.json \
-    #     model.language_model.lm_checkpoint=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \
-    #     model.nemo_path=null \
-    #     ~model.infer_samples \
-    #     +exp_manager.explicit_log_dir=/home/TestData/nlp/mp_autoresume \
-    #     +exp_manager.resume_if_exists=true
-    #   }
-    # }
-
-    # stage("L2: Model Parallel Size 2 Megatron Evaluation from .nemo") {
-    #   when {
-    #     anyOf{
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   steps{
-    #     cd examples/nlp/text_classification && \
-    #     python model_parallel_text_classification_evaluation.py \
-    #     trainer.devices=[0,1] \
-    #     trainer.accelerator="gpu" \
-    #     trainer.num_nodes=1 \
-    #     model.dataset.num_classes=6 \
-    #     model.test_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \
-    #     model.nemo_path=/home/TestData/nlp/mp_2_nemo/retail_text_class_350M.nemo \
-    #     exp_manager=null
-    #   }
-    # }
-
-    # stage("L2: Model Parallel Size 2 Megatron Train from .nemo") {
-    #   when {
-    #     anyOf{
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   steps{
-    #     cd examples/nlp/token_classification && \
-    #     python token_classification_train.py \
-    #     pretrained_model=/home/TestData/nlp/mp_2_nemo/ner_350M.nemo \
-    #     model.dataset.data_dir=/home/TestData/nlp/ner/ \
-    #     model.train_ds.batch_size=2 \
-    #     model.dataset.use_cache=false \
-    #     trainer.devices=[0,1] \
-    #     trainer.accelerator="gpu" \
-    #     +trainer.fast_dev_run=true \
-    #     model.dataset.class_balancing="weighted_loss" \
-    #     exp_manager=null
-    #   }
-    # }
-
-
-  # L2: Parallel NLP Examples 2
-  L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/token_classification && \
-        python token_classification_train.py \
-        pretrained_model=ner_en_bert \
-        model.dataset.data_dir=/home/TestData/nlp/ner/ \
-        model.train_ds.batch_size=2 \
-        model.dataset.use_cache=false \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=true \
-        model.dataset.class_balancing="weighted_loss" \
-        exp_manager.exp_dir=null
-
-  L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/token_classification && \
-        data_dir="$(mktemp -d -p "$(pwd)")" && \
-        cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
-        python punctuation_capitalization_train_evaluate.py \
-          pretrained_model=punctuation_en_bert \
-          model.train_ds.ds_item="${data_dir}" \
-          model.validation_ds.ds_item="${data_dir}" \
-          model.test_ds.ds_item="${data_dir}" \
-          +model.train_ds.use_cache=false \
-          +model.validation_ds.use_cache=false \
-          +model.test_ds.use_cache=false \
-          trainer.devices=1 \
-          trainer.accelerator="gpu" \
-          +trainer.fast_dev_run=true \
-          exp_manager.exp_dir=null;
-
-        rm -rf "${data_dir}"
-
-  L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/token_classification && \
-        python token_classification_train.py \
-        model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=true \
-        model.dataset.use_cache=false \
-        model.language_model.pretrained_model_name="TurkuNLP/bert-base-finnish-cased-v1" \
-        exp_manager.exp_dir=null
-
-  L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/token_classification/token_classification_evaluate.py \
-        model.dataset.data_dir=/home/TestData/nlp/ner/ \
-        model.dataset.use_cache=false \
-        pretrained_model=/home/TestData/nlp/pretrained_models/NER_Model_with_BERT_base_uncased.nemo
-
-  L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        data_dir="$(mktemp -d -p "$(pwd)")" && \
-        cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
-        python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \
-          +do_training=false \
-          +do_testing=true \
-          model.test_ds.ds_item="${data_dir}" \
-          ~model.train_ds \
-          ~model.validation_ds \
-          +model.test_ds.use_cache=false \
-          pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_Capitalization_with_DistilBERT_base_uncased.nemo;
-
-        rm -rf "${data_dir}"
-
-
-  # L2: Parallel Pretraining BERT pretraining from Text/Preprocessed
-  L2_Pretraining_BERT_pretraining_from_Text:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Pretraining_BERT_pretraining_from_Text') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/language_modeling && \
-        python bert_pretraining.py \
-        --config-name=bert_pretraining_from_text_config.yaml \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        trainer.precision=16 \
-        +trainer.fast_dev_run=true \
-        model.train_ds.data_file=/home/TestData/nlp/wikitext-2/train.txt  \
-        model.train_ds.batch_size=32 \
-        model.validation_ds.data_file=/home/TestData/nlp/wikitext-2/valid.txt  \
-        model.validation_ds.batch_size=32 \
-        model.language_model.config_file=/home/TestData/nlp/bert_configs/bert_3200.json \
-        model.optim.lr=0.01 \
-        model.optim.sched.warmup_ratio=0.1 \
-        model.tokenizer.tokenizer_name=sentencepiece \
-        model.tokenizer.tokenizer_model=/home/TestData/nlp/wikitext-2/tokenizer_bpe_v3193/tokenizer.model \
-        model.mask_prob=0.15 \
-        model.short_seq_prob=0.1 \
-        exp_manager.exp_dir=/tmp/PretrainingBERTFromText;
-#      AFTER_SCRIPT: |
-#        rm -f /home/TestData/nlp/wikitext-2/*.pkl
-        #rm -rf examples/nlp/language_modeling/PretrainingBERTFromText
-
-  L2_Pretraining_BERT_from_Preprocessed:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Pretraining_BERT_from_Preprocessed') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/language_modeling && \
-            python bert_pretraining.py \
-            --config-name=bert_pretraining_from_preprocessed_config.yaml \
-            trainer.devices=1 \
-            trainer.accelerator="gpu" \
-            trainer.precision=16 \
-            +trainer.fast_dev_run=false \
-            +trainer.max_epochs=1 \
-            +trainer.limit_val_batches=0 \
-            +trainer.limit_train_batches=1 \
-            model.train_ds.data_file=/home/TestData/nlp/wiki_book_mini/training \
-            model.train_ds.batch_size=8 \
-            model.language_model.lm_checkpoint=/home/TestData/nlp/bert_ckpts/nemo1.0/bert_base_uncased_mlm_final_1074591_nemo1.0.pt \
-            model.language_model.config_file=/home/TestData/nlp/bert_configs/uncased_L-12_H-768_A-12.json \
-            model.optim.lr=0.875e-4 \
-            model.optim.weight_decay=0.01 \
-            model.optim.sched.warmup_ratio=0.01 \
-            exp_manager.exp_dir=PretrainingBERTFromPreprocessed \
-            exp_manager.create_checkpoint_callback=False  \
-
-            #rm -rf examples/nlp/language_modeling/PretrainingBERTFromPreprocessed
-
-
-  # TODO: remove +model.optim.capturable=True when Pytorch fix: https://github.com/pytorch/pytorch/pull/81858
-  # is in the release container
-  # L2: NMT Attention is All You Need Training
-  L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        python examples/nlp/machine_translation/enc_dec_nmt.py \
-          --config-path=conf \
-          --config-name=aayn_base \
-          do_testing=false \
-          model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-          model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-          model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-          model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-          model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-          model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-          model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-          model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-          model.encoder.num_layers=1 \
-          model.encoder.hidden_size=64 \
-          model.encoder.inner_size=256 \
-          model.decoder.num_layers=1 \
-          model.decoder.hidden_size=64 \
-          model.decoder.inner_size=256 \
-          +model.optim.capturable=True \
-          trainer.devices=1 \
-          trainer.accelerator="gpu" \
-          +trainer.val_check_interval=2 \
-          +trainer.limit_val_batches=1 \
-          +trainer.max_steps=2 \
-          trainer.precision=16 \
-          +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \
-          +exp_manager.create_checkpoint_callback=true
-          
-        python examples/nlp/machine_translation/enc_dec_nmt.py \
-          --config-path=conf \
-          --config-name=aayn_base \
-          do_testing=true \
-          model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-          model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-          model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-          model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-          model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-          model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-          model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-          model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-          model.encoder.num_layers=1 \
-          model.encoder.hidden_size=64 \
-          model.encoder.inner_size=256 \
-          model.decoder.num_layers=1 \
-          model.decoder.hidden_size=64 \
-          model.decoder.inner_size=256 \
-          +model.optim.capturable=True \
-          trainer.devices=1 \
-          trainer.accelerator="gpu" \
-          +trainer.val_check_interval=10 \
-          +trainer.limit_val_batches=1 \
-          +trainer.limit_test_batches=1 \
-          +trainer.max_steps=10 \
-          +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \
-          +exp_manager.create_checkpoint_callback=true \
-          +exp_manager.resume_if_exists=True
-      AFTER_SCRIPT: |    
-        rm -rf examples/nlp/machine_translation/nmt_results
-  L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/machine_translation && \
-        python enc_dec_nmt.py \
-        --config-path=conf \
-        --config-name=aayn_base \
-        do_testing=true \
-        model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-        model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-        model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-        model.encoder.pre_ln=true \
-        model.decoder.pre_ln=true \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=true \
-        +trainer.limit_test_batches=2 \
-        exp_manager=null
-
-  L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/machine_translation && \
-        python enc_dec_nmt.py \
-        --config-path=conf \
-        --config-name=aayn_base \
-        do_testing=true \
-        model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \
-        model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \
-        model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \
-        model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \
-        model.test_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \
-        model.test_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \
-        model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-        model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=true \
-        +trainer.limit_test_batches=2 \
-        exp_manager=null
-
-  # L2: NMT Attention is All You Need Inference
-  L2_NMT_Attention_is_All_You_Need_Inference:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NMT_Attention_is_All_You_Need_Inference') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        cd examples/nlp/machine_translation && \
-        python nmt_transformer_infer.py \
-        --model=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \
-        --srctext=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.test.src \
-        --tgtout=/home/TestData/nlp/nmt/toy_data/out.txt \
-        --target_lang en \
-        --source_lang de
-
-  # L2: NMT Attention is All You Need Finetuning
-  L2_NMT_Attention_is_All_You_Need_Finetuning:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NMT_Attention_is_All_You_Need_Finetuning') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/machine_translation && \
-        python enc_dec_nmt_finetune.py \
-        model_path=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \
-        trainer.devices=1 \
-        ~trainer.max_epochs \
-        model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-        model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        +trainer.val_check_interval=10 \
-        +trainer.limit_val_batches=1 \
-        +trainer.limit_test_batches=1 \
-        +trainer.max_steps=10 \
-        +exp_manager.exp_dir=examples/nlp/machine_translation/nmt_finetune \
-        +exp_manager.create_checkpoint_callback=True \
-        +exp_manager.checkpoint_callback_params.monitor=val_sacreBLEU \
-        +exp_manager.checkpoint_callback_params.mode=max \
-        +exp_manager.checkpoint_callback_params.save_best_model=true
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/machine_translation/nmt_finetune
-
-  # L2: NMT Tarred Dataset Creation
-  L2_NMT_Tarred_Dataset_Creation_Auto_Tarred_Dataset_Creation:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NMT_Tarred_Dataset_Creation_Auto_Tarred_Dataset_Creation') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        cd examples/nlp/machine_translation && \
-        python enc_dec_nmt.py \
-        --config-path=conf \
-        --config-name=aayn_base \
-        do_training=false \
-        model.preproc_out_dir=$PWD/preproc_out_dir \
-        model.train_ds.use_tarred_dataset=true \
-        model.train_ds.n_preproc_jobs=2 \
-        model.train_ds.lines_per_dataset_fragment=500 \
-        model.train_ds.num_batches_per_tarfile=10 \
-        model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-        model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.encoder_tokenizer.vocab_size=2000 \
-        model.decoder_tokenizer.vocab_size=2000 \
-        ~model.test_ds \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.fast_dev_run=true \
-        exp_manager=null
-
-  L2_NMT_Tarred_Dataset_Creation_Script_Tarred_Dataset_Creation:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NMT_Tarred_Dataset_Creation_Script_Tarred_Dataset_Creation') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        cd examples/nlp/machine_translation && \
-        python create_tarred_parallel_dataset.py \
-        --src_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        --tgt_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-        --out_dir $PWD/out_dir \
-        --encoder_tokenizer_vocab_size=2000 \
-        --decoder_tokenizer_vocab_size=2000 \
-        --tokens_in_batch=1000 \
-        --lines_per_dataset_fragment=500 \
-        --num_batches_per_tarfile=10 \
-        --n_preproc_jobs=2
-
-  L2_Megatron_NMT_Training_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_NMT_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/machine_translation/megatron_nmt_training.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        +trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation="swiglu" \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method="block" \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation="swiglu" \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method="block" \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.micro_batch_size=2 \
-        model.global_batch_size=4 \
-        model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-        model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-        model.train_ds.num_workers=1 \
-        model.validation_ds.num_workers=1 \
-        ~model.test_ds \
-        model.train_ds.dataset_type=text_memmap \
-        model.encoder_tokenizer.library=sentencepiece \
-        model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-        model.decoder_tokenizer.library=sentencepiece \
-        model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model
-        # Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error
-        # if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run()
-        python examples/nlp/machine_translation/megatron_nmt_training.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        +trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation="swiglu" \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method="block" \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation="swiglu" \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method="block" \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.micro_batch_size=2 \
-        model.global_batch_size=4 \
-        model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-        model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-        model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-        model.train_ds.num_workers=1 \
-        model.validation_ds.num_workers=1 \
-        ~model.test_ds \
-        model.train_ds.dataset_type=text_memmap \
-        model.encoder_tokenizer.library=sentencepiece \
-        model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-        model.decoder_tokenizer.library=sentencepiece \
-        model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/machine_translation/megatron_nmt_results
-
-  L2_Megatron_BART_Perceiver_MIM_Training_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_BART_Perceiver_MIM_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.arch=perceiver \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation="swiglu" \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method="block" \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation="swiglu" \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method="block" \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.micro_batch_size=2 \
-        model.global_batch_size=4 \
-        model.data.data_impl=text_mmap \
-        model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
-        model.data.splits_string="\"800,100,100\"" \
-        model.data.whole_word_masking=False \
-        model.tokenizer.library=sentencepiece \
-        model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-        ++model.hiddens.enc_output_name=z \
-        ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \
-        ++model.hiddens.transform.q_z_given_x.hidden_size=64 \
-        ++model.hiddens.loss.mim.cls_name=a_mim \
-        ++model.hiddens.loss.mim.loss_weight=0.5
-        # Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error
-        # if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run()
-        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.arch=perceiver \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation="swiglu" \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method="block" \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation="swiglu" \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method="block" \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.micro_batch_size=2 \
-        model.global_batch_size=4 \
-        model.data.data_impl=text_mmap \
-        model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \
-        model.data.splits_string="\"800,100,100\"" \
-        model.data.whole_word_masking=False \
-        model.tokenizer.library=sentencepiece \
-        model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
-        ++model.hiddens.enc_output_name=z \
-        ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \
-        ++model.hiddens.transform.q_z_given_x.hidden_size=64 \
-        ++model.hiddens.loss.mim.cls_name=a_mim \
-        ++model.hiddens.loss.mim.loss_weight=0.5
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/megatron_mim_results
-
-    # stage("L2: NMT Bottleneck Fallback") {
-    #   when {
-    #     anyOf {
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   parallel {
-    #     stage("L2: seq2seq (no bottleneck)") {
-    #         steps {
-    #           cd examples/nlp/machine_translation && \
-    #           enc_dec_nmt-bottleneck.py \
-    #           --config-path=conf \
-    #           --config-name=aayn_bottleneck \
-    #           do_testing=true \
-    #           model.model_type=nll \
-    #           model.encoder.arch=seq2seq \
-    #           model.encoder.hidden_steps=1 \
-    #           model.encoder.hidden_blocks=1 \
-    #           model.encoder.hidden_init_method=params \
-    #           model.encoder.hidden_size=64 \
-    #           model.encoder.inner_size=128 \
-    #           model.encoder.num_attention_heads=2 \
-    #           model.encoder.num_layers=2 \
-    #           model.decoder.hidden_size=64 \
-    #           model.decoder.inner_size=128 \
-    #           model.decoder.num_attention_heads=2 \
-    #           model.decoder.num_layers=2 \
-    #           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \
-    #           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \
-    #           model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \
-    #           model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \
-    #           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src \
-    #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref \
-    #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           trainer.devices=1 \
-    #           trainer.accelerator="gpu" \
-    #           +trainer.fast_dev_run=true \
-    #           +trainer.limit_test_batches=2 \
-    #           exp_manager=null \
-    #         }
-    #     }
-    #   }
-    # }
-    # stage("L2: NMT Bottleneck Architecture") {
-    #   when {
-    #     anyOf {
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   parallel {
-    #     stage("Bridge Encoder (identity)") {
-    #         steps {
-    #           cd examples/nlp/machine_translation && \
-    #           enc_dec_nmt-bottleneck.py \
-    #           --config-path=conf \
-    #           --config-name=aayn_bottleneck \
-    #           do_testing=true \
-    #           model.model_type=nll \
-    #           model.encoder.arch=bridge \
-    #           model.encoder.hidden_steps=1 \
-    #           model.encoder.hidden_blocks=1 \
-    #           model.encoder.hidden_init_method=identity \
-    #           model.encoder.hidden_size=64 \
-    #           model.encoder.inner_size=128 \
-    #           model.encoder.num_attention_heads=2 \
-    #           model.encoder.num_layers=2 \
-    #           model.decoder.hidden_size=64 \
-    #           model.decoder.inner_size=128 \
-    #           model.decoder.num_attention_heads=2 \
-    #           model.decoder.num_layers=2 \
-    #           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    #           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           trainer.devices=1 \
-    #           trainer.accelerator="gpu" \
-    #           +trainer.fast_dev_run=true \
-    #           +trainer.limit_test_batches=2 \
-    #           exp_manager=null
-    #         }
-    #     }
-    #     stage("Perceiver Encoder (params)") {
-    #         steps {
-    #           cd examples/nlp/machine_translation && \
-    #           enc_dec_nmt-bottleneck.py \
-    #           --config-path=conf \
-    #           --config-name=aayn_bottleneck \
-    #           do_testing=true \
-    #           model.model_type=nll \
-    #           model.encoder.arch=perceiver \
-    #           model.encoder.hidden_steps=1 \
-    #           model.encoder.hidden_blocks=1 \
-    #           model.encoder.hidden_init_method=params \
-    #           model.encoder.hidden_size=64 \
-    #           model.encoder.inner_size=128 \
-    #           model.encoder.num_attention_heads=2 \
-    #           model.encoder.num_layers=2 \
-    #           model.decoder.hidden_size=64 \
-    #           model.decoder.inner_size=128 \
-    #           model.decoder.num_attention_heads=2 \
-    #           model.decoder.num_layers=2 \
-    #           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    #           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           trainer.devices=1 \
-    #           trainer.accelerator="gpu" \
-    #           +trainer.fast_dev_run=true \
-    #           +trainer.limit_test_batches=2 \
-    #           exp_manager=null
-    #         }
-    #     }
-    #   }
-    # }
-    # stage("L2: NMT Bottleneck LVM") {
-    #   when {
-    #     anyOf {
-    #       branch "main"
-    #       changeRequest target: "main"
-    #     }
-    #   }
-    #   failFast true
-    #   parallel {
-    #     stage("VAE") {
-    #         steps {
-    #           cd examples/nlp/machine_translation && \
-    #           enc_dec_nmt-bottleneck.py \
-    #           --config-path=conf \
-    #           --config-name=aayn_bottleneck \
-    #           do_testing=true \
-    #           model.model_type=vae \
-    #           model.encoder.arch=perceiver \
-    #           model.encoder.hidden_steps=1 \
-    #           model.encoder.hidden_blocks=1 \
-    #           model.encoder.hidden_init_method=params \
-    #           model.encoder.hidden_size=64 \
-    #           model.encoder.inner_size=128 \
-    #           model.encoder.num_attention_heads=2 \
-    #           model.encoder.num_layers=2 \
-    #           model.decoder.hidden_size=64 \
-    #           model.decoder.inner_size=128 \
-    #           model.decoder.num_attention_heads=2 \
-    #           model.decoder.num_layers=2 \
-    #           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    #           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           trainer.devices=1 \
-    #           trainer.accelerator="gpu" \
-    #           +trainer.fast_dev_run=true \
-    #           +trainer.limit_test_batches=2 \
-    #           exp_manager=null
-    #         }
-    #     }
-    #     stage("MIM") {
-    #         steps {
-    #           cd examples/nlp/machine_translation && \
-    #           enc_dec_nmt-bottleneck.py \
-    #           --config-path=conf \
-    #           --config-name=aayn_bottleneck \
-    #           do_testing=true \
-    #           model.model_type=mim \
-    #           model.encoder.arch=perceiver \
-    #           model.encoder.hidden_steps=1 \
-    #           model.encoder.hidden_blocks=1 \
-    #           model.encoder.hidden_init_method=params \
-    #           model.encoder.hidden_size=64 \
-    #           model.encoder.inner_size=128 \
-    #           model.encoder.num_attention_heads=2 \
-    #           model.encoder.num_layers=2 \
-    #           model.decoder.hidden_size=64 \
-    #           model.decoder.inner_size=128 \
-    #           model.decoder.num_attention_heads=2 \
-    #           model.decoder.num_layers=2 \
-    #           model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
-    #           model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
-    #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           trainer.devices=1 \
-    #           trainer.accelerator="gpu" \
-    #           +trainer.fast_dev_run=true \
-    #           +trainer.limit_test_batches=2 \
-    #           exp_manager=null
-    #         }
-    #     }
-    #   }
-    # }
-        
-  L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=bf16 \
-        model.megatron_amp_O2=True \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=block \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-
-        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=20 \
-        trainer.precision=bf16 \
-        model.megatron_amp_O2=True \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=block \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-
-  L2_Megatron_Bert_Pretraining_and_Resume_Training:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Bert_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=bf16 \
-        model.megatron_amp_O2=True \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.sequence_parallel=True \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=block \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-
-        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=20 \
-        trainer.precision=bf16 \
-        model.megatron_amp_O2=True \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=block \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/bert_pretrain_results
-        rm -rf examples/nlp/language_modeling/bert_index_mappings
-
-  L2_Megatron_Core_Bert_Pretraining_and_Resume_Training:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Core_Bert_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-          trainer.devices=2 \
-          trainer.accelerator=gpu \
-          trainer.log_every_n_steps=1 \
-          trainer.val_check_interval=10 \
-          trainer.limit_val_batches=2 \
-          trainer.accumulate_grad_batches=1 \
-          trainer.max_steps=10 \
-          trainer.gradient_clip_val=1.0 \
-          exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-          model.mcore_bert=True \
-          model.tensor_model_parallel_size=2 \
-          model.optim.name=fused_adam \
-          model.optim.lr=2e-4 \
-          model.sequence_parallel=True \
-          model.optim.sched.warmup_steps=2 \
-          model.optim.sched.constant_steps=2 \
-          model.optim.sched.min_lr=8e-5 \
-          model.max_position_embeddings=128 \
-          model.encoder_seq_length=128 \
-          model.data.seq_length=128 \
-          model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-          model.num_layers=8 \
-          model.hidden_size=256 \
-          model.num_attention_heads=8 \
-          model.activations_checkpoint_method="block" \
-          model.activations_checkpoint_num_layers=1 \
-          model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-          model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-
-          python examples/nlp/language_modeling/megatron_bert_pretraining.py \
-          trainer.devices=2 \
-          trainer.accelerator=gpu \
-          trainer.log_every_n_steps=1 \
-          trainer.val_check_interval=10 \
-          trainer.limit_val_batches=2 \
-          trainer.accumulate_grad_batches=1 \
-          trainer.max_steps=20 \
-          trainer.gradient_clip_val=1.0 \
-          exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \
-          exp_manager.resume_if_exists=True \
-          model.mcore_bert=True \
-          model.tensor_model_parallel_size=2 \
-          model.optim.name=fused_adam \
-          model.optim.lr=2e-4 \
-          model.optim.sched.warmup_steps=2 \
-          model.optim.sched.constant_steps=2 \
-          model.optim.sched.min_lr=8e-5 \
-          model.max_position_embeddings=128 \
-          model.encoder_seq_length=128 \
-          model.data.seq_length=128 \
-          model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \
-          model.num_layers=8 \
-          model.hidden_size=256 \
-          model.num_attention_heads=8 \
-          model.activations_checkpoint_method="block" \
-          model.activations_checkpoint_num_layers=1 \
-          model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
-          model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/bert_pretrain_results
-        rm -rf examples/nlp/language_modeling/bert_index_mappings
-
-  L2_Megatron_RETRO_Pretraining_and_Resume_Training:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_RETRO_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_retro_pretraining.py \
-        trainer.num_nodes=1 \
-        trainer.devices=2 \
-        trainer.precision=bf16 \
-        trainer.accelerator=gpu \
-        model.data.data_prefix=["none"] \
-        exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
-        model.mcore_gpt=True \
-        model.tensor_model_parallel_size=1 \
-        model.pipeline_model_parallel_size=1 \
-        model.optim.name=distributed_fused_adam \
-        model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \
-        model.data.num_workers=4 \
-        model.micro_batch_size=1 \
-        model.data.shuffle_documents=False \
-        trainer.val_check_interval=30 \
-        +trainer.num_sanity_val_steps=0 \
-        model.init_method_std=0.023 \
-        model.optim.lr=6.0e-4 \
-        model.megatron_amp_O2=True \
-        model.data.splits_string="\"98,2,0\"" \
-        model.data.dataloader_type=cyclic \
-        trainer.max_steps=10
-
-        python examples/nlp/language_modeling/megatron_retro_pretraining.py \
-        trainer.num_nodes=1 \
-        trainer.devices=2 \
-        trainer.precision=bf16 \
-        trainer.accelerator=gpu \
-        model.data.data_prefix=["none"] \
-        exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
-        model.mcore_gpt=True \
-        model.tensor_model_parallel_size=1 \
-        model.pipeline_model_parallel_size=1 \
-        model.optim.name=distributed_fused_adam \
-        model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \
-        model.data.num_workers=4 \
-        model.micro_batch_size=1 \
-        model.data.shuffle_documents=False \
-        trainer.val_check_interval=30 \
-        +trainer.num_sanity_val_steps=0 \
-        model.init_method_std=0.023 \
-        model.optim.lr=6.0e-4 \
-        model.megatron_amp_O2=True \
-        model.data.splits_string="\"98,2,0\"" \
-        model.data.dataloader_type=cyclic \
-        trainer.max_steps=20
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/mcore_retro_results
-
-  L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \
-        trainer.devices=2 \
-        trainer.num_nodes=1 \
-        trainer.accelerator=gpu \
-        trainer.accumulate_grad_batches=1 \
-        trainer.limit_val_batches=2 \
-        exp_manager.resume_if_exists=True \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        trainer.val_check_interval=10 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \
-        model.data.data_prefix= \
-        model.data.knn_index= \
-        model.data.retrieval_prefix= \
-        model.tensor_model_parallel_size=2 \
-        model.micro_batch_size=4 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.chunk_size=32 \
-        model.enc_num_layers=2 \
-        model.dec_num_layers=2 \
-        model.enc_cross_attention=[1] \
-        model.dec_cross_attention=[1] \
-        +model.data.mock=True
-
-            python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \
-            trainer.devices=2 \
-            trainer.num_nodes=1 \
-            trainer.accelerator=gpu \
-            trainer.accumulate_grad_batches=1 \
-            trainer.limit_val_batches=2 \
-            exp_manager.resume_if_exists=True \
-            trainer.max_steps=20 \
-            trainer.precision=16 \
-            trainer.gradient_clip_val=1.0 \
-            trainer.val_check_interval=10 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \
-            model.data.data_prefix= \
-            model.data.knn_index= \
-            model.data.retrieval_prefix= \
-            model.tensor_model_parallel_size=2 \
-            model.micro_batch_size=4 \
-            model.optim.name=fused_adam \
-            model.optim.lr=2e-4 \
-            model.optim.sched.warmup_steps=2 \
-            model.optim.sched.constant_steps=2 \
-            model.optim.sched.min_lr=8e-5 \
-            model.max_position_embeddings=128 \
-            model.encoder_seq_length=128 \
-            model.chunk_size=32 \
-            model.enc_num_layers=2 \
-            model.dec_num_layers=2 \
-            model.enc_cross_attention=[1] \
-            model.dec_cross_attention=[1] \
-            +model.data.mock=True
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/retro_legacy_results
-
-  # L2_Megatron_RETRO_muTransfer_Pretraining_Performance:
-  #   needs: [cicd-test-container-setup]
-  #   runs-on: self-hosted-azure
-  #   container:
-  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-  #     options: 
-  #       # --user 0:128
-  #       --device=/dev/nvidia0
-  #       --gpus all
-  #       --shm-size=8g 
-  #       --env TRANSFORMERS_OFFLINE=0 
-  #       --env HYDRA_FULL_ERROR=1
-  #       --volume /mnt/datadrive/TestData:/home/TestData
-  #   steps:
-  #       - name: Checkout repository
-  #         uses: actions/checkout@v4
-  #       - run: |
-  #           python examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py \
-  #               trainer.devices=2 \
-  #               trainer.num_nodes=1 \
-  #               trainer.accelerator=gpu \
-  #               trainer.accumulate_grad_batches=1 \
-  #               trainer.max_steps=100 \
-  #               trainer.log_every_n_steps=1 \
-  #               trainer.precision=16 \
-  #               trainer.val_check_interval=100 \
-  #               trainer.limit_val_batches=0 \
-  #               trainer.gradient_clip_val=1.0 \
-  #               +trainer.num_sanity_val_steps=0 \
-  #               exp_manager.exp_dir=examples/nlp/language_modeling/retro_results/ \
-  #               +exp_manager.version=smalltest \
-  #               model.data.neighbors=2 \
-  #               model.megatron_amp_O2=False \
-  #               model.apply_query_key_layer_scaling=False \
-  #               model.tensor_model_parallel_size=1 \
-  #               model.optim.name=muadamw \
-  #               model.optim.weight_decay=0.1 \
-  #               model.optim.betas=[0.9,0.95] \
-  #               model.optim.lr=6e-4 \
-  #               model.optim.sched.warmup_steps=1000 \
-  #               model.optim.sched.constant_steps=0 \
-  #               model.optim.sched.min_lr=6e-5 \
-  #               model.add_position_embedding=False \
-  #               model.enc_num_layers=2 \
-  #               model.dec_num_layers=6 \
-  #               model.enc_cross_attention=[0] \
-  #               model.dec_cross_attention=[3,5] \
-  #               model.hidden_size=96 \
-  #               model.ffn_hidden_size=384 \
-  #               model.init_method_std=0.023 \
-  #               model.num_attention_heads=12 \
-  #               model.max_position_embeddings=1024 \
-  #               model.encoder_seq_length=1024 \
-  #               model.tokenizer.library=megatron \
-  #               model.tokenizer.type=GPT2BPETokenizer \
-  #               model.tokenizer.merge_file=/home/TestData/nlp/megatron_retro/gpt2-merges.txt \
-  #               model.tokenizer.vocab_file=/home/TestData/nlp/megatron_retro/gpt2-vocab.json \
-  #               model.data.data_prefix=[/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document] \
-  #               model.data.knn_index=[/home/TestData/nlp/megatron_retro/knn2_map_wiki_test.idx] \
-  #               model.data.retrieval_prefix=/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document \
-  #               model.data.index_mapping_dir=/home/TestData/nlp/megatron_retro \
-  #               model.data.num_workers=8 \
-  #               model.micro_batch_size=8 \
-  #               model.normalization=rmsnorm \
-  #               model.transformer_block_type=pre_ln \
-  #               model.bias_activation_fusion=True \
-  #               model.bias_dropout_add_fusion=False \
-  #               model.masked_softmax_fusion=True \
-  #               model.hidden_dropout=0 \
-  #               model.attention_dropout=0 \
-  #               model.fp32_residual_connection=True \
-  #               model.shape_file=/home/TestData/nlp/megatron_retro/o1_rel_shape_info_tiny.yaml
-
-  #               python -c "import pandas as pd
-  #               import pathlib
-  #               from pandas.testing import assert_frame_equal
-  #               from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
-  #               import torch
-  #               if not (torch.cuda.is_available() and "A100" in torch.cuda.get_device_name()):
-  #                   import sys
-  #                   sys.exit(0)
-  #               event_file = list(pathlib.Path("examples/nlp/language_modeling/retro_results/megatron_retro/smalltest").glob("events.out.tfevents*"))[0]
-  #               ea = EventAccumulator(str(event_file)).Reload()
-  #               vals = []
-  #               for i in ea.Scalars("reduced_train_loss"):
-  #                   vals.append(i.value)
-  #               training_curve = pd.DataFrame({"loss": vals})
-  #               gt_curve = pd.read_csv("/home/TestData/nlp/megatron_retro/expected_learning_curve.csv")
-  #               assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"
-
-  #               rm -rf examples/nlp/language_modeling/retro_results
-  #       - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-  #         if: "failure()"
-
-  L2_RAG_Pipeline_Indexing:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_RAG_Pipeline_Indexing') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/rag/rag_indexing.py \
-        trainer.num_nodes=1 \
-        trainer.devices=1 \
-        trainer.precision="bf16-mixed" \
-        indexing.embedder.model_path="/home/TestData/nlp/rag_pipeline/testing_models/embedders/sbert_nemo.nemo" \
-        indexing.embedder.embed_batch_size=128 \
-        indexing.data.data_path="/home/TestData/nlp/rag_pipeline/testing_data/corpus_data/sample_data" \
-        indexing.data.chunk_size=256 \
-        indexing.data.chunk_overlap=10 \
-        indexing.index_path="/home/TestData/nlp/rag_pipeline/testing_data/saved_index/sample_index"
-
-  L2_RAG_Pipeline_Generating:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_RAG_Pipeline_Generating') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/rag/rag_generating.py \
-        trainer.devices=1 \
-        trainer.precision="bf16-mixed" \
-        indexing.embedder.model_path="/home/TestData/nlp/rag_pipeline/testing_models/embedders/sbert_nemo.nemo" \
-        indexing.index_path="/home/TestData/nlp/rag_pipeline/testing_data/saved_index/sample_index" \
-        generating.llm.model_path="/home/TestData/nlp/rag_pipeline/testing_models/llms/megatron_gpt_125m.nemo" \
-        generating.inference.tokens_to_generate=50 \
-        generating.inference.greedy=False \
-        generating.inference.temperature=1.0 \
-        generating.query="Which art schools did I applied to?"
-
-  L2_BioMegatron_Bert_NER_Task:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_BioMegatron_Bert_NER_Task') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/token_classification/token_classification_train.py \
-        exp_manager.exp_dir=examples/nlp/language_modeling/token_classification_results \
-        trainer.max_epochs=1 \
-        model.dataset.data_dir=/home/TestData/nlp/ner \
-        model.language_model.pretrained_model_name=biomegatron345m_biovocab_30k_cased \
-        model.tokenizer.tokenizer_name=null
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/token_classification_results
-
-  L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-2-h100
-      SCRIPT: |
-        # This is to improve p2p overlap on H100
-        export NVTE_FWD_LAYERNORM_SM_MARGIN=8
-        export NVTE_BWD_LAYERNORM_SM_MARGIN=8
-        export TORCH_NCCL_AVOID_RECORD_STREAMS=1
-        export NCCL_MIN_NCHANNELS=4
-        # TP overlap is not supported in docker environment
-        #NVTE_UB_SPLIT_RS: 0
-        #NVTE_UB_ATOMIC_GEMM_RS: 1
-        #NVTE_RS_STRIDED_ATOMIC: 1
-        #NVTE_UB_FP8_RS: 1
-        # Increase p2p chunksize to 2MB
-        export NCCL_P2P_NET_CHUNKSIZE=2097152
-        # Disable gc when switching to/from validation steps
-        export NEMO_MANUAL_GC_IN_VALIDATION=0
-        
-        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=/tmp/examples_gpt_pretrain_results_te_autocast \
-        ++model.transformer_engine=True \
-        ++model.fp8=True \
-        ++model.fp8_hybrid=True \
-        ++model.fp8_amax_history_len=1024 \
-        ++model.fp8_amax_compute_algo=max \
-        ++model.reduce_amax=True \
-        ++model.use_te_rng_tracker=True \
-        ++model.name=megatron_gpt_full_te_layer_autocast \
-        model.ub_tp_comm_overlap=False \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=distributed_fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=1 \
-        model.optim.sched.constant_steps=1 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.bias=False \
-        model.bias_activation_fusion=False \
-        model.bias_dropout_add_fusion=False \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=2 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=block \
-        model.activations_checkpoint_granularity=full \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.validation_drop_last=False \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=/tmp/examples_gpt_pretrain_results_te_autocast/gpt_mapping
-
-        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=6 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=/tmp/examples_gpt_pretrain_results_te_autocast \
-        exp_manager.resume_if_exists=True \
-        ++model.transformer_engine=True \
-        ++model.fp8=True \
-        ++model.fp8_hybrid=True \
-        ++model.fp8_amax_history_len=1024 \
-        ++model.fp8_amax_compute_algo=max \
-        ++model.reduce_amax=True \
-        ++model.use_te_rng_tracker=True \
-        ++model.name=megatron_gpt_full_te_layer_autocast \
-        model.ub_tp_comm_overlap=False \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=distributed_fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.bias=False \
-        model.bias_activation_fusion=False \
-        model.bias_dropout_add_fusion=False \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=2 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=block \
-        model.activations_checkpoint_granularity=full \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.validation_drop_last=False \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=/tmp/examples_gpt_pretrain_results_te_autocast/gpt_mapping
-
-
-  L2_Megatron_GPT_Skip_Train:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Skip_Train') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        model.skip_train=True \
-        model.tensor_model_parallel_size=2 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.data.data_prefix=[] \
-        model.data.data_impl=mock \
-        model.dist_ckpt_format=torch_dist
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/gpt_pretrain_results
-        rm -rf examples/nlp/language_modeling/gpt_index_mappings
-
-  L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=1 \
-        model.optim.sched.constant_steps=1 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.position_embedding_type=rope \
-        model.rotary_percentage=0.5 \
-        model.bias=False \
-        model.bias_activation_fusion=False \
-        model.bias_dropout_add_fusion=False \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=block \
-        model.activations_checkpoint_granularity=full \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-    
-        #  commented out to save time on github ci @adithyare
-        # python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        # trainer.devices=2 \
-        # trainer.accelerator=gpu \
-        # trainer.log_every_n_steps=1 \
-        # trainer.val_check_interval=2 \
-        # trainer.limit_val_batches=1 \
-        # trainer.accumulate_grad_batches=1 \
-        # trainer.max_steps=6 \
-        # trainer.gradient_clip_val=1.0 \
-        # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        # exp_manager.resume_if_exists=True \
-        # model.tensor_model_parallel_size=2 \
-        # model.optim.name=fused_adam \
-        # model.optim.lr=2e-4 \
-        # model.optim.sched.warmup_steps=2 \
-        # model.optim.sched.constant_steps=2 \
-        # model.optim.sched.min_lr=8e-5 \
-        # model.max_position_embeddings=128 \
-        # model.encoder_seq_length=128 \
-        # model.data.seq_length=128 \
-        # model.position_embedding_type=rope \
-        # model.rotary_percentage=0.5 \
-        # model.normalization=rmsnorm \
-        # model.bias=False \
-        # model.bias_activation_fusion=False \
-        # model.bias_dropout_add_fusion=False \
-        # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        # model.num_layers=8 \
-        # model.hidden_size=256 \
-        # model.num_attention_heads=8 \
-        # model.activations_checkpoint_method=block \
-        # model.activations_checkpoint_granularity=full \
-        # model.activations_checkpoint_num_layers=1 \
-        # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/gpt_pretrain_results
-        rm -rf examples/nlp/language_modeling/gpt_index_mappings
-
-    #  This test requires Ampere but some of the test GPUs are Volta
-    #  Need to add a check for compute capability before uncommenting this test
-    #  - name: L2: Megatron GPT with Rope Pretraining using Flash Attention and Resume Training TP=2
-    #    when {
-    #      anyOf {
-    #        branch main
-    #        changeRequest target: main
-    #      }
-    #    }
-    #    failFast true
-    #    - run: |
-    #      python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-    #      trainer.devices=2 \
-    #      trainer.accelerator=gpu \
-    #      trainer.log_every_n_steps=1 \
-    #      trainer.val_check_interval=2 \
-    #      trainer.limit_val_batches=2 \
-    #      trainer.accumulate_grad_batches=1 \
-    #      trainer.max_steps=3 \
-    #      trainer.precision=16 \
-    #      trainer.gradient_clip_val=1.0 \
-    #      exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-    #      model.tensor_model_parallel_size=2 \
-    #      model.optim.name=fused_adam \
-    #      model.optim.lr=2e-4 \
-    #      model.optim.sched.warmup_steps=1 \
-    #      model.optim.sched.constant_steps=1 \
-    #      model.optim.sched.min_lr=8e-5 \
-    #      model.max_position_embeddings=128 \
-    #      model.encoder_seq_length=128 \
-    #      model.data.seq_length=128 \
-    #      model.position_embedding_type=rope \
-    #      model.rotary_percentage=0.5 \
-    #      model.normalization=rmsnorm \
-    #      model.bias=False \
-    #      model.bias_activation_fusion=False \
-    #      model.bias_dropout_add_fusion=False \
-    #      model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-    #      model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-    #      model.num_layers=8 \
-    #      model.hidden_size=256 \
-    #      model.num_attention_heads=8 \
-    #      model.activations_checkpoint_method=block \
-    #      model.activations_checkpoint_granularity=full \
-    #      model.activations_checkpoint_num_layers=1 \
-    #      model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-    #      model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \
-    #      model.use_flash_attention=True "
-    #      #  commented out to save time on github ci @adithyare
-    #      # python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-    #      # trainer.devices=2 \
-    #      # trainer.accelerator=gpu \
-    #      # trainer.log_every_n_steps=1 \
-    #      # trainer.val_check_interval=2 \
-    #      # trainer.limit_val_batches=1 \
-    #      # trainer.accumulate_grad_batches=1 \
-    #      # trainer.max_steps=6 \
-    #      # trainer.precision=16 \
-    #      # trainer.gradient_clip_val=1.0 \
-    #      # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-    #      # exp_manager.resume_if_exists=True \
-    #      # model.tensor_model_parallel_size=2 \
-    #      # model.optim.name=fused_adam \
-    #      # model.optim.lr=2e-4 \
-    #      # model.optim.sched.warmup_steps=2 \
-    #      # model.optim.sched.constant_steps=2 \
-    #      # model.optim.sched.min_lr=8e-5 \
-    #      # model.max_position_embeddings=128 \
-    #      # model.encoder_seq_length=128 \
-    #      # model.data.seq_length=128 \
-    #      # model.position_embedding_type=rope \
-    #      # model.rotary_percentage=0.5 \
-    #      # model.normalization=rmsnorm \
-    #      # model.bias=False \
-    #      # model.bias_activation_fusion=False \
-    #      # model.bias_dropout_add_fusion=False \
-    #      # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-    #      # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-    #      # model.num_layers=8 \
-    #      # model.hidden_size=256 \
-    #      # model.num_attention_heads=8 \
-    #      # model.activations_checkpoint_method=block \
-    #      # model.activations_checkpoint_granularity=full \
-    #      # model.activations_checkpoint_num_layers=1 \
-    #      # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-    #      # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \
-    #      # model.use_flash_attention=True"
-    #      rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
-    #      rm -rf examples/nlp/language_modeling/gpt_index_mappings"
-    #    }
-    #  }
-
-  L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=3 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=bf16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.megatron_amp_O2=True \
-        model.optim.name=distributed_fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-    
-        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=3 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=6 \
-        trainer.precision=bf16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.reset_lr=True \
-        model.tensor_model_parallel_size=2 \
-        model.megatron_amp_O2=True \
-        model.optim.name=distributed_fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/gpt_pretrain_results
-        rm -rf examples/nlp/language_modeling/gpt_index_mappings
-
-  L2_Megatron_GPT_with_Drop_Optimizer_States_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_with_Drop_Optimizer_States_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=6 \
-        trainer.precision=bf16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        exp_manager.checkpoint_callback_params.save_last_n_optim_states=1 \
-        model.dist_ckpt_format="torch_dist" \
-        model.tensor_model_parallel_size=2 \
-        model.megatron_amp_O2=True \
-        model.optim.name=distributed_fused_adam \
-        model.optim.lr=2e-4 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/gpt_pretrain_results
-        rm -rf examples/nlp/language_modeling/gpt_index_mappings
-
-  L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=1 \
-        model.optim.sched.constant_steps=1 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.position_embedding_type=alibi \
-        model.bias=False \
-        model.bias_activation_fusion=False \
-        model.bias_dropout_add_fusion=False \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=block \
-        model.activations_checkpoint_granularity=full \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-    
-        # not testing resume functionality to save time on ci @adithyare
-        #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        #trainer.devices=2 \
-        #trainer.accelerator=gpu \
-        #trainer.log_every_n_steps=1 \
-        #trainer.val_check_interval=2 \
-        #trainer.limit_val_batches=1 \
-        #trainer.accumulate_grad_batches=1 \
-        #trainer.max_steps=6 \
-        #trainer.gradient_clip_val=1.0 \
-        #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        #exp_manager.resume_if_exists=True \
-        #model.tensor_model_parallel_size=2 \
-        #model.optim.name=fused_adam \
-        #model.optim.lr=2e-4 \
-        #model.optim.sched.warmup_steps=2 \
-        #model.optim.sched.constant_steps=2 \
-        #model.optim.sched.min_lr=8e-5 \
-        #model.max_position_embeddings=128 \
-        #model.encoder_seq_length=128 \
-        #model.data.seq_length=128 \
-        #model.position_embedding_type=alibi \
-        #model.normalization=rmsnorm \
-        #model.bias=False \
-        #model.bias_activation_fusion=False \
-        #model.bias_dropout_add_fusion=False \
-        #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        #model.num_layers=8 \
-        #model.hidden_size=256 \
-        #model.num_attention_heads=8 \
-        #model.activations_checkpoint_method=block \
-        #model.activations_checkpoint_granularity=full \
-        #model.activations_checkpoint_num_layers=1 \
-        #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/gpt_pretrain_results
-        rm -rf examples/nlp/language_modeling/gpt_index_mappings
-
-  L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=1 \
-        model.optim.sched.constant_steps=1 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.data.seq_length=128 \
-        model.position_embedding_type=kerple \
-        model.bias=False \
-        model.bias_activation_fusion=False \
-        model.bias_dropout_add_fusion=False \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=block \
-        model.activations_checkpoint_granularity=full \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-        
-        # commented out to save time on github ci @adithyare
-        #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        #trainer.devices=2 \
-        #trainer.accelerator=gpu \
-        #trainer.log_every_n_steps=1 \
-        #trainer.val_check_interval=2 \
-        #trainer.limit_val_batches=1 \
-        #trainer.accumulate_grad_batches=1 \
-        #trainer.max_steps=6 \
-        #trainer.precision=16 \
-        #trainer.gradient_clip_val=1.0 \
-        #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        #exp_manager.resume_if_exists=True \
-        #model.tensor_model_parallel_size=2 \
-        #model.optim.name=fused_adam \
-        #model.optim.lr=2e-4 \
-        #model.optim.sched.warmup_steps=2 \
-        #model.optim.sched.constant_steps=2 \
-        #model.optim.sched.min_lr=8e-5 \
-        #model.max_position_embeddings=128 \
-        #model.encoder_seq_length=128 \
-        #model.data.seq_length=128 \
-        #model.position_embedding_type=kerple \
-        #model.normalization=rmsnorm \
-        #model.bias=False \
-        #model.bias_activation_fusion=False \
-        #model.bias_dropout_add_fusion=False \
-        #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        #model.num_layers=8 \
-        #model.hidden_size=256 \
-        #model.num_attention_heads=8 \
-        #model.activations_checkpoint_method=block \
-        #model.activations_checkpoint_granularity=full \
-        #model.activations_checkpoint_num_layers=1 \
-        #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/gpt_pretrain_results
-        rm -rf examples/nlp/language_modeling/gpt_index_mappings
-
-  L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-2-h100
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=bf16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        ++model.transformer_engine=True \
-        ++model.fp8=True \
-        ++model.fp8_hybrid=True \
-        ++model.fp8_amax_history_len=1024 \
-        ++model.fp8_amax_compute_algo=max \
-        ++model.reduce_amax=True \
-        ++model.use_te_rng_tracker=True \
-        ++model.name=megatron_gpt_full_te_layer_autocast \
-        model.ub_tp_comm_overlap=False \
-        model.pipeline_model_parallel_size=2 \
-        model.tensor_model_parallel_size=1 \
-        model.mcore_gpt=True \
-        model.megatron_amp_O2=True \
-        model.optim.name=distributed_fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=1 \
-        model.optim.sched.constant_steps=1 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.activation=fast-swiglu \
-        model.bias_activation_fusion=False \
-        model.hidden_dropout=0.0 \
-        model.attention_dropout=0.0 \
-        model.transformer_block_type=normformer \
-        model.headscale=True \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=block \
-        model.activations_checkpoint_granularity=full \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.validation_drop_last=False \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-
-        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=6 \
-        trainer.precision=bf16 \
-        trainer.gradient_clip_val=1.0 \
-        model.mcore_gpt=True \
-        model.megatron_amp_O2=True \
-        exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        ++model.transformer_engine=True \
-        ++model.fp8=True \
-        ++model.fp8_hybrid=True \
-        ++model.fp8_amax_history_len=1024 \
-        ++model.fp8_amax_compute_algo=max \
-        ++model.reduce_amax=True \
-        ++model.use_te_rng_tracker=True \
-        ++model.name=megatron_gpt_full_te_layer_autocast \
-        model.ub_tp_comm_overlap=False \
-        model.pipeline_model_parallel_size=2 \
-        model.tensor_model_parallel_size=1 \
-        model.optim.name=distributed_fused_adam \
-        model.optim.lr=2e-4 \
-        model.optim.sched.warmup_steps=2 \
-        model.optim.sched.constant_steps=2 \
-        model.optim.sched.min_lr=8e-5 \
-        model.max_position_embeddings=128 \
-        model.encoder_seq_length=128 \
-        model.activation=fast-swiglu \
-        model.bias_activation_fusion=False \
-        model.hidden_dropout=0.0 \
-        model.attention_dropout=0.0 \
-        model.transformer_block_type=normformer \
-        model.headscale=True \
-        model.data.seq_length=128 \
-        model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        model.num_layers=8 \
-        model.hidden_size=256 \
-        model.num_attention_heads=8 \
-        model.activations_checkpoint_method=block \
-        model.activations_checkpoint_granularity=full \
-        model.activations_checkpoint_num_layers=1 \
-        model.data.validation_drop_last=False \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/gpt_pretrain_results
-        rm -rf examples/nlp/language_modeling/gpt_index_mappings
-
-  OPTIONAL_L2_Megatron_GPT_Auto_Configurator_TP1_PP1_MBS124:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L2_Megatron_GPT_Auto_Configurator_TP1_PP1_MBS124') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        mkdir examples/llm/auto_configurator/auto_conf_logs
-
-        python examples/llm/auto_configurator/auto_config.py \
-        --logs_dir=/workspace/examples/llm/auto_configurator/auto_conf_logs \
-        --data_path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \
-        --tokenizer_path=/home/TestData/nlp/gpt2_tokenizer \
-        --run_number=1
-
-        python examples/llm/auto_configurator/auto_config.py \
-        --logs_dir=/workspace/examples/llm/auto_configurator/auto_conf_logs \
-        --data_path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \
-        --tokenizer_path=/home/TestData/nlp/gpt2_tokenizer \
-        --run_number=2
-
-        python examples/llm/auto_configurator/auto_config.py \
-        --logs_dir=/workspace/examples/llm/auto_configurator/auto_conf_logs \
-        --data_path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \
-        --tokenizer_path=/home/TestData/nlp/gpt2_tokenizer \
-        --run_number=3
-
-        python examples/llm/auto_configurator/auto_config.py \
-        --logs_dir=/workspace/examples/llm/auto_configurator/auto_conf_logs \
-        --data_path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \
-        --tokenizer_path=/home/TestData/nlp/gpt2_tokenizer \
-        --get_results
-      AFTER_SCRIPT: |
-        rm -rf examples/llm/auto_configurator/auto_conf_logs
-      IS_OPTIONAL: true
-
-  L2_Megatron_GPT_Finetuning_PP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Finetuning_PP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        +trainer.limit_val_batches=2 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=/tmp/gpt_finetuning_pp2_megatron \
-        model.pipeline_model_parallel_size=2 \
-        model.tensor_model_parallel_size=1 \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.peft.peft_scheme=null \
-        model.data.train_ds.micro_batch_size=1 \
-        model.data.train_ds.global_batch_size=4 \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
-        model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \
-        model.data.train_ds.num_workers=0 \
-        model.data.test_ds.micro_batch_size=1 \
-        model.data.test_ds.global_batch_size=1 \
-        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.test_ds.names=[quarel] \
-        model.data.validation_ds.micro_batch_size=1 \
-        model.data.validation_ds.global_batch_size=1 \
-        model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.validation_ds.names=[quarel]
-
-        python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        +trainer.limit_val_batches=2 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=/tmp/gpt_finetuning_pp2_megatron \
-        model.pipeline_model_parallel_size=2 \
-        model.tensor_model_parallel_size=1 \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
-        model.optim.name=fused_adam \
-        model.optim.lr=2e-4 \
-        model.peft.peft_scheme=null \
-        model.data.train_ds.micro_batch_size=1 \
-        model.data.train_ds.global_batch_size=4 \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \
-        model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \
-        model.data.train_ds.num_workers=0 \
-        model.data.test_ds.micro_batch_size=1 \
-        model.data.test_ds.global_batch_size=1 \
-        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.test_ds.names=[quarel] \
-        model.data.validation_ds.micro_batch_size=1 \
-        model.data.validation_ds.global_batch_size=1 \
-        model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.validation_ds.names=[quarel]
-
-  L2_Megatron_GPT_Finetuning_StarCoder_PP1:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Finetuning_StarCoder_PP1') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
-        trainer.devices=1 \
-        trainer.num_nodes=1 \
-        trainer.precision=bf16 \
-        trainer.max_steps=4 \
-        trainer.val_check_interval=4 \
-        trainer.enable_checkpointing=False \
-        +trainer.limit_val_batches=2 \
-        +trainer.limit_test_batches=2 \
-        exp_manager.checkpoint_callback_params.save_best_model=False \
-        exp_manager.exp_dir=/tmp/gpt_sft_results_starcoder_pp1 \
-        model.peft.peft_scheme=none \
-        model.optim.name=distributed_fused_adam \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \
-        model.tensor_model_parallel_size=1 \
-        model.pipeline_model_parallel_size=1 \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.train_ds.num_workers=0 \
-        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.test_ds.num_workers=0 \
-        model.data.train_ds.concat_sampling_probabilities=[1.0]
-
-  L2_Megatron_GPT_Reranker:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Reranker') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py \
-        exp_manager.exp_dir="/tmp/gpt_reranker_workdir/" \
-        model.global_batch_size=4 \
-        model.micro_batch_size=4 \
-        trainer.devices=1 \
-        trainer.num_nodes=1 \
-        trainer.max_epochs=null \
-        trainer.max_steps=20 \
-        trainer.val_check_interval=10 \
-        model.restore_from_path="/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo" \
-        model.peft.lora_tuning.adapter_dim=8 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] \
-        model.data.validation_ds.write_embeddings_to_file=True \
-        model.data.validation_ds.output_file_path_prefix="/home/TestData/nlp/megatron_ir/working_dir/val_embs" \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl]
-
-  L2_Megatron_GPT_Embedding:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Embedding') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py \
-        exp_manager.exp_dir="/tmp/gpt_embedding_workdir/" \
-        model.global_batch_size=4 \
-        model.micro_batch_size=4 \
-        trainer.devices=1 \
-        trainer.num_nodes=1 \
-        trainer.max_epochs=null \
-        trainer.max_steps=20 \
-        trainer.val_check_interval=10 \
-        model.restore_from_path="/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo" \
-        model.peft.lora_tuning.adapter_dim=8 \
-        model.data.validation_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \
-        model.data.validation_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl] \
-        model.data.validation_ds.write_embeddings_to_file=True \
-        model.data.validation_ds.output_file_path_prefix="/tmp/gpt_embedding_workdir/val_embs/" \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl]
-
-
-        python examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py \
-        trainer.devices=1 \
-        trainer.num_nodes=1 \
-        model.restore_from_path="/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo" \
-        model.peft.restore_from_path="/tmp/gpt_embedding_workdir/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo" \
-        model.global_batch_size=4 \
-        model.micro_batch_size=4 \
-        model.peft.lora_tuning.adapter_dim=8 \
-        model.data.test_ds.write_embeddings_to_file=True \
-        model.data.test_ds.output_file_path_prefix="/tmp/gpt_embedding_workdir/test_embs" \
-        model.data.test_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \
-        model.data.test_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl]
-
-  L2_Megatron_GPT_PEFT_Lora_PP2_O2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_PEFT_Lora_PP2_O2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.max_epochs=9999 \
-        trainer.max_steps=3 \
-        trainer.val_check_interval=3 \
-        ++trainer.limit_val_batches=2 \
-        trainer.precision=bf16 \
-        exp_manager.exp_dir=/tmp/nlp_peft_lora_tuning_pp2 \
-        model.pipeline_model_parallel_size=2 \
-        model.tensor_model_parallel_size=1 \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
-        model.megatron_amp_O2=True \
-        model.peft.peft_scheme=lora \
-        model.answer_only_loss=True \
-        model.micro_batch_size=1 \
-        model.global_batch_size=1 \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.train_ds.concat_sampling_probabilities=[1.0] \
-        model.data.train_ds.num_workers=0 \
-        model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.validation_ds.names=[quarel]
-        
-        python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
-        model.peft.restore_from_path=/tmp/nlp_peft_lora_tuning_pp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \
-        model.pipeline_model_parallel_size=2 \
-        model.tensor_model_parallel_size=1 \
-        trainer.devices=2 \
-        model.megatron_amp_O2=True \
-        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
-        model.data.test_ds.names=["quarel4"] \
-        model.global_batch_size=2 \
-        model.micro_batch_size=1 \
-        model.data.test_ds.tokens_to_generate=10 \
-        model.data.test_ds.write_predictions_to_file=True \
-        model.data.test_ds.output_file_path_prefix="/tmp/nlp_peft_lora_tuning_pp2/out" \
-        inference.greedy=True \
-        inference.repetition_penalty=1.0 \
-        inference.outfile_path="/tmp/nlp_peft_lora_tuning_pp2/out.jsonl"
-
-  L2_Megatron_GPT_PEFT_Lora_TP2_O1:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_PEFT_Lora_TP2_O1') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.max_epochs=9999 \
-        trainer.max_steps=3 \
-        trainer.val_check_interval=3 \
-        ++trainer.limit_val_batches=2 \
-        trainer.precision=bf16 \
-        exp_manager.exp_dir=/tmp/nlp_peft_lora_tuning_pp2_o1 \
-        model.pipeline_model_parallel_size=1 \
-        model.tensor_model_parallel_size=2 \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
-        model.peft.peft_scheme="lora" \
-        model.answer_only_loss=True \
-        model.micro_batch_size=1 \
-        model.global_batch_size=1 \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.train_ds.concat_sampling_probabilities=[1.0] \
-        model.data.train_ds.num_workers=0 \
-        model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.validation_ds.names=[quarel]
-
-        python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
-        model.peft.restore_from_path=/tmp/nlp_peft_lora_tuning_pp2_o1/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \
-        model.tensor_model_parallel_size=2 \
-        trainer.devices=2 \
-        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
-        model.data.test_ds.names=["quarel4"] \
-        model.global_batch_size=2 \
-        model.micro_batch_size=1 \
-        model.data.test_ds.tokens_to_generate=10 \
-        model.data.test_ds.write_predictions_to_file=True \
-        model.data.test_ds.output_file_path_prefix="/tmp/nlp_peft_lora_tuning_pp2_o1/out" \
-        inference.greedy=True \
-        inference.repetition_penalty=1.0 \
-        inference.outfile_path="/tmp/nlp_peft_lora_tuning_pp2_o1/out.jsonl"
-
-  L2_Megatron_GPT_PEFT_Lora_TP2SP1:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_PEFT_Lora_TP2SP1') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-2-h100
-      SCRIPT: |
-        CUDA_DEVICE_MAX_CONNECTIONS=1 NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=1 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.max_epochs=9999 \
-        trainer.max_steps=3 \
-        trainer.val_check_interval=3 \
-        ++trainer.limit_val_batches=2 \
-        trainer.precision=bf16 \
-        exp_manager.exp_dir=/tmp/nlp_lora_tuning_tp2_sp1 \
-        +model.mcore_gpt=True \
-        model.pipeline_model_parallel_size=1 \
-        model.tensor_model_parallel_size=2 \
-        model.sequence_parallel=True \
-        model.megatron_amp_O2=True \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
-        +model.fp8=True \
-        +model.fp8_params=True \
-        +model.fp8_hybrid=True \
-        +model.fp8_e4m3=False \
-        +model.fp8_interval=1 \
-        +model.fp8_margin=0 \
-        +model.fp8_amax_history_len=32 \
-        +model.fp8_amax_compute_algo=max \
-        +model.reduce_amax=False \
-        +model.ub_tp_comm_overlap=False \
-        +model.tp_comm_overlap_ag=False \
-        +model.tp_comm_overlap_rs=False \
-        +model.tp_comm_overlap_disable_qkv=True \
-        model.peft.peft_scheme="lora" \
-        model.peft.lora_tuning.adapter_dim=16 \
-        model.peft.lora_tuning.alpha=32 \
-        model.peft.lora_tuning.column_init_method="kaiming" \
-        +model.peft.lora_tuning.dropout_position="pre" \
-        model.peft.lora_tuning.target_modules=["attention"] \
-        model.peft.lora_tuning.adapter_dropout=0.1 \
-        +model.peft.lora_tuning.a2a_experimental=1 \
-        model.answer_only_loss=True \
-        model.micro_batch_size=1 \
-        model.global_batch_size=1 \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.train_ds.concat_sampling_probabilities=[1.0] \
-        model.data.train_ds.num_workers=0 \
-        model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.validation_ds.names=[quarel]
-
-  L2_Megatron_GPT_Eval:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Eval') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_eval.py \
-            gpt_model_file=/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo \
-            prompts=["How to fix GPU memory? A:"] \
-            tensor_model_parallel_size=1 \
-            inference.tokens_to_generate=32 \
-            trainer.precision=32
-
-  L2_Megatron_GPT_Eval_PP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Eval_PP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_eval.py \
-            gpt_model_file=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
-            server=False \
-            tensor_model_parallel_size=1 \
-            pipeline_model_parallel_size=2 \
-            trainer.devices=2 \
-            trainer.num_nodes=1 \
-            trainer.precision=32
-
-  L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
-            model.restore_from_path=/home/TestData/nlp/megatron_gpt_sft/megatron_gpt_rope_sft.nemo \
-            model.peft.restore_from_path=null \
-            model.data.test_ds.file_names=[/home/TestData/nlp/megatron_gpt_sft/sample.jsonl] \
-            model.data.test_ds.names=[test] \
-            model.data.test_ds.global_batch_size=1 \
-            model.data.test_ds.micro_batch_size=1 \
-            model.data.test_ds.tokens_to_generate=30 \
-            model.data.test_ds.max_seq_length=6000 \
-            model.data.test_ds.write_predictions_to_file=True \
-            model.data.test_ds.output_file_path_prefix=examples/nlp/language_modeling/out \
-            inference.greedy=True \
-            inference.repetition_penalty=1.0 \
-            inference.outfile_path=examples/nlp/language_modeling/out.jsonl 
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/out.jsonl
-
-    # TODO: Add this test back. Test was failing on CI machines due to HW error
-    # - name: L2: Megatron GPT Convert from Megatron-LM checkpoing and Eval
-    #   when {
-    #     anyOf {
-    #       branch main
-    #       changeRequest target: main
-    #     }
-    #   }
-    #   failFast true
-    #   - run: |
-    #     python -m torch.distributed.launch --nproc_per_node=2 \
-    #     examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py \
-    #     --checkpoint_folder=/home/TestData/nlp/megatron_gpt/data/gpt/iter_0008700 \
-    #     --checkpoint_name=model_optim_rng.pt \
-    #     --hparams_file=/home/TestData/nlp/megatron_gpt/data/gpt/iter_0008700/hparams.yaml \
-    #     --nemo_file_path=examples/nlp/language_modeling/small_gpt.nemo \
-    #     --model_type=gpt \
-    #     --pipeline_model_parallel_size=1 \
-    #     --gpus_per_node=2 \
-    #     --tensor_model_parallel_size=2"
-    #     python examples/nlp/language_modeling/megatron_gpt_eval.py \
-    #     --gpt_model_file=examples/nlp/language_modeling/small_gpt.nemo \
-    #     --tokens_to_generate=32 \
-    #     --tensor_model_parallel_size=2 \
-    #     --prompt=This is a test.
-    #     rm examples/nlp/language_modeling/small_gpt.nemo
-  
-  # L2_Megatron_Change_Partitions
-  L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_change_num_partitions.py \
-            --model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
-            --target_file /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo \
-            --tensor_model_parallel_size 2 \
-            --target_tensor_model_parallel_size 1 \
-            --pipeline_model_parallel_size 1 \
-            --target_pipeline_model_parallel_size 2
-      AFTER_SCRIPT: |
-        rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo
-
-  L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_change_num_partitions.py \
-            --model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
-            --target_file /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo \
-            --tensor_model_parallel_size 2 \
-            --target_tensor_model_parallel_size 4 \
-            --pipeline_model_parallel_size 1 \
-            --target_pipeline_model_parallel_size 1
-      AFTER_SCRIPT: |
-        rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo
-
-  L2_Megatron_T5_Pretraining_and_Resume_Training_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=bf16 \
-        model.megatron_amp_O2=True \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation=swiglu \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method=block \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.position_embedding_type=relative \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation=fast-swiglu \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method=block \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type=pre_ln \
-        model.decoder.transformer_block_type=pre_ln \
-        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-        model.data.data_impl=text_mmap \
-        +model.data.data_impl_kwargs.newline_int=10 \
-        +model.data.data_impl_kwargs.header_lines=0 \
-        +model.data.data_impl_kwargs.workers=null \
-        +model.data.data_impl_kwargs.sort_dataset_paths=False \
-        model.share_token_embeddings=False \
-        model.share_decoder_tokens_head_embeddings=False
-
-        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=bf16 \
-        model.megatron_amp_O2=True \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation=swiglu \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method=block \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.position_embedding_type=relative \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation=fast-swiglu \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method=block \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type=pre_ln \
-        model.decoder.transformer_block_type=pre_ln \
-        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-        model.data.data_impl=text_mmap \
-        +model.data.data_impl_kwargs.newline_int=10 \
-        +model.data.data_impl_kwargs.header_lines=0 \
-        +model.data.data_impl_kwargs.workers=null \
-        +model.data.data_impl_kwargs.sort_dataset_paths=False \
-        model.share_token_embeddings=False \
-        model.share_decoder_tokens_head_embeddings=False
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/t5_pretrain_results
-        rm -rf examples/nlp/language_modeling/t5_index_mappings
-
-  L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.max_epochs=null \
-        trainer.max_steps=10 \
-        trainer.val_check_interval=10 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.precision=bf16 \
-        model.megatron_amp_O2=True \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        model.mcore_t5=True \
-        model.transformer_engine=True \
-        model.tensor_model_parallel_size=2 \
-        model.micro_batch_size=4 \
-        model.global_batch_size=4 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.encoder.transformer_block_type="pre_ln" \
-        model.decoder.transformer_block_type="pre_ln" \
-        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-        model.data.data_impl=text_mmap \
-        +model.data.data_impl_kwargs.newline_int=10 \
-        +model.data.data_impl_kwargs.header_lines=0 \
-        +model.data.data_impl_kwargs.workers=null \
-        +model.data.data_impl_kwargs.sort_dataset_paths=False
-
-        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.max_epochs=null \
-        trainer.max_steps=10 \
-        trainer.val_check_interval=10 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.precision=bf16 \
-        model.megatron_amp_O2=True \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.mcore_t5=True \
-        model.transformer_engine=True \
-        model.tensor_model_parallel_size=2 \
-        model.micro_batch_size=4 \
-        model.global_batch_size=4 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.encoder.transformer_block_type="pre_ln" \
-        model.decoder.transformer_block_type="pre_ln" \
-        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-        model.data.data_impl=text_mmap \
-        +model.data.data_impl_kwargs.newline_int=10 \
-        +model.data.data_impl_kwargs.header_lines=0 \
-        +model.data.data_impl_kwargs.workers=null \
-        +model.data.data_impl_kwargs.sort_dataset_paths=False
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/t5_pretrain_results
-        rm -rf examples/nlp/language_modeling/t5_index_mappings
-
-  L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation=swiglu \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method=block \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.position_embedding_type=alibi \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation=swiglu \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method=block \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type=pre_ln \
-        model.decoder.transformer_block_type=pre_ln \
-        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-        model.data.data_impl=text_mmap \
-        +model.data.data_impl_kwargs.newline_int=10 \
-        +model.data.data_impl_kwargs.header_lines=0 \
-        +model.data.data_impl_kwargs.workers=null \
-        +model.data.data_impl_kwargs.sort_dataset_paths=False \
-        model.share_token_embeddings=False \
-        model.share_decoder_tokens_head_embeddings=False
-
-        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation=swiglu \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method=block \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.position_embedding_type=alibi \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation=swiglu \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method=block \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type=pre_ln \
-        model.decoder.transformer_block_type=pre_ln \
-        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-        model.data.data_impl=text_mmap \
-        +model.data.data_impl_kwargs.newline_int=10 \
-        +model.data.data_impl_kwargs.header_lines=0 \
-        +model.data.data_impl_kwargs.workers=null \
-        +model.data.data_impl_kwargs.sort_dataset_paths=False \
-        model.share_token_embeddings=False \
-        model.share_decoder_tokens_head_embeddings=False
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/t5_pretrain_results
-        rm -rf examples/nlp/language_modeling/t5_index_mappings
-
-  L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation=swiglu \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method=block \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.position_embedding_type=kerple \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation=swiglu \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method=block \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type=pre_ln \
-        model.decoder.transformer_block_type=pre_ln \
-        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-        model.data.data_impl=text_mmap \
-        +model.data.data_impl_kwargs.newline_int=10 \
-        +model.data.data_impl_kwargs.header_lines=0 \
-        +model.data.data_impl_kwargs.workers=null \
-        +model.data.data_impl_kwargs.sort_dataset_paths=False \
-        model.share_token_embeddings=False \
-        model.share_decoder_tokens_head_embeddings=False
-
-        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation=swiglu \
-        model.encoder.masked_softmax_fusion=False \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method=block \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.position_embedding_type=kerple \
-        model.decoder.num_layers=2 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation=swiglu \
-        model.decoder.masked_softmax_fusion=False \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method=block \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type=pre_ln \
-        model.decoder.transformer_block_type=pre_ln \
-        model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \
-        model.data.data_impl=text_mmap \
-        +model.data.data_impl_kwargs.newline_int=10 \
-        +model.data.data_impl_kwargs.header_lines=0 \
-        +model.data.data_impl_kwargs.workers=null \
-        +model.data.data_impl_kwargs.sort_dataset_paths=False \
-        model.share_token_embeddings=False \
-        model.share_decoder_tokens_head_embeddings=False
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/t5_pretrain_results
-        rm -rf examples/nlp/language_modeling/t5_index_mappings
-
-  L2_Megatron_T5_Pretraining_and_Resume_Training_PP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_Pretraining_and_Resume_Training_PP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        model.pipeline_model_parallel_size=2 \
-        model.pipeline_model_parallel_split_rank=1 \
-        model.seq_length=256 \
-        model.encoder.num_layers=4 \
-        model.decoder.num_layers=1 \
-        model.encoder.hidden_size=64 \
-        model.decoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.ffn_hidden_size=2048 \
-        model.encoder.activation=gelu \
-        model.encoder.activations_checkpoint_method=block \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type=pre_ln \
-        model.decoder.transformer_block_type=post_ln \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
-
-        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.pipeline_model_parallel_size=2 \
-        model.pipeline_model_parallel_split_rank=1 \
-        model.seq_length=256 \
-        model.encoder.num_layers=4 \
-        model.decoder.num_layers=1 \
-        model.encoder.hidden_size=64 \
-        model.decoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.ffn_hidden_size=2048 \
-        model.encoder.activation=gelu \
-        model.encoder.activations_checkpoint_method=block \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type=pre_ln \
-        model.decoder.transformer_block_type=post_ln \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/t5_pretrain_results
-        rm -rf examples/nlp/language_modeling/t5_index_mappings
-
-  L2_Megatron_T5_w_Mixture_of_Expert_Pretraining:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_w_Mixture_of_Expert_Pretraining') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        model.pipeline_model_parallel_split_rank=0 \
-        model.seq_length=256 \
-        model.encoder.num_layers=4 \
-        model.decoder.num_layers=1 \
-        model.encoder.num_moe_experts=4 \
-        model.decoder.num_moe_experts=4 \
-        model.encoder.moe_frequency=3 \
-        model.decoder.moe_frequency=1 \
-        model.encoder.hidden_size=64 \
-        model.decoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.ffn_hidden_size=2048 \
-        model.encoder.activation=gelu \
-        model.encoder.activations_checkpoint_method=block \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type=pre_ln \
-        model.decoder.transformer_block_type=post_ln \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/t5_pretrain_results
-        rm -rf examples/nlp/language_modeling/t5_index_mappings
-
-  L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_t5_pretraining.py -cn megatron_ul2_config \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation=swiglu \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method=block \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type=normformer \
-        model.encoder.headscale=True \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation=geglu \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method=block \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.decoder.transformer_block_type=normformer \
-        model.decoder.headscale=False \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
-
-        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation=swiglu \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method=block \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.encoder.transformer_block_type=normformer \
-        model.encoder.headscale=True \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation=geglu \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method=block \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.decoder.transformer_block_type=normformer \
-        model.decoder.headscale=False \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \
-        model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/t5_pretrain_results
-        rm -rf examples/nlp/language_modeling/t5_index_mappings
-
-  L2_Megatron_T5_Eval:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_Eval') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_t5_eval.py \
-            --model_file /home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
-            --prompt "How do I fix my GPU memory issue? I am seeing <mask> out of memory." \
-            --tensor_model_parallel_size 1
-
-  L2_Megatron_Core_T5_Eval:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Core_T5_Eval') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_t5_eval.py \
-            --model_file /home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m.nemo \
-            --prompt "How do I fix my GPU memory issue? I am seeing <mask> out of memory." \
-            --tensor_model_parallel_size 1
-
-  L2_Megatron_BART_Pretraining_and_Resume_Training_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_BART_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=3 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation="reglu" \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method="block" \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation="reglu" \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method="block" \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix="{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}"
-
-        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=2 \
-        trainer.limit_val_batches=5 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=6 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.tensor_model_parallel_size=2 \
-        model.seq_length=128 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation="reglu" \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method="block" \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation="reglu" \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method="block" \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.data_prefix="{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}"
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/bart_pretrain_results
-
-  L2_Megatron_BART_Pretraining_and_Resume_Training_PP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_BART_Pretraining_and_Resume_Training_PP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-        model.pipeline_model_parallel_size=2 \
-        model.pipeline_model_parallel_split_rank=1 \
-        model.seq_length=256 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation=geglu \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method=block \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation=geglu \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method=block \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.respect_document_boundaries=False \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]
-
-        python examples/nlp/language_modeling/megatron_bart_pretraining.py \
-        trainer.devices=2 \
-        trainer.accelerator=gpu \
-        trainer.log_every_n_steps=1 \
-        trainer.val_check_interval=1 \
-        trainer.limit_val_batches=2 \
-        trainer.accumulate_grad_batches=1 \
-        trainer.max_steps=10 \
-        trainer.precision=16 \
-        trainer.gradient_clip_val=1.0 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \
-        exp_manager.resume_if_exists=True \
-        model.pipeline_model_parallel_size=2 \
-        model.pipeline_model_parallel_split_rank=1 \
-        model.seq_length=256 \
-        model.encoder.num_layers=4 \
-        model.encoder.hidden_size=64 \
-        model.encoder.num_attention_heads=8 \
-        model.encoder.activation=geglu \
-        model.encoder.bias_activation_fusion=False \
-        model.encoder.activations_checkpoint_method=block \
-        model.encoder.activations_checkpoint_num_layers=1 \
-        model.decoder.num_layers=4 \
-        model.decoder.hidden_size=64 \
-        model.decoder.num_attention_heads=8 \
-        model.decoder.activation=geglu \
-        model.decoder.bias_activation_fusion=False \
-        model.decoder.activations_checkpoint_method=block \
-        model.decoder.activations_checkpoint_num_layers=1 \
-        model.data.respect_document_boundaries=False \
-        model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/bart_pretrain_results
-
- 
-  L2_Megatron_T5_PEFT_Lora_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_PEFT_Lora_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-
-        python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.max_epochs=9999 \
-        trainer.max_steps=3 \
-        trainer.val_check_interval=3 \
-        ++trainer.limit_val_batches=2 \
-        trainer.precision=16 \
-        exp_manager.exp_dir=/tmp/nlp_t5_lora_tuning_tp2 \
-        model.pipeline_model_parallel_size=1 \
-        model.tensor_model_parallel_size=2 \
-        model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \
-        model.peft.peft_scheme=lora \
-        model.answer_only_loss=True \
-        model.micro_batch_size=1 \
-        model.global_batch_size=1 \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.train_ds.concat_sampling_probabilities=[1.0] \
-        model.data.train_ds.num_workers=0 \
-        model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.validation_ds.names=[quarel]
-
-        python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \
-        model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \
-        model.peft.restore_from_path=/tmp/nlp_t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \
-        model.peft.restore_from_ckpt_name=null \
-        model.peft.restore_from_hparams_path=null \
-        model.tensor_model_parallel_size=2 \
-        trainer.devices=2 \
-        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
-        model.data.test_ds.names=[quarel4] \
-        model.global_batch_size=2 \
-        model.micro_batch_size=1 \
-        model.data.test_ds.tokens_to_generate=10 \
-        model.data.test_ds.write_predictions_to_file=True \
-        model.data.test_ds.output_file_path_prefix=/tmp/nlp_t5_lora_tuning_tp2/out \
-        inference.greedy=True \
-        inference.repetition_penalty=1.0 \
-        inference.outfile_path=/tmp/nlp_t5_lora_tuning_tp2/out.jsonl
-
-  L2_Megatron_Core_T5_PEFT_Lora_TP2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Core_T5_PEFT_Lora_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \
-        trainer.devices=2 \
-        trainer.log_every_n_steps=1 \
-        trainer.max_epochs=9999 \
-        trainer.max_steps=3 \
-        trainer.val_check_interval=3 \
-        ++trainer.limit_val_batches=2 \
-        trainer.precision=16 \
-        exp_manager.exp_dir=/tmp/nlp_mcore_t5_lora_tuning_tp2 \
-        model.pipeline_model_parallel_size=1 \
-        model.tensor_model_parallel_size=2 \
-        model.restore_from_path=/home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m.nemo \
-        model.peft.peft_scheme=lora \
-        model.answer_only_loss=True \
-        model.micro_batch_size=1 \
-        model.global_batch_size=1 \
-        model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.train_ds.concat_sampling_probabilities=[1.0] \
-        model.data.train_ds.num_workers=0 \
-        model.data.validation_ds.num_workers=0 \
-        model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
-        model.data.validation_ds.names=[quarel]
-
-        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \
-        model.restore_from_path=/home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m.nemo \
-        model.peft.restore_from_path=/tmp/nlp_mcore_t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \
-        model.peft.restore_from_ckpt_name=null \
-        model.peft.restore_from_hparams_path=null \
-        model.tensor_model_parallel_size=2 \
-        trainer.devices=2 \
-        model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \
-        model.data.test_ds.names=[quarel4] \
-        model.global_batch_size=1 \
-        model.micro_batch_size=1 \
-        model.data.test_ds.tokens_to_generate=10 \
-        model.data.test_ds.write_predictions_to_file=True \
-        model.data.test_ds.output_file_path_prefix=/tmp/nlp_mcore_t5_lora_tuning_tp2/out \
-        inference.greedy=True \
-        inference.repetition_penalty=1.0 \
-        inference.outfile_path=/tmp/nlp_mcore_t5_lora_tuning_tp2/out.jsonl
-
-  # L2: Megatron Mock Data Generation                
-  L2_Megatron_Mock_Data_Generation_MockGPTDataset:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Mock_Data_Generation_MockGPTDataset') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-            trainer.max_steps=10 \
-            trainer.limit_val_batches=7 \
-            trainer.val_check_interval=10 \
-            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-            model.mcore_gpt=True \
-            model.data.data_impl=mock \
-            model.data.data_prefix=[]
-
-  L2_Megatron_Mock_Data_Generation_MockT5Dataset:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Mock_Data_Generation_MockT5Dataset') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.max_steps=10 \
-        trainer.limit_val_batches=3 \
-        trainer.val_check_interval=10 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        model.data.data_impl=mock \
-        model.data.data_prefix=[]
-      AFTER_SCRIPT: |
-        rm -rf examples/nlp/language_modeling/t5_pretrain_results
-
-  # L2: TTS Fast dev runs 1
-  L2_TTS_Fast_dev_runs_1_Tacotron_2:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_TTS_Fast_dev_runs_1_Tacotron_2') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        python examples/tts/tacotron2.py \
-        train_dataset=/home/TestData/an4_dataset/an4_train.json \
-        validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-        trainer.devices=1 \
-        trainer.accelerator="gpu" \
-        +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
-        trainer.strategy=auto \
-        model.decoder.decoder_rnn_dim=256 \
-        model.decoder.attention_rnn_dim=1024 \
-        model.decoder.prenet_dim=128 \
-        model.postnet.postnet_n_convolutions=3 \
-        model.train_ds.dataloader_params.batch_size=4 \
-        model.train_ds.dataloader_params.num_workers=0 \
-        model.validation_ds.dataloader_params.batch_size=4 \
-        model.validation_ds.dataloader_params.num_workers=0 \
-        ~model.text_normalizer \
-        ~model.text_normalizer_call_kwargs \
-        ~trainer.check_val_every_n_epoch
-
-  L2_TTS_Fast_dev_runs_1_WaveGlow:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_TTS_Fast_dev_runs_1_WaveGlow') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/tts/waveglow.py \
-        train_dataset=/home/TestData/an4_dataset/an4_train.json \
-        validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-        trainer.devices="[0]" \
-        +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
-        trainer.strategy=auto \
-        model.train_ds.dataloader_params.batch_size=4 \
-        model.train_ds.dataloader_params.num_workers=0 \
-        model.validation_ds.dataloader_params.batch_size=4 \
-        model.validation_ds.dataloader_params.num_workers=0 \
-        model.waveglow.n_flows=4 \
-        model.waveglow.n_wn_layers=2 \
-        model.waveglow.n_wn_channels=32 \
-        ~trainer.check_val_every_n_epoch
-
-  L2_TTS_Fast_dev_runs_1_FastPitch:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_TTS_Fast_dev_runs_1_FastPitch') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/tts/fastpitch.py \
-        --config-name fastpitch_align_v1.05 \
-        train_dataset=/home/TestData/an4_dataset/an4_train.json \
-        validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-        sup_data_path=/home/TestData/an4_dataset/beta_priors \
-        trainer.devices="[0]" \
-        +trainer.limit_train_batches=1 \
-        +trainer.limit_val_batches=1 \
-        trainer.max_epochs=1 \
-        trainer.strategy=auto \
-        model.pitch_mean=212.35873413085938 \
-        model.pitch_std=68.52806091308594 \
-        model.train_ds.dataloader_params.batch_size=4 \
-        model.train_ds.dataloader_params.num_workers=0 \
-        model.validation_ds.dataloader_params.batch_size=4 \
-        model.validation_ds.dataloader_params.num_workers=0 \
-        model.symbols_embedding_dim=64 \
-        model.input_fft.d_inner=384 \
-        model.input_fft.n_layer=2 \
-        model.output_fft.d_inner=384 \
-        model.output_fft.n_layer=2 \
-        ~trainer.check_val_every_n_epoch \
-        ~model.text_normalizer \
-        ~model.text_normalizer_call_kwargs
-
-  # OPTIONAL_L2_TTS_Fast_dev_runs_1_RADTTS:
-  #   needs: [cicd-test-container-setup]
-  #   runs-on: self-hosted-azure
-  #   timeout-minutes: 10
-  #   container:
-  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-  #     options: 
-  #       # --user 0:128
-  #       --device=/dev/nvidia0
-  #       --gpus all
-  #       --shm-size=8g
-  #       --env TRANSFORMERS_OFFLINE=0 
-  #       --env HYDRA_FULL_ERROR=1
-  #       --volume /mnt/datadrive/TestData:/home/TestData
-  #   steps:
-  #       - name: Checkout repository
-  #         uses: actions/checkout@v4
-  #       - run: |
-  #           python examples/tts/radtts.py \
-  #           train_dataset=/home/TestData/an4_dataset/an4_train.json \
-  #           validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-  #           sup_data_path=/home/TestData/an4_dataset/radtts_beta_priors \
-  #           trainer.devices="[0]" \
-  #           +trainer.limit_train_batches=1 \
-  #           +trainer.limit_val_batches=1 \
-  #           trainer.max_epochs=1 \
-  #           trainer.strategy=auto \
-  #           model.pitch_mean=212.35873413085938 \
-  #           model.pitch_std=68.52806091308594 \
-  #           model.train_ds.dataloader_params.batch_size=4 \
-  #           model.train_ds.dataloader_params.num_workers=0 \
-  #           model.validation_ds.dataloader_params.batch_size=4 \
-  #           model.validation_ds.dataloader_params.num_workers=0 \
-  #           export_dir=/home/TestData/radtts_test \
-  #           model.optim.lr=0.0001 \
-  #           model.modelConfig.decoder_use_partial_padding=True \
-  #           ~trainer.check_val_every_n_epoch \
-  #           ~model.text_normalizer \
-  #           ~model.text_normalizer_call_kwargs
-  #       #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-  #       #  if: "failure()"
-
-  L2_TTS_Fast_dev_runs_1_Mixer-TTS:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_TTS_Fast_dev_runs_1_Mixer-TTS') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/tts/mixer_tts.py \
-        train_dataset=/home/TestData/an4_dataset/an4_train.json \
-        validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-        sup_data_path=/home/TestData/an4_dataset/sup_data \
-        trainer.devices="[0]" \
-        +trainer.limit_train_batches=1 \
-        +trainer.limit_val_batches=1 \
-        trainer.max_epochs=1 \
-        trainer.strategy=auto \
-        model.pitch_mean=212.35873413085938 \
-        model.pitch_std=68.52806091308594 \
-        model.train_ds.dataloader_params.batch_size=4 \
-        model.train_ds.dataloader_params.num_workers=0 \
-        model.validation_ds.dataloader_params.batch_size=4 \
-        model.validation_ds.dataloader_params.num_workers=0 \
-        ~trainer.check_val_every_n_epoch \
-        ~model.text_normalizer \
-        ~model.text_normalizer_call_kwargs
-
-  L2_TTS_Fast_dev_runs_1_Hifigan:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_TTS_Fast_dev_runs_1_Hifigan') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/tts/hifigan.py \
-        train_dataset=/home/TestData/an4_dataset/an4_train.json \
-        validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-        trainer.devices="[0]" \
-        +trainer.limit_train_batches=1 \
-        +trainer.limit_val_batches=1 \
-        +trainer.max_epochs=1 \
-        trainer.strategy=auto \
-        model.train_ds.dataloader_params.batch_size=4 \
-        model.train_ds.dataloader_params.num_workers=0 \
-        model.validation_ds.dataloader_params.batch_size=4 \
-        model.validation_ds.dataloader_params.num_workers=0 \
-        model.generator.upsample_initial_channel=64 \
-        +model.debug=true \
-        ~trainer.check_val_every_n_epoch
-
-  # L2: NeRF
-  # L2_NeRF_DreamFusion:
-  #   needs: [cicd-test-container-setup]
-  #   runs-on: self-hosted-azure
-  #   container:
-  #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-  #     options: 
-  #       # --user 0:128
-  #       --device=/dev/nvidia0
-  #       --gpus all
-  #       --shm-size=8g 
-  #       --env TRANSFORMERS_OFFLINE=0 
-  #       --env HYDRA_FULL_ERROR=1
-  #       --volume /mnt/datadrive/TestData:/home/TestData
-  #   steps:
-  #       - name: Checkout repository
-  #         uses: actions/checkout@v4
-  #       - run: |
-  #           python examples/multimodal/text_to_image/nerf/main.py \
-  #           trainer.num_nodes=1 \
-  #           trainer.devices="[0]" \
-  #           trainer.max_steps=1000 \
-  #           model.prompt="a DSLR photo of a delicious hamburger" \
-  #           exp_manager.exp_dir=examples/multimodal/text_to_image/nerf/dreamfusion_results
-  #
-  #           rm -rf examples/multimodal/text_to_image/nerf/dreamfusion_results
-  #       - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-  #         if: "failure()"
-
-  Speech_Checkpoints_tests:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'Speech_Checkpoints_tests') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      TIMEOUT: 20
-      SCRIPT: |
-        CUDA_VISIBLE_DEVICES=0 python examples/asr/speech_to_text_eval.py \
-            pretrained_name=QuartzNet15x5Base-En  \
-            dataset_manifest=/home/TestData/librispeech/librivox-dev-other.json \
-            batch_size=64 \
-            tolerance=0.1012
-      AFTER_SCRIPT: |
-        rm -f examples/asr/evaluation_transcripts.json
-
-  L2_Stable_Diffusion_Training:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Stable_Diffusion_Training') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure-gpus-1
-      SCRIPT: |
-        rm -rf examples/multimodal/text_to_image/sd_train_results
-
-        python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \
-        trainer.devices=1 \
-        trainer.max_steps=3 \
-        +trainer.val_check_interval=10 \
-        trainer.limit_val_batches=2 \
-        trainer.gradient_clip_val=0 \
-        exp_manager.exp_dir=examples/multimodal/text_to_image/sd_train_results \
-        exp_manager.create_checkpoint_callback=False \
-        exp_manager.resume_if_exists=False \
-        model.resume_from_checkpoint=null \
-        model.precision=16 \
-        model.micro_batch_size=1 \
-        model.global_batch_size=1 \
-        model.first_stage_key=moments \
-        model.cond_stage_key=encoded \
-        +model.load_vae=False \
-        +model.load_unet=False \
-        +model.load_encoder=False \
-        model.parameterization=v \
-        model.load_only_unet=False \
-        model.text_embedding_dropout_rate=0.0 \
-        model.inductor=True \
-        model.inductor_cudagraphs=False \
-        model.capture_cudagraph_iters=15 \
-        +model.unet_config.num_head_channels=64 \
-        +model.unet_config.use_linear_in_transformer=True \
-        model.unet_config.context_dim=1024 \
-        model.unet_config.use_flash_attention=null \
-        model.unet_config.resblock_gn_groups=16 \
-        model.unet_config.unet_precision=fp16 \
-        +model.unet_config.timesteps=1000 \
-        model.optim.name=megatron_fused_adam \
-        +model.optim.capturable=True \
-        +model.optim.master_weights=True \
-        model.optim.weight_decay=0.01 \
-        model.first_stage_config.from_pretrained=null \
-        model.data.num_workers=16 \
-        model.data.synthetic_data=True
-      AFTER_SCRIPT: |
-        rm -rf examples/multimodal/text_to_image/sd_train_results
-
-  L2_NeMo_2_GPT_Pretraining_no_transformer_engine:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_GPT_Pretraining_no_transformer_engine') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        pip uninstall -y apex ## TODO: remove when apex is no longer a dependency
-        pip uninstall -y transformer_engine
-
-        python tests/collections/llm/megatron_gpt_pretraining.py \
-        --devices=2 \
-        --max-steps=3 \
-        --experiment-dir=tests/collections/llm/gpt_pretrain_results \
-        --vocab-path=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        --merges-path=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        --data-path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \
-        --index-mapping-dir=tests/collections/llm/gpt_index_mappings \
-        --no-masked-softmax-fusion
-
-        python tests/collections/llm/megatron_gpt_pretraining.py \
-        --devices=2 \
-        --max-steps=6 \
-        --experiment-dir=tests/collections/llm/gpt_pretrain_results \
-        --vocab-path=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        --merges-path=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        --data-path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \
-        --index-mapping-dir=tests/collections/llm/gpt_index_mappings \
-        --no-masked-softmax-fusion
-      AFTER_SCRIPT: |
-        rm -rf tests/collections/llm/gpt_pretrain_results
-        rm -rf tests/collections/llm/gpt_index_mappings
-
-  OPTIONAL_L2_NeMo_2_GPT_DDP_Param_Parity_check:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L2_NeMo_2_GPT_DDP_Param_Parity_check') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-
-        python tests/lightning/test_ddp_parity_checker.py \
-        --vocab-path=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
-        --merges-path=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
-        --data-path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document
-
-      AFTER_SCRIPT: |
-        rm -rf tests/collections/llm/gpt_pretrain_results
-        rm -rf tests/collections/llm/gpt_index_mappings
-      IS_OPTIONAL: true
-      
-  L2_NeMo_2_SSM_Pretraining:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_SSM_Pretraining') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-
-        python tests/collections/llm/gpt/model/megatron_ssm_pretraining.py \
-        --devices 1 \
-        --max-steps 10 \
-        --experiment-dir /tmp/nlp_megatron_mamba_nemo-ux-mamba_cicd_test_pretrain/${{ github.run_id }} \
-        --data-path /home/TestData/nlp/megatron_mamba/toy_ssm_dataset/legal_pile_text_document
-
-  L2_NeMo_2_SSM_Finetuning:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_SSM_Finetuning') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-
-        python tests/collections/llm/gpt/model/megatron_ssm_finetuning.py \
-        --devices 1 \
-        --max-steps 10 \
-        --experiment-dir /tmp/nlp_megatron_mamba_nemo-ux-mamba_cicd_test_sft/${{ github.run_id }} \
-        --model-path /home/TestData/nlp/megatron_mamba/model_optim_rng.pt
-
-  L2_NeMo_2_HF_MODEL_IMPORT:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_HF_MODEL_IMPORT') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-
-        python tests/collections/llm/gpt/model/test_model_import.py
-
-      AFTER_SCRIPT: |
-        rm -rf ~/.cache/nemo/models
-
-  L2_NeMo_2_T5_Pretraining:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_T5_Pretraining') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_pretraining.py \
-        --devices=2 \
-        --max-steps=3 \
-        --experiment-dir=tests/collections/llm/t5_pretrain_results/${{ github.run_id }} \
-        --data-path=/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document \
-        --index-mapping-dir=tests/collections/llm/t5_index_mappings/${{ github.run_id }}
-
-        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_pretraining.py \
-        --devices=2 \
-        --max-steps=6 \
-        --experiment-dir=tests/collections/llm/t5_pretrain_results/${{ github.run_id }} \
-        --data-path=/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document \
-        --index-mapping-dir=tests/collections/llm/t5_index_mappings/${{ github.run_id }}
-      AFTER_SCRIPT: |
-        rm -rf tests/collections/llm/t5_pretrain_results/${{ github.run_id }}
-        rm -rf tests/collections/llm/t5_index_mappings/${{ github.run_id }}
-
-  L2_NeMo_2_Mixtral_Pretraining:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_Mixtral_Pretraining') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python3 tests/collections/llm/megatron_mixtral_pretraining.py \
-        --experiment-dir=/tmp/mixtral_pretrain_results \
-        --data-path=/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document
-
-  Nemo_CICD_Test:
-    needs: 
-      - pre-flight
-      - gpu-test
-      - cicd-test-container-setup
-
-      #- OPTIONAL_L0_Unit_Tests_GPU_ASR
-      - L0_Unit_Tests_GPU_Audio
-      - L0_Unit_Tests_GPU_Common
-      - L0_Unit_Tests_GPU_LLM
-      - L0_Unit_Tests_GPU_Multimodal
-      - L0_Unit_Tests_GPU_NLP
-      - L0_Unit_Tests_GPU_TTS
-      #- OPTIONAL_L0_Unit_Tests_GPU_Core
-      - L0_Unit_Tests_GPU_Hydra
-      #- OPTIONAL_L0_Unit_Tests_GPU_Lightning
-      - L0_Unit_Tests_GPU_Others
-      
-      - L0_Unit_Tests_CPU_ASR
-      - L0_Unit_Tests_CPU_Audio
-      - L0_Unit_Tests_CPU_Common
-      - L0_Unit_Tests_CPU_LLM
-      - L0_Unit_Tests_CPU_Multimodal
-      - L0_Unit_Tests_CPU_NLP
-      - L0_Unit_Tests_CPU_TTS
-      - L0_Unit_Tests_CPU_Core
-      - L0_Unit_Tests_CPU_Hydra
-      - L0_Unit_Tests_CPU_Lightning
-      - L0_Unit_Tests_CPU_Others
-
-      - L2_Community_LLM_Checkpoints_tests_Bert
-      - L2_Community_LLM_Checkpoints_tests_Mamba2
-      - L2_Community_LLM_Checkpoints_tests_Llama
-      - L2_Community_LLM_Checkpoints_tests_StarCoder
-      - L2_Community_LLM_Checkpoints_tests_Falcon
-      - L2_Community_vita_Checkpoints_tests_Llama3
-      #- OPTIONAL_L2_Community_LLM_Checkpoints_tests_Baichuan2
-      - ASR_dev_run_Speech_to_Text
-      - ASR_dev_run_Speech_to_Text_WPE_-_CitriNet
-      - ASR_dev_run_Speech_Pre-training_-_CitriNet
-      - ASR_dev_run_Speech_To_Text_Finetuning
-      - ASR_dev_run_Speech_To_Text_HF_Finetuning
-      - ASR_dev_run_Speech_to_Text_WPE_-_Conformer
-      - ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer
-      - L2_Speech_to_Text_EMA
-      - L2_Speaker_dev_run_Speaker_Recognition
-      - L2_Speaker_dev_run_Speaker_Diarization
-      - L2_Speaker_dev_run_Speech_to_Label
-      - L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference
-      - L2_Speaker_dev_run_Clustering_Diarizer_Inference
-      - L2_Speaker_dev_run_Neural_Diarizer_Inference
-      - L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation
-      - L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader
-      - L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader
-      - L2_ASR_Adapters_Linear_Adapters
-      - L2_ASR_Adapters_RelPos_MHA_Adapters
-      - L2_Speech_Transcription_Speech_to_Text_Transcribe
-      #- OPTIONAL_L2_Transducer_alignment_Running_pytest
-      - L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav
-      - L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3
-      - L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference
-      - L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference
-      - L2_Duplex_Text_Normalization_with_Tarred_dataset
-      - L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification
-      - L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification
-      - L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test
-      - L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test
-      - L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1
-      - L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification
-      - L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation
-      - L2_Pretraining_BERT_pretraining_from_Text
-      - L2_Pretraining_BERT_from_Preprocessed
-      - L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN
-      - L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN
-      - L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation
-      - L2_NMT_Attention_is_All_You_Need_Inference
-      - L2_NMT_Attention_is_All_You_Need_Finetuning
-      - L2_NMT_Tarred_Dataset_Creation_Auto_Tarred_Dataset_Creation
-      - L2_NMT_Tarred_Dataset_Creation_Script_Tarred_Dataset_Creation
-      - L2_Megatron_NMT_Training_TP2
-      - L2_Megatron_BART_Perceiver_MIM_Training_TP2
-      - L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism
-      - L2_Megatron_Bert_Pretraining_and_Resume_Training
-      - L2_Megatron_Core_Bert_Pretraining_and_Resume_Training
-      - L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training
-      - L2_Megatron_RETRO_Pretraining_and_Resume_Training
-      - L2_RAG_Pipeline_Indexing
-      - L2_RAG_Pipeline_Generating
-      - L2_BioMegatron_Bert_NER_Task
-      - L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2
-      - L2_Megatron_GPT_Skip_Train
-      - L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2
-      - L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2
-      - L2_Megatron_GPT_with_Drop_Optimizer_States_TP2
-      - L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2
-      - L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2
-      - L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2
-      #- OPTIONAL_L2_Megatron_GPT_Auto_Configurator_TP1_PP1_MBS124
-      - L2_Megatron_GPT_Finetuning_PP2
-      - L2_Megatron_GPT_Finetuning_StarCoder_PP1
-      - L2_Megatron_GPT_Embedding
-      - L2_Megatron_GPT_PEFT_Lora_PP2_O2
-      - L2_Megatron_GPT_PEFT_Lora_TP2_O1
-      - L2_Megatron_GPT_PEFT_Lora_TP2SP1
-      - L2_Megatron_GPT_Eval
-      - L2_Megatron_GPT_Eval_PP2
-      - L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len
-      - L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2
-      - L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2
-      - L2_Megatron_T5_Pretraining_and_Resume_Training_TP2
-      - L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2
-      - L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2
-      - L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2
-      - L2_Megatron_T5_Pretraining_and_Resume_Training_PP2
-      - L2_Megatron_T5_w_Mixture_of_Expert_Pretraining
-      - L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2
-      - L2_Megatron_T5_Eval
-      - L2_Megatron_Core_T5_Eval
-      - L2_Megatron_BART_Pretraining_and_Resume_Training_TP2
-      - L2_Megatron_BART_Pretraining_and_Resume_Training_PP2
-      - L2_Megatron_T5_PEFT_Lora_TP2
-      - L2_Megatron_Core_T5_PEFT_Lora_TP2
-      - L2_Megatron_Mock_Data_Generation_MockGPTDataset
-      - L2_Megatron_Mock_Data_Generation_MockT5Dataset
-      - L2_TTS_Fast_dev_runs_1_Tacotron_2
-      - L2_TTS_Fast_dev_runs_1_WaveGlow
-      - L2_TTS_Fast_dev_runs_1_FastPitch
-      #- OPTIONAL_L2_TTS_Fast_dev_runs_1_RADTTS
-      - L2_TTS_Fast_dev_runs_1_Mixer-TTS
-      - L2_TTS_Fast_dev_runs_1_Hifigan
-      - Speech_Checkpoints_tests
-      - L2_Stable_Diffusion_Training
-      - L2_NeMo_2_GPT_Pretraining_no_transformer_engine
-      #- OPTIONAL_L2_NeMo_2_GPT_DDP_Param_Parity_check
-      - L2_NeMo_2_HF_MODEL_IMPORT
-      - L2_NeMo_2_SSM_Pretraining
-      - L2_NeMo_2_SSM_Finetuning
-      - L2_NeMo_2_T5_Pretraining
-      - L2_NeMo_2_Mixtral_Pretraining
-      - L2_PTQ_Llama2_INT8_SQ
-      - L2_PTQ_Llama2_FP8
-      - L2_Community_LLM_Checkpoints_tests_Llama3
-      - L2_PTQ_Llama2_Export_Only
-      - L2_Distill_Llama2
-      - L2_Prune_Width_Llama2
-      - L2_Speech_to_Text_AED
-      - L2_Speech_Estimate_Duration_Bins
-      - L2_Speech_Batch_Size_OOMptimizer
-      - L2_Speech_Batch_Size_OOMptimizer_Canary
-      - L2_Speech_Transcription_Canary_Transcribe_Full_Manifest
-      - L2_Speech_Transcription_Canary_Transcribe_With_Prompt
-      - L2_Speech_Transcription_Canary_Transcribe_Audio_Dir
-      - L2_Megatron_GPT_Reranker
-    if: always()
-    runs-on: ubuntu-latest
-    steps:  
-      - name: Evaluate conclusion
-        if: ${{ always() }}
-        id: pipeline-conclusion
-        run: |
-          # Slack notifications are send only on test failure (not cancelled):
-          FAILED=${{ contains(needs.*.outputs.conclusion, 'failure') }}
-          echo "FAILED=$FAILED" >> $GITHUB_OUTPUT
-          
-          # Mark as successful if no job was cancelled:
-          SUCCESS=${{ !contains(needs.*.outputs.conclusion, 'failure') && !contains(needs.*.result, 'cancelled') && !contains(needs.*.result, 'skipped') }}
-          echo "SUCCESS=$SUCCESS" >> $GITHUB_OUTPUT
-
-      # This should depend on all the tests so we block/unblock based on all tests passing
-      - name: Pipeline successful, set exit code to 0 
-        if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'true' }}
-        run: exit 0
-
-      - name: Pipeline successful, add PR comment 
-        if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'true' && github.event_name == 'pull_request' && env.SLACK_WEBHOOK != '' }}
-        uses: peter-evans/create-or-update-comment@v4
-        env: 
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-          REPOSITORY: ${{ github.repository }}
-          RUN_ID: ${{ github.run_id }}
-        with:
-          issue-number: ${{ github.event.number }}
-          body: |
-            [🤖]: Hi @${{ github.event.pull_request.user.login }} 👋,
-            
-            We wanted to let you know that a [CICD pipeline](https://github.com/${{ env.REPOSITORY }}/actions/runs/${{ env.RUN_ID }}) for this PR just finished successfully
-
-            So it might be time to merge this PR or get some approvals
-
-            I'm just a bot so I'll leave it you what to do next.
-
-            //cc @pablo-garay @ko3n1g
-
-      - name: "Pipeline not successful and not cancelled: Send Slack alert & create step summary"
-        if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' && env.SLACK_WEBHOOK != '' }}
-        env: 
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          REPOSITORY: ${{ github.repository }}
-          RUN_ID: ${{ github.run_id }}
-          PR_NUMBER: ${{ github.event.number }}
-          SERVER_URL: ${{ github.server_url }}
-        run: |
-          set -x
-
-          PR_INFO=$(curl -L \
-            -H "Accept: application/vnd.github+json" \
-            -H "Authorization: Bearer $GITHUB_TOKEN" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            https://api.github.com/repos/$REPOSITORY/pulls/$PR_NUMBER
-          )
-          PR_URL=$(echo -E $PR_INFO | jq '.html_url' | tr -d '"')
-          PR_TITLE=$(echo -E $PR_INFO | jq '.title' | tr -d '"')
-          
-          PIPELINE_URL=$SERVER_URL/$REPOSITORY/actions/runs/$RUN_ID
-          BASE_MESSAGE='
-            {
-              "blocks": [
-                {
-                  "type": "section",
-                  "text": {
-                    "type": "mrkdwn",
-                    "text": "🚨 *CI/CD failure at <'$PIPELINE_URL'|NeMo CI>*."
-                  }
-                }
-              ]
-            }
-          '
-
-          # Since this workflow contains more than 100 jobs, we need to iterate over job pages
-          JOBS='[]'
-          PAGE=1
-          while : ; do
-            JOBS_URL="https://api.github.com/repos/$REPOSITORY/actions/runs/$RUN_ID/jobs?page=$PAGE&per_page=100"  
-            RESPONSE=$(curl -s -H "Authorization: token $GITHUB_TOKEN" $JOBS_URL | jq '.jobs')
-            JOBS=$(echo -e "$JOBS\n$RESPONSE" | jq -cs 'add')
-            if [[ $(echo $RESPONSE | jq 'length') -lt 100 ]]; then
-              break
-            else
-              PAGE=$(( PAGE + 1))
-            fi
-          done
-          
-          SUMMARY="[]"
-          echo "Failed jobs: " | tee -a $GITHUB_STEP_SUMMARY
-          while IFS= read -r JOB; do
-            JOB_NAME="$(echo $JOB | jq '.key' | tr -d '"') / main"
-            JOB_ID=$(echo $JOBS | jq --arg job_name "$JOB_NAME" '.[] | select(.name == $job_name) | .id')
-            JOB_URL="https://github.com/$REPOSITORY/actions/runs/$RUN_ID/job/$JOB_ID"
-
-            echo "* [$JOB_NAME]($JOB_URL)" | tee -a $GITHUB_STEP_SUMMARY
-
-            LOGS=$(echo $JOB | yq '(.value.outputs.log | @base64d)' | tr -d '"')
-            
-            SUMMARY=$(echo "$SUMMARY" | jq \
-              --arg pr "<$PR_URL|$PR_TITLE>" \
-              --arg job "<$JOB_URL|$JOB_NAME>" \
-              --arg logs "$LOGS" \
-              --arg author "<https://github.com/${{ github.actor }}|${{ github.actor }}>" \
-              --arg branch "<https://github.com/$REPOSITORY/tree/${{ github.head_ref || github.ref_name }}|${{ github.head_ref || github.ref_name }}>"\
-              '. += [
-              {
-                "type": "section",
-                "text": {
-                  "type": "mrkdwn",
-                  "text": (
-                    "PR: " + $pr
-                    + "\nJob: " + $job
-                    + "\nAuthor: " + $author
-                    + "\nBranch: " + $branch
-                    + "\nLogs:" 
-                    + "```\n" + $logs + "\n```" 
-                  )
-                }
-              }
-            ]')
-          done <<<$(echo '${{ toJSON(needs) }}' | jq -c 'to_entries | .[] | select(.value.outputs.conclusion == "failure")')
-
-          MESSAGE=$(echo $BASE_MESSAGE | jq -c --argjson summary "$SUMMARY" '.blocks += $summary')
-
-          curl -X POST -H "Content-type: application/json" --data "$MESSAGE" $SLACK_WEBHOOK
-
-      - name: "Pipeline not successful, set exit code to 1"
-        if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'false' }}
-        run: exit 1