diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml deleted file mode 100644 index c0aedbc1524ef..0000000000000 --- a/.github/workflows/_test_template.yml +++ /dev/null @@ -1,86 +0,0 @@ -name: ~test template - -on: - workflow_call: - inputs: - RUNNER: - type: string - description: Runner to use for test - required: true - TIMEOUT: - type: number - description: Max runtime of test in minutes - required: false - default: 10 - SCRIPT: - type: string - description: Test script to execute - required: true - AFTER_SCRIPT: - type: string - description: Script to run after main test - required: false - default: ":" - IS_OPTIONAL: - type: boolean - description: Failure will cancel all other tests if set to true - required: false - default: false - outputs: - conclusion: - description: Conclusion of main test step - value: ${{ jobs.main.outputs.conclusion }} - log: - description: Last 2000 characters of the test step's log - value: ${{ jobs.main.outputs.log }} -jobs: - - main: - runs-on: ${{ inputs.RUNNER }} - outputs: - conclusion: ${{ steps.main.conclusion }} - log: ${{ steps.main.outputs.log }} - steps: - - name: Docker system cleanup - run: | - docker system prune -a --filter "until=48h" --force || true - - - name: Docker pull image - run: | - docker pull nemoci.azurecr.io/nemo_container_${{ github.run_id }} - - - name: Start container - run: | - docker run --rm -d --name nemo_container_${{ github.run_id }} --runtime=nvidia --gpus all --shm-size=64g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))" - - - id: main - name: Run main script - timeout-minutes: ${{ inputs.TIMEOUT }} - run: | - mkdir -p ${{ github.run_id }} - cd ${{ github.run_id }}/ - set +e - ( - set -e - - docker exec nemo_container_${{ github.run_id }} bash -c '${{ inputs.SCRIPT }}' - ) 2> >(tee err.log) - - EXIT_CODE=$? - - echo "log=$(tail -c 2000 err.log | base64 -w 0)" >> "$GITHUB_OUTPUT" - - exit $EXIT_CODE - - - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - if: failure() && inputs.IS_OPTIONAL == false - - name: after_script - if: always() && inputs.AFTER_SCRIPT != ':' - run: | - docker exec nemo_container_${{ github.run_id }} bash -c '${{ inputs.AFTER_SCRIPT }}' - - - name: Container shutdown - if: always() - run: | - docker container stop nemo_container_${{ github.run_id }} || true - docker container rm nemo_container_${{ github.run_id }} || true \ No newline at end of file diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml deleted file mode 100644 index 871f58ed7bf28..0000000000000 --- a/.github/workflows/cicd-main.yml +++ /dev/null @@ -1,5465 +0,0 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -name: "CICD NeMo" -on: - pull_request: - branches: - - 'main' - - 'r**' - types: [ labeled ] - - workflow_dispatch: - inputs: - test_to_run: - required: false - default: all - type: string - description: Comma-separated list of tests to run. Use "all" to run the full test suite. - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - pre-flight: - runs-on: ubuntu-latest - outputs: - test_to_run: ${{ steps.test_to_run.outputs.main }} - all: ${{ steps.all.outputs.main }} - steps: - - name: Parse test_to_run - id: test_to_run - run: | - parsed_string=$(echo ${{ inputs.test_to_run || 'all' }} | jq -c --raw-input 'split(",")') - echo "main=${parsed_string}" | tee -a "$GITHUB_OUTPUT" - - name: Parse all - id: all - run: | - echo "main=${{ contains(fromJSON(steps.test_to_run.outputs.main), 'all') }}" | tee -a "$GITHUB_OUTPUT" - - gpu-test: - needs: [pre-flight] - runs-on: self-hosted-azure - if: ${{ github.event.label.name == 'Run CICD' || github.event_name == 'workflow_dispatch' }} - steps: - - name: Run nvidia-smi test - run: | - whoami - nvidia-smi - - - cicd-cluster-clean: - runs-on: self-hosted-azure-builder - needs: [pre-flight] - if: ${{ github.event.label.name == 'Run CICD' || github.event_name == 'workflow_dispatch' }} - steps: - - name: Clean server from old files - run: | - docker system prune --filter "until=24h" --filter "label=nemo.library=nemo-core" --force - - cicd-test-container-setup: - needs: [cicd-cluster-clean, pre-flight] - runs-on: self-hosted-azure-builder - if: ${{ github.event.label.name == 'Run CICD' || github.event_name == 'workflow_dispatch' }} - outputs: - test_to_run: ${{ needs.pre-flight.outputs.test_to_run }} - all: ${{ needs.pre-flight.outputs.all }} - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - path: ${{ github.run_id }} - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - with: - # We use `docker` driver as this speeds things up for - # trivial (non-multi-stage) builds. - driver: docker - - - name: Restore cache - run: | - docker pull nemoci.azurecr.io/nemo_container:latest - docker pull nemoci.azurecr.io/nemo_container_${{ github.event.number || 'noop' }} || true - - - name: Build and push - uses: docker/build-push-action@v5 - with: - file: Dockerfile.ci - push: true - cache-from: | - nemoci.azurecr.io/nemo_container:latest - nemoci.azurecr.io/nemo_container_${{ github.event.number || 'noop' }} - cache-to: type=inline - tags: | - nemoci.azurecr.io/nemo_container_${{ github.run_id }} - nemoci.azurecr.io/nemo_container_${{ github.event.number || 'noop' }} - nemoci.azurecr.io/nemo_container:latest - - - name: Run some checks - run: | - docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --env PYTHONUNBUFFERED=1 nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '\ - # PyTorch Lightning version - python -c "import pytorch_lightning; print(pytorch_lightning.__version__)" - - # PyTorch Lightning DDP Checks - CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py" - - # Basic Import Checks - python -c "import nemo.collections.asr as nemo_asr" - python -c "import nemo.collections.nlp as nemo_nlp" - python -c "import nemo.collections.nlp as nemo_nlp; nemo_nlp.modules.get_tokenizer_list()" - python -c "import nemo.collections.tts as nemo_tts" - - python setup.py style - python tests/check_copyright_header.py --dir . - - # These checks are not crucial - exit 0 - ' - ### \'\' - - # L0: GPU unit tests - OPTIONAL_L0_Unit_Tests_GPU_ASR: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_ASR') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - TIMEOUT: 20 - SCRIPT: | - NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --with_downloads - IS_OPTIONAL: true - - L0_Unit_Tests_GPU_Audio: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Audio') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - TIMEOUT: 20 - SCRIPT: | - NEMO_NUMBA_MINVER=0.53 pytest tests/collections/audio -m "not pleasefixme" --with_downloads - - L0_Unit_Tests_GPU_Common: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Common') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - NEMO_NUMBA_MINVER=0.53 pytest tests/collections/common -m "not pleasefixme" --with_downloads - - L0_Unit_Tests_GPU_LLM: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_LLM') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - NEMO_NUMBA_MINVER=0.53 pytest tests/collections/llm -m "not pleasefixme" --with_downloads - - L0_Unit_Tests_GPU_Multimodal: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Multimodal') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - NEMO_NUMBA_MINVER=0.53 pytest tests/collections/multimodal -m "not pleasefixme" --with_downloads - - L0_Unit_Tests_GPU_NLP: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_NLP') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - NEMO_NUMBA_MINVER=0.53 pytest tests/collections/nlp -m "not pleasefixme" --with_downloads - - L0_Unit_Tests_GPU_TTS: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_TTS') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - NEMO_NUMBA_MINVER=0.53 pytest tests/collections/tts -m "not pleasefixme" --with_downloads - - OPTIONAL_L0_Unit_Tests_GPU_Core: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_Core') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - TIMEOUT: 20 - SCRIPT: | - NEMO_NUMBA_MINVER=0.53 pytest tests/core -m "not pleasefixme" --with_downloads - IS_OPTIONAL: true - - L0_Unit_Tests_GPU_Hydra: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Hydra') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - NEMO_NUMBA_MINVER=0.53 pytest tests/hydra -m "not pleasefixme" --with_downloads - - OPTIONAL_L0_Unit_Tests_GPU_Lightning: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_Lightning') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - NEMO_NUMBA_MINVER=0.53 pytest tests/lightning -m "not pleasefixme" --with_downloads - IS_OPTIONAL: true - - L0_Unit_Tests_GPU_Others: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_Others') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads \ - --ignore=tests/collections/asr \ - --ignore=tests/collections/audio \ - --ignore=tests/collections/common \ - --ignore=tests/collections/llm \ - --ignore=tests/collections/multimodal \ - --ignore=tests/collections/nlp \ - --ignore=tests/collections/tts \ - --ignore=tests/core \ - --ignore=tests/core_ptl \ - --ignore=tests/hydra \ - --ignore=tests/lightning \ - --ignore=tests/utils - - # L0: CPU unit tests - L0_Unit_Tests_CPU_ASR: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_ASR') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-cpu - TIMEOUT: 20 - SCRIPT: | - CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - - L0_Unit_Tests_CPU_Audio: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Audio') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-cpu - SCRIPT: | - CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/audio -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - - L0_Unit_Tests_CPU_Common: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Common') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-cpu - TIMEOUT: 20 - SCRIPT: | - CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/common -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - - L0_Unit_Tests_CPU_LLM: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_LLM') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-cpu - SCRIPT: | - CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/llm -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - - L0_Unit_Tests_CPU_Multimodal: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Multimodal') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-cpu - SCRIPT: | - CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/multimodal -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - - L0_Unit_Tests_CPU_NLP: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_NLP') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-cpu - TIMEOUT: 20 - SCRIPT: | - CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/nlp -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - - L0_Unit_Tests_CPU_TTS: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_TTS') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-cpu - SCRIPT: | - CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/collections/tts -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - - L0_Unit_Tests_CPU_Core: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Core') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-cpu - TIMEOUT: 20 - SCRIPT: | - CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/core tests/core_ptl -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - - L0_Unit_Tests_CPU_Hydra: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Hydra') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-cpu - SCRIPT: | - CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/hydra -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - - L0_Unit_Tests_CPU_Lightning: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Lightning') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-cpu - SCRIPT: | - CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest tests/lightning -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat - - L0_Unit_Tests_CPU_Others: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_CPU_Others') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-cpu - SCRIPT: | - CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat \ - --ignore=tests/collections/asr \ - --ignore=tests/collections/audio \ - --ignore=tests/collections/common \ - --ignore=tests/collections/llm \ - --ignore=tests/collections/multimodal \ - --ignore=tests/collections/nlp \ - --ignore=tests/collections/tts \ - --ignore=tests/core \ - --ignore=tests/core_ptl \ - --ignore=tests/hydra \ - --ignore=tests/lightning \ - --ignore=tests/utils - - - L0_Setup_Test_Data_And_Models: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Setup_Test_Data_And_Models') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python -m tests.setup --save_dir /home/TestData/nlp - - # - name: L2: Multimodal Imagen Train - - # L2: Community LLM Checkpoints tests - L2_Community_LLM_Checkpoints_tests_Bert: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_Bert') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python scripts/checkpoint_converters/convert_bert_hf_to_nemo.py \ - --input_name_or_path /home/TestData/nlp/megatron_ir/sbert/hf_model/bert-base-uncased \ - --output_path /tmp/nlp_megatron_ir_sbert/sbert.nemo - - L2_Community_LLM_Checkpoints_tests_Mamba2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_Mamba2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py \ - --input_name_or_path /home/TestData/nlp/megatron_mamba/model_optim_rng.pt \ - --output_path /tmp/nlp_megatron_mamba/converted_mamba.nemo \ - --precision=bf16 \ - --mamba_ssm_ngroups 1 - - L2_Community_LLM_Checkpoints_tests_Llama: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_Llama') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \ - --input_name_or_path=/home/TestData/nlp/megatron_llama/llama-ci-hf-tiny \ - --output_path=/tmp/nlp_megatron_llama/llama_ci.nemo \ - --precision=16 - - L2_Community_LLM_Checkpoints_tests_Llama3: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_Llama3') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \ - --input_name_or_path=/home/TestData/nlp/megatron_llama/llama3-ci-hf \ - --output_path=/tmp/nlp_megatron_llama_llama3-ci-hf/llama3_ci.nemo \ - --precision=16 - - L2_Community_LLM_Checkpoints_tests_StarCoder: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_StarCoder') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - mkdir -p /tmp/nlp_megatron_gpt_starcoder-ci-hf/ - python scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py \ - --input_name_or_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf \ - --output_path /tmp/nlp_megatron_gpt_starcoder-ci-hf/ - - L2_Community_LLM_Checkpoints_tests_Falcon: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Community_LLM_Checkpoints_tests_Falcon') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py \ - --input_name_or_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf \ - --output_path /tmp/nlp_megatron_gpt_falcon-ci-hf/falcon_ci.nemo - - # L2: Community llava multimodal Checkpoints tests - L2_Community_vita_Checkpoints_tests_Llama3: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Community_vita_Checkpoints_tests_Llama3') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - mkdir /tmp/${{ github.run_id }} - export PYTHONPATH=/home/TestData/multimodal/video_neva/LLaVA:$PYTHONPATH - CUDA_VISIBLE_DEVICES=0 python examples/multimodal/multimodal_llm/neva/convert_llava_to_neva.py \ - --in-file /home/TestData/multimodal/video_neva/Llama-3-VILA1.5-8B/llm \ - --mm-projector-ckpt-dir /home/TestData/multimodal/video_neva/Llama-3-VILA1.5-8B/mm_projector \ - --mm-vision-tower /home/TestData/multimodal/video_neva/Llama-3-VILA1.5-8B/vision_tower \ - --tokenizer-model /home/TestData/multimodal/video_neva/vita-tokenizer/ \ - --config-file vita_config.yaml \ - --out-file=/tmp/${{ github.run_id }}/llama3_ci.nemo \ - --model-type VITA \ - --conv-template llama_3 - - # this test is using a 7B model which is too large for GitHub CI - # replace the model in this test with a toy model or move the test - # to the nightly CI - # OPTIONAL_L2_Community_LLM_Checkpoints_tests_Baichuan2: - # needs: [cicd-test-container-setup] - # runs-on: self-hosted-azure - # container: - # image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - # options: - # # --user 0:128 - # --device=/dev/nvidia0 - # --gpus all - # --shm-size=8g - # --env TRANSFORMERS_OFFLINE=0 - # --env HYDRA_FULL_ERROR=1 - # --volume /mnt/datadrive/TestData:/home/TestData - # steps: - # - name: Checkout repository - # uses: actions/checkout@v4 - # - run: | - # python scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py \ - # --input_name_or_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base \ - # --output_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo - # rm -f /home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base/ci.nemo - # - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - # if: "failure()" - - L2_PTQ_Llama2_Export_Only: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_PTQ_Llama2_Export_Only') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_gpt_ptq.py \ - model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ - quantization.algorithm=null \ - export.save_path=/tmp/nlp_megatron_llama_export_only/ci_baseline - - L2_PTQ_Llama2_FP8: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_PTQ_Llama2_FP8') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_gpt_ptq.py \ - model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ - model.tensor_model_parallel_size=2 \ - trainer.devices=2 \ - quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \ - quantization.algorithm=fp8 \ - quantization.num_calib_size=8 \ - inference.batch_size=2 \ - export.inference_tensor_parallel=2 \ - export.sample_output=False \ - export.save_path=/tmp/nlp_megatron_llama_eo/ci_fp8.qnemo - - L2_PTQ_Llama2_INT8_SQ: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_PTQ_Llama2_INT8_SQ') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - TIMEOUT: 15 - SCRIPT: | - python examples/nlp/language_modeling/megatron_gpt_ptq.py \ - model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ - quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \ - quantization.algorithm=int8_sq \ - quantization.num_calib_size=8 \ - inference.batch_size=2 \ - export.sample_output=False \ - export.save_path=/tmp/nlp_megatron_llama_eo/ci_int8_sq.qnemo - - # TODO: investigate int4_awq stuck issues and restore the test - #L2_PTQ_Llama2_INT4_AWQ: - # needs: [cicd-test-container-setup] - # runs-on: self-hosted-azure - # timeout-minutes: 10 - # container: - # image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - # options: - # # --user 0:128 - # --device=/dev/nvidia0 - # --gpus all - # --shm-size=8g - # --env TRANSFORMERS_OFFLINE=0 - # --env HYDRA_FULL_ERROR=1 - # --volume /mnt/datadrive/TestData:/home/TestData - # steps: - # - name: Checkout repository - # uses: actions/checkout@v4 - # - run: | - # python examples/nlp/language_modeling/megatron_gpt_ptq.py \ - # model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ - # model.tensor_model_parallel_size=1 \ - # trainer.devices=1 \ - # quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \ - # quantization.algorithm=int4_awq \ - # quantization.num_calib_size=8 \ - # inference.batch_size=2 \ - # export.save_path=/home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo - # - # rm -rf /home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo - #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - # if: "failure()" - - # OPTIONAL_L2_QAT_Llama2_INT4: - # needs: [cicd-test-container-setup] - # runs-on: self-hosted-azure - # timeout-minutes: 10 - # container: - # image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - # options: - # # --user 0:128 - # --device=/dev/nvidia0 - # --gpus all - # --shm-size=8g - # --env TRANSFORMERS_OFFLINE=0 - # --env HYDRA_FULL_ERROR=1 - # --volume /mnt/datadrive/TestData:/home/TestData - # steps: - # - name: Checkout repository - # uses: actions/checkout@v4 - # - run: | - # python examples/nlp/language_modeling/tuning/megatron_gpt_qat.py \ - # quantization.algorithm=int4 \ - # quantization.num_calib_size=8 \ - # trainer.devices=1 \ - # trainer.num_nodes=1 \ - # trainer.max_steps=4 \ - # trainer.val_check_interval=4 \ - # +trainer.limit_val_batches=2 \ - # exp_manager.explicit_log_dir=llama2_qat_results \ - # model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ - # model.tensor_model_parallel_size=1 \ - # model.pipeline_model_parallel_size=1 \ - # model.global_batch_size=2 \ - # model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - # model.data.train_ds.concat_sampling_probabilities=[1.0] \ - # model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] - - # rm -rf llama2_qat_results - - L2_Distill_Llama2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Distill_Llama2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_gpt_distillation.py \ - trainer.devices=2 \ - trainer.num_nodes=1 \ - trainer.precision=bf16 \ - trainer.max_steps=5 \ - trainer.log_every_n_steps=5 \ - trainer.val_check_interval=5 \ - trainer.limit_val_batches=2 \ - model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ - model.kd_teacher_restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ - model.tensor_model_parallel_size=2 \ - model.pipeline_model_parallel_size=1 \ - model.micro_batch_size=1 \ - model.global_batch_size=4 \ - model.optim.name=distributed_fused_adam \ - model.optim.sched.warmup_steps=1 \ - model.data.data_prefix=[1.0,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \ - exp_manager.exp_dir=/tmp/megatron_llama_distill - - L2_Prune_Width_Llama2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Prune_Width_Llama2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_gpt_prune.py \ - trainer.devices=2 \ - trainer.num_nodes=1 \ - trainer.precision=bf16 \ - model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \ - model.tensor_model_parallel_size=1 \ - model.pipeline_model_parallel_size=2 \ - prune.num_calib_size=8 \ - prune.ffn_hidden_size=192 \ - prune.num_attention_heads=2 \ - prune.num_query_groups=2 \ - prune.hidden_size=null \ - export.save_path=examples/nlp/language_modeling/ci_prune_width.nemo - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/ci_prune_width.nemo - - # L2: ASR dev run - ASR_dev_run_Speech_to_Text: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'ASR_dev_run_Speech_to_Text') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - python examples/asr/asr_ctc/speech_to_text_ctc.py \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=/tmp/speech_to_text_results - - ASR_dev_run_Speech_to_Text_WPE_-_CitriNet: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'ASR_dev_run_Speech_to_Text_WPE_-_CitriNet') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \ - --config-path="../conf/citrinet/" --config-name="config_bpe" \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ - model.tokenizer.type="wpe" \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=/tmp/speech_to_text_wpe_results - - ASR_dev_run_Speech_Pre-training_-_CitriNet: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'ASR_dev_run_Speech_Pre-training_-_CitriNet') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - python examples/asr/speech_pretraining/speech_pre_training.py \ - --config-path="../conf/ssl/citrinet/" --config-name="citrinet_ssl_ci" \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=/tmp/speech_pre_training_results - - ASR_dev_run_Speech_To_Text_Finetuning: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'ASR_dev_run_Speech_To_Text_Finetuning') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - python examples/asr/speech_to_text_finetune.py \ - --config-path="conf/asr_finetune" --config-name="speech_to_text_finetune" \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \ - model.tokenizer.update_tokenizer=False \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=/tmp/speech_finetuning_results - - ASR_dev_run_Speech_To_Text_HF_Finetuning: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'ASR_dev_run_Speech_To_Text_HF_Finetuning') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: |- - python examples/asr/speech_to_text_finetune.py \ - --config-path="conf/asr_finetune" --config-name="speech_to_text_hf_finetune" \ - ~model.train_ds.hf_data_cfg \ - model.train_ds.num_workers=1 \ - model.train_ds.batch_size=2 model.validation_ds.batch_size=2 \ - model.train_ds.streaming=true \ - +model.train_ds.hf_data_cfg.path="librispeech_asr" \ - +model.train_ds.hf_data_cfg.name=null \ - +model.train_ds.hf_data_cfg.split="test.clean" \ - +model.train_ds.hf_data_cfg.streaming=true \ - +model.train_ds.hf_data_cfg.trust_remote_code=True \ - ~model.validation_ds.hf_data_cfg \ - model.validation_ds.streaming=true \ - +model.validation_ds.hf_data_cfg.path="librispeech_asr" \ - +model.validation_ds.hf_data_cfg.name=null \ - +model.validation_ds.hf_data_cfg.split="test.clean" \ - +model.validation_ds.hf_data_cfg.streaming=true \ - +model.validation_ds.hf_data_cfg.trust_remote_code=True \ - ~model.test_ds \ - init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \ - model.tokenizer.update_tokenizer=False \ - model.optim.sched.warmup_steps=0 \ - +model.optim.sched.max_steps=3 \ - trainer.max_epochs=null \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=/tmp/speech_finetuning_results - - ASR_dev_run_Speech_to_Text_WPE_-_Conformer: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'ASR_dev_run_Speech_to_Text_WPE_-_Conformer') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \ - --config-path="../conf/conformer" --config-name="conformer_ctc_bpe" \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ - model.tokenizer.type="wpe" \ - model.train_ds.batch_size=4 \ - model.validation_ds.batch_size=4 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=/tmp/speech_to_text_wpe_conformer_results - - # L2: ASR dev run - part two - ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \ - --config-path="../conf/squeezeformer" --config-name="squeezeformer_ctc_bpe" \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \ - model.tokenizer.type="wpe" \ - model.encoder.d_model=144 \ - model.train_ds.batch_size=4 \ - model.validation_ds.batch_size=4 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=/tmp/speech_to_text_wpe_squeezeformer_results - - L2_Speech_to_Text_EMA: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speech_to_Text_EMA') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/asr/asr_ctc/speech_to_text_ctc.py \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices=2 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - +exp_manager.ema.enable=True \ - exp_manager.exp_dir=/tmp/speech_to_text_results - - L2_Speech_to_Text_AED: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speech_to_Text_AED') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - python examples/asr/speech_multitask/speech_to_text_aed.py \ - model.prompt_format=canary \ - model.model_defaults.asr_enc_hidden=256 \ - model.model_defaults.lm_dec_hidden=256 \ - model.encoder.n_layers=12 \ - model.transf_encoder.num_layers=0 \ - model.transf_decoder.config_dict.num_layers=12 \ - model.train_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_train.json \ - model.train_ds.batch_duration=60 \ - model.train_ds.use_bucketing=false \ - model.train_ds.shuffle_buffer_size=100 \ - model.train_ds.num_workers=0 \ - +model.train_ds.text_field="answer" \ - +model.train_ds.lang_field="target_lang" \ - model.validation_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \ - +model.validation_ds.text_field="answer" \ - +model.validation_ds.lang_field="target_lang" \ - model.validation_ds.num_workers=0 \ - model.test_ds.manifest_filepath=/home/TestData/asr/manifests/canary/an4_canary_val.json \ - +model.test_ds.text_field="answer" \ - +model.test_ds.lang_field="target_lang" \ - model.test_ds.num_workers=0 \ - spl_tokens.model_dir=/home/TestData/asr_tokenizers/canary/canary_spl_tokenizer_v32 \ - model.tokenizer.langs.en.dir=/home/TestData/asr_tokenizers/canary/en/tokenizer_spe_bpe_v1024_max_4 \ - model.tokenizer.langs.en.type=bpe \ - ++model.tokenizer.langs.es.dir=/home/TestData/asr_tokenizers/canary/es/tokenizer_spe_bpe_v1024_max_4 \ - ++model.tokenizer.langs.es.type=bpe \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=/tmp/speech_to_text_aed_results - - # L2: Speaker dev run - L2_Speaker_dev_run_Speaker_Recognition: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speaker_dev_run_Speaker_Recognition') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - python examples/speaker_tasks/recognition/speaker_reco.py \ - model.train_ds.batch_size=10 \ - model.validation_ds.batch_size=2 \ - model.train_ds.manifest_filepath=/home/TestData/an4_speaker/train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_speaker/dev.json \ - model.decoder.num_classes=2 \ - trainer.max_epochs=10 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=/tmp/speaker_recognition_results - - L2_Speaker_dev_run_Speaker_Diarization: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speaker_dev_run_Speaker_Diarization') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder.py \ - model.diarizer.speaker_embeddings.model_path=titanet_large \ - model.train_ds.batch_size=5 \ - model.validation_ds.batch_size=5 \ - model.train_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \ - model.validation_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \ - model.train_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_train/msdd_data.50step.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_valid/msdd_data.50step.json \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=/tmp/speaker_diarization_results - - L2_Speaker_dev_run_Speech_to_Label: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speaker_dev_run_Speech_to_Label') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - python examples/asr/speech_classification/speech_to_label.py \ - model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \ - model.validation_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \ - model.test_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \ - ~model.preprocessor.window_size \ - ~model.preprocessor.window_stride \ - ~model.preprocessor.window \ - ~model.preprocessor.n_mels \ - ~model.preprocessor.n_mfcc \ - ~model.preprocessor.n_fft \ - exp_manager.exp_dir=/tmp/speech_to_label_results - - L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py \ - diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \ - diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \ - diarizer.speaker_embeddings.parameters.save_embeddings=True \ - diarizer.speaker_embeddings.parameters.window_length_in_sec=[1.5] \ - diarizer.speaker_embeddings.parameters.shift_length_in_sec=[0.75] \ - diarizer.speaker_embeddings.parameters.multiscale_weights=[1.0] \ - diarizer.asr.model_path=QuartzNet15x5Base-En \ - diarizer.asr.parameters.asr_based_vad=True \ - diarizer.out_dir=/tmp/speaker_diarization_asr_results - - L2_Speaker_dev_run_Clustering_Diarizer_Inference: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speaker_dev_run_Clustering_Diarizer_Inference') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py \ - diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \ - diarizer.speaker_embeddings.model_path=/home/TestData/an4_diarizer/spkr.nemo \ - diarizer.speaker_embeddings.parameters.save_embeddings=True \ - diarizer.speaker_embeddings.parameters.window_length_in_sec=1.5 \ - diarizer.speaker_embeddings.parameters.shift_length_in_sec=0.75 \ - diarizer.speaker_embeddings.parameters.multiscale_weights=null \ - diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \ - diarizer.out_dir=/tmp/clustering_diarizer_results - - L2_Speaker_dev_run_Neural_Diarizer_Inference: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speaker_dev_run_Neural_Diarizer_Inference') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py \ - diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \ - diarizer.msdd_model.model_path=/home/TestData/an4_diarizer/diar_msdd_telephonic.nemo \ - diarizer.speaker_embeddings.parameters.save_embeddings=True \ - diarizer.vad.model_path=/home/TestData/an4_diarizer/MatchboxNet_VAD_3x2.nemo \ - diarizer.out_dir=/tmp/neural_diarizer_results - - L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python tools/speech_data_simulator/multispeaker_simulator.py \ - --config-path=conf --config-name=data_simulator.yaml \ - data_simulator.random_seed=42 \ - data_simulator.manifest_filepath=/home/TestData/LibriSpeechShort/dev-clean-align-short.json \ - data_simulator.outputs.output_dir=/tmp/test_simulator \ - data_simulator.session_config.num_sessions=2 \ - data_simulator.session_config.session_length=60 - - # L2: ASR Multi-dataloader dev run - L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - python examples/asr/asr_ctc/speech_to_text_ctc.py \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=[/home/TestData/an4_dataset/an4_val.json,/home/TestData/an4_dataset/an4_val.json] \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - +trainer.num_sanity_val_steps=1 \ - exp_manager.exp_dir=/tmp/speech_to_text_results - - L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - python examples/asr/speech_classification/speech_to_label.py \ - model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \ - model.validation_ds.manifest_filepath=[/home/TestData/speech_commands/test_manifest.json,/home/TestData/speech_commands/test_manifest.json] \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - +trainer.num_sanity_val_steps=1 \ - model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \ - ~model.preprocessor.window_size \ - ~model.preprocessor.window_stride \ - ~model.preprocessor.window \ - ~model.preprocessor.n_mels \ - ~model.preprocessor.n_mfcc \ - ~model.preprocessor.n_fft \ - exp_manager.exp_dir=/tmp/speech_to_label_results - - # L2: ASR Adapters - L2_ASR_Adapters_Linear_Adapters: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_ASR_Adapters_Linear_Adapters') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - python examples/asr/asr_adapters/train_asr_adapter.py \ - model.pretrained_model="stt_en_conformer_ctc_small" \ - model.adapter.adapter_name="an4" \ - model.adapter.linear.in_features=176 \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - trainer.max_steps=5 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=/tmp/speech_to_text_adapters_results - - L2_ASR_Adapters_RelPos_MHA_Adapters: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_ASR_Adapters_RelPos_MHA_Adapters') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - python examples/asr/asr_adapters/train_asr_adapter.py \ - model.pretrained_model="stt_en_conformer_ctc_small" \ - model.adapter.adapter_name="encoder:an4" \ - model.adapter.adapter_type="tiny_attn" \ - model.adapter.tiny_attn.n_feat=176 \ - model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \ - model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \ - trainer.max_steps=5 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=True \ - exp_manager.exp_dir=/tmp/speech_to_text_adapters_mha_results - - # L2: OOMptimizer - L2_Speech_Estimate_Duration_Bins: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speech_Estimate_Duration_Bins') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - set -x - # 1D buckets [SSL, CTC] - python scripts/speech_recognition/estimate_duration_bins.py \ - /home/TestData/an4_dataset/an4_train.json \ - --buckets 5 - # 2D buckets [CTC, RNNT, TDT] / with tokenizer - python scripts/speech_recognition/estimate_duration_bins_2d.py \ - /home/TestData/an4_dataset/an4_train_lang.json \ - --tokenizer /home/TestData/asr_tokenizers/canary/en/tokenizer_spe_bpe_v1024_max_4/tokenizer.model \ - --buckets 5 \ - --sub-buckets 2 - # TODO(pzelasko): Figure out how to quote the value in the test properly for CI to accept it... - # 2D buckets with prompt [AED/Canary, SpeechLM] / with aggregate tokenizer + prompt format - # python scripts/speech_recognition/estimate_duration_bins_2d.py \ - # /home/TestData/an4_dataset/an4_train_lang.json \ - # --tokenizer /home/TestData/asr_tokenizers/canary/canary_spl_tokenizer_v32/tokenizer.model \ - # /home/TestData/asr_tokenizers/canary/en/tokenizer_spe_bpe_v1024_max_4/tokenizer.model \ - # /home/TestData/asr_tokenizers/canary/es/tokenizer_spe_bpe_v1024_max_4/tokenizer.model \ - # --langs spl_tokens en es \ - # --prompt-format canary \ - # --prompt '[{"role":"user","slots":{"source_lang":"en","target_lang":"en","task":"asr","pnc":"yes"}}]' \ - # --buckets 5 \ - # --sub-buckets 2 - - # L2: OOMptimizer - L2_Speech_Batch_Size_OOMptimizer: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speech_Batch_Size_OOMptimizer') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - # 1D bucketing - python scripts/speech_recognition/oomptimizer.py \ - -c /home/TestData/oomptimizer/fast-conformer_ctc_bpe.yaml \ - -m nemo.collections.asr.models.EncDecCTCModelBPE \ - -b "[5.0,10.0]" - # 2D bucketing - python scripts/speech_recognition/oomptimizer.py \ - -c /home/TestData/oomptimizer/fast-conformer_ctc_bpe.yaml \ - -m nemo.collections.asr.models.EncDecCTCModelBPE \ - -b "[[5.0,30],[5.0,45],[10.0,57],[10.0,71]]" - - # L2: OOMptimizer Canary (has a different batch schema) - L2_Speech_Batch_Size_OOMptimizer_Canary: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speech_Batch_Size_OOMptimizer_Canary') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python scripts/speech_recognition/oomptimizer.py \ - -c /home/TestData/oomptimizer/fast-conformer_aed.yaml \ - -m nemo.collections.asr.models.EncDecMultiTaskModel \ - -b "[[5.0,30],[5.0,45],[10.0,57],[10.0,71]]" - - # L2: Speech Transcription - L2_Speech_Transcription_Speech_to_Text_Transcribe: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speech_Transcription_Speech_to_Text_Transcribe') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/asr/transcribe_speech.py \ - pretrained_name="QuartzNet15x5Base-En" \ - audio_dir="/home/TestData/an4_transcribe/test_subset/" \ - output_filename="/tmp/stt_test_res.json" \ - amp=true - - # L2: Speech Transcription - L2_Speech_Transcription_Canary_Transcribe_Full_Manifest: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speech_Transcription_Canary_Transcribe_Full_Manifest') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/asr/transcribe_speech.py \ - dataset_manifest=/home/TestData/asr/canary/dev-other-wav-10-canary-fields.json \ - output_filename=/tmp/preds.json \ - batch_size=10 \ - pretrained_name=nvidia/canary-1b \ - num_workers=0 \ - amp=false \ - compute_dtype=bfloat16 \ - matmul_precision=medium - AFTER_SCRIPT: | - rm -rf /tmp/preds.json transcribe.log - - L2_Speech_Transcription_Canary_Transcribe_With_Prompt: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speech_Transcription_Canary_Transcribe_With_Prompt') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/asr/transcribe_speech.py \ - dataset_manifest=/home/TestData/asr/canary/dev-other-wav-10.json \ - output_filename=preds.json \ - batch_size=10 \ - pretrained_name=nvidia/canary-1b \ - num_workers=0 \ - amp=false \ - compute_dtype=bfloat16 \ - matmul_precision=medium \ - +prompt.source_lang="en" \ - +prompt.target_lang="en" \ - +prompt.task="asr" \ - +prompt.pnc="no" - AFTER_SCRIPT: | - rm -rf preds.json transcribe.log - - L2_Speech_Transcription_Canary_Transcribe_Audio_Dir: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Speech_Transcription_Canary_Transcribe_Audio_Dir') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/asr/transcribe_speech.py \ - audio_dir=/home/TestData/asr/canary/dev-other-wav \ - output_filename=preds.json \ - batch_size=10 \ - pretrained_name=nvidia/canary-1b \ - num_workers=0 \ - amp=false \ - compute_dtype=bfloat16 \ - matmul_precision=medium - AFTER_SCRIPT: | - rm -rf preds.json - - - # L2: Transducer alignment - OPTIONAL_L2_Transducer_alignment_Running_pytest: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L2_Transducer_alignment_Running_pytest') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1 --with_downloads - IS_OPTIONAL: true - - # L2: Segmentation Tool - L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - cd tools/ctc_segmentation && \ - TIME=`date +"%Y-%m-%d-%T"` && \ - /bin/bash run_segmentation.sh \ - --MODEL_NAME_OR_PATH="stt_en_citrinet_512_gamma_0_25" \ - --DATA_DIR=/home/TestData/ctc_segmentation/eng \ - --OUTPUT_DIR=/tmp/ctc_seg_en/output${TIME} \ - --LANGUAGE=en \ - --USE_NEMO_NORMALIZATION="TRUE" && \ - python /home/TestData/ctc_segmentation/verify_alignment.py \ - -r /home/TestData/ctc_segmentation/eng/eng_valid_segments_1.7.txt \ - -g /tmp/ctc_seg_en/output${TIME}/verified_segments/nv_test_segments.txt; - - L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - cd tools/ctc_segmentation && \ - TIME=`date +"%Y-%m-%d-%T"` && \ - /bin/bash run_segmentation.sh \ - --MODEL_NAME_OR_PATH=/home/TestData/ctc_segmentation/QuartzNet15x5-Ru-e512-wer14.45.nemo \ - --DATA_DIR=/home/TestData/ctc_segmentation/ru \ - --OUTPUT_DIR=/tmp/ctc_seg_ru/output${TIME} \ - --LANGUAGE=ru \ - --ADDITIONAL_SPLIT_SYMBOLS=";" && \ - python /home/TestData/ctc_segmentation/verify_alignment.py \ - -r /home/TestData/ctc_segmentation/ru/valid_ru_segments_1.7.txt \ - -g /tmp/ctc_seg_ru/output${TIME}/verified_segments/ru_segments.txt; - - # L2: G2P Models - L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - cd examples/tts/g2p && \ - TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_CONFORMER=output_ctc_${TIME} && \ - python g2p_train_and_evaluate.py \ - train_manifest=/home/TestData/g2p/g2p.json \ - validation_manifest=/home/TestData/g2p/g2p.json \ - model.test_ds.manifest_filepath=/home/TestData/g2p/g2p.json \ - model.tokenizer.dir=/home/TestData/g2p/tokenizer_spe_unigram_v512 \ - trainer.max_epochs=1 \ - model.max_source_len=64 \ - trainer.devices=1 \ - do_training=True \ - do_testing=True \ - exp_manager.exp_dir=${OUTPUT_DIR_CONFORMER} \ - +exp_manager.use_datetime_version=False\ - +exp_manager.version=test \ - --config-name=g2p_conformer_ctc && \ - python g2p_inference.py \ - pretrained_model=${OUTPUT_DIR_CONFORMER}/G2P-Conformer-CTC/test/checkpoints/G2P-Conformer-CTC.nemo \ - manifest_filepath=/home/TestData/g2p/g2p.json \ - phoneme_field=text - - # TODO: pleasefixme @redoctopus - # - name: ByT5G2P training, evaluation and inference - # run: | - # cd examples/tts/g2p && \ - # TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_T5=output_byt5_${TIME} && \ - # python g2p_train_and_evaluate.py \ - # train_manifest=/home/TestData/g2p/g2p.json \ - # validation_manifest=/home/TestData/g2p/g2p.json \ - # model.test_ds.manifest_filepath=/home/TestData/g2p/g2p.json \ - # trainer.max_epochs=1 \ - # model.max_source_len=64 \ - # trainer.devices=1 \ - # do_training=True \ - # do_testing=True \ - # exp_manager.exp_dir=${OUTPUT_DIR_T5} \ - # +exp_manager.use_datetime_version=False\ - # +exp_manager.version=test && \ - # python g2p_inference.py \ - # pretrained_model=${OUTPUT_DIR_T5}/T5G2P/test/checkpoints/T5G2P.nemo \ - # manifest_filepath=/home/TestData/g2p/g2p.json \ - # phoneme_field=text - # } - # } - # - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - # if: "failure()" - - L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - cd examples/tts/g2p && \ - TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR=output_${TIME} && \ - python g2p_heteronym_classification_train_and_evaluate.py \ - train_manifest=/home/TestData/g2p/manifest.json \ - validation_manifest=/home/TestData/g2p/manifest.json \ - test_manifest=/home/TestData/g2p/manifest.json \ - model.wordids=/home/TestData/g2p/wordids.tsv \ - trainer.max_epochs=1 \ - model.max_seq_length=64 \ - do_training=True \ - do_testing=True \ - exp_manager.exp_dir=${OUTPUT_DIR} \ - +exp_manager.use_datetime_version=False\ - +exp_manager.version=test && \ - python g2p_heteronym_classification_inference.py \ - manifest=/home/TestData/g2p/manifest.json \ - pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \ - output_manifest=preds.json - - # L2: Duplex Text Normalization - L2_Duplex_Text_Normalization_with_Tarred_dataset: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Duplex_Text_Normalization_with_Tarred_dataset') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - cd examples/nlp/duplex_text_normalization && \ - python duplex_text_normalization_train.py \ - data.validation_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv \ - mode=tn \ - lang=en \ - tagger_model.do_training=false \ - decoder_model.transformer=t5-small \ - data.validation_ds.batch_size=2 \ - data.train_ds.use_cache=false \ - data.validation_ds.use_cache=false \ - data.test_ds.batch_size=2 \ - data.train_ds.decoder_data_augmentation=false \ - data.train_ds.num_workers=2 \ - decoder_trainer.devices=[0,1] \ - decoder_trainer.accelerator="gpu" \ - data.train_ds.use_tarred_dataset=true \ - +decoder_trainer.fast_dev_run=true \ - decoder_exp_manager.create_checkpoint_callback=false \ - data.train_ds.tar_metadata_file=/home/TestData/nlp/duplex_text_norm/tarred_small/metadata.json \ - data.test_ds.use_cache=false \ - data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv - - # L2: Intent and Slot Classification Tasks - L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/intent_slot_classification && \ - python intent_slot_classification.py \ - model.data_dir=/home/TestData/nlp/retail \ - model.validation_ds.prefix=dev \ - model.test_ds.prefix=dev \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - exp_manager.exp_dir=checkpoints - AFTER_SCRIPT: | - rm -rf checkpoints - - L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/intent_slot_classification && \ - python multi_label_intent_slot_classification.py \ - model.data_dir=/home/TestData/nlp/new_multiatis \ - model.validation_ds.prefix=dev \ - model.test_ds.prefix=dev \ - trainer.devices=1 \ - +trainer.fast_dev_run=true \ - exp_manager.exp_dir=checkpoints2 - AFTER_SCRIPT: | - rm -rf checkpoints2 - - # TODO: add when megatron-bert is supported again - # stage("L2: Model Parallel Size 2 Megatron Text Classification") { - # when { - # anyOf{ - # branch "main" - # changeRequest target: "main" - # } - # } - # failFast true - # steps{ - # cd examples/nlp/text_classification && \ - # python text_classification_with_bert.py \ - # trainer.devices=[0,1] \ - # trainer.accelerator="gpu" \ - # trainer.num_nodes=1 \ - # trainer.precision=16 \ - # trainer.gradient_clip_val=1.0 \ - # +trainer.fast_dev_run=true \ - # model.dataset.num_classes=6 \ - # model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \ - # model.train_ds.batch_size=4 \ - # model.language_model.pretrained_model_name=megatron-bert-uncased \ - # model.language_model.config_file=/home/TestData/nlp/mp_2_bert_toy/config.json \ - # model.language_model.lm_checkpoint=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \ - # model.nemo_path=null \ - # ~model.infer_samples \ - # exp_manager=null - # } - # } - - # stage("L2: Model Parallel Size 2 Megatron Autoresume") { - # when { - # anyOf{ - # branch "main" - # changeRequest target: "main" - # } - # } - # failFast true - # steps{ - # cd examples/nlp/text_classification && \ - # python text_classification_with_bert.py \ - # trainer.devices=[0,1] \ - # trainer.accelerator="gpu" \ - # trainer.num_nodes=1 \ - # trainer.precision=16 \ - # trainer.gradient_clip_val=1.0 \ - # trainer.max_epochs=1 \ - # +trainer.fast_dev_run=true \ - # model.dataset.num_classes=6 \ - # model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \ - # model.train_ds.batch_size=4 \ - # model.language_model.pretrained_model_name=megatron-bert-uncased \ - # model.language_model.config_file=/home/TestData/nlp/mp_2_bert_toy/config.json \ - # model.language_model.lm_checkpoint=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \ - # model.nemo_path=null \ - # ~model.infer_samples \ - # +exp_manager.explicit_log_dir=/home/TestData/nlp/mp_autoresume \ - # +exp_manager.resume_if_exists=true - # } - # } - - # stage("L2: Model Parallel Size 2 Megatron Evaluation from .nemo") { - # when { - # anyOf{ - # branch "main" - # changeRequest target: "main" - # } - # } - # failFast true - # steps{ - # cd examples/nlp/text_classification && \ - # python model_parallel_text_classification_evaluation.py \ - # trainer.devices=[0,1] \ - # trainer.accelerator="gpu" \ - # trainer.num_nodes=1 \ - # model.dataset.num_classes=6 \ - # model.test_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \ - # model.nemo_path=/home/TestData/nlp/mp_2_nemo/retail_text_class_350M.nemo \ - # exp_manager=null - # } - # } - - # stage("L2: Model Parallel Size 2 Megatron Train from .nemo") { - # when { - # anyOf{ - # branch "main" - # changeRequest target: "main" - # } - # } - # failFast true - # steps{ - # cd examples/nlp/token_classification && \ - # python token_classification_train.py \ - # pretrained_model=/home/TestData/nlp/mp_2_nemo/ner_350M.nemo \ - # model.dataset.data_dir=/home/TestData/nlp/ner/ \ - # model.train_ds.batch_size=2 \ - # model.dataset.use_cache=false \ - # trainer.devices=[0,1] \ - # trainer.accelerator="gpu" \ - # +trainer.fast_dev_run=true \ - # model.dataset.class_balancing="weighted_loss" \ - # exp_manager=null - # } - # } - - - # L2: Parallel NLP Examples 2 - L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/token_classification && \ - python token_classification_train.py \ - pretrained_model=ner_en_bert \ - model.dataset.data_dir=/home/TestData/nlp/ner/ \ - model.train_ds.batch_size=2 \ - model.dataset.use_cache=false \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - model.dataset.class_balancing="weighted_loss" \ - exp_manager.exp_dir=null - - L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/token_classification && \ - data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \ - python punctuation_capitalization_train_evaluate.py \ - pretrained_model=punctuation_en_bert \ - model.train_ds.ds_item="${data_dir}" \ - model.validation_ds.ds_item="${data_dir}" \ - model.test_ds.ds_item="${data_dir}" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - exp_manager.exp_dir=null; - - rm -rf "${data_dir}" - - L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/token_classification && \ - python token_classification_train.py \ - model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name="TurkuNLP/bert-base-finnish-cased-v1" \ - exp_manager.exp_dir=null - - L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/token_classification/token_classification_evaluate.py \ - model.dataset.data_dir=/home/TestData/nlp/ner/ \ - model.dataset.use_cache=false \ - pretrained_model=/home/TestData/nlp/pretrained_models/NER_Model_with_BERT_base_uncased.nemo - - L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \ - python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \ - +do_training=false \ - +do_testing=true \ - model.test_ds.ds_item="${data_dir}" \ - ~model.train_ds \ - ~model.validation_ds \ - +model.test_ds.use_cache=false \ - pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_Capitalization_with_DistilBERT_base_uncased.nemo; - - rm -rf "${data_dir}" - - - # L2: Parallel Pretraining BERT pretraining from Text/Preprocessed - L2_Pretraining_BERT_pretraining_from_Text: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Pretraining_BERT_pretraining_from_Text') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/language_modeling && \ - python bert_pretraining.py \ - --config-name=bert_pretraining_from_text_config.yaml \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - trainer.precision=16 \ - +trainer.fast_dev_run=true \ - model.train_ds.data_file=/home/TestData/nlp/wikitext-2/train.txt \ - model.train_ds.batch_size=32 \ - model.validation_ds.data_file=/home/TestData/nlp/wikitext-2/valid.txt \ - model.validation_ds.batch_size=32 \ - model.language_model.config_file=/home/TestData/nlp/bert_configs/bert_3200.json \ - model.optim.lr=0.01 \ - model.optim.sched.warmup_ratio=0.1 \ - model.tokenizer.tokenizer_name=sentencepiece \ - model.tokenizer.tokenizer_model=/home/TestData/nlp/wikitext-2/tokenizer_bpe_v3193/tokenizer.model \ - model.mask_prob=0.15 \ - model.short_seq_prob=0.1 \ - exp_manager.exp_dir=/tmp/PretrainingBERTFromText; -# AFTER_SCRIPT: | -# rm -f /home/TestData/nlp/wikitext-2/*.pkl - #rm -rf examples/nlp/language_modeling/PretrainingBERTFromText - - L2_Pretraining_BERT_from_Preprocessed: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Pretraining_BERT_from_Preprocessed') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/language_modeling && \ - python bert_pretraining.py \ - --config-name=bert_pretraining_from_preprocessed_config.yaml \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - trainer.precision=16 \ - +trainer.fast_dev_run=false \ - +trainer.max_epochs=1 \ - +trainer.limit_val_batches=0 \ - +trainer.limit_train_batches=1 \ - model.train_ds.data_file=/home/TestData/nlp/wiki_book_mini/training \ - model.train_ds.batch_size=8 \ - model.language_model.lm_checkpoint=/home/TestData/nlp/bert_ckpts/nemo1.0/bert_base_uncased_mlm_final_1074591_nemo1.0.pt \ - model.language_model.config_file=/home/TestData/nlp/bert_configs/uncased_L-12_H-768_A-12.json \ - model.optim.lr=0.875e-4 \ - model.optim.weight_decay=0.01 \ - model.optim.sched.warmup_ratio=0.01 \ - exp_manager.exp_dir=PretrainingBERTFromPreprocessed \ - exp_manager.create_checkpoint_callback=False \ - - #rm -rf examples/nlp/language_modeling/PretrainingBERTFromPreprocessed - - - # TODO: remove +model.optim.capturable=True when Pytorch fix: https://github.com/pytorch/pytorch/pull/81858 - # is in the release container - # L2: NMT Attention is All You Need Training - L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - python examples/nlp/machine_translation/enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - do_testing=false \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.encoder.num_layers=1 \ - model.encoder.hidden_size=64 \ - model.encoder.inner_size=256 \ - model.decoder.num_layers=1 \ - model.decoder.hidden_size=64 \ - model.decoder.inner_size=256 \ - +model.optim.capturable=True \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.val_check_interval=2 \ - +trainer.limit_val_batches=1 \ - +trainer.max_steps=2 \ - trainer.precision=16 \ - +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \ - +exp_manager.create_checkpoint_callback=true - - python examples/nlp/machine_translation/enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - do_testing=true \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.encoder.num_layers=1 \ - model.encoder.hidden_size=64 \ - model.encoder.inner_size=256 \ - model.decoder.num_layers=1 \ - model.decoder.hidden_size=64 \ - model.decoder.inner_size=256 \ - +model.optim.capturable=True \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.val_check_interval=10 \ - +trainer.limit_val_batches=1 \ - +trainer.limit_test_batches=1 \ - +trainer.max_steps=10 \ - +exp_manager.explicit_log_dir=examples/nlp/machine_translation/nmt_results \ - +exp_manager.create_checkpoint_callback=true \ - +exp_manager.resume_if_exists=True - AFTER_SCRIPT: | - rm -rf examples/nlp/machine_translation/nmt_results - L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/machine_translation && \ - python enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - do_testing=true \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.encoder.pre_ln=true \ - model.decoder.pre_ln=true \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - +trainer.limit_test_batches=2 \ - exp_manager=null - - L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/machine_translation && \ - python enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - do_testing=true \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \ - model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \ - model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \ - model.test_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \ - model.test_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \ - model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - +trainer.limit_test_batches=2 \ - exp_manager=null - - # L2: NMT Attention is All You Need Inference - L2_NMT_Attention_is_All_You_Need_Inference: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NMT_Attention_is_All_You_Need_Inference') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - cd examples/nlp/machine_translation && \ - python nmt_transformer_infer.py \ - --model=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \ - --srctext=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.test.src \ - --tgtout=/home/TestData/nlp/nmt/toy_data/out.txt \ - --target_lang en \ - --source_lang de - - # L2: NMT Attention is All You Need Finetuning - L2_NMT_Attention_is_All_You_Need_Finetuning: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NMT_Attention_is_All_You_Need_Finetuning') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/machine_translation && \ - python enc_dec_nmt_finetune.py \ - model_path=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \ - trainer.devices=1 \ - ~trainer.max_epochs \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - +trainer.val_check_interval=10 \ - +trainer.limit_val_batches=1 \ - +trainer.limit_test_batches=1 \ - +trainer.max_steps=10 \ - +exp_manager.exp_dir=examples/nlp/machine_translation/nmt_finetune \ - +exp_manager.create_checkpoint_callback=True \ - +exp_manager.checkpoint_callback_params.monitor=val_sacreBLEU \ - +exp_manager.checkpoint_callback_params.mode=max \ - +exp_manager.checkpoint_callback_params.save_best_model=true - AFTER_SCRIPT: | - rm -rf examples/nlp/machine_translation/nmt_finetune - - # L2: NMT Tarred Dataset Creation - L2_NMT_Tarred_Dataset_Creation_Auto_Tarred_Dataset_Creation: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NMT_Tarred_Dataset_Creation_Auto_Tarred_Dataset_Creation') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/machine_translation && \ - python enc_dec_nmt.py \ - --config-path=conf \ - --config-name=aayn_base \ - do_training=false \ - model.preproc_out_dir=$PWD/preproc_out_dir \ - model.train_ds.use_tarred_dataset=true \ - model.train_ds.n_preproc_jobs=2 \ - model.train_ds.lines_per_dataset_fragment=500 \ - model.train_ds.num_batches_per_tarfile=10 \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.encoder_tokenizer.vocab_size=2000 \ - model.decoder_tokenizer.vocab_size=2000 \ - ~model.test_ds \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - exp_manager=null - - L2_NMT_Tarred_Dataset_Creation_Script_Tarred_Dataset_Creation: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NMT_Tarred_Dataset_Creation_Script_Tarred_Dataset_Creation') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - cd examples/nlp/machine_translation && \ - python create_tarred_parallel_dataset.py \ - --src_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - --tgt_fname /home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - --out_dir $PWD/out_dir \ - --encoder_tokenizer_vocab_size=2000 \ - --decoder_tokenizer_vocab_size=2000 \ - --tokens_in_batch=1000 \ - --lines_per_dataset_fragment=500 \ - --num_batches_per_tarfile=10 \ - --n_preproc_jobs=2 - - L2_Megatron_NMT_Training_TP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_NMT_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/machine_translation/megatron_nmt_training.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - +trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation="swiglu" \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method="block" \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation="swiglu" \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method="block" \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.micro_batch_size=2 \ - model.global_batch_size=4 \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.train_ds.num_workers=1 \ - model.validation_ds.num_workers=1 \ - ~model.test_ds \ - model.train_ds.dataset_type=text_memmap \ - model.encoder_tokenizer.library=sentencepiece \ - model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ - model.decoder_tokenizer.library=sentencepiece \ - model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model - # Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error - # if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run() - python examples/nlp/machine_translation/megatron_nmt_training.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - +trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/machine_translation/megatron_nmt_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation="swiglu" \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method="block" \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation="swiglu" \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method="block" \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.micro_batch_size=2 \ - model.global_batch_size=4 \ - model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - model.train_ds.num_workers=1 \ - model.validation_ds.num_workers=1 \ - ~model.test_ds \ - model.train_ds.dataset_type=text_memmap \ - model.encoder_tokenizer.library=sentencepiece \ - model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ - model.decoder_tokenizer.library=sentencepiece \ - model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model - AFTER_SCRIPT: | - rm -rf examples/nlp/machine_translation/megatron_nmt_results - - L2_Megatron_BART_Perceiver_MIM_Training_TP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_BART_Perceiver_MIM_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.arch=perceiver \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation="swiglu" \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method="block" \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation="swiglu" \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method="block" \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.micro_batch_size=2 \ - model.global_batch_size=4 \ - model.data.data_impl=text_mmap \ - model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \ - model.data.splits_string="\"800,100,100\"" \ - model.data.whole_word_masking=False \ - model.tokenizer.library=sentencepiece \ - model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ - ++model.hiddens.enc_output_name=z \ - ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \ - ++model.hiddens.transform.q_z_given_x.hidden_size=64 \ - ++model.hiddens.loss.mim.cls_name=a_mim \ - ++model.hiddens.loss.mim.loss_weight=0.5 - # Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error - # if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run() - python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/megatron_mim_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.arch=perceiver \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation="swiglu" \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method="block" \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation="swiglu" \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method="block" \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.micro_batch_size=2 \ - model.global_batch_size=4 \ - model.data.data_impl=text_mmap \ - model.data.data_prefix=[1.0,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src] \ - model.data.splits_string="\"800,100,100\"" \ - model.data.whole_word_masking=False \ - model.tokenizer.library=sentencepiece \ - model.tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \ - ++model.hiddens.enc_output_name=z \ - ++model.hiddens.transform.q_z_given_x.cls_name=cond_gaussian \ - ++model.hiddens.transform.q_z_given_x.hidden_size=64 \ - ++model.hiddens.loss.mim.cls_name=a_mim \ - ++model.hiddens.loss.mim.loss_weight=0.5 - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/megatron_mim_results - - # stage("L2: NMT Bottleneck Fallback") { - # when { - # anyOf { - # branch "main" - # changeRequest target: "main" - # } - # } - # failFast true - # parallel { - # stage("L2: seq2seq (no bottleneck)") { - # steps { - # cd examples/nlp/machine_translation && \ - # enc_dec_nmt-bottleneck.py \ - # --config-path=conf \ - # --config-name=aayn_bottleneck \ - # do_testing=true \ - # model.model_type=nll \ - # model.encoder.arch=seq2seq \ - # model.encoder.hidden_steps=1 \ - # model.encoder.hidden_blocks=1 \ - # model.encoder.hidden_init_method=params \ - # model.encoder.hidden_size=64 \ - # model.encoder.inner_size=128 \ - # model.encoder.num_attention_heads=2 \ - # model.encoder.num_layers=2 \ - # model.decoder.hidden_size=64 \ - # model.decoder.inner_size=128 \ - # model.decoder.num_attention_heads=2 \ - # model.decoder.num_layers=2 \ - # model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src \ - # model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref \ - # model.validation_ds.src_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.src] \ - # model.validation_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \ - # model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt13-en-de.src \ - # model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref \ - # model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - # model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - # trainer.devices=1 \ - # trainer.accelerator="gpu" \ - # +trainer.fast_dev_run=true \ - # +trainer.limit_test_batches=2 \ - # exp_manager=null \ - # } - # } - # } - # } - # stage("L2: NMT Bottleneck Architecture") { - # when { - # anyOf { - # branch "main" - # changeRequest target: "main" - # } - # } - # failFast true - # parallel { - # stage("Bridge Encoder (identity)") { - # steps { - # cd examples/nlp/machine_translation && \ - # enc_dec_nmt-bottleneck.py \ - # --config-path=conf \ - # --config-name=aayn_bottleneck \ - # do_testing=true \ - # model.model_type=nll \ - # model.encoder.arch=bridge \ - # model.encoder.hidden_steps=1 \ - # model.encoder.hidden_blocks=1 \ - # model.encoder.hidden_init_method=identity \ - # model.encoder.hidden_size=64 \ - # model.encoder.inner_size=128 \ - # model.encoder.num_attention_heads=2 \ - # model.encoder.num_layers=2 \ - # model.decoder.hidden_size=64 \ - # model.decoder.inner_size=128 \ - # model.decoder.num_attention_heads=2 \ - # model.decoder.num_layers=2 \ - # model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - # model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - # model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - # trainer.devices=1 \ - # trainer.accelerator="gpu" \ - # +trainer.fast_dev_run=true \ - # +trainer.limit_test_batches=2 \ - # exp_manager=null - # } - # } - # stage("Perceiver Encoder (params)") { - # steps { - # cd examples/nlp/machine_translation && \ - # enc_dec_nmt-bottleneck.py \ - # --config-path=conf \ - # --config-name=aayn_bottleneck \ - # do_testing=true \ - # model.model_type=nll \ - # model.encoder.arch=perceiver \ - # model.encoder.hidden_steps=1 \ - # model.encoder.hidden_blocks=1 \ - # model.encoder.hidden_init_method=params \ - # model.encoder.hidden_size=64 \ - # model.encoder.inner_size=128 \ - # model.encoder.num_attention_heads=2 \ - # model.encoder.num_layers=2 \ - # model.decoder.hidden_size=64 \ - # model.decoder.inner_size=128 \ - # model.decoder.num_attention_heads=2 \ - # model.decoder.num_layers=2 \ - # model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - # model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - # model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - # trainer.devices=1 \ - # trainer.accelerator="gpu" \ - # +trainer.fast_dev_run=true \ - # +trainer.limit_test_batches=2 \ - # exp_manager=null - # } - # } - # } - # } - # stage("L2: NMT Bottleneck LVM") { - # when { - # anyOf { - # branch "main" - # changeRequest target: "main" - # } - # } - # failFast true - # parallel { - # stage("VAE") { - # steps { - # cd examples/nlp/machine_translation && \ - # enc_dec_nmt-bottleneck.py \ - # --config-path=conf \ - # --config-name=aayn_bottleneck \ - # do_testing=true \ - # model.model_type=vae \ - # model.encoder.arch=perceiver \ - # model.encoder.hidden_steps=1 \ - # model.encoder.hidden_blocks=1 \ - # model.encoder.hidden_init_method=params \ - # model.encoder.hidden_size=64 \ - # model.encoder.inner_size=128 \ - # model.encoder.num_attention_heads=2 \ - # model.encoder.num_layers=2 \ - # model.decoder.hidden_size=64 \ - # model.decoder.inner_size=128 \ - # model.decoder.num_attention_heads=2 \ - # model.decoder.num_layers=2 \ - # model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - # model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - # model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - # trainer.devices=1 \ - # trainer.accelerator="gpu" \ - # +trainer.fast_dev_run=true \ - # +trainer.limit_test_batches=2 \ - # exp_manager=null - # } - # } - # stage("MIM") { - # steps { - # cd examples/nlp/machine_translation && \ - # enc_dec_nmt-bottleneck.py \ - # --config-path=conf \ - # --config-name=aayn_bottleneck \ - # do_testing=true \ - # model.model_type=mim \ - # model.encoder.arch=perceiver \ - # model.encoder.hidden_steps=1 \ - # model.encoder.hidden_blocks=1 \ - # model.encoder.hidden_init_method=params \ - # model.encoder.hidden_size=64 \ - # model.encoder.inner_size=128 \ - # model.encoder.num_attention_heads=2 \ - # model.encoder.num_layers=2 \ - # model.decoder.hidden_size=64 \ - # model.decoder.inner_size=128 \ - # model.decoder.num_attention_heads=2 \ - # model.decoder.num_layers=2 \ - # model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \ - # model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \ - # model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - # model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \ - # trainer.devices=1 \ - # trainer.accelerator="gpu" \ - # +trainer.fast_dev_run=true \ - # +trainer.limit_test_batches=2 \ - # exp_manager=null - # } - # } - # } - # } - - L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=bf16 \ - model.megatron_amp_O2=True \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=20 \ - trainer.precision=bf16 \ - model.megatron_amp_O2=True \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - - L2_Megatron_Bert_Pretraining_and_Resume_Training: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Bert_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=bf16 \ - model.megatron_amp_O2=True \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.sequence_parallel=True \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=20 \ - trainer.precision=bf16 \ - model.megatron_amp_O2=True \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/bert_pretrain_results - rm -rf examples/nlp/language_modeling/bert_index_mappings - - L2_Megatron_Core_Bert_Pretraining_and_Resume_Training: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Core_Bert_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - model.mcore_bert=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.sequence_parallel=True \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method="block" \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=20 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.mcore_bert=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method="block" \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/bert_pretrain_results - rm -rf examples/nlp/language_modeling/bert_index_mappings - - L2_Megatron_RETRO_Pretraining_and_Resume_Training: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_RETRO_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_retro_pretraining.py \ - trainer.num_nodes=1 \ - trainer.devices=2 \ - trainer.precision=bf16 \ - trainer.accelerator=gpu \ - model.data.data_prefix=["none"] \ - exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \ - model.mcore_gpt=True \ - model.tensor_model_parallel_size=1 \ - model.pipeline_model_parallel_size=1 \ - model.optim.name=distributed_fused_adam \ - model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \ - model.data.num_workers=4 \ - model.micro_batch_size=1 \ - model.data.shuffle_documents=False \ - trainer.val_check_interval=30 \ - +trainer.num_sanity_val_steps=0 \ - model.init_method_std=0.023 \ - model.optim.lr=6.0e-4 \ - model.megatron_amp_O2=True \ - model.data.splits_string="\"98,2,0\"" \ - model.data.dataloader_type=cyclic \ - trainer.max_steps=10 - - python examples/nlp/language_modeling/megatron_retro_pretraining.py \ - trainer.num_nodes=1 \ - trainer.devices=2 \ - trainer.precision=bf16 \ - trainer.accelerator=gpu \ - model.data.data_prefix=["none"] \ - exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \ - model.mcore_gpt=True \ - model.tensor_model_parallel_size=1 \ - model.pipeline_model_parallel_size=1 \ - model.optim.name=distributed_fused_adam \ - model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \ - model.data.num_workers=4 \ - model.micro_batch_size=1 \ - model.data.shuffle_documents=False \ - trainer.val_check_interval=30 \ - +trainer.num_sanity_val_steps=0 \ - model.init_method_std=0.023 \ - model.optim.lr=6.0e-4 \ - model.megatron_amp_O2=True \ - model.data.splits_string="\"98,2,0\"" \ - model.data.dataloader_type=cyclic \ - trainer.max_steps=20 - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/mcore_retro_results - - L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \ - trainer.devices=2 \ - trainer.num_nodes=1 \ - trainer.accelerator=gpu \ - trainer.accumulate_grad_batches=1 \ - trainer.limit_val_batches=2 \ - exp_manager.resume_if_exists=True \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - trainer.val_check_interval=10 \ - exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \ - model.data.data_prefix= \ - model.data.knn_index= \ - model.data.retrieval_prefix= \ - model.tensor_model_parallel_size=2 \ - model.micro_batch_size=4 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.chunk_size=32 \ - model.enc_num_layers=2 \ - model.dec_num_layers=2 \ - model.enc_cross_attention=[1] \ - model.dec_cross_attention=[1] \ - +model.data.mock=True - - python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \ - trainer.devices=2 \ - trainer.num_nodes=1 \ - trainer.accelerator=gpu \ - trainer.accumulate_grad_batches=1 \ - trainer.limit_val_batches=2 \ - exp_manager.resume_if_exists=True \ - trainer.max_steps=20 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - trainer.val_check_interval=10 \ - exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \ - model.data.data_prefix= \ - model.data.knn_index= \ - model.data.retrieval_prefix= \ - model.tensor_model_parallel_size=2 \ - model.micro_batch_size=4 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.chunk_size=32 \ - model.enc_num_layers=2 \ - model.dec_num_layers=2 \ - model.enc_cross_attention=[1] \ - model.dec_cross_attention=[1] \ - +model.data.mock=True - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/retro_legacy_results - - # L2_Megatron_RETRO_muTransfer_Pretraining_Performance: - # needs: [cicd-test-container-setup] - # runs-on: self-hosted-azure - # container: - # image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - # options: - # # --user 0:128 - # --device=/dev/nvidia0 - # --gpus all - # --shm-size=8g - # --env TRANSFORMERS_OFFLINE=0 - # --env HYDRA_FULL_ERROR=1 - # --volume /mnt/datadrive/TestData:/home/TestData - # steps: - # - name: Checkout repository - # uses: actions/checkout@v4 - # - run: | - # python examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py \ - # trainer.devices=2 \ - # trainer.num_nodes=1 \ - # trainer.accelerator=gpu \ - # trainer.accumulate_grad_batches=1 \ - # trainer.max_steps=100 \ - # trainer.log_every_n_steps=1 \ - # trainer.precision=16 \ - # trainer.val_check_interval=100 \ - # trainer.limit_val_batches=0 \ - # trainer.gradient_clip_val=1.0 \ - # +trainer.num_sanity_val_steps=0 \ - # exp_manager.exp_dir=examples/nlp/language_modeling/retro_results/ \ - # +exp_manager.version=smalltest \ - # model.data.neighbors=2 \ - # model.megatron_amp_O2=False \ - # model.apply_query_key_layer_scaling=False \ - # model.tensor_model_parallel_size=1 \ - # model.optim.name=muadamw \ - # model.optim.weight_decay=0.1 \ - # model.optim.betas=[0.9,0.95] \ - # model.optim.lr=6e-4 \ - # model.optim.sched.warmup_steps=1000 \ - # model.optim.sched.constant_steps=0 \ - # model.optim.sched.min_lr=6e-5 \ - # model.add_position_embedding=False \ - # model.enc_num_layers=2 \ - # model.dec_num_layers=6 \ - # model.enc_cross_attention=[0] \ - # model.dec_cross_attention=[3,5] \ - # model.hidden_size=96 \ - # model.ffn_hidden_size=384 \ - # model.init_method_std=0.023 \ - # model.num_attention_heads=12 \ - # model.max_position_embeddings=1024 \ - # model.encoder_seq_length=1024 \ - # model.tokenizer.library=megatron \ - # model.tokenizer.type=GPT2BPETokenizer \ - # model.tokenizer.merge_file=/home/TestData/nlp/megatron_retro/gpt2-merges.txt \ - # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_retro/gpt2-vocab.json \ - # model.data.data_prefix=[/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document] \ - # model.data.knn_index=[/home/TestData/nlp/megatron_retro/knn2_map_wiki_test.idx] \ - # model.data.retrieval_prefix=/home/TestData/nlp/megatron_retro/retro_wiki_test_text_document \ - # model.data.index_mapping_dir=/home/TestData/nlp/megatron_retro \ - # model.data.num_workers=8 \ - # model.micro_batch_size=8 \ - # model.normalization=rmsnorm \ - # model.transformer_block_type=pre_ln \ - # model.bias_activation_fusion=True \ - # model.bias_dropout_add_fusion=False \ - # model.masked_softmax_fusion=True \ - # model.hidden_dropout=0 \ - # model.attention_dropout=0 \ - # model.fp32_residual_connection=True \ - # model.shape_file=/home/TestData/nlp/megatron_retro/o1_rel_shape_info_tiny.yaml - - # python -c "import pandas as pd - # import pathlib - # from pandas.testing import assert_frame_equal - # from tensorboard.backend.event_processing.event_accumulator import EventAccumulator - # import torch - # if not (torch.cuda.is_available() and "A100" in torch.cuda.get_device_name()): - # import sys - # sys.exit(0) - # event_file = list(pathlib.Path("examples/nlp/language_modeling/retro_results/megatron_retro/smalltest").glob("events.out.tfevents*"))[0] - # ea = EventAccumulator(str(event_file)).Reload() - # vals = [] - # for i in ea.Scalars("reduced_train_loss"): - # vals.append(i.value) - # training_curve = pd.DataFrame({"loss": vals}) - # gt_curve = pd.read_csv("/home/TestData/nlp/megatron_retro/expected_learning_curve.csv") - # assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)" - - # rm -rf examples/nlp/language_modeling/retro_results - # - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - # if: "failure()" - - L2_RAG_Pipeline_Indexing: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_RAG_Pipeline_Indexing') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/rag/rag_indexing.py \ - trainer.num_nodes=1 \ - trainer.devices=1 \ - trainer.precision="bf16-mixed" \ - indexing.embedder.model_path="/home/TestData/nlp/rag_pipeline/testing_models/embedders/sbert_nemo.nemo" \ - indexing.embedder.embed_batch_size=128 \ - indexing.data.data_path="/home/TestData/nlp/rag_pipeline/testing_data/corpus_data/sample_data" \ - indexing.data.chunk_size=256 \ - indexing.data.chunk_overlap=10 \ - indexing.index_path="/home/TestData/nlp/rag_pipeline/testing_data/saved_index/sample_index" - - L2_RAG_Pipeline_Generating: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_RAG_Pipeline_Generating') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/rag/rag_generating.py \ - trainer.devices=1 \ - trainer.precision="bf16-mixed" \ - indexing.embedder.model_path="/home/TestData/nlp/rag_pipeline/testing_models/embedders/sbert_nemo.nemo" \ - indexing.index_path="/home/TestData/nlp/rag_pipeline/testing_data/saved_index/sample_index" \ - generating.llm.model_path="/home/TestData/nlp/rag_pipeline/testing_models/llms/megatron_gpt_125m.nemo" \ - generating.inference.tokens_to_generate=50 \ - generating.inference.greedy=False \ - generating.inference.temperature=1.0 \ - generating.query="Which art schools did I applied to?" - - L2_BioMegatron_Bert_NER_Task: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_BioMegatron_Bert_NER_Task') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/token_classification/token_classification_train.py \ - exp_manager.exp_dir=examples/nlp/language_modeling/token_classification_results \ - trainer.max_epochs=1 \ - model.dataset.data_dir=/home/TestData/nlp/ner \ - model.language_model.pretrained_model_name=biomegatron345m_biovocab_30k_cased \ - model.tokenizer.tokenizer_name=null - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/token_classification_results - - L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-2-h100 - SCRIPT: | - # This is to improve p2p overlap on H100 - export NVTE_FWD_LAYERNORM_SM_MARGIN=8 - export NVTE_BWD_LAYERNORM_SM_MARGIN=8 - export TORCH_NCCL_AVOID_RECORD_STREAMS=1 - export NCCL_MIN_NCHANNELS=4 - # TP overlap is not supported in docker environment - #NVTE_UB_SPLIT_RS: 0 - #NVTE_UB_ATOMIC_GEMM_RS: 1 - #NVTE_RS_STRIDED_ATOMIC: 1 - #NVTE_UB_FP8_RS: 1 - # Increase p2p chunksize to 2MB - export NCCL_P2P_NET_CHUNKSIZE=2097152 - # Disable gc when switching to/from validation steps - export NEMO_MANUAL_GC_IN_VALIDATION=0 - - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=/tmp/examples_gpt_pretrain_results_te_autocast \ - ++model.transformer_engine=True \ - ++model.fp8=True \ - ++model.fp8_hybrid=True \ - ++model.fp8_amax_history_len=1024 \ - ++model.fp8_amax_compute_algo=max \ - ++model.reduce_amax=True \ - ++model.use_te_rng_tracker=True \ - ++model.name=megatron_gpt_full_te_layer_autocast \ - model.ub_tp_comm_overlap=False \ - model.tensor_model_parallel_size=2 \ - model.optim.name=distributed_fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=2 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.validation_drop_last=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=/tmp/examples_gpt_pretrain_results_te_autocast/gpt_mapping - - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=/tmp/examples_gpt_pretrain_results_te_autocast \ - exp_manager.resume_if_exists=True \ - ++model.transformer_engine=True \ - ++model.fp8=True \ - ++model.fp8_hybrid=True \ - ++model.fp8_amax_history_len=1024 \ - ++model.fp8_amax_compute_algo=max \ - ++model.reduce_amax=True \ - ++model.use_te_rng_tracker=True \ - ++model.name=megatron_gpt_full_te_layer_autocast \ - model.ub_tp_comm_overlap=False \ - model.tensor_model_parallel_size=2 \ - model.optim.name=distributed_fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=2 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.validation_drop_last=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=/tmp/examples_gpt_pretrain_results_te_autocast/gpt_mapping - - - L2_Megatron_GPT_Skip_Train: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Skip_Train') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.skip_train=True \ - model.tensor_model_parallel_size=2 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.data.data_prefix=[] \ - model.data.data_impl=mock \ - model.dist_ckpt_format=torch_dist - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings - - L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=rope \ - model.rotary_percentage=0.5 \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - # commented out to save time on github ci @adithyare - # python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - # trainer.devices=2 \ - # trainer.accelerator=gpu \ - # trainer.log_every_n_steps=1 \ - # trainer.val_check_interval=2 \ - # trainer.limit_val_batches=1 \ - # trainer.accumulate_grad_batches=1 \ - # trainer.max_steps=6 \ - # trainer.gradient_clip_val=1.0 \ - # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - # exp_manager.resume_if_exists=True \ - # model.tensor_model_parallel_size=2 \ - # model.optim.name=fused_adam \ - # model.optim.lr=2e-4 \ - # model.optim.sched.warmup_steps=2 \ - # model.optim.sched.constant_steps=2 \ - # model.optim.sched.min_lr=8e-5 \ - # model.max_position_embeddings=128 \ - # model.encoder_seq_length=128 \ - # model.data.seq_length=128 \ - # model.position_embedding_type=rope \ - # model.rotary_percentage=0.5 \ - # model.normalization=rmsnorm \ - # model.bias=False \ - # model.bias_activation_fusion=False \ - # model.bias_dropout_add_fusion=False \ - # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - # model.num_layers=8 \ - # model.hidden_size=256 \ - # model.num_attention_heads=8 \ - # model.activations_checkpoint_method=block \ - # model.activations_checkpoint_granularity=full \ - # model.activations_checkpoint_num_layers=1 \ - # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings - - # This test requires Ampere but some of the test GPUs are Volta - # Need to add a check for compute capability before uncommenting this test - # - name: L2: Megatron GPT with Rope Pretraining using Flash Attention and Resume Training TP=2 - # when { - # anyOf { - # branch main - # changeRequest target: main - # } - # } - # failFast true - # - run: | - # python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - # trainer.devices=2 \ - # trainer.accelerator=gpu \ - # trainer.log_every_n_steps=1 \ - # trainer.val_check_interval=2 \ - # trainer.limit_val_batches=2 \ - # trainer.accumulate_grad_batches=1 \ - # trainer.max_steps=3 \ - # trainer.precision=16 \ - # trainer.gradient_clip_val=1.0 \ - # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - # model.tensor_model_parallel_size=2 \ - # model.optim.name=fused_adam \ - # model.optim.lr=2e-4 \ - # model.optim.sched.warmup_steps=1 \ - # model.optim.sched.constant_steps=1 \ - # model.optim.sched.min_lr=8e-5 \ - # model.max_position_embeddings=128 \ - # model.encoder_seq_length=128 \ - # model.data.seq_length=128 \ - # model.position_embedding_type=rope \ - # model.rotary_percentage=0.5 \ - # model.normalization=rmsnorm \ - # model.bias=False \ - # model.bias_activation_fusion=False \ - # model.bias_dropout_add_fusion=False \ - # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - # model.num_layers=8 \ - # model.hidden_size=256 \ - # model.num_attention_heads=8 \ - # model.activations_checkpoint_method=block \ - # model.activations_checkpoint_granularity=full \ - # model.activations_checkpoint_num_layers=1 \ - # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \ - # model.use_flash_attention=True " - # # commented out to save time on github ci @adithyare - # # python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - # # trainer.devices=2 \ - # # trainer.accelerator=gpu \ - # # trainer.log_every_n_steps=1 \ - # # trainer.val_check_interval=2 \ - # # trainer.limit_val_batches=1 \ - # # trainer.accumulate_grad_batches=1 \ - # # trainer.max_steps=6 \ - # # trainer.precision=16 \ - # # trainer.gradient_clip_val=1.0 \ - # # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - # # exp_manager.resume_if_exists=True \ - # # model.tensor_model_parallel_size=2 \ - # # model.optim.name=fused_adam \ - # # model.optim.lr=2e-4 \ - # # model.optim.sched.warmup_steps=2 \ - # # model.optim.sched.constant_steps=2 \ - # # model.optim.sched.min_lr=8e-5 \ - # # model.max_position_embeddings=128 \ - # # model.encoder_seq_length=128 \ - # # model.data.seq_length=128 \ - # # model.position_embedding_type=rope \ - # # model.rotary_percentage=0.5 \ - # # model.normalization=rmsnorm \ - # # model.bias=False \ - # # model.bias_activation_fusion=False \ - # # model.bias_dropout_add_fusion=False \ - # # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - # # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - # # model.num_layers=8 \ - # # model.hidden_size=256 \ - # # model.num_attention_heads=8 \ - # # model.activations_checkpoint_method=block \ - # # model.activations_checkpoint_granularity=full \ - # # model.activations_checkpoint_num_layers=1 \ - # # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - # # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings \ - # # model.use_flash_attention=True" - # rm -rf examples/nlp/language_modeling/gpt_pretrain_results" - # rm -rf examples/nlp/language_modeling/gpt_index_mappings" - # } - # } - - L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=3 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=bf16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.megatron_amp_O2=True \ - model.optim.name=distributed_fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=3 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.precision=bf16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.reset_lr=True \ - model.tensor_model_parallel_size=2 \ - model.megatron_amp_O2=True \ - model.optim.name=distributed_fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings - - L2_Megatron_GPT_with_Drop_Optimizer_States_TP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_with_Drop_Optimizer_States_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.precision=bf16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - exp_manager.checkpoint_callback_params.save_last_n_optim_states=1 \ - model.dist_ckpt_format="torch_dist" \ - model.tensor_model_parallel_size=2 \ - model.megatron_amp_O2=True \ - model.optim.name=distributed_fused_adam \ - model.optim.lr=2e-4 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings - - L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=alibi \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - # not testing resume functionality to save time on ci @adithyare - #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - #trainer.devices=2 \ - #trainer.accelerator=gpu \ - #trainer.log_every_n_steps=1 \ - #trainer.val_check_interval=2 \ - #trainer.limit_val_batches=1 \ - #trainer.accumulate_grad_batches=1 \ - #trainer.max_steps=6 \ - #trainer.gradient_clip_val=1.0 \ - #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - #exp_manager.resume_if_exists=True \ - #model.tensor_model_parallel_size=2 \ - #model.optim.name=fused_adam \ - #model.optim.lr=2e-4 \ - #model.optim.sched.warmup_steps=2 \ - #model.optim.sched.constant_steps=2 \ - #model.optim.sched.min_lr=8e-5 \ - #model.max_position_embeddings=128 \ - #model.encoder_seq_length=128 \ - #model.data.seq_length=128 \ - #model.position_embedding_type=alibi \ - #model.normalization=rmsnorm \ - #model.bias=False \ - #model.bias_activation_fusion=False \ - #model.bias_dropout_add_fusion=False \ - #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - #model.num_layers=8 \ - #model.hidden_size=256 \ - #model.num_attention_heads=8 \ - #model.activations_checkpoint_method=block \ - #model.activations_checkpoint_granularity=full \ - #model.activations_checkpoint_num_layers=1 \ - #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings - - L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=kerple \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - # commented out to save time on github ci @adithyare - #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - #trainer.devices=2 \ - #trainer.accelerator=gpu \ - #trainer.log_every_n_steps=1 \ - #trainer.val_check_interval=2 \ - #trainer.limit_val_batches=1 \ - #trainer.accumulate_grad_batches=1 \ - #trainer.max_steps=6 \ - #trainer.precision=16 \ - #trainer.gradient_clip_val=1.0 \ - #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - #exp_manager.resume_if_exists=True \ - #model.tensor_model_parallel_size=2 \ - #model.optim.name=fused_adam \ - #model.optim.lr=2e-4 \ - #model.optim.sched.warmup_steps=2 \ - #model.optim.sched.constant_steps=2 \ - #model.optim.sched.min_lr=8e-5 \ - #model.max_position_embeddings=128 \ - #model.encoder_seq_length=128 \ - #model.data.seq_length=128 \ - #model.position_embedding_type=kerple \ - #model.normalization=rmsnorm \ - #model.bias=False \ - #model.bias_activation_fusion=False \ - #model.bias_dropout_add_fusion=False \ - #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - #model.num_layers=8 \ - #model.hidden_size=256 \ - #model.num_attention_heads=8 \ - #model.activations_checkpoint_method=block \ - #model.activations_checkpoint_granularity=full \ - #model.activations_checkpoint_num_layers=1 \ - #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings - - L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-2-h100 - SCRIPT: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=bf16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - ++model.transformer_engine=True \ - ++model.fp8=True \ - ++model.fp8_hybrid=True \ - ++model.fp8_amax_history_len=1024 \ - ++model.fp8_amax_compute_algo=max \ - ++model.reduce_amax=True \ - ++model.use_te_rng_tracker=True \ - ++model.name=megatron_gpt_full_te_layer_autocast \ - model.ub_tp_comm_overlap=False \ - model.pipeline_model_parallel_size=2 \ - model.tensor_model_parallel_size=1 \ - model.mcore_gpt=True \ - model.megatron_amp_O2=True \ - model.optim.name=distributed_fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.activation=fast-swiglu \ - model.bias_activation_fusion=False \ - model.hidden_dropout=0.0 \ - model.attention_dropout=0.0 \ - model.transformer_block_type=normformer \ - model.headscale=True \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.validation_drop_last=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.precision=bf16 \ - trainer.gradient_clip_val=1.0 \ - model.mcore_gpt=True \ - model.megatron_amp_O2=True \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - exp_manager.resume_if_exists=True \ - ++model.transformer_engine=True \ - ++model.fp8=True \ - ++model.fp8_hybrid=True \ - ++model.fp8_amax_history_len=1024 \ - ++model.fp8_amax_compute_algo=max \ - ++model.reduce_amax=True \ - ++model.use_te_rng_tracker=True \ - ++model.name=megatron_gpt_full_te_layer_autocast \ - model.ub_tp_comm_overlap=False \ - model.pipeline_model_parallel_size=2 \ - model.tensor_model_parallel_size=1 \ - model.optim.name=distributed_fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.activation=fast-swiglu \ - model.bias_activation_fusion=False \ - model.hidden_dropout=0.0 \ - model.attention_dropout=0.0 \ - model.transformer_block_type=normformer \ - model.headscale=True \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.validation_drop_last=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings - - OPTIONAL_L2_Megatron_GPT_Auto_Configurator_TP1_PP1_MBS124: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L2_Megatron_GPT_Auto_Configurator_TP1_PP1_MBS124') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - mkdir examples/llm/auto_configurator/auto_conf_logs - - python examples/llm/auto_configurator/auto_config.py \ - --logs_dir=/workspace/examples/llm/auto_configurator/auto_conf_logs \ - --data_path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \ - --tokenizer_path=/home/TestData/nlp/gpt2_tokenizer \ - --run_number=1 - - python examples/llm/auto_configurator/auto_config.py \ - --logs_dir=/workspace/examples/llm/auto_configurator/auto_conf_logs \ - --data_path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \ - --tokenizer_path=/home/TestData/nlp/gpt2_tokenizer \ - --run_number=2 - - python examples/llm/auto_configurator/auto_config.py \ - --logs_dir=/workspace/examples/llm/auto_configurator/auto_conf_logs \ - --data_path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \ - --tokenizer_path=/home/TestData/nlp/gpt2_tokenizer \ - --run_number=3 - - python examples/llm/auto_configurator/auto_config.py \ - --logs_dir=/workspace/examples/llm/auto_configurator/auto_conf_logs \ - --data_path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \ - --tokenizer_path=/home/TestData/nlp/gpt2_tokenizer \ - --get_results - AFTER_SCRIPT: | - rm -rf examples/llm/auto_configurator/auto_conf_logs - IS_OPTIONAL: true - - L2_Megatron_GPT_Finetuning_PP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Finetuning_PP2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - +trainer.limit_val_batches=2 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=/tmp/gpt_finetuning_pp2_megatron \ - model.pipeline_model_parallel_size=2 \ - model.tensor_model_parallel_size=1 \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.peft.peft_scheme=null \ - model.data.train_ds.micro_batch_size=1 \ - model.data.train_ds.global_batch_size=4 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \ - model.data.train_ds.num_workers=0 \ - model.data.test_ds.micro_batch_size=1 \ - model.data.test_ds.global_batch_size=1 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.test_ds.names=[quarel] \ - model.data.validation_ds.micro_batch_size=1 \ - model.data.validation_ds.global_batch_size=1 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel] - - python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - +trainer.limit_val_batches=2 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=/tmp/gpt_finetuning_pp2_megatron \ - model.pipeline_model_parallel_size=2 \ - model.tensor_model_parallel_size=1 \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.peft.peft_scheme=null \ - model.data.train_ds.micro_batch_size=1 \ - model.data.train_ds.global_batch_size=4 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl,/home/TestData/nlp/megatron_sft/trec.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[0.3,0.7] \ - model.data.train_ds.num_workers=0 \ - model.data.test_ds.micro_batch_size=1 \ - model.data.test_ds.global_batch_size=1 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.test_ds.names=[quarel] \ - model.data.validation_ds.micro_batch_size=1 \ - model.data.validation_ds.global_batch_size=1 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel] - - L2_Megatron_GPT_Finetuning_StarCoder_PP1: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Finetuning_StarCoder_PP1') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ - trainer.devices=1 \ - trainer.num_nodes=1 \ - trainer.precision=bf16 \ - trainer.max_steps=4 \ - trainer.val_check_interval=4 \ - trainer.enable_checkpointing=False \ - +trainer.limit_val_batches=2 \ - +trainer.limit_test_batches=2 \ - exp_manager.checkpoint_callback_params.save_best_model=False \ - exp_manager.exp_dir=/tmp/gpt_sft_results_starcoder_pp1 \ - model.peft.peft_scheme=none \ - model.optim.name=distributed_fused_adam \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \ - model.tensor_model_parallel_size=1 \ - model.pipeline_model_parallel_size=1 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.train_ds.num_workers=0 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.test_ds.num_workers=0 \ - model.data.train_ds.concat_sampling_probabilities=[1.0] - - L2_Megatron_GPT_Reranker: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Reranker') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py \ - exp_manager.exp_dir="/tmp/gpt_reranker_workdir/" \ - model.global_batch_size=4 \ - model.micro_batch_size=4 \ - trainer.devices=1 \ - trainer.num_nodes=1 \ - trainer.max_epochs=null \ - trainer.max_steps=20 \ - trainer.val_check_interval=10 \ - model.restore_from_path="/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo" \ - model.peft.lora_tuning.adapter_dim=8 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] \ - model.data.validation_ds.write_embeddings_to_file=True \ - model.data.validation_ds.output_file_path_prefix="/home/TestData/nlp/megatron_ir/working_dir/val_embs" \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] - - L2_Megatron_GPT_Embedding: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Embedding') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py \ - exp_manager.exp_dir="/tmp/gpt_embedding_workdir/" \ - model.global_batch_size=4 \ - model.micro_batch_size=4 \ - trainer.devices=1 \ - trainer.num_nodes=1 \ - trainer.max_epochs=null \ - trainer.max_steps=20 \ - trainer.val_check_interval=10 \ - model.restore_from_path="/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo" \ - model.peft.lora_tuning.adapter_dim=8 \ - model.data.validation_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \ - model.data.validation_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl] \ - model.data.validation_ds.write_embeddings_to_file=True \ - model.data.validation_ds.output_file_path_prefix="/tmp/gpt_embedding_workdir/val_embs/" \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] - - - python examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py \ - trainer.devices=1 \ - trainer.num_nodes=1 \ - model.restore_from_path="/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo" \ - model.peft.restore_from_path="/tmp/gpt_embedding_workdir/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo" \ - model.global_batch_size=4 \ - model.micro_batch_size=4 \ - model.peft.lora_tuning.adapter_dim=8 \ - model.data.test_ds.write_embeddings_to_file=True \ - model.data.test_ds.output_file_path_prefix="/tmp/gpt_embedding_workdir/test_embs" \ - model.data.test_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \ - model.data.test_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl] - - L2_Megatron_GPT_PEFT_Lora_PP2_O2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_PEFT_Lora_PP2_O2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.max_epochs=9999 \ - trainer.max_steps=3 \ - trainer.val_check_interval=3 \ - ++trainer.limit_val_batches=2 \ - trainer.precision=bf16 \ - exp_manager.exp_dir=/tmp/nlp_peft_lora_tuning_pp2 \ - model.pipeline_model_parallel_size=2 \ - model.tensor_model_parallel_size=1 \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \ - model.megatron_amp_O2=True \ - model.peft.peft_scheme=lora \ - model.answer_only_loss=True \ - model.micro_batch_size=1 \ - model.global_batch_size=1 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[1.0] \ - model.data.train_ds.num_workers=0 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel] - - python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \ - model.peft.restore_from_path=/tmp/nlp_peft_lora_tuning_pp2/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \ - model.pipeline_model_parallel_size=2 \ - model.tensor_model_parallel_size=1 \ - trainer.devices=2 \ - model.megatron_amp_O2=True \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \ - model.data.test_ds.names=["quarel4"] \ - model.global_batch_size=2 \ - model.micro_batch_size=1 \ - model.data.test_ds.tokens_to_generate=10 \ - model.data.test_ds.write_predictions_to_file=True \ - model.data.test_ds.output_file_path_prefix="/tmp/nlp_peft_lora_tuning_pp2/out" \ - inference.greedy=True \ - inference.repetition_penalty=1.0 \ - inference.outfile_path="/tmp/nlp_peft_lora_tuning_pp2/out.jsonl" - - L2_Megatron_GPT_PEFT_Lora_TP2_O1: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_PEFT_Lora_TP2_O1') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.max_epochs=9999 \ - trainer.max_steps=3 \ - trainer.val_check_interval=3 \ - ++trainer.limit_val_batches=2 \ - trainer.precision=bf16 \ - exp_manager.exp_dir=/tmp/nlp_peft_lora_tuning_pp2_o1 \ - model.pipeline_model_parallel_size=1 \ - model.tensor_model_parallel_size=2 \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \ - model.peft.peft_scheme="lora" \ - model.answer_only_loss=True \ - model.micro_batch_size=1 \ - model.global_batch_size=1 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[1.0] \ - model.data.train_ds.num_workers=0 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel] - - python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \ - model.peft.restore_from_path=/tmp/nlp_peft_lora_tuning_pp2_o1/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo \ - model.tensor_model_parallel_size=2 \ - trainer.devices=2 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \ - model.data.test_ds.names=["quarel4"] \ - model.global_batch_size=2 \ - model.micro_batch_size=1 \ - model.data.test_ds.tokens_to_generate=10 \ - model.data.test_ds.write_predictions_to_file=True \ - model.data.test_ds.output_file_path_prefix="/tmp/nlp_peft_lora_tuning_pp2_o1/out" \ - inference.greedy=True \ - inference.repetition_penalty=1.0 \ - inference.outfile_path="/tmp/nlp_peft_lora_tuning_pp2_o1/out.jsonl" - - L2_Megatron_GPT_PEFT_Lora_TP2SP1: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_PEFT_Lora_TP2SP1') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-2-h100 - SCRIPT: | - CUDA_DEVICE_MAX_CONNECTIONS=1 NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=1 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.max_epochs=9999 \ - trainer.max_steps=3 \ - trainer.val_check_interval=3 \ - ++trainer.limit_val_batches=2 \ - trainer.precision=bf16 \ - exp_manager.exp_dir=/tmp/nlp_lora_tuning_tp2_sp1 \ - +model.mcore_gpt=True \ - model.pipeline_model_parallel_size=1 \ - model.tensor_model_parallel_size=2 \ - model.sequence_parallel=True \ - model.megatron_amp_O2=True \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \ - +model.fp8=True \ - +model.fp8_params=True \ - +model.fp8_hybrid=True \ - +model.fp8_e4m3=False \ - +model.fp8_interval=1 \ - +model.fp8_margin=0 \ - +model.fp8_amax_history_len=32 \ - +model.fp8_amax_compute_algo=max \ - +model.reduce_amax=False \ - +model.ub_tp_comm_overlap=False \ - +model.tp_comm_overlap_ag=False \ - +model.tp_comm_overlap_rs=False \ - +model.tp_comm_overlap_disable_qkv=True \ - model.peft.peft_scheme="lora" \ - model.peft.lora_tuning.adapter_dim=16 \ - model.peft.lora_tuning.alpha=32 \ - model.peft.lora_tuning.column_init_method="kaiming" \ - +model.peft.lora_tuning.dropout_position="pre" \ - model.peft.lora_tuning.target_modules=["attention"] \ - model.peft.lora_tuning.adapter_dropout=0.1 \ - +model.peft.lora_tuning.a2a_experimental=1 \ - model.answer_only_loss=True \ - model.micro_batch_size=1 \ - model.global_batch_size=1 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[1.0] \ - model.data.train_ds.num_workers=0 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel] - - L2_Megatron_GPT_Eval: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Eval') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_gpt_eval.py \ - gpt_model_file=/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo \ - prompts=["How to fix GPU memory? A:"] \ - tensor_model_parallel_size=1 \ - inference.tokens_to_generate=32 \ - trainer.precision=32 - - L2_Megatron_GPT_Eval_PP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Eval_PP2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_gpt_eval.py \ - gpt_model_file=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \ - server=False \ - tensor_model_parallel_size=1 \ - pipeline_model_parallel_size=2 \ - trainer.devices=2 \ - trainer.num_nodes=1 \ - trainer.precision=32 - - L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt_sft/megatron_gpt_rope_sft.nemo \ - model.peft.restore_from_path=null \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_gpt_sft/sample.jsonl] \ - model.data.test_ds.names=[test] \ - model.data.test_ds.global_batch_size=1 \ - model.data.test_ds.micro_batch_size=1 \ - model.data.test_ds.tokens_to_generate=30 \ - model.data.test_ds.max_seq_length=6000 \ - model.data.test_ds.write_predictions_to_file=True \ - model.data.test_ds.output_file_path_prefix=examples/nlp/language_modeling/out \ - inference.greedy=True \ - inference.repetition_penalty=1.0 \ - inference.outfile_path=examples/nlp/language_modeling/out.jsonl - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/out.jsonl - - # TODO: Add this test back. Test was failing on CI machines due to HW error - # - name: L2: Megatron GPT Convert from Megatron-LM checkpoing and Eval - # when { - # anyOf { - # branch main - # changeRequest target: main - # } - # } - # failFast true - # - run: | - # python -m torch.distributed.launch --nproc_per_node=2 \ - # examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py \ - # --checkpoint_folder=/home/TestData/nlp/megatron_gpt/data/gpt/iter_0008700 \ - # --checkpoint_name=model_optim_rng.pt \ - # --hparams_file=/home/TestData/nlp/megatron_gpt/data/gpt/iter_0008700/hparams.yaml \ - # --nemo_file_path=examples/nlp/language_modeling/small_gpt.nemo \ - # --model_type=gpt \ - # --pipeline_model_parallel_size=1 \ - # --gpus_per_node=2 \ - # --tensor_model_parallel_size=2" - # python examples/nlp/language_modeling/megatron_gpt_eval.py \ - # --gpt_model_file=examples/nlp/language_modeling/small_gpt.nemo \ - # --tokens_to_generate=32 \ - # --tensor_model_parallel_size=2 \ - # --prompt=This is a test. - # rm examples/nlp/language_modeling/small_gpt.nemo - - # L2_Megatron_Change_Partitions - L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_change_num_partitions.py \ - --model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ - --target_file /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo \ - --tensor_model_parallel_size 2 \ - --target_tensor_model_parallel_size 1 \ - --pipeline_model_parallel_size 1 \ - --target_pipeline_model_parallel_size 2 - AFTER_SCRIPT: | - rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-reduce.nemo - - L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_change_num_partitions.py \ - --model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ - --target_file /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo \ - --tensor_model_parallel_size 2 \ - --target_tensor_model_parallel_size 4 \ - --pipeline_model_parallel_size 1 \ - --target_pipeline_model_parallel_size 1 - AFTER_SCRIPT: | - rm /home/TestData/nlp/megatron_gpt/TP2-Temp/test-increase.nemo - - L2_Megatron_T5_Pretraining_and_Resume_Training_TP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=bf16 \ - model.megatron_amp_O2=True \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=swiglu \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=relative \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=fast-swiglu \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=pre_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False - - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=bf16 \ - model.megatron_amp_O2=True \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=swiglu \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=relative \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=fast-swiglu \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=pre_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/t5_pretrain_results - rm -rf examples/nlp/language_modeling/t5_index_mappings - - L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.max_epochs=null \ - trainer.max_steps=10 \ - trainer.val_check_interval=10 \ - trainer.accumulate_grad_batches=1 \ - trainer.precision=bf16 \ - model.megatron_amp_O2=True \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.mcore_t5=True \ - model.transformer_engine=True \ - model.tensor_model_parallel_size=2 \ - model.micro_batch_size=4 \ - model.global_batch_size=4 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.encoder.transformer_block_type="pre_ln" \ - model.decoder.transformer_block_type="pre_ln" \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False - - NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.max_epochs=null \ - trainer.max_steps=10 \ - trainer.val_check_interval=10 \ - trainer.accumulate_grad_batches=1 \ - trainer.precision=bf16 \ - model.megatron_amp_O2=True \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.mcore_t5=True \ - model.transformer_engine=True \ - model.tensor_model_parallel_size=2 \ - model.micro_batch_size=4 \ - model.global_batch_size=4 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.encoder.transformer_block_type="pre_ln" \ - model.decoder.transformer_block_type="pre_ln" \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/t5_pretrain_results - rm -rf examples/nlp/language_modeling/t5_index_mappings - - L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=swiglu \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=alibi \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=swiglu \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=pre_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False - - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=swiglu \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=alibi \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=swiglu \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=pre_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/t5_pretrain_results - rm -rf examples/nlp/language_modeling/t5_index_mappings - - L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=swiglu \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=kerple \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=swiglu \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=pre_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False - - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=swiglu \ - model.encoder.masked_softmax_fusion=False \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.position_embedding_type=kerple \ - model.decoder.num_layers=2 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=swiglu \ - model.decoder.masked_softmax_fusion=False \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=pre_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src,.5,/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings \ - model.data.data_impl=text_mmap \ - +model.data.data_impl_kwargs.newline_int=10 \ - +model.data.data_impl_kwargs.header_lines=0 \ - +model.data.data_impl_kwargs.workers=null \ - +model.data.data_impl_kwargs.sort_dataset_paths=False \ - model.share_token_embeddings=False \ - model.share_decoder_tokens_head_embeddings=False - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/t5_pretrain_results - rm -rf examples/nlp/language_modeling/t5_index_mappings - - L2_Megatron_T5_Pretraining_and_Resume_Training_PP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_Pretraining_and_Resume_Training_PP2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.pipeline_model_parallel_size=2 \ - model.pipeline_model_parallel_split_rank=1 \ - model.seq_length=256 \ - model.encoder.num_layers=4 \ - model.decoder.num_layers=1 \ - model.encoder.hidden_size=64 \ - model.decoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.decoder.num_attention_heads=8 \ - model.decoder.ffn_hidden_size=2048 \ - model.encoder.activation=gelu \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=post_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings - - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.pipeline_model_parallel_size=2 \ - model.pipeline_model_parallel_split_rank=1 \ - model.seq_length=256 \ - model.encoder.num_layers=4 \ - model.decoder.num_layers=1 \ - model.encoder.hidden_size=64 \ - model.decoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.decoder.num_attention_heads=8 \ - model.decoder.ffn_hidden_size=2048 \ - model.encoder.activation=gelu \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=post_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/t5_pretrain_results - rm -rf examples/nlp/language_modeling/t5_index_mappings - - L2_Megatron_T5_w_Mixture_of_Expert_Pretraining: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_w_Mixture_of_Expert_Pretraining') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.pipeline_model_parallel_split_rank=0 \ - model.seq_length=256 \ - model.encoder.num_layers=4 \ - model.decoder.num_layers=1 \ - model.encoder.num_moe_experts=4 \ - model.decoder.num_moe_experts=4 \ - model.encoder.moe_frequency=3 \ - model.decoder.moe_frequency=1 \ - model.encoder.hidden_size=64 \ - model.decoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.decoder.num_attention_heads=8 \ - model.decoder.ffn_hidden_size=2048 \ - model.encoder.activation=gelu \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=pre_ln \ - model.decoder.transformer_block_type=post_ln \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/t5_pretrain_results - rm -rf examples/nlp/language_modeling/t5_index_mappings - - L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_t5_pretraining.py -cn megatron_ul2_config \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=swiglu \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=normformer \ - model.encoder.headscale=True \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=geglu \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.decoder.transformer_block_type=normformer \ - model.decoder.headscale=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings - - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=swiglu \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.encoder.transformer_block_type=normformer \ - model.encoder.headscale=True \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=geglu \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.decoder.transformer_block_type=normformer \ - model.decoder.headscale=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/t5_index_mappings - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/t5_pretrain_results - rm -rf examples/nlp/language_modeling/t5_index_mappings - - L2_Megatron_T5_Eval: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_Eval') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_t5_eval.py \ - --model_file /home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ - --prompt "How do I fix my GPU memory issue? I am seeing out of memory." \ - --tensor_model_parallel_size 1 - - L2_Megatron_Core_T5_Eval: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Core_T5_Eval') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_t5_eval.py \ - --model_file /home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m.nemo \ - --prompt "How do I fix my GPU memory issue? I am seeing out of memory." \ - --tensor_model_parallel_size 1 - - L2_Megatron_BART_Pretraining_and_Resume_Training_TP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_BART_Pretraining_and_Resume_Training_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation="reglu" \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method="block" \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation="reglu" \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method="block" \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.data.data_prefix="{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}" - - python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=5 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.seq_length=128 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation="reglu" \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method="block" \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation="reglu" \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method="block" \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.data.data_prefix="{train:[1.0,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document],test:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document], validation:[/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]}" - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/bart_pretrain_results - - L2_Megatron_BART_Pretraining_and_Resume_Training_PP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_BART_Pretraining_and_Resume_Training_PP2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ - model.pipeline_model_parallel_size=2 \ - model.pipeline_model_parallel_split_rank=1 \ - model.seq_length=256 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=geglu \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=geglu \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.data.respect_document_boundaries=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] - - python examples/nlp/language_modeling/megatron_bart_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bart_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.pipeline_model_parallel_size=2 \ - model.pipeline_model_parallel_split_rank=1 \ - model.seq_length=256 \ - model.encoder.num_layers=4 \ - model.encoder.hidden_size=64 \ - model.encoder.num_attention_heads=8 \ - model.encoder.activation=geglu \ - model.encoder.bias_activation_fusion=False \ - model.encoder.activations_checkpoint_method=block \ - model.encoder.activations_checkpoint_num_layers=1 \ - model.decoder.num_layers=4 \ - model.decoder.hidden_size=64 \ - model.decoder.num_attention_heads=8 \ - model.decoder.activation=geglu \ - model.decoder.bias_activation_fusion=False \ - model.decoder.activations_checkpoint_method=block \ - model.decoder.activations_checkpoint_num_layers=1 \ - model.data.respect_document_boundaries=False \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document] - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/bart_pretrain_results - - - L2_Megatron_T5_PEFT_Lora_TP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_PEFT_Lora_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - - python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.max_epochs=9999 \ - trainer.max_steps=3 \ - trainer.val_check_interval=3 \ - ++trainer.limit_val_batches=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=/tmp/nlp_t5_lora_tuning_tp2 \ - model.pipeline_model_parallel_size=1 \ - model.tensor_model_parallel_size=2 \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \ - model.peft.peft_scheme=lora \ - model.answer_only_loss=True \ - model.micro_batch_size=1 \ - model.global_batch_size=1 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[1.0] \ - model.data.train_ds.num_workers=0 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel] - - python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo \ - model.peft.restore_from_path=/tmp/nlp_t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \ - model.peft.restore_from_ckpt_name=null \ - model.peft.restore_from_hparams_path=null \ - model.tensor_model_parallel_size=2 \ - trainer.devices=2 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \ - model.data.test_ds.names=[quarel4] \ - model.global_batch_size=2 \ - model.micro_batch_size=1 \ - model.data.test_ds.tokens_to_generate=10 \ - model.data.test_ds.write_predictions_to_file=True \ - model.data.test_ds.output_file_path_prefix=/tmp/nlp_t5_lora_tuning_tp2/out \ - inference.greedy=True \ - inference.repetition_penalty=1.0 \ - inference.outfile_path=/tmp/nlp_t5_lora_tuning_tp2/out.jsonl - - L2_Megatron_Core_T5_PEFT_Lora_TP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Core_T5_PEFT_Lora_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \ - trainer.devices=2 \ - trainer.log_every_n_steps=1 \ - trainer.max_epochs=9999 \ - trainer.max_steps=3 \ - trainer.val_check_interval=3 \ - ++trainer.limit_val_batches=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=/tmp/nlp_mcore_t5_lora_tuning_tp2 \ - model.pipeline_model_parallel_size=1 \ - model.tensor_model_parallel_size=2 \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m.nemo \ - model.peft.peft_scheme=lora \ - model.answer_only_loss=True \ - model.micro_batch_size=1 \ - model.global_batch_size=1 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.train_ds.concat_sampling_probabilities=[1.0] \ - model.data.train_ds.num_workers=0 \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.names=[quarel] - - NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m.nemo \ - model.peft.restore_from_path=/tmp/nlp_mcore_t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \ - model.peft.restore_from_ckpt_name=null \ - model.peft.restore_from_hparams_path=null \ - model.tensor_model_parallel_size=2 \ - trainer.devices=2 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel_4.jsonl] \ - model.data.test_ds.names=[quarel4] \ - model.global_batch_size=1 \ - model.micro_batch_size=1 \ - model.data.test_ds.tokens_to_generate=10 \ - model.data.test_ds.write_predictions_to_file=True \ - model.data.test_ds.output_file_path_prefix=/tmp/nlp_mcore_t5_lora_tuning_tp2/out \ - inference.greedy=True \ - inference.repetition_penalty=1.0 \ - inference.outfile_path=/tmp/nlp_mcore_t5_lora_tuning_tp2/out.jsonl - - # L2: Megatron Mock Data Generation - L2_Megatron_Mock_Data_Generation_MockGPTDataset: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Mock_Data_Generation_MockGPTDataset') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.max_steps=10 \ - trainer.limit_val_batches=7 \ - trainer.val_check_interval=10 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.mcore_gpt=True \ - model.data.data_impl=mock \ - model.data.data_prefix=[] - - L2_Megatron_Mock_Data_Generation_MockT5Dataset: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_Mock_Data_Generation_MockT5Dataset') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_t5_pretraining.py \ - trainer.max_steps=10 \ - trainer.limit_val_batches=3 \ - trainer.val_check_interval=10 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \ - model.data.data_impl=mock \ - model.data.data_prefix=[] - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/t5_pretrain_results - - # L2: TTS Fast dev runs 1 - L2_TTS_Fast_dev_runs_1_Tacotron_2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_TTS_Fast_dev_runs_1_Tacotron_2') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - python examples/tts/tacotron2.py \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.decoder.decoder_rnn_dim=256 \ - model.decoder.attention_rnn_dim=1024 \ - model.decoder.prenet_dim=128 \ - model.postnet.postnet_n_convolutions=3 \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - ~model.text_normalizer \ - ~model.text_normalizer_call_kwargs \ - ~trainer.check_val_every_n_epoch - - L2_TTS_Fast_dev_runs_1_WaveGlow: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_TTS_Fast_dev_runs_1_WaveGlow') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/tts/waveglow.py \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices="[0]" \ - +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - model.waveglow.n_flows=4 \ - model.waveglow.n_wn_layers=2 \ - model.waveglow.n_wn_channels=32 \ - ~trainer.check_val_every_n_epoch - - L2_TTS_Fast_dev_runs_1_FastPitch: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_TTS_Fast_dev_runs_1_FastPitch') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/tts/fastpitch.py \ - --config-name fastpitch_align_v1.05 \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - sup_data_path=/home/TestData/an4_dataset/beta_priors \ - trainer.devices="[0]" \ - +trainer.limit_train_batches=1 \ - +trainer.limit_val_batches=1 \ - trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.pitch_mean=212.35873413085938 \ - model.pitch_std=68.52806091308594 \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - model.symbols_embedding_dim=64 \ - model.input_fft.d_inner=384 \ - model.input_fft.n_layer=2 \ - model.output_fft.d_inner=384 \ - model.output_fft.n_layer=2 \ - ~trainer.check_val_every_n_epoch \ - ~model.text_normalizer \ - ~model.text_normalizer_call_kwargs - - # OPTIONAL_L2_TTS_Fast_dev_runs_1_RADTTS: - # needs: [cicd-test-container-setup] - # runs-on: self-hosted-azure - # timeout-minutes: 10 - # container: - # image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - # options: - # # --user 0:128 - # --device=/dev/nvidia0 - # --gpus all - # --shm-size=8g - # --env TRANSFORMERS_OFFLINE=0 - # --env HYDRA_FULL_ERROR=1 - # --volume /mnt/datadrive/TestData:/home/TestData - # steps: - # - name: Checkout repository - # uses: actions/checkout@v4 - # - run: | - # python examples/tts/radtts.py \ - # train_dataset=/home/TestData/an4_dataset/an4_train.json \ - # validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - # sup_data_path=/home/TestData/an4_dataset/radtts_beta_priors \ - # trainer.devices="[0]" \ - # +trainer.limit_train_batches=1 \ - # +trainer.limit_val_batches=1 \ - # trainer.max_epochs=1 \ - # trainer.strategy=auto \ - # model.pitch_mean=212.35873413085938 \ - # model.pitch_std=68.52806091308594 \ - # model.train_ds.dataloader_params.batch_size=4 \ - # model.train_ds.dataloader_params.num_workers=0 \ - # model.validation_ds.dataloader_params.batch_size=4 \ - # model.validation_ds.dataloader_params.num_workers=0 \ - # export_dir=/home/TestData/radtts_test \ - # model.optim.lr=0.0001 \ - # model.modelConfig.decoder_use_partial_padding=True \ - # ~trainer.check_val_every_n_epoch \ - # ~model.text_normalizer \ - # ~model.text_normalizer_call_kwargs - # #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - # # if: "failure()" - - L2_TTS_Fast_dev_runs_1_Mixer-TTS: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_TTS_Fast_dev_runs_1_Mixer-TTS') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/tts/mixer_tts.py \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - sup_data_path=/home/TestData/an4_dataset/sup_data \ - trainer.devices="[0]" \ - +trainer.limit_train_batches=1 \ - +trainer.limit_val_batches=1 \ - trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.pitch_mean=212.35873413085938 \ - model.pitch_std=68.52806091308594 \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - ~trainer.check_val_every_n_epoch \ - ~model.text_normalizer \ - ~model.text_normalizer_call_kwargs - - L2_TTS_Fast_dev_runs_1_Hifigan: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_TTS_Fast_dev_runs_1_Hifigan') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/tts/hifigan.py \ - train_dataset=/home/TestData/an4_dataset/an4_train.json \ - validation_datasets=/home/TestData/an4_dataset/an4_val.json \ - trainer.devices="[0]" \ - +trainer.limit_train_batches=1 \ - +trainer.limit_val_batches=1 \ - +trainer.max_epochs=1 \ - trainer.strategy=auto \ - model.train_ds.dataloader_params.batch_size=4 \ - model.train_ds.dataloader_params.num_workers=0 \ - model.validation_ds.dataloader_params.batch_size=4 \ - model.validation_ds.dataloader_params.num_workers=0 \ - model.generator.upsample_initial_channel=64 \ - +model.debug=true \ - ~trainer.check_val_every_n_epoch - - # L2: NeRF - # L2_NeRF_DreamFusion: - # needs: [cicd-test-container-setup] - # runs-on: self-hosted-azure - # container: - # image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - # options: - # # --user 0:128 - # --device=/dev/nvidia0 - # --gpus all - # --shm-size=8g - # --env TRANSFORMERS_OFFLINE=0 - # --env HYDRA_FULL_ERROR=1 - # --volume /mnt/datadrive/TestData:/home/TestData - # steps: - # - name: Checkout repository - # uses: actions/checkout@v4 - # - run: | - # python examples/multimodal/text_to_image/nerf/main.py \ - # trainer.num_nodes=1 \ - # trainer.devices="[0]" \ - # trainer.max_steps=1000 \ - # model.prompt="a DSLR photo of a delicious hamburger" \ - # exp_manager.exp_dir=examples/multimodal/text_to_image/nerf/dreamfusion_results - # - # rm -rf examples/multimodal/text_to_image/nerf/dreamfusion_results - # - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" - # if: "failure()" - - Speech_Checkpoints_tests: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'Speech_Checkpoints_tests') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - TIMEOUT: 20 - SCRIPT: | - CUDA_VISIBLE_DEVICES=0 python examples/asr/speech_to_text_eval.py \ - pretrained_name=QuartzNet15x5Base-En \ - dataset_manifest=/home/TestData/librispeech/librivox-dev-other.json \ - batch_size=64 \ - tolerance=0.1012 - AFTER_SCRIPT: | - rm -f examples/asr/evaluation_transcripts.json - - L2_Stable_Diffusion_Training: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Stable_Diffusion_Training') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - rm -rf examples/multimodal/text_to_image/sd_train_results - - python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \ - trainer.devices=1 \ - trainer.max_steps=3 \ - +trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.gradient_clip_val=0 \ - exp_manager.exp_dir=examples/multimodal/text_to_image/sd_train_results \ - exp_manager.create_checkpoint_callback=False \ - exp_manager.resume_if_exists=False \ - model.resume_from_checkpoint=null \ - model.precision=16 \ - model.micro_batch_size=1 \ - model.global_batch_size=1 \ - model.first_stage_key=moments \ - model.cond_stage_key=encoded \ - +model.load_vae=False \ - +model.load_unet=False \ - +model.load_encoder=False \ - model.parameterization=v \ - model.load_only_unet=False \ - model.text_embedding_dropout_rate=0.0 \ - model.inductor=True \ - model.inductor_cudagraphs=False \ - model.capture_cudagraph_iters=15 \ - +model.unet_config.num_head_channels=64 \ - +model.unet_config.use_linear_in_transformer=True \ - model.unet_config.context_dim=1024 \ - model.unet_config.use_flash_attention=null \ - model.unet_config.resblock_gn_groups=16 \ - model.unet_config.unet_precision=fp16 \ - +model.unet_config.timesteps=1000 \ - model.optim.name=megatron_fused_adam \ - +model.optim.capturable=True \ - +model.optim.master_weights=True \ - model.optim.weight_decay=0.01 \ - model.first_stage_config.from_pretrained=null \ - model.data.num_workers=16 \ - model.data.synthetic_data=True - AFTER_SCRIPT: | - rm -rf examples/multimodal/text_to_image/sd_train_results - - L2_NeMo_2_GPT_Pretraining_no_transformer_engine: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_GPT_Pretraining_no_transformer_engine') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - pip uninstall -y apex ## TODO: remove when apex is no longer a dependency - pip uninstall -y transformer_engine - - python tests/collections/llm/megatron_gpt_pretraining.py \ - --devices=2 \ - --max-steps=3 \ - --experiment-dir=tests/collections/llm/gpt_pretrain_results \ - --vocab-path=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - --merges-path=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - --data-path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \ - --index-mapping-dir=tests/collections/llm/gpt_index_mappings \ - --no-masked-softmax-fusion - - python tests/collections/llm/megatron_gpt_pretraining.py \ - --devices=2 \ - --max-steps=6 \ - --experiment-dir=tests/collections/llm/gpt_pretrain_results \ - --vocab-path=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - --merges-path=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - --data-path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document \ - --index-mapping-dir=tests/collections/llm/gpt_index_mappings \ - --no-masked-softmax-fusion - AFTER_SCRIPT: | - rm -rf tests/collections/llm/gpt_pretrain_results - rm -rf tests/collections/llm/gpt_index_mappings - - OPTIONAL_L2_NeMo_2_GPT_DDP_Param_Parity_check: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L2_NeMo_2_GPT_DDP_Param_Parity_check') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - - python tests/lightning/test_ddp_parity_checker.py \ - --vocab-path=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - --merges-path=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - --data-path=/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document - - AFTER_SCRIPT: | - rm -rf tests/collections/llm/gpt_pretrain_results - rm -rf tests/collections/llm/gpt_index_mappings - IS_OPTIONAL: true - - L2_NeMo_2_SSM_Pretraining: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_SSM_Pretraining') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - - python tests/collections/llm/gpt/model/megatron_ssm_pretraining.py \ - --devices 1 \ - --max-steps 10 \ - --experiment-dir /tmp/nlp_megatron_mamba_nemo-ux-mamba_cicd_test_pretrain/${{ github.run_id }} \ - --data-path /home/TestData/nlp/megatron_mamba/toy_ssm_dataset/legal_pile_text_document - - L2_NeMo_2_SSM_Finetuning: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_SSM_Finetuning') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - - python tests/collections/llm/gpt/model/megatron_ssm_finetuning.py \ - --devices 1 \ - --max-steps 10 \ - --experiment-dir /tmp/nlp_megatron_mamba_nemo-ux-mamba_cicd_test_sft/${{ github.run_id }} \ - --model-path /home/TestData/nlp/megatron_mamba/model_optim_rng.pt - - L2_NeMo_2_HF_MODEL_IMPORT: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_HF_MODEL_IMPORT') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - - python tests/collections/llm/gpt/model/test_model_import.py - - AFTER_SCRIPT: | - rm -rf ~/.cache/nemo/models - - L2_NeMo_2_T5_Pretraining: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_T5_Pretraining') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_pretraining.py \ - --devices=2 \ - --max-steps=3 \ - --experiment-dir=tests/collections/llm/t5_pretrain_results/${{ github.run_id }} \ - --data-path=/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document \ - --index-mapping-dir=tests/collections/llm/t5_index_mappings/${{ github.run_id }} - - NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_pretraining.py \ - --devices=2 \ - --max-steps=6 \ - --experiment-dir=tests/collections/llm/t5_pretrain_results/${{ github.run_id }} \ - --data-path=/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document \ - --index-mapping-dir=tests/collections/llm/t5_index_mappings/${{ github.run_id }} - AFTER_SCRIPT: | - rm -rf tests/collections/llm/t5_pretrain_results/${{ github.run_id }} - rm -rf tests/collections/llm/t5_index_mappings/${{ github.run_id }} - - L2_NeMo_2_Mixtral_Pretraining: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_Mixtral_Pretraining') || needs.cicd-test-container-setup.outputs.all == 'true' - with: - RUNNER: self-hosted-azure - SCRIPT: | - NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python3 tests/collections/llm/megatron_mixtral_pretraining.py \ - --experiment-dir=/tmp/mixtral_pretrain_results \ - --data-path=/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document - - Nemo_CICD_Test: - needs: - - pre-flight - - gpu-test - - cicd-test-container-setup - - #- OPTIONAL_L0_Unit_Tests_GPU_ASR - - L0_Unit_Tests_GPU_Audio - - L0_Unit_Tests_GPU_Common - - L0_Unit_Tests_GPU_LLM - - L0_Unit_Tests_GPU_Multimodal - - L0_Unit_Tests_GPU_NLP - - L0_Unit_Tests_GPU_TTS - #- OPTIONAL_L0_Unit_Tests_GPU_Core - - L0_Unit_Tests_GPU_Hydra - #- OPTIONAL_L0_Unit_Tests_GPU_Lightning - - L0_Unit_Tests_GPU_Others - - - L0_Unit_Tests_CPU_ASR - - L0_Unit_Tests_CPU_Audio - - L0_Unit_Tests_CPU_Common - - L0_Unit_Tests_CPU_LLM - - L0_Unit_Tests_CPU_Multimodal - - L0_Unit_Tests_CPU_NLP - - L0_Unit_Tests_CPU_TTS - - L0_Unit_Tests_CPU_Core - - L0_Unit_Tests_CPU_Hydra - - L0_Unit_Tests_CPU_Lightning - - L0_Unit_Tests_CPU_Others - - - L2_Community_LLM_Checkpoints_tests_Bert - - L2_Community_LLM_Checkpoints_tests_Mamba2 - - L2_Community_LLM_Checkpoints_tests_Llama - - L2_Community_LLM_Checkpoints_tests_StarCoder - - L2_Community_LLM_Checkpoints_tests_Falcon - - L2_Community_vita_Checkpoints_tests_Llama3 - #- OPTIONAL_L2_Community_LLM_Checkpoints_tests_Baichuan2 - - ASR_dev_run_Speech_to_Text - - ASR_dev_run_Speech_to_Text_WPE_-_CitriNet - - ASR_dev_run_Speech_Pre-training_-_CitriNet - - ASR_dev_run_Speech_To_Text_Finetuning - - ASR_dev_run_Speech_To_Text_HF_Finetuning - - ASR_dev_run_Speech_to_Text_WPE_-_Conformer - - ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer - - L2_Speech_to_Text_EMA - - L2_Speaker_dev_run_Speaker_Recognition - - L2_Speaker_dev_run_Speaker_Diarization - - L2_Speaker_dev_run_Speech_to_Label - - L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference - - L2_Speaker_dev_run_Clustering_Diarizer_Inference - - L2_Speaker_dev_run_Neural_Diarizer_Inference - - L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation - - L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader - - L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader - - L2_ASR_Adapters_Linear_Adapters - - L2_ASR_Adapters_RelPos_MHA_Adapters - - L2_Speech_Transcription_Speech_to_Text_Transcribe - #- OPTIONAL_L2_Transducer_alignment_Running_pytest - - L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav - - L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3 - - L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference - - L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference - - L2_Duplex_Text_Normalization_with_Tarred_dataset - - L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification - - L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification - - L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test - - L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test - - L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1 - - L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification - - L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation - - L2_Pretraining_BERT_pretraining_from_Text - - L2_Pretraining_BERT_from_Preprocessed - - L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN - - L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN - - L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation - - L2_NMT_Attention_is_All_You_Need_Inference - - L2_NMT_Attention_is_All_You_Need_Finetuning - - L2_NMT_Tarred_Dataset_Creation_Auto_Tarred_Dataset_Creation - - L2_NMT_Tarred_Dataset_Creation_Script_Tarred_Dataset_Creation - - L2_Megatron_NMT_Training_TP2 - - L2_Megatron_BART_Perceiver_MIM_Training_TP2 - - L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism - - L2_Megatron_Bert_Pretraining_and_Resume_Training - - L2_Megatron_Core_Bert_Pretraining_and_Resume_Training - - L2_Legacy_Megatron_RETRO_Pretraining_and_Resume_Training - - L2_Megatron_RETRO_Pretraining_and_Resume_Training - - L2_RAG_Pipeline_Indexing - - L2_RAG_Pipeline_Generating - - L2_BioMegatron_Bert_NER_Task - - L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2 - - L2_Megatron_GPT_Skip_Train - - L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2 - - L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2 - - L2_Megatron_GPT_with_Drop_Optimizer_States_TP2 - - L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2 - - L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2 - - L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2 - #- OPTIONAL_L2_Megatron_GPT_Auto_Configurator_TP1_PP1_MBS124 - - L2_Megatron_GPT_Finetuning_PP2 - - L2_Megatron_GPT_Finetuning_StarCoder_PP1 - - L2_Megatron_GPT_Embedding - - L2_Megatron_GPT_PEFT_Lora_PP2_O2 - - L2_Megatron_GPT_PEFT_Lora_TP2_O1 - - L2_Megatron_GPT_PEFT_Lora_TP2SP1 - - L2_Megatron_GPT_Eval - - L2_Megatron_GPT_Eval_PP2 - - L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len - - L2_Megatron_Change_Partitions_Reduce_TP_Num_Partitions_-2_to_1-_and_PP_Num_Partitions_-1_to_2 - - L2_Megatron_Change_Partitions_Increase_TP_Num_Partitions_-2_to_4-_and_PP_Num_Partitions_-1_to_2 - - L2_Megatron_T5_Pretraining_and_Resume_Training_TP2 - - L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2 - - L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2 - - L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2 - - L2_Megatron_T5_Pretraining_and_Resume_Training_PP2 - - L2_Megatron_T5_w_Mixture_of_Expert_Pretraining - - L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2 - - L2_Megatron_T5_Eval - - L2_Megatron_Core_T5_Eval - - L2_Megatron_BART_Pretraining_and_Resume_Training_TP2 - - L2_Megatron_BART_Pretraining_and_Resume_Training_PP2 - - L2_Megatron_T5_PEFT_Lora_TP2 - - L2_Megatron_Core_T5_PEFT_Lora_TP2 - - L2_Megatron_Mock_Data_Generation_MockGPTDataset - - L2_Megatron_Mock_Data_Generation_MockT5Dataset - - L2_TTS_Fast_dev_runs_1_Tacotron_2 - - L2_TTS_Fast_dev_runs_1_WaveGlow - - L2_TTS_Fast_dev_runs_1_FastPitch - #- OPTIONAL_L2_TTS_Fast_dev_runs_1_RADTTS - - L2_TTS_Fast_dev_runs_1_Mixer-TTS - - L2_TTS_Fast_dev_runs_1_Hifigan - - Speech_Checkpoints_tests - - L2_Stable_Diffusion_Training - - L2_NeMo_2_GPT_Pretraining_no_transformer_engine - #- OPTIONAL_L2_NeMo_2_GPT_DDP_Param_Parity_check - - L2_NeMo_2_HF_MODEL_IMPORT - - L2_NeMo_2_SSM_Pretraining - - L2_NeMo_2_SSM_Finetuning - - L2_NeMo_2_T5_Pretraining - - L2_NeMo_2_Mixtral_Pretraining - - L2_PTQ_Llama2_INT8_SQ - - L2_PTQ_Llama2_FP8 - - L2_Community_LLM_Checkpoints_tests_Llama3 - - L2_PTQ_Llama2_Export_Only - - L2_Distill_Llama2 - - L2_Prune_Width_Llama2 - - L2_Speech_to_Text_AED - - L2_Speech_Estimate_Duration_Bins - - L2_Speech_Batch_Size_OOMptimizer - - L2_Speech_Batch_Size_OOMptimizer_Canary - - L2_Speech_Transcription_Canary_Transcribe_Full_Manifest - - L2_Speech_Transcription_Canary_Transcribe_With_Prompt - - L2_Speech_Transcription_Canary_Transcribe_Audio_Dir - - L2_Megatron_GPT_Reranker - if: always() - runs-on: ubuntu-latest - steps: - - name: Evaluate conclusion - if: ${{ always() }} - id: pipeline-conclusion - run: | - # Slack notifications are send only on test failure (not cancelled): - FAILED=${{ contains(needs.*.outputs.conclusion, 'failure') }} - echo "FAILED=$FAILED" >> $GITHUB_OUTPUT - - # Mark as successful if no job was cancelled: - SUCCESS=${{ !contains(needs.*.outputs.conclusion, 'failure') && !contains(needs.*.result, 'cancelled') && !contains(needs.*.result, 'skipped') }} - echo "SUCCESS=$SUCCESS" >> $GITHUB_OUTPUT - - # This should depend on all the tests so we block/unblock based on all tests passing - - name: Pipeline successful, set exit code to 0 - if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'true' }} - run: exit 0 - - - name: Pipeline successful, add PR comment - if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'true' && github.event_name == 'pull_request' && env.SLACK_WEBHOOK != '' }} - uses: peter-evans/create-or-update-comment@v4 - env: - SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} - REPOSITORY: ${{ github.repository }} - RUN_ID: ${{ github.run_id }} - with: - issue-number: ${{ github.event.number }} - body: | - [🤖]: Hi @${{ github.event.pull_request.user.login }} 👋, - - We wanted to let you know that a [CICD pipeline](https://github.com/${{ env.REPOSITORY }}/actions/runs/${{ env.RUN_ID }}) for this PR just finished successfully - - So it might be time to merge this PR or get some approvals - - I'm just a bot so I'll leave it you what to do next. - - //cc @pablo-garay @ko3n1g - - - name: "Pipeline not successful and not cancelled: Send Slack alert & create step summary" - if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' && env.SLACK_WEBHOOK != '' }} - env: - SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - REPOSITORY: ${{ github.repository }} - RUN_ID: ${{ github.run_id }} - PR_NUMBER: ${{ github.event.number }} - SERVER_URL: ${{ github.server_url }} - run: | - set -x - - PR_INFO=$(curl -L \ - -H "Accept: application/vnd.github+json" \ - -H "Authorization: Bearer $GITHUB_TOKEN" \ - -H "X-GitHub-Api-Version: 2022-11-28" \ - https://api.github.com/repos/$REPOSITORY/pulls/$PR_NUMBER - ) - PR_URL=$(echo -E $PR_INFO | jq '.html_url' | tr -d '"') - PR_TITLE=$(echo -E $PR_INFO | jq '.title' | tr -d '"') - - PIPELINE_URL=$SERVER_URL/$REPOSITORY/actions/runs/$RUN_ID - BASE_MESSAGE=' - { - "blocks": [ - { - "type": "section", - "text": { - "type": "mrkdwn", - "text": "🚨 *CI/CD failure at <'$PIPELINE_URL'|NeMo CI>*." - } - } - ] - } - ' - - # Since this workflow contains more than 100 jobs, we need to iterate over job pages - JOBS='[]' - PAGE=1 - while : ; do - JOBS_URL="https://api.github.com/repos/$REPOSITORY/actions/runs/$RUN_ID/jobs?page=$PAGE&per_page=100" - RESPONSE=$(curl -s -H "Authorization: token $GITHUB_TOKEN" $JOBS_URL | jq '.jobs') - JOBS=$(echo -e "$JOBS\n$RESPONSE" | jq -cs 'add') - if [[ $(echo $RESPONSE | jq 'length') -lt 100 ]]; then - break - else - PAGE=$(( PAGE + 1)) - fi - done - - SUMMARY="[]" - echo "Failed jobs: " | tee -a $GITHUB_STEP_SUMMARY - while IFS= read -r JOB; do - JOB_NAME="$(echo $JOB | jq '.key' | tr -d '"') / main" - JOB_ID=$(echo $JOBS | jq --arg job_name "$JOB_NAME" '.[] | select(.name == $job_name) | .id') - JOB_URL="https://github.com/$REPOSITORY/actions/runs/$RUN_ID/job/$JOB_ID" - - echo "* [$JOB_NAME]($JOB_URL)" | tee -a $GITHUB_STEP_SUMMARY - - LOGS=$(echo $JOB | yq '(.value.outputs.log | @base64d)' | tr -d '"') - - SUMMARY=$(echo "$SUMMARY" | jq \ - --arg pr "<$PR_URL|$PR_TITLE>" \ - --arg job "<$JOB_URL|$JOB_NAME>" \ - --arg logs "$LOGS" \ - --arg author "" \ - --arg branch ""\ - '. += [ - { - "type": "section", - "text": { - "type": "mrkdwn", - "text": ( - "PR: " + $pr - + "\nJob: " + $job - + "\nAuthor: " + $author - + "\nBranch: " + $branch - + "\nLogs:" - + "```\n" + $logs + "\n```" - ) - } - } - ]') - done <<<$(echo '${{ toJSON(needs) }}' | jq -c 'to_entries | .[] | select(.value.outputs.conclusion == "failure")') - - MESSAGE=$(echo $BASE_MESSAGE | jq -c --argjson summary "$SUMMARY" '.blocks += $summary') - - curl -X POST -H "Content-type: application/json" --data "$MESSAGE" $SLACK_WEBHOOK - - - name: "Pipeline not successful, set exit code to 1" - if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'false' }} - run: exit 1