Skip to content

Commit

Permalink
WIP Split workflow into multiple dispatch groups to avoid skipped jobs.
Browse files Browse the repository at this point in the history
  • Loading branch information
alliepiper committed Jun 1, 2024
1 parent 8b5bf66 commit d415ef8
Show file tree
Hide file tree
Showing 13 changed files with 531 additions and 86 deletions.
25 changes: 15 additions & 10 deletions .github/actions/workflow-build/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,8 @@ inputs:

outputs:
workflow:
description: "The dispatchable workflow matrix"
value: ${{ steps.build-workflow.outputs.workflow }}
workflow_keys:
description: "The keys of the parsed workflow"
value: ${{ steps.build-workflow.outputs.workflow_keys }}
description: "The dispatchable workflows"
value: ${{ steps.build-workflow.outputs.DISPATCH_WORKFLOWS }}

runs:
using: "composite"
Expand Down Expand Up @@ -85,13 +82,21 @@ runs:
cat workflow/job_list.txt
echo "::endgroup::"
echo "Setting outputs..."
echo "::group::GHA Output: WORKFLOW"
printf "WORKFLOW=%s\n" "$(cat workflow/workflow.json | jq -c '.')" | tee -a "${GITHUB_OUTPUT}"
- name: Create dispatch workflows
shell: bash --noprofile --norc -euo pipefail {0}
run: |
"${GITHUB_ACTION_PATH}/prepare-workflow-dispatch.py" workflow/workflow.json
echo "::group::Dispatch Workflows"
cat dispatch/dispatch.json
echo "::endgroup::"
echo "::group::GHA Output: WORKFLOW_KEYS"
printf "WORKFLOW_KEYS=%s\n" "$(cat workflow/workflow_keys.json | jq -c '.')" | tee -a "${GITHUB_OUTPUT}"
- name: Set outputs
id: outputs
shell: bash --noprofile --norc -euo pipefail {0}
run: |
echo "::group::GHA Output: DISPATCH_WORKFLOWS"
printf "DISPATCH_WORKFLOWS=%s\n" "$(cat dispatch/dispatch.json | jq -c '.')" | tee -a "${GITHUB_OUTPUT}"
echo "::endgroup::"
- name: Upload artifacts
Expand Down
95 changes: 95 additions & 0 deletions .github/actions/workflow-build/prepare-workflow-dispatch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/usr/bin/env python3

"""
This script prepares a full workflow for GHA dispatch.
To avoid skipped jobs from cluttering the GHA UI, this script splits the full workflow.json into multiple workflows
that don't require large numbers of skipped jobs in the workflow implementation.
"""

import argparse
import json
import os
import sys


def write_json_file(filename, json_object):
with open(filename, 'w') as f:
json.dump(json_object, f, indent=2)


def is_windows(job):
return job['runner'].startswith('windows')


def split_workflow(workflow):
linux_standalone = {}
linux_two_stage = {}
windows_standalone = {}
windows_two_stage = {}

def strip_extra_info(job):
del job['origin']

for group_name, group_json in workflow.items():
standalone = group_json['standalone'] if 'standalone' in group_json else []
two_stage = group_json['two_stage'] if 'two_stage' in group_json else []

if len(standalone) > 0:
for job in standalone:
strip_extra_info(job)

if is_windows(standalone[0]):
windows_standalone[group_name] = standalone
else:
linux_standalone[group_name] = standalone

if len(two_stage) > 0:
for ts in two_stage:
for job in ts['producers']:
strip_extra_info(job)
for job in ts['consumers']:
strip_extra_info(job)

if is_windows(two_stage[0]['producers'][0]):
windows_two_stage[group_name] = two_stage
else:
linux_two_stage[group_name] = two_stage

dispatch = {
'linux_standalone': {
'keys': list(linux_standalone.keys()),
'jobs': linux_standalone},
'linux_two_stage': {
'keys': list(linux_two_stage.keys()),
'jobs': linux_two_stage},
'windows_standalone': {
'keys': list(windows_standalone.keys()),
'jobs': windows_standalone},
'windows_two_stage': {
'keys': list(windows_two_stage.keys()),
'jobs': windows_two_stage}
}

os.makedirs('dispatch', exist_ok=True)
write_json_file('dispatch/dispatch.json', dispatch)


def main():
parser = argparse.ArgumentParser(description='Prepare a full workflow for GHA dispatch.')
parser.add_argument('workflow_json', help='Path to the full workflow.json file')
args = parser.parse_args()

# Check if the workflow file exists
if not os.path.isfile(args.workflow_json):
print(f"Error: Matrix file '{args.workflow_json}' not found.")
sys.exit(1)

with open(args.workflow_json) as f:
workflow = json.load(f)

split_workflow(workflow)


if __name__ == '__main__':
main()
104 changes: 104 additions & 0 deletions .github/actions/workflow-run-job-linux/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
name: "Run Linux Job"
description: "Run a job on a Linux runner."

inputs:
id:
description: "A unique identifier."
required: true
command:
description: "The command to run."
required: true
image:
description: "The Docker image to use."
required: true

runs:
using: "composite"
steps:
- name: Checkout repo
uses: actions/checkout@v3
with:
path: ${{github.event.repository.name}}
persist-credentials: false
- name: Link files to coder user home directory
shell: bash --noprofile --norc -euo pipefail {0}
run: |
ln -s "$(pwd)/${{github.event.repository.name}}" /home/coder/${{github.event.repository.name}}
chown -R coder:coder ${{github.event.repository.name}}
chown -R coder:coder /home/coder/${{github.event.repository.name}}
- name: Add NVCC problem matcher
shell: bash --noprofile --norc -euo pipefail {0}
run: |
echo "::add-matcher::${{github.event.repository.name}}/.github/problem-matchers/problem-matcher.json"
- name: Get AWS credentials for sccache bucket
uses: aws-actions/configure-aws-credentials@v2
with:
role-to-assume: arn:aws:iam::279114543810:role/gha-oidc-NVIDIA
aws-region: us-east-2
role-duration-seconds: 43200 # 12 hours)
- name: Set environment variables
shell: bash --noprofile --norc -euo pipefail {0}
run: |
echo "SCCACHE_BUCKET=rapids-sccache-devs" >> $GITHUB_ENV
echo "SCCACHE_REGION=us-east-2" >> $GITHUB_ENV
echo "SCCACHE_IDLE_TIMEOUT=32768" >> $GITHUB_ENV
echo "SCCACHE_S3_USE_SSL=true" >> $GITHUB_ENV
echo "SCCACHE_S3_NO_CREDENTIALS=false" >> $GITHUB_ENV
- name: Run command # Do not change this step's name, it is checked in parse-job-times.py
shell: su coder {0}
env:
# Dereferencing the command from and env var instead of a GHA input avoids issues with escaping
# semicolons and other special characters (e.g. `-arch "60;70;80"`).
COMMAND: ${{inputs.command}}
run: |
set -eo pipefail
cd ~/${{github.event.repository.name}}
echo -e "\e[1;34mRunning as 'coder' user in $(pwd):\e[0m"
echo -e "\e[1;34m${COMMAND}\e[0m"
eval "${COMMAND}" || exit_code=$?
if [ ! -z "$exit_code" ]; then
echo -e "::group::️❗ \e[1;31mInstructions to Reproduce CI Failure Locally\e[0m"
echo "::error:: To replicate this failure locally, follow the steps below:"
echo "1. Clone the repository, and navigate to the correct branch and commit:"
echo " git clone --branch $GITHUB_REF_NAME --single-branch https://github.com/$GITHUB_REPOSITORY.git && cd $(echo $GITHUB_REPOSITORY | cut -d'/' -f2) && git checkout $GITHUB_SHA"
echo ""
echo "2. Run the failed command inside the same Docker container used by the CI:"
echo " docker run --rm -it --gpus all --pull=always --volume \$PWD:/repo --workdir /repo ${{ inputs.image }} ${COMMAND}"
echo ""
echo "For additional information, see:"
echo " - DevContainer Documentation: https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md"
echo " - Continuous Integration (CI) Overview: https://github.com/NVIDIA/cccl/blob/main/ci-overview.md"
exit $exit_code
fi
- name: Prepare job artifacts
id: done
shell: bash --noprofile --norc -euo pipefail {0}
run: |
echo "SUCCESS=true" | tee -a "${GITHUB_OUTPUT}"
result_dir="jobs/${{inputs.id}}"
mkdir -p "$result_dir"
touch "$result_dir/success"
# Finds a matching file in the repo directory and copies it to the results directory.
find_and_copy() {
filename="$1"
filepath="$(find ${{github.event.repository.name}} -name "${filename}" -print -quit)"
if [[ -z "$filepath" ]]; then
echo "${filename} does not exist in repo directory."
return 1
fi
cp -v "$filepath" "$result_dir"
}
find_and_copy "sccache_stats.json" || true # Ignore failures
echo "::group::Job artifacts"
tree "$result_dir"
echo "::endgroup::"
- name: Upload job artifacts
uses: actions/upload-artifact@v3
with:
name: jobs
path: jobs
84 changes: 84 additions & 0 deletions .github/actions/workflow-run-job-windows/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
name: "Run Linux Job"
description: "Run a job on a Linux runner."

inputs:
image: { type: string, required: true }
command: { type: string, required: true }
id: { type: string, required: true }

runs:
using: "composite"
steps:
- name: Get AWS credentials for sccache bucket
uses: aws-actions/configure-aws-credentials@v2
with:
role-to-assume: arn:aws:iam::279114543810:role/gha-oidc-NVIDIA
aws-region: us-east-2
role-duration-seconds: 43200 # 12 hours
- name: Checkout repo
uses: actions/checkout@v3
with:
path: ${{github.event.repository.name}}
persist-credentials: false
- name: Fetch ${{ inputs.image }}
shell: bash --noprofile --norc -euo pipefail {0}
run: docker pull ${{ inputs.image }}
- name: Prepare paths for docker
id: paths
shell: bash --noprofile --norc -euo pipefail {0}
run: |
echo "HOST_REPO=${{ github.workspace }}\${{ github.event.repository.name }}".Replace('\', '/') | Out-File -FilePath $env:GITHUB_OUTPUT -Append
echo "MOUNT_REPO=C:/${{ github.event.repository.name }}" | Out-File -FilePath $env:GITHUB_OUTPUT -Append
cat $env:GITHUB_OUTPUT
shell: powershell
- name: Run command # Do not change this step's name, it is checked in parse-job-times.py
shell: bash --noprofile --norc -euo pipefail {0}
run: |
docker run \
--mount type=bind,source="${{steps.paths.outputs.HOST_REPO}}",target="${{steps.paths.outputs.MOUNT_REPO}}" \
--workdir "${{steps.paths.outputs.MOUNT_REPO}}" \
${{ inputs.image }} \
powershell -c "
[System.Environment]::SetEnvironmentVariable('AWS_ACCESS_KEY_ID','${{env.AWS_ACCESS_KEY_ID}}');
[System.Environment]::SetEnvironmentVariable('AWS_SECRET_ACCESS_KEY','${{env.AWS_SECRET_ACCESS_KEY}}');
[System.Environment]::SetEnvironmentVariable('AWS_SESSION_TOKEN','${{env.AWS_SESSION_TOKEN }}');
[System.Environment]::SetEnvironmentVariable('SCCACHE_BUCKET','${{env.SCCACHE_BUCKET}}');
[System.Environment]::SetEnvironmentVariable('SCCACHE_REGION','${{env.SCCACHE_REGION}}');
[System.Environment]::SetEnvironmentVariable('SCCACHE_IDLE_TIMEOUT','${{env.SCCACHE_IDLE_TIMEOUT}}');
[System.Environment]::SetEnvironmentVariable('SCCACHE_S3_USE_SSL','${{env.SCCACHE_S3_USE_SSL}}');
[System.Environment]::SetEnvironmentVariable('SCCACHE_S3_NO_CREDENTIALS','${{env.SCCACHE_S3_NO_CREDENTIALS}}');
git config --global --add safe.directory '${{steps.paths.outputs.MOUNT_REPO}}';
${{inputs.command}}"
- name: Prepare job artifacts
id: done
shell: bash --noprofile --norc -euo pipefail {0}
run: |
echo "SUCCESS=true" | tee -a "${GITHUB_OUTPUT}"
result_dir="jobs/${{inputs.id}}"
mkdir -p "$result_dir"
touch "$result_dir/success"
# Finds a matching file in the repo directory and copies it to the results directory.
find_and_copy() {
filename="$1"
filepath="$(find ${{github.event.repository.name}} -name "${filename}" -print -quit)"
if [[ -z "$filepath" ]]; then
echo "${filename} does not exist in repo directory."
return 1
fi
cp -v "$filepath" "$result_dir"
}
find_and_copy "sccache_stats.json" || true # Ignore failures
echo "::group::Job artifacts"
find "$result_dir" # Tree not available in this image.
echo "::endgroup::"
- name: Upload job artifacts
uses: actions/upload-artifact@v3
with:
name: jobs
path: jobs
26 changes: 9 additions & 17 deletions .github/workflows/ci-workflow-pull-request.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ jobs:
pull-requests: read
outputs:
workflow: ${{ steps.build-workflow.outputs.workflow }}
workflow_keys: ${{ steps.build-workflow.outputs.workflow_keys }}
steps:
- name: Checkout repo
uses: actions/checkout@v3
Expand All @@ -62,22 +61,15 @@ jobs:
${{ env.nightly_workflow }}
run-workflow:
name: ${{ matrix.name }}
name: Run workflow
needs: build-workflow
permissions:
id-token: write
contents: read
strategy:
fail-fast: false
matrix:
name: ${{ fromJSON(needs.build-workflow.outputs.workflow_keys) }}
uses: ./.github/workflows/workflow-dispatch.yml
with:
name: ${{ matrix.name }}
jobs: ${{ toJSON(fromJSON(needs.build-workflow.outputs.workflow)[matrix.name]) }}
workflow: ${{ needs.build-workflow.outputs.workflow }}

# This job acts as a sentry and will fail if any leaf job in the workflow tree fails, as
# run-workflow always succeeds. Use this job when checking for successful matrix workflow job completion.
verify-workflow:
name: Verify and summarize workflow results
if: ${{ always() && !cancelled() }}
Expand All @@ -103,12 +95,12 @@ jobs:
github_token: ${{ secrets.GITHUB_TOKEN }}
pr_number: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).number }}

verify-devcontainers:
name: Verify Dev Containers
permissions:
id-token: write
contents: read
uses: ./.github/workflows/verify-devcontainers.yml
# verify-devcontainers:
# name: Verify Dev Containers
# permissions:
# id-token: write
# contents: read
# uses: ./.github/workflows/verify-devcontainers.yml

# Check all other job statuses. This job gates branch protection checks.
ci:
Expand All @@ -120,7 +112,7 @@ jobs:
if: ${{ always() }}
needs:
- verify-workflow
- verify-devcontainers
# - verify-devcontainers
runs-on: ubuntu-latest
steps:
- name: Check results
Expand Down
Loading

0 comments on commit d415ef8

Please sign in to comment.