Skip to content

Increase JSON to Parquet timeout #456

Increase JSON to Parquet timeout

Increase JSON to Parquet timeout #456

name: upload-and-deploy
on:
push:
branches: "*"
tags-ignore: "*"
env:
NAMESPACE: main
PYTHON_VERSION: 3.9
DEV_INPUT_BUCKET: recover-dev-input-data
DEV_PROCESSED_BUCKET: recover-dev-processed-data
PROD_INPUT_BUCKET: recover-input-data
jobs:
pre-commit:
name: Run pre-commit hooks against all files
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
- uses: pre-commit/[email protected]
upload-files:
name: Upload files to S3 bucket in development
runs-on: ubuntu-latest
needs: pre-commit
# These permissions are needed to interact with GitHub's OIDC Token endpoint.
permissions:
id-token: write
contents: read
environment: develop
steps:
- name: Setup code, pipenv, aws
uses: Sage-Bionetworks/action-pipenv-aws-setup@v3
with:
role_to_assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }}
role_session_name: GitHubActions-${{ github.repository_owner }}-${{ github.event.repository.name }}-${{ github.run_id }}
python_version: ${{ env.PYTHON_VERSION }}
- name: Setup sam
uses: aws-actions/setup-sam@v2
- name: Set namespace for non-default branch
if: github.ref_name != 'main'
run: echo "NAMESPACE=$GITHUB_REF_NAME" >> $GITHUB_ENV
- name: Copy files to templates bucket, use dev cloudformation bucket
run: >
python src/scripts/manage_artifacts/artifacts.py
--upload
--namespace $NAMESPACE
--cfn_bucket ${{ vars.CFN_BUCKET }}
nonglue-unit-tests:
name: Runs unit tests that are not dependent on aws-glue package resources
runs-on: ubuntu-latest
needs: pre-commit
# These permissions are needed to interact with GitHub's OIDC Token endpoint.
permissions:
id-token: write
contents: read
environment: develop
steps:
- name: Setup code, pipenv, aws
uses: Sage-Bionetworks/action-pipenv-aws-setup@v3
with:
role_to_assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }}
role_session_name: ${{ github.event.repository.name }}-${{ github.run_id }}-nonglue-unit-tests
python_version: ${{ env.PYTHON_VERSION }}
- name: Install additional python dependency
run: |
pipenv install ecs_logging~=2.0
- name: Test lambda scripts with pytest
run: |
pipenv run python -m pytest tests/test_s3_event_config_lambda.py -v
pipenv run python -m pytest tests/test_s3_to_glue_lambda.py -v
- name: Test dev synapse folders for STS access with pytest
run: >
pipenv run python -m pytest tests/test_setup_external_storage.py
--test-bucket $DEV_INPUT_BUCKET
--test-synapse-folder-id syn51758510
--namespace $NAMESPACE
--test-sts-permission read_only
-v
pipenv run python -m pytest tests/test_setup_external_storage.py
--test-bucket $DEV_PROCESSED_BUCKET
--test-synapse-folder-id syn51084525
--namespace $NAMESPACE/parquet
--test-sts-permission read_write
-v
pytest-docker:
name: Build and push testing docker images to the pytest ECR repository.
needs: pre-commit
runs-on: ubuntu-latest
# These permissions are needed to interact with GitHub's OIDC Token endpoint.
permissions:
id-token: write
contents: read
strategy:
matrix:
include:
- tag_name: aws_glue_3
dockerfile: tests/Dockerfile.aws_glue_3
- tag_name: aws_glue_4
dockerfile: tests/Dockerfile.aws_glue_4
environment: develop
steps:
- name: Assume AWS role
uses: aws-actions/configure-aws-credentials@v2
with:
role-to-assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }}
aws-region: "us-east-1"
# unmasking of the AWS account ID allows the acct id to pass through outputs
mask-aws-account-id: "no"
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v1
- name: Get ECR secret names
id: ecr
run: |
usernameKey=docker_username_$(echo ${{ steps.login-ecr.outputs.registry }} | tr '.-' _)
echo "username-key=$usernameKey" >> $GITHUB_OUTPUT
passwordKey=docker_password_$(echo ${{ steps.login-ecr.outputs.registry }} | tr '.-' _)
echo "password-key=$passwordKey" >> $GITHUB_OUTPUT
- uses: actions/checkout@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Build and push to ECR
id: docker-build-push
uses: docker/build-push-action@v4
with:
push: true
tags: ${{ steps.login-ecr.outputs.registry }}/pytest:${{ github.ref_name }}_${{ matrix.tag_name }}
file: ${{ matrix.dockerfile }}
cache-from: type=local,src=/tmp/.buildx-cache
cache-to: type=local,dest=/tmp/.buildx-cache
outputs:
ecr-registry: ${{ steps.login-ecr.outputs.registry }}
ecr-username: ${{ steps.login-ecr.outputs[steps.ecr.outputs.username-key] }}
ecr-password: ${{ steps.login-ecr.outputs[steps.ecr.outputs.password-key] }}
glue-unit-tests:
name: Run Pytest unit tests for AWS glue
needs: pytest-docker
environment: develop
runs-on: ubuntu-latest
# These permissions are needed to interact with GitHub's OIDC Token endpoint.
permissions:
id-token: write
contents: read
strategy:
matrix:
tag_name:
["aws_glue_3", "aws_glue_4"]
container:
image: ${{ needs.pytest-docker.outputs.ecr-registry }}/pytest:${{ github.ref_name }}_${{ matrix.tag_name }}
credentials:
username: ${{ needs.pytest-docker.outputs.ecr-username }}
password: ${{ needs.pytest-docker.outputs.ecr-password }}
env:
DISABLE_SSL: true
options: "--user root"
steps:
- name: Assume AWS role
uses: aws-actions/configure-aws-credentials@v2
with:
role-to-assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }}
aws-region: "us-east-1"
- uses: actions/checkout@v3
- run: chown -R glue_user $GITHUB_WORKSPACE
- run: su - glue_user --command "aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID"
- run: su - glue_user --command "aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY"
- run: su - glue_user --command "aws configure set aws_session_token $AWS_SESSION_TOKEN"
- run: su - glue_user --command "aws configure set region $AWS_REGION"
- name: Set namespace for non-default branch or for tag
if: github.ref_name != 'main'
run: echo "NAMESPACE=$GITHUB_REF_NAME" >> $GITHUB_ENV
- name: Run Pytest unit tests under AWS 3.0
if: matrix.tag_name == 'aws_glue_3'
run: |
su - glue_user --command "cd $GITHUB_WORKSPACE && python3 -m pytest tests/test_s3_to_json.py -v"
su - glue_user --command "cd $GITHUB_WORKSPACE && python3 -m pytest tests/test_compare_parquet_datasets.py -v"
- name: Run Pytest unit tests under AWS 4.0
if: matrix.tag_name == 'aws_glue_4'
run: >
su - glue_user --command "cd $GITHUB_WORKSPACE &&
python3 -m pytest tests/test_json_to_parquet.py --namespace $NAMESPACE -v"
sceptre-deploy-develop:
name: Deploys branch using sceptre
runs-on: ubuntu-latest
needs: [upload-files, nonglue-unit-tests, glue-unit-tests]
environment: develop
# These permissions are needed to interact with GitHub's OIDC Token endpoint.
permissions:
id-token: write
contents: read
steps:
- name: Setup code, pipenv, aws
uses: Sage-Bionetworks/action-pipenv-aws-setup@v3
with:
role_to_assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }}
role_session_name: GitHubActions-${{ github.repository_owner }}-${{ github.event.repository.name }}-${{ github.run_id }}
python_version: ${{ env.PYTHON_VERSION }}
- name: Create directory for remote sceptre templates
run: mkdir -p templates/remote/
- name: Set namespace for non-default branch
if: github.ref_name != 'main'
run: echo "NAMESPACE=$GITHUB_REF_NAME" >> $GITHUB_ENV
- name: "Deploy sceptre stacks to dev"
run: pipenv run sceptre --var "namespace=${{ env.NAMESPACE }}" launch develop --yes
- name: Configure S3 to Glue lambda with S3 trigger
uses: gagoar/invoke-aws-lambda@v3
with:
AWS_ACCESS_KEY_ID: ${{ env.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ env.AWS_SECRET_ACCESS_KEY }}
AWS_SESSION_TOKEN: ${{ env.AWS_SESSION_TOKEN }}
REGION: ${{ env.AWS_REGION }}
FunctionName: ${{ env.NAMESPACE }}-S3EventConfig
Payload: '{"RequestType": "Create"}'
LogType: Tail
integration-test-develop:
name: Triggers ETL workflow with S3 test files
runs-on: ubuntu-latest
needs: sceptre-deploy-develop
environment: develop
# These permissions are needed to interact with GitHub's OIDC Token endpoint.
permissions:
id-token: write
contents: read
steps:
- name: Setup code, pipenv, aws
uses: Sage-Bionetworks/action-pipenv-aws-setup@v3
with:
role_to_assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }}
role_session_name: integration-test-${{ github.run_id }}
python_version: ${{ env.PYTHON_VERSION }}
- name: Set namespace for non-default branch or for tag
if: github.ref_name != 'main'
run: echo "NAMESPACE=$GITHUB_REF_NAME" >> $GITHUB_ENV
- name: Copies over test files from ingestion bucket
run: >
aws s3 cp s3://recover-dev-ingestion/pilot-data/ s3://$DEV_INPUT_BUCKET/$NAMESPACE/
--recursive
--exclude "owner.txt"
sceptre-deploy-staging:
name: Deploys to staging of prod using sceptre
runs-on: ubuntu-latest
needs: integration-test-develop
if: github.ref_name == 'main'
environment: prod
# These permissions are needed to interact with GitHub's OIDC Token endpoint.
permissions:
id-token: write
contents: read
steps:
- name: Setup code, pipenv, aws
uses: Sage-Bionetworks/action-pipenv-aws-setup@v3
with:
role_to_assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }}
role_session_name: GitHubActions-${{ github.repository_owner }}-${{ github.event.repository.name }}-${{ github.run_id }}
python_version: ${{ env.PYTHON_VERSION }}
- name: Copy files to templates bucket
run: >
python src/scripts/manage_artifacts/artifacts.py
--upload
--namespace staging
--cfn_bucket ${{ vars.CFN_BUCKET }}
- name: Create directory for remote sceptre templates
run: mkdir -p templates/remote/
- name: Deploy sceptre stacks to staging on prod
run: pipenv run sceptre --var "namespace=staging" launch prod --yes
integration-test-staging:
name: Triggers staging workflow with production data
runs-on: ubuntu-latest
needs: sceptre-deploy-staging
environment: prod
# These permissions are needed to interact with GitHub's OIDC Token endpoint.
permissions:
id-token: write
contents: read
steps:
- name: Setup code, pipenv, aws
uses: Sage-Bionetworks/action-pipenv-aws-setup@v3
with:
role_to_assume: ${{ vars.AWS_CREDENTIALS_IAM_ROLE }}
role_session_name: integration-test-${{ github.run_id }}
python_version: ${{ env.PYTHON_VERSION }}
- name: generate test events
run: >
pipenv run python src/lambda_function/s3_to_glue/events/generate_test_event.py
--input-bucket $PROD_INPUT_BUCKET
--input-key-prefix $NAMESPACE
--output-directory ./src/lambda_function/s3_to_glue/events/
- name: Setup sam
uses: aws-actions/setup-sam@v2
- name: sam build lambda
run: >
sam build
-t src/lambda_function/s3_to_glue/template.yaml
- name: Invoke Lambda
run: |
cd src/lambda_function/s3_to_glue/
sam local invoke -e events/records.json --parameter-overrides "S3ToJsonWorkflowName=staging-S3ToJsonWorkflow"