Skip to content

Commit

Permalink
Merge pull request #3216 from alphagov/184950003-block-cf-deployment-…
Browse files Browse the repository at this point in the history
…if-failing-healthcheck-node-new

#184950003 Block cf-deploy if az healthcheck node fails to start
  • Loading branch information
AP-Hunt committed Jun 9, 2023
2 parents 607f7d6 + 1830004 commit 1256cb1
Show file tree
Hide file tree
Showing 6 changed files with 132 additions and 31 deletions.
68 changes: 67 additions & 1 deletion concourse/pipelines/create-cloudfoundry.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1608,12 +1608,18 @@ jobs:
TF_VAR_region: ((aws_region))
AWS_DEFAULT_REGION: ((aws_region))
ENABLE_AZ_HEALTHCHECK: ((enable_az_healthcheck))
DISABLED_AZS: ((disabled_azs))
run:
path: sh
args:
- -e
- -c
- |
if [ -n "${DISABLED_AZS}" ]; then
WAIT_FOR_HEALTHCHECK=0
else
WAIT_FOR_HEALTHCHECK=1
fi
cp az-healthcheck-tfstate/azhc.tfstate updated-tfstate/azhc.tfstate
sh paas-cf/terraform/./update-terraform-providers.sh updated-tfstate/azhc.tfstate
Expand All @@ -1624,12 +1630,68 @@ jobs:
terraform apply \
-auto-approve=true \
-var "enabled=$ENABLE_AZ_HEALTHCHECK" \
-var "wait_for_healthcheck=$WAIT_FOR_HEALTHCHECK" \
-state=../../../updated-tfstate/azhc.tfstate
ensure:
put: az-healthcheck-tfstate
params:
file: updated-tfstate/azhc.tfstate

- task: extract-terraform-variables
tags: [colocated-with-web]
config:
platform: linux
image_resource: *terraform-image-resource
inputs:
- name: paas-cf
- name: az-healthcheck-tfstate
outputs:
- name: terraform-variables
params:
REGION: ((aws_region))
run:
path: sh
args:
- -e
- -c
- |
terraform output \
-state=az-healthcheck-tfstate/azhc.tfstate \
-raw \
healthcheck_address_a > "terraform-variables/${REGION}a"
terraform output \
-state=az-healthcheck-tfstate/azhc.tfstate \
-raw \
healthcheck_address_b > "terraform-variables/${REGION}b"
terraform output \
-state=az-healthcheck-tfstate/azhc.tfstate \
-raw \
healthcheck_address_c > "terraform-variables/${REGION}c"
- task: curl-az-healthcheck-a
tags: [colocated-with-web]
file: paas-cf/concourse/tasks/curl-az-healthcheck.yml
params:
AVAILABILITY_ZONE: ((aws_region))a
ENABLE_AZ_HEALTHCHECK: ((enable_az_healthcheck))
DISABLED_AZS: ((disabled_azs))

- task: curl-az-healthcheck-b
tags: [colocated-with-web]
file: paas-cf/concourse/tasks/curl-az-healthcheck.yml
params:
AVAILABILITY_ZONE: ((aws_region))b
ENABLE_AZ_HEALTHCHECK: ((enable_az_healthcheck))
DISABLED_AZS: ((disabled_azs))

- task: curl-az-healthcheck-c
tags: [colocated-with-web]
file: paas-cf/concourse/tasks/curl-az-healthcheck.yml
params:
AVAILABILITY_ZONE: ((aws_region))c
ENABLE_AZ_HEALTHCHECK: ((enable_az_healthcheck))
DISABLED_AZS: ((disabled_azs))

- *end-grafana-job-annotation

- name: dms-terraform
Expand Down Expand Up @@ -2604,7 +2666,9 @@ jobs:
- *add-grafana-job-annotation
- in_parallel:
- get: pipeline-trigger
passed: ['cf-terraform']
passed:
- cf-terraform
- az-healthcheck-terraform
trigger: true
- <<: *get-paas-cf
passed: ['cf-terraform']
Expand All @@ -2614,6 +2678,8 @@ jobs:
- get: bosh-tfstate
- get: cf-tfstate
passed: ['cf-terraform']
- get: az-healthcheck-tfstate
passed: ['az-healthcheck-terraform']

- in_parallel:
- do:
Expand Down
9 changes: 8 additions & 1 deletion concourse/tasks/curl-az-healthcheck.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ image_resource:
params:
AVAILABILITY_ZONE:
ENABLE_AZ_HEALTHCHECK:
DISABLED_AZS:
inputs:
- name: terraform-variables
- name: paas-cf
Expand All @@ -20,7 +21,13 @@ run:
echo "Checking the ability to perform tests on ${AVAILABILITY_ZONE}..."
if [ "${ENABLE_AZ_HEALTHCHECK:-}" = "false" ]; then
echo "Availabilty Zone Healthchecks have been disabled."
echo "Availability Zone Healthchecks have been disabled."
exit 0
fi
# if we have any disabled azs then we don't want to do the curl
if [ -n "${DISABLED_AZS}" ]; then
echo "DISABLED_AZS is set, so we're skipping the check here."
exit 0
fi
Expand Down
47 changes: 25 additions & 22 deletions terraform/az-monitoring/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ resource "aws_vpc" "main" {

resource "aws_internet_gateway" "main" {
vpc_id = aws_vpc.main[0].id
count = var.enabled ? 1 : 0
count = var.enabled ? 1 : 0
}

resource "aws_route_table" "main" {
Expand All @@ -41,35 +41,38 @@ resource "aws_route_table" "main" {
module "healthcheck_a" {
source = "./module"

ami = data.aws_ami.amazon_linux_2.id
cidr = "10.0.1.0/24"
region = var.region
aws_route_table_id = aws_route_table.main[0].id
vpc_id = aws_vpc.main[0].id
zone = "a"
count = var.enabled ? 1 : 0
ami = data.aws_ami.amazon_linux_2.id
cidr = "10.0.1.0/24"
region = var.region
aws_route_table_id = aws_route_table.main[0].id
vpc_id = aws_vpc.main[0].id
zone = "a"
count = var.enabled ? 1 : 0
wait_for_healthcheck = var.wait_for_healthcheck
}

module "healthcheck_b" {
source = "./module"

ami = data.aws_ami.amazon_linux_2.id
cidr = "10.0.2.0/24"
region = var.region
aws_route_table_id = aws_route_table.main[0].id
vpc_id = aws_vpc.main[0].id
zone = "b"
count = var.enabled ? 1 : 0
ami = data.aws_ami.amazon_linux_2.id
cidr = "10.0.2.0/24"
region = var.region
aws_route_table_id = aws_route_table.main[0].id
vpc_id = aws_vpc.main[0].id
zone = "b"
count = var.enabled ? 1 : 0
wait_for_healthcheck = var.wait_for_healthcheck
}

module "healthcheck_c" {
source = "./module"

ami = data.aws_ami.amazon_linux_2.id
cidr = "10.0.3.0/24"
region = var.region
aws_route_table_id = aws_route_table.main[0].id
vpc_id = aws_vpc.main[0].id
zone = "c"
count = var.enabled ? 1 : 0
ami = data.aws_ami.amazon_linux_2.id
cidr = "10.0.3.0/24"
region = var.region
aws_route_table_id = aws_route_table.main[0].id
vpc_id = aws_vpc.main[0].id
zone = "c"
count = var.enabled ? 1 : 0
wait_for_healthcheck = var.wait_for_healthcheck
}
28 changes: 22 additions & 6 deletions terraform/az-monitoring/module/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ resource "aws_instance" "healthcheck" {
aws_security_group.access_sg.id,
]

user_data = <<-EOF
user_data_replace_on_change = true
user_data = <<-EOF
#!/bin/bash
set -ex
Expand All @@ -80,17 +81,16 @@ WantedBy=multi-user.target
' >/etc/systemd/system/simple-healthcheck.service
systemctl daemon-reload
yum update -y --setopt=retries=0
amazon-linux-extras install docker -y --setopt=retries=0
service docker start
usermod -a -G docker ec2-user
usermod -a -G docker ec2-user
sudo systemctl enable simple-healthcheck
sudo systemctl start simple-healthcheck
EOF
user_data_replace_on_change = true

tags = {
Name = "az-healthcheck/${var.zone}"
Expand All @@ -99,4 +99,20 @@ WantedBy=multi-user.target
monitoring = true
disable_api_termination = false
ebs_optimized = true
}
}

resource "null_resource" "wait_for_healthcheck" {
depends_on = [aws_instance.healthcheck]
count = var.wait_for_healthcheck ? 1 : 0

triggers = {
healthcheck_instance_ip = aws_instance.healthcheck.public_ip
}

provisioner "local-exec" {
command = <<-EOF
echo "Instance ID: ${aws_instance.healthcheck.id}"
timeout 900s /bin/sh -c 'until wget --spider -S "http://${aws_instance.healthcheck.public_ip}:3000/healthcheck" 2>&1 | grep "HTTP/1.1 200 OK"; do sleep 5; done'
EOF
}
}
4 changes: 4 additions & 0 deletions terraform/az-monitoring/module/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@ variable "zone" {
variable "aws_route_table_id" {
description = "Route Table ID for association with subnets"
}

variable "wait_for_healthcheck" {
description = "Wait for the healthchecks"
}
7 changes: 6 additions & 1 deletion terraform/az-monitoring/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,9 @@ variable "region" {
variable "enabled" {
description = "Enable monitoring"
default = false
}
}

variable "wait_for_healthcheck" {
description = "Wait for the healthchecks"
default = true
}

0 comments on commit 1256cb1

Please sign in to comment.