diff --git a/aws/alarms/dashboards.tf b/aws/alarms/dashboards.tf new file mode 100644 index 000000000..0cf09dfd8 --- /dev/null +++ b/aws/alarms/dashboards.tf @@ -0,0 +1,23 @@ +resource "aws_cloudwatch_dashboard" "forms_service_health" { + dashboard_name = "Forms-Service-Health" + dashboard_body = templatefile("${path.module}/dashboards/forms_service_health.tmpl.json", { + alarm_ecs_cpu_utilization_warn = aws_cloudwatch_metric_alarm.forms_cpu_utilization_high_warn.arn, + alarm_ecs_memory_utilization_warn = aws_cloudwatch_metric_alarm.forms_memory_utilization_high_warn.arn, + alarm_lb_response_5xx_warn = aws_cloudwatch_metric_alarm.ELB_5xx_error_warn.arn, + alarm_lb_response_time_warn = aws_cloudwatch_metric_alarm.response_time_warn.arn, + alarm_lb_unhealth_host_count_tg1 = aws_cloudwatch_metric_alarm.UnHealthyHostCount-TargetGroup1.arn, + alarm_lb_unhealth_host_count_tg2 = aws_cloudwatch_metric_alarm.UnHealthyHostCount-TargetGroup2.arn, + alarm_reliability_deadletter_queue = aws_cloudwatch_metric_alarm.reliability_dead_letter_queue_warn.arn, + lb_arn_suffix = var.lb_arn_suffix, + ecs_cloudwatch_log_group_name = var.ecs_cloudwatch_log_group_name, + ecs_cluster_name = var.ecs_cluster_name, + ecs_service_name = var.ecs_service_name, + lambda_nagware_log_group_name = var.lambda_nagware_log_group_name, + lambda_reliability_log_group_name = var.lambda_reliability_log_group_name, + lambda_response_archiver_log_group_name = var.lambda_response_archiver_log_group_name, + lambda_submission_log_group_name = var.lambda_submission_log_group_name, + lambda_vault_integrity_log_group_name = var.lambda_vault_integrity_log_group_name, + rds_cluster_identifier = var.rds_cluster_identifier, + region = var.region + }) +} diff --git a/aws/alarms/dashboards/forms_service_health.tmpl.json b/aws/alarms/dashboards/forms_service_health.tmpl.json new file mode 100644 index 000000000..030b27b10 --- /dev/null +++ b/aws/alarms/dashboards/forms_service_health.tmpl.json @@ -0,0 +1,541 @@ +{ + "widgets": [ + { + "height": 12, + "width": 12, + "y": 2, + "x": 0, + "type": "metric", + "properties": { + "metrics": [ + [ "forms", "ClientSubmitSuccess", { "color": "#2ca02c" } ], + [ ".", "ClientSubmitFailed", { "color": "#d62728" } ] + ], + "view": "timeSeries", + "stacked": false, + "region": "${region}", + "stat": "Sum", + "period": 300, + "title": "App: client submissions" + } + }, + { + "height": 6, + "width": 12, + "y": 2, + "x": 12, + "type": "metric", + "properties": { + "metrics": [ + [ "forms", "SubmissionSuccess", { "color": "#2ca02c" } ], + [ ".", "SubmissionWarn", { "color": "#ffbb78" } ], + [ ".", "SubmissionFailed", { "color": "#d62728" } ] + ], + "view": "timeSeries", + "stacked": false, + "region": "${region}", + "stat": "Sum", + "period": 300, + "title": "Lambda: submission" + } + }, + { + "height": 8, + "width": 6, + "y": 74, + "x": 0, + "type": "metric", + "properties": { + "metrics": [ + [ "AWS/RDS", "CPUUtilization", "DBClusterIdentifier", "${rds_cluster_identifier}", { "region": "${region}", "color": "#17becf" } ] + ], + "sparkline": true, + "view": "timeSeries", + "region": "${region}", + "stat": "Average", + "period": 60, + "title": "DB: CPU use", + "stacked": false + } + }, + { + "height": 6, + "width": 12, + "y": 8, + "x": 12, + "type": "metric", + "properties": { + "metrics": [ + [ "forms", "ReliabilitySuccess", { "color": "#2ca02c" } ], + [ ".", "ReliabilityWarn", { "color": "#ffbb78" } ], + [ ".", "ReliabilityFailed", { "color": "#d62728" } ] + ], + "view": "timeSeries", + "stacked": false, + "region": "${region}", + "stat": "Sum", + "period": 300, + "title": "Lambda: reliability" + } + }, + { + "height": 2, + "width": 24, + "y": 20, + "x": 0, + "type": "text", + "properties": { + "markdown": "", + "background": "transparent" + } + }, + { + "height": 8, + "width": 6, + "y": 74, + "x": 6, + "type": "metric", + "properties": { + "metrics": [ + [ "AWS/RDS", "FreeableMemory", "DBClusterIdentifier", "${rds_cluster_identifier}", { "color": "#9467bd" } ] + ], + "sparkline": true, + "view": "timeSeries", + "region": "${region}", + "stat": "Average", + "period": 60, + "title": "DB: freeable memory", + "stacked": false + } + }, + { + "height": 8, + "width": 6, + "y": 74, + "x": 12, + "type": "metric", + "properties": { + "metrics": [ + [ "AWS/RDS", "ReadLatency", "DBClusterIdentifier", "${rds_cluster_identifier}", { "color": "#c5b0d5" } ] + ], + "sparkline": true, + "view": "timeSeries", + "region": "${region}", + "stat": "Average", + "period": 60, + "title": "DB: read latency", + "stacked": false + } + }, + { + "height": 8, + "width": 6, + "y": 74, + "x": 18, + "type": "metric", + "properties": { + "metrics": [ + [ "AWS/RDS", "WriteLatency", "DBClusterIdentifier", "${rds_cluster_identifier}", { "color": "#7f7f7f" } ] + ], + "sparkline": true, + "view": "timeSeries", + "region": "${region}", + "stat": "Average", + "period": 60, + "title": "DB: write latency", + "stacked": false + } + }, + { + "height": 2, + "width": 24, + "y": 0, + "x": 0, + "type": "text", + "properties": { + "markdown": "# Form submissions\nTracking form submissions flow through the system.", + "background": "transparent" + } + }, + { + "height": 6, + "width": 8, + "y": 14, + "x": 0, + "type": "metric", + "properties": { + "metrics": [ + [ "AWS/SQS", "NumberOfMessagesReceived", "QueueName", "submission_processing.fifo", { "color": "#8c564b" } ] + ], + "view": "timeSeries", + "stacked": false, + "region": "${region}", + "stat": "Sum", + "period": 300, + "title": "Queue: submission messages" + } + }, + { + "height": 6, + "width": 4, + "y": 14, + "x": 8, + "type": "metric", + "properties": { + "metrics": [ + [ "AWS/SQS", "ApproximateAgeOfOldestMessage", "QueueName", "submission_processing.fifo", { "color": "#7f7f7f", "region": "${region}", "label": "Oldest message age" } ] + ], + "sparkline": true, + "view": "singleValue", + "region": "${region}", + "stat": "Average", + "period": 300, + "title": "Queue: submission message age" + } + }, + { + "height": 6, + "width": 12, + "y": 14, + "x": 12, + "type": "metric", + "properties": { + "metrics": [ + [ "forms", "ReliabilityNotifySendSuccess", { "region": "${region}", "color": "#2ca02c" } ], + [ ".", "ReliabilityNotifySendFailed", { "region": "${region}", "color": "#d62728" } ], + [ ".", "ReliabilityVaultSaveSuccess", { "color": "#1f77b4", "region": "${region}" } ], + [ ".", "ReliabilityVaultSaveFailed", { "color": "#ff7f0e", "region": "${region}" } ] + ], + "view": "timeSeries", + "stacked": false, + "region": "${region}", + "stat": "Sum", + "period": 300, + "title": "Lambda: reliability send/save" + } + }, + { + "height": 2, + "width": 24, + "y": 43, + "x": 0, + "type": "text", + "properties": { + "markdown": "", + "background": "transparent" + } + }, + { + "height": 2, + "width": 24, + "y": 22, + "x": 0, + "type": "text", + "properties": { + "markdown": "# Errors\nError logs and alarms from the app and lambdas.", + "background": "transparent" + } + }, + { + "height": 2, + "width": 24, + "y": 45, + "x": 0, + "type": "text", + "properties": { + "markdown": "# Lambdas\nPerformance metrics for the Lambda functions.", + "background": "transparent" + } + }, + { + "height": 7, + "width": 8, + "y": 36, + "x": 0, + "type": "metric", + "properties": { + "metrics": [ + [ "AWS/ECS", "CPUUtilization", "ServiceName", "${ecs_service_name}", "ClusterName", "${ecs_cluster_name}", { "stat": "Minimum", "region": "${region}" } ], + [ "...", { "stat": "Maximum", "region": "${region}" } ], + [ "...", { "stat": "Average", "region": "${region}" } ] + ], + "period": 300, + "region": "${region}", + "stacked": false, + "title": "App: CPU use", + "view": "timeSeries" + } + }, + { + "height": 7, + "width": 8, + "y": 36, + "x": 8, + "type": "metric", + "properties": { + "metrics": [ + [ "AWS/ECS", "MemoryUtilization", "ServiceName", "${ecs_service_name}", "ClusterName", "${ecs_cluster_name}", { "stat": "Minimum" } ], + [ "...", { "stat": "Maximum" } ], + [ "...", { "stat": "Average" } ] + ], + "period": 300, + "region": "${region}", + "stacked": false, + "title": "App: memory use", + "view": "timeSeries" + } + }, + { + "height": 2, + "width": 24, + "y": 59, + "x": 0, + "type": "text", + "properties": { + "markdown": "", + "background": "transparent" + } + }, + { + "height": 2, + "width": 24, + "y": 61, + "x": 0, + "type": "text", + "properties": { + "markdown": "# Load balancer\nRequests, errors and response time for the app's load balancer.", + "background": "transparent" + } + }, + { + "height": 6, + "width": 18, + "y": 47, + "x": 0, + "type": "metric", + "properties": { + "metrics": [ + [ "AWS/Lambda", "Invocations", "FunctionName", "Submission", { "region": "${region}" } ], + [ ".", "Throttles", ".", ".", { "color": "#ffbb78", "region": "${region}" } ], + [ ".", "Errors", ".", ".", { "region": "${region}" } ] + ], + "view": "timeSeries", + "stacked": false, + "region": "${region}", + "stat": "Sum", + "period": 300, + "title": "Lambda: submission" + } + }, + { + "height": 6, + "width": 6, + "y": 47, + "x": 18, + "type": "metric", + "properties": { + "metrics": [ + [ "AWS/Lambda", "Duration", "FunctionName", "Submission", "Resource", "Submission", { "region": "${region}", "color": "#555555" } ] + ], + "sparkline": true, + "view": "singleValue", + "region": "${region}", + "title": "Lambda: submission duration", + "stacked": false, + "period": 300, + "stat": "Average" + } + }, + { + "height": 6, + "width": 18, + "y": 53, + "x": 0, + "type": "metric", + "properties": { + "metrics": [ + [ "AWS/Lambda", "Invocations", "FunctionName", "reliability", "Resource", "reliability", { "region": "${region}" } ], + [ ".", "Throttles", ".", ".", ".", ".", { "region": "${region}", "color": "#ffbb78" } ], + [ ".", "Errors", ".", ".", ".", ".", { "region": "${region}" } ] + ], + "view": "timeSeries", + "stacked": false, + "region": "${region}", + "stat": "Sum", + "period": 300, + "title": "Lambda: reliability" + } + }, + { + "height": 6, + "width": 6, + "y": 53, + "x": 18, + "type": "metric", + "properties": { + "metrics": [ + [ "AWS/Lambda", "Duration", "FunctionName", "reliability", "Resource", "reliability", { "color": "#555" } ] + ], + "sparkline": true, + "view": "singleValue", + "region": "${region}", + "title": "Lambda: reliabiity duration", + "stacked": false, + "period": 300, + "stat": "Average" + } + }, + { + "height": 7, + "width": 8, + "y": 36, + "x": 16, + "type": "metric", + "properties": { + "metrics": [ + [ "ECS/ContainerInsights", "NetworkRxBytes", "ClusterName", "${ecs_cluster_name}", { "region": "${region}", "color": "#1f77b4" } ] + ], + "view": "timeSeries", + "stacked": false, + "region": "${region}", + "stat": "Sum", + "period": 300, + "title": "App: network bytes" + } + }, + { + "height": 7, + "width": 9, + "y": 63, + "x": 0, + "type": "metric", + "properties": { + "metrics": [ + [ "AWS/ApplicationELB", "RequestCount", "LoadBalancer", "${lb_arn_suffix}", { "color": "#2ca02c", "region": "${region}", "label": "Request count" } ], + [ ".", "HTTPCode_ELB_4XX_Count", ".", ".", { "region": "${region}", "color": "#ffbb78", "label": "4XX response count" } ], + [ ".", "HTTPCode_ELB_5XX_Count", ".", ".", { "color": "#d62728", "region": "${region}", "label": "5XX response count" } ] + ], + "view": "timeSeries", + "stacked": false, + "region": "${region}", + "stat": "Sum", + "period": 300, + "title": "LB: requests" + } + }, + { + "height": 7, + "width": 6, + "y": 63, + "x": 18, + "type": "metric", + "properties": { + "metrics": [ + [ "AWS/ApplicationELB", "TargetResponseTime", "LoadBalancer", "${lb_arn_suffix}", { "region": "${region}", "color": "#8c564b" } ] + ], + "view": "singleValue", + "stacked": false, + "region": "${region}", + "sparkline": true, + "period": 300, + "title": "LB: response time", + "stat": "Average" + } + }, + { + "height": 2, + "width": 24, + "y": 70, + "x": 0, + "type": "text", + "properties": { + "markdown": "", + "background": "transparent" + } + }, + { + "height": 2, + "width": 24, + "y": 72, + "x": 0, + "type": "text", + "properties": { + "markdown": "# Database\nPerformance metrics for the database cluster.", + "background": "transparent" + } + }, + { + "height": 7, + "width": 9, + "y": 63, + "x": 9, + "type": "metric", + "properties": { + "metrics": [ + [ "AWS/ApplicationELB", "ActiveConnectionCount", "LoadBalancer", "${lb_arn_suffix}", { "color": "#e377c2" } ] + ], + "view": "timeSeries", + "stacked": false, + "region": "${region}", + "stat": "Average", + "period": 300, + "title": "LB: connections" + } + }, + { + "type": "log", + "x": 0, + "y": 24, + "width": 20, + "height": 8, + "properties": { + "query": "SOURCE '${ecs_cloudwatch_log_group_name}' | SOURCE '${lambda_reliability_log_group_name}' | SOURCE '${lambda_submission_log_group_name}' | SOURCE '${lambda_nagware_log_group_name}' | SOURCE '${lambda_response_archiver_log_group_name}' | SOURCE '${lambda_vault_integrity_log_group_name}' | fields @timestamp, @message, @logStream, @log\n| filter level = 'error' or level = 'warn' or status = 'failed'\n| sort @timestamp desc\n| limit 1000", + "region": "${region}", + "stacked": false, + "title": "Errors: app and lambdas", + "view": "table" + } + }, + { + "type": "alarm", + "x": 20, + "y": 24, + "width": 4, + "height": 8, + "properties": { + "title": "Alarms", + "alarms": [ + "${alarm_ecs_cpu_utilization_warn}", + "${alarm_ecs_memory_utilization_warn}", + "${alarm_lb_response_5xx_warn}", + "${alarm_lb_response_time_warn}", + "${alarm_lb_unhealth_host_count_tg1}", + "${alarm_lb_unhealth_host_count_tg2}", + "${alarm_reliability_deadletter_queue}" + ] + } + }, + { + "height": 2, + "width": 24, + "y": 32, + "x": 0, + "type": "text", + "properties": { + "markdown": "", + "background": "transparent" + } + }, + { + "height": 2, + "width": 24, + "y": 34, + "x": 0, + "type": "text", + "properties": { + "markdown": "# App\nPerformance metrics for the ECS form viewer client app.", + "background": "transparent" + } + } + ] +} \ No newline at end of file diff --git a/aws/alarms/inputs.tf b/aws/alarms/inputs.tf index 8627816ce..38c1e7f4f 100644 --- a/aws/alarms/inputs.tf +++ b/aws/alarms/inputs.tf @@ -105,6 +105,11 @@ variable "opsgenie_api_key" { sensitive = true } +variable "rds_cluster_identifier" { + description = "RDS cluster identifier used for alarms and dashboards" + type = string +} + variable "sqs_reliability_deadletter_queue_arn" { description = "ARN of the Reliability queue's SQS Dead Letter Queue" type = string @@ -158,4 +163,4 @@ variable "sns_topic_alert_ok_us_east_arn" { variable "ecr_repository_url_notify_slack_lambda" { description = "URL of the Notify Slack Lambda ECR" type = string -} \ No newline at end of file +} diff --git a/aws/rds/outputs.tf b/aws/rds/outputs.tf index 9390ea718..04c2e3ee6 100644 --- a/aws/rds/outputs.tf +++ b/aws/rds/outputs.tf @@ -13,6 +13,11 @@ output "rds_cluster_arn" { value = aws_rds_cluster.forms.arn } +output "rds_cluster_identifier" { + description = "RDS cluster identifier" + value = aws_rds_cluster.forms.cluster_identifier +} + output "rds_db_name" { description = "Name of the database" value = var.rds_db_name diff --git a/env/cloud/alarms/terragrunt.hcl b/env/cloud/alarms/terragrunt.hcl index 43af26603..69de8f78d 100644 --- a/env/cloud/alarms/terragrunt.hcl +++ b/env/cloud/alarms/terragrunt.hcl @@ -3,7 +3,7 @@ terraform { } dependencies { - paths = ["../hosted_zone", "../kms", "../load_balancer", "../sqs", "../app", "../sns", "../lambdas", "../ecr"] + paths = ["../hosted_zone", "../kms", "../load_balancer", "../rds", "../sqs", "../app", "../sns", "../lambdas", "../ecr"] } locals { @@ -43,6 +43,16 @@ dependency "load_balancer" { } } +dependency "rds" { + config_path = "../rds" + + mock_outputs_allowed_terraform_commands = ["init", "fmt", "validate", "plan", "show"] + mock_outputs_merge_strategy_with_state = "shallow" + mock_outputs = { + rds_cluster_identifier = "forms-mock-db-cluster" + } +} + dependency "sqs" { config_path = "../sqs" @@ -140,6 +150,8 @@ inputs = { lambda_vault_integrity_log_group_name = dependency.lambdas.outputs.lambda_vault_integrity_log_group_name lambda_vault_integrity_function_name = dependency.lambdas.outputs.lambda_vault_integrity_function_name + rds_cluster_identifier = dependency.rds.outputs.rds_cluster_identifier + sns_topic_alert_critical_arn = dependency.sns.outputs.sns_topic_alert_critical_arn sns_topic_alert_warning_arn = dependency.sns.outputs.sns_topic_alert_warning_arn sns_topic_alert_ok_arn = dependency.sns.outputs.sns_topic_alert_ok_arn