diff --git a/terragrunt/aws/alarms/alarms.tf b/terragrunt/aws/alarms/alarms.tf index 9094b25..379d14f 100644 --- a/terragrunt/aws/alarms/alarms.tf +++ b/terragrunt/aws/alarms/alarms.tf @@ -31,26 +31,6 @@ resource "aws_cloudwatch_metric_alarm" "glue_crawler_error" { ok_actions = [aws_sns_topic.cloudwatch_ok_action.arn] } -resource "aws_cloudwatch_metric_alarm" "glue_job_failures" { - alarm_name = "glue-job-failures" - alarm_description = "Failed Glue jobs in a 1 minute period." - comparison_operator = "GreaterThanThreshold" - evaluation_periods = "1" - metric_name = local.glue_job_failure_metric_name - namespace = local.data_lake_namespace - period = "60" - statistic = "Sum" - threshold = "0" - treat_missing_data = "notBreaching" - - alarm_actions = [aws_sns_topic.cloudwatch_alarm_action.arn] - ok_actions = [aws_sns_topic.cloudwatch_ok_action.arn] - - dimensions = { - JobName = "*" - } -} - # # Log Insight queries # diff --git a/terragrunt/aws/alarms/eventbridge.tf b/terragrunt/aws/alarms/eventbridge.tf index e72018f..1baa7f4 100644 --- a/terragrunt/aws/alarms/eventbridge.tf +++ b/terragrunt/aws/alarms/eventbridge.tf @@ -11,27 +11,31 @@ resource "aws_cloudwatch_event_rule" "glue_job_failure" { }) } +# +# Publish Glue Job failures to SNS using a CloudWatch Alarm message payload. +# This allows us to use the existing SRE Bot webhooks to post to Slack. +# resource "aws_cloudwatch_event_target" "glue_job_failure" { rule = aws_cloudwatch_event_rule.glue_job_failure.name - target_id = "PublishMetric" - arn = "arn:aws:events:${var.region}:${var.account_id}:api-destination/cloudwatch-metrics" + target_id = "send-to-sns" + arn = aws_sns_topic.cloudwatch_alarm_action.arn input_transformer { input_paths = { jobName = "$.detail.jobName" state = "$.detail.state" + message = "$.detail.message" } input_template = jsonencode({ - MetricData = [{ - MetricName = local.glue_job_failure_metric_name - Value = 1 - Unit = "Count" - Dimensions = [{ - Name = "JobName" - Value = "" - }] - }] - Namespace = local.data_lake_namespace + Message = jsonencode({ + AlarmArn = "arn:aws:cloudwatch:${var.region}:${var.account_id}:alarm:glue-job-failure", + AlarmName = "glue-job-failure", + AlarmDescription = "`` detected for Glue job ``", + AWSAccountId = var.account_id, + OldStateValue = "OK", + NewStateValue = "ALARM", + NewStateReason = "", + }) }) } } diff --git a/terragrunt/aws/alarms/locals.tf b/terragrunt/aws/alarms/locals.tf index 154bd09..30eaf1b 100644 --- a/terragrunt/aws/alarms/locals.tf +++ b/terragrunt/aws/alarms/locals.tf @@ -2,5 +2,4 @@ locals { data_lake_namespace = "data-lake" glue_crawler_metric_filter_error_pattern = "ERROR" glue_crawler_error_metric_name = "glue-crawler-error" - glue_job_failure_metric_name = "glue-job-failure" } \ No newline at end of file