From ba4846fea23c2c4cc8131a9c07fc014db0d1d728 Mon Sep 17 00:00:00 2001 From: Pat Heard Date: Fri, 8 Nov 2024 21:09:09 +0000 Subject: [PATCH] feat: add alarms for failed Glue Crawler runs Add CloudWatch alarms and the associated resources required to alert us when a Glue Crawler job fails. --- .../workflows/terragrunt-apply-production.yml | 5 ++ .../workflows/terragrunt-plan-production.yml | 14 ++- terragrunt/aws/alarms/alarms.tf | 48 ++++++++++ terragrunt/aws/alarms/kms.tf | 75 ++++++++++++++++ terragrunt/aws/alarms/locals.tf | 3 + terragrunt/aws/alarms/sns.tf | 88 +++++++++++++++++++ terragrunt/aws/alarms/variables.tf | 10 +++ terragrunt/aws/glue/iam.tf | 2 +- terragrunt/aws/glue/locals.tf | 3 + terragrunt/aws/glue/outputs.tf | 4 + .../env/production/alarms/terragrunt.hcl | 24 +++++ 11 files changed, 273 insertions(+), 3 deletions(-) create mode 100644 terragrunt/aws/alarms/alarms.tf create mode 100644 terragrunt/aws/alarms/kms.tf create mode 100644 terragrunt/aws/alarms/locals.tf create mode 100644 terragrunt/aws/alarms/sns.tf create mode 100644 terragrunt/aws/alarms/variables.tf create mode 100644 terragrunt/aws/glue/locals.tf create mode 100644 terragrunt/aws/glue/outputs.tf create mode 100644 terragrunt/env/production/alarms/terragrunt.hcl diff --git a/.github/workflows/terragrunt-apply-production.yml b/.github/workflows/terragrunt-apply-production.yml index b460d27..7ea7864 100644 --- a/.github/workflows/terragrunt-apply-production.yml +++ b/.github/workflows/terragrunt-apply-production.yml @@ -17,6 +17,7 @@ env: TERRAFORM_VERSION: 1.9.8 TERRAGRUNT_VERSION: 0.68.6 TF_INPUT: false + TF_VAR_cloudwatch_alarm_slack_webhook: ${{ secrets.PRODUCTION_SLACK_WEBHOOK_OPS }} permissions: id-token: write @@ -47,6 +48,10 @@ jobs: working-directory: terragrunt/env/production/glue run: terragrunt apply --terragrunt-non-interactive -auto-approve + - name: Terragrunt apply alarms + working-directory: terragrunt/env/production/alarms + run: terragrunt apply --terragrunt-non-interactive -auto-approve + - name: Report deployment to Sentinel if: always() uses: cds-snc/sentinel-forward-data-action@main diff --git a/.github/workflows/terragrunt-plan-production.yml b/.github/workflows/terragrunt-plan-production.yml index 853de49..a7169c0 100644 --- a/.github/workflows/terragrunt-plan-production.yml +++ b/.github/workflows/terragrunt-plan-production.yml @@ -17,6 +17,7 @@ env: TERRAFORM_VERSION: 1.9.8 TERRAGRUNT_VERSION: 0.68.6 TF_INPUT: false + TF_VAR_cloudwatch_alarm_slack_webhook: ${{ secrets.PRODUCTION_SLACK_WEBHOOK_OPS }} permissions: id-token: write @@ -45,7 +46,7 @@ jobs: with: directory: "terragrunt/env/production/buckets" comment-delete: "true" - comment-title: "Production: buckets" + comment-title: "Production: buckets 🪣" github-token: "${{ secrets.GITHUB_TOKEN }}" terragrunt: "true" @@ -54,6 +55,15 @@ jobs: with: directory: "terragrunt/env/production/glue" comment-delete: "true" - comment-title: "Production: glue" + comment-title: "Production: glue 🧴" + github-token: "${{ secrets.GITHUB_TOKEN }}" + terragrunt: "true" + + - name: Terragrunt plan alarms + uses: cds-snc/terraform-plan@25afd759b2ada46a94b011fab7a81963c4f3a61a # v3.3.0 + with: + directory: "terragrunt/env/production/alarms" + comment-delete: "true" + comment-title: "Production: alarms 🚨" github-token: "${{ secrets.GITHUB_TOKEN }}" terragrunt: "true" diff --git a/terragrunt/aws/alarms/alarms.tf b/terragrunt/aws/alarms/alarms.tf new file mode 100644 index 0000000..413de22 --- /dev/null +++ b/terragrunt/aws/alarms/alarms.tf @@ -0,0 +1,48 @@ +# +# Glue Crawler errors +# +resource "aws_cloudwatch_log_metric_filter" "glue_crawler_error" { + name = "glue-crawler-error" + pattern = local.glue_crawler_metric_filter_error_pattern + log_group_name = var.glue_crawler_log_group_name + + metric_transformation { + name = "glue-crawler-error" + namespace = "data-lake" + value = "1" + default_value = "0" + unit = "Count" + } +} + +resource "aws_cloudwatch_metric_alarm" "glue_crawler_error" { + alarm_name = "glue-crawler-error" + alarm_description = "Errors logged over 1 minute by the Glue Crawler." + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "1" + metric_name = aws_cloudwatch_log_metric_filter.glue_crawler_error.metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.glue_crawler_error.metric_transformation[0].namespace + period = "60" + statistic = "Sum" + threshold = "0" + treat_missing_data = "notBreaching" + + alarm_actions = [aws_sns_topic.cloudwatch_alarm_action.arn] + ok_actions = [aws_sns_topic.cloudwatch_ok_action.arn] +} + +# +# Log Insight queries +# +resource "aws_cloudwatch_query_definition" "glue_crawler_errors" { + name = "Glue Crawler - ERRORS" + + log_group_names = [var.glue_crawler_log_group_name] + + query_string = <<-QUERY + fields @timestamp, @message, @logStream + | filter @message like /${local.glue_crawler_metric_filter_error_pattern}/ + | sort @timestamp desc + | limit 100 + QUERY +} \ No newline at end of file diff --git a/terragrunt/aws/alarms/kms.tf b/terragrunt/aws/alarms/kms.tf new file mode 100644 index 0000000..6decadd --- /dev/null +++ b/terragrunt/aws/alarms/kms.tf @@ -0,0 +1,75 @@ +# +# Encrypt messages sent to the CloudWatch alarm SNS topics +# +resource "aws_kms_key" "cloudwatch" { + description = "SNS topic for CloudWatch alarm actions" + enable_key_rotation = true + policy = data.aws_iam_policy_document.kms_cloudwatch.json +} + +data "aws_iam_policy_document" "kms_cloudwatch" { + # checkov:skip=CKV_AWS_109: `resources = ["*"]` identifies the KMS key to which the key policy is attached + # checkov:skip=CKV_AWS_111: `resources = ["*"]` identifies the KMS key to which the key policy is attached + # checkov:skip=CKV_AWS_356: `resources = ["*"]` identifies the KMS key to which the key policy is attached + statement { + sid = "Enable IAM User Permissions" + effect = "Allow" + actions = ["kms:*"] + resources = ["*"] + + principals { + type = "AWS" + identifiers = ["arn:aws:iam::${var.account_id}:root"] + } + } + + statement { + effect = "Allow" + actions = [ + "kms:Encrypt*", + "kms:Decrypt*", + "kms:ReEncrypt*", + "kms:GenerateDataKey*", + "kms:Describe*" + ] + resources = ["*"] + + principals { + type = "Service" + identifiers = ["logs.${var.region}.amazonaws.com"] + } + } + + statement { + sid = "Allow_CloudWatch_for_CMK" + effect = "Allow" + actions = [ + "kms:Decrypt", + "kms:GenerateDataKey*", + ] + resources = ["*"] + + principals { + type = "Service" + identifiers = ["cloudwatch.amazonaws.com"] + } + } + + statement { + sid = "CloudwatchEvents" + effect = "Allow" + actions = [ + "kms:Encrypt*", + "kms:Decrypt*", + "kms:ReEncrypt*", + "kms:GenerateDataKey*", + "kms:Describe*" + ] + resources = ["*"] + + principals { + type = "Service" + identifiers = ["events.amazonaws.com"] + } + } +} diff --git a/terragrunt/aws/alarms/locals.tf b/terragrunt/aws/alarms/locals.tf new file mode 100644 index 0000000..98da5d1 --- /dev/null +++ b/terragrunt/aws/alarms/locals.tf @@ -0,0 +1,3 @@ +locals { + glue_crawler_metric_filter_error_pattern = "ERROR" +} \ No newline at end of file diff --git a/terragrunt/aws/alarms/sns.tf b/terragrunt/aws/alarms/sns.tf new file mode 100644 index 0000000..b54136d --- /dev/null +++ b/terragrunt/aws/alarms/sns.tf @@ -0,0 +1,88 @@ +# +# SNS topics +# +resource "aws_sns_topic" "cloudwatch_alarm_action" { + name = "cloudwatch-alarm-action" + kms_master_key_id = aws_kms_key.cloudwatch.arn +} + +resource "aws_sns_topic" "cloudwatch_ok_action" { + name = "cloudwatch-ok-action" + kms_master_key_id = aws_kms_key.cloudwatch.arn +} + +# +# SNS topic subscriptions +# +resource "aws_sns_topic_subscription" "cloudwatch_alarm_action" { + topic_arn = aws_sns_topic.cloudwatch_alarm_action.arn + protocol = "https" + endpoint = var.cloudwatch_alarm_slack_webhook +} + +resource "aws_sns_topic_subscription" "cloudwatch_ok_action" { + topic_arn = aws_sns_topic.cloudwatch_ok_action.arn + protocol = "https" + endpoint = var.cloudwatch_alarm_slack_webhook +} + +# +# Allow CloudWatch to use the SNS topics +# +resource "aws_sns_topic_policy" "cloudwatch_alarm_action" { + arn = aws_sns_topic.cloudwatch_alarm_action.arn + policy = data.aws_iam_policy_document.cloudwatch_events_sns_topic.json +} + +resource "aws_sns_topic_policy" "cloudwatch_ok_action" { + arn = aws_sns_topic.cloudwatch_ok_action.arn + policy = data.aws_iam_policy_document.cloudwatch_events_sns_topic.json +} + +data "aws_iam_policy_document" "cloudwatch_events_sns_topic" { + statement { + # checkov:skip=CKV_AWS_111: False-positive, `resources = ["*"]` refers to the SNS topic the policy applies to + # checkov:skip=CKV_AWS_356: False-positive, `resources = ["*"]` refers to the SNS topic the policy applies to + sid = "SNS_Default_Policy" + effect = "Allow" + actions = [ + "SNS:Subscribe", + "SNS:SetTopicAttributes", + "SNS:RemovePermission", + "SNS:Receive", + "SNS:Publish", + "SNS:ListSubscriptionsByTopic", + "SNS:GetTopicAttributes", + "SNS:DeleteTopic", + "SNS:AddPermission", + ] + + condition { + test = "StringEquals" + variable = "AWS:SourceOwner" + values = [var.account_id] + } + + resources = ["*"] + + principals { + type = "AWS" + identifiers = ["*"] + } + } + + statement { + sid = "SNS_Publish_statement" + effect = "Allow" + actions = [ + "sns:Publish" + ] + + resources = ["*"] + + principals { + type = "Service" + identifiers = ["events.amazonaws.com"] + } + } +} \ No newline at end of file diff --git a/terragrunt/aws/alarms/variables.tf b/terragrunt/aws/alarms/variables.tf new file mode 100644 index 0000000..08ae382 --- /dev/null +++ b/terragrunt/aws/alarms/variables.tf @@ -0,0 +1,10 @@ +variable "cloudwatch_alarm_slack_webhook" { + description = "Slack webhook URL used by the CloudWatch alarm SNS topics." + type = string + sensitive = true +} + +variable "glue_crawler_log_group_name" { + description = "The name of the Glue Crawler CloudWatch log group." + type = string +} diff --git a/terragrunt/aws/glue/iam.tf b/terragrunt/aws/glue/iam.tf index 646a307..924a2e0 100644 --- a/terragrunt/aws/glue/iam.tf +++ b/terragrunt/aws/glue/iam.tf @@ -76,7 +76,7 @@ data "aws_iam_policy_document" "glue_crawler" { "logs:AssociateKmsKey" ] resources = [ - "arn:aws:logs:${var.region}:${var.account_id}:log-group:/aws-glue/crawlers-role/service-role/${aws_iam_role.glue_crawler.name}-${aws_glue_security_configuration.encryption_at_rest.name}:*" + "arn:aws:logs:${var.region}:${var.account_id}:log-group:${local.glue_crawler_log_group_name}:*" ] } } diff --git a/terragrunt/aws/glue/locals.tf b/terragrunt/aws/glue/locals.tf new file mode 100644 index 0000000..5e5663b --- /dev/null +++ b/terragrunt/aws/glue/locals.tf @@ -0,0 +1,3 @@ +locals { + glue_crawler_log_group_name = "/aws-glue/crawlers-role${aws_iam_role.glue_crawler.path}${aws_iam_role.glue_crawler.name}-${aws_glue_security_configuration.encryption_at_rest.name}" +} \ No newline at end of file diff --git a/terragrunt/aws/glue/outputs.tf b/terragrunt/aws/glue/outputs.tf new file mode 100644 index 0000000..b73579d --- /dev/null +++ b/terragrunt/aws/glue/outputs.tf @@ -0,0 +1,4 @@ +output "glue_crawler_log_group_name" { + description = "The name of the Glue Crawler CloudWatch log group." + value = local.glue_crawler_log_group_name +} \ No newline at end of file diff --git a/terragrunt/env/production/alarms/terragrunt.hcl b/terragrunt/env/production/alarms/terragrunt.hcl new file mode 100644 index 0000000..6c8af8c --- /dev/null +++ b/terragrunt/env/production/alarms/terragrunt.hcl @@ -0,0 +1,24 @@ +terraform { + source = "../../../aws//alarms" +} + +dependencies { + paths = ["../glue"] +} + +dependency "glue" { + config_path = "../buckets" + mock_outputs_merge_strategy_with_state = "shallow" + mock_outputs_allowed_terraform_commands = ["init", "fmt", "validate", "plan", "show"] + mock_outputs = { + glue_crawler_log_group_name = "mock-glue-crawler-log-group" + } +} + +inputs = { + glue_crawler_log_group_name = dependency.glue.outputs.glue_crawler_log_group_name +} + +include { + path = find_in_parent_folders() +}