Skip to content

Commit

Permalink
feat: add alarms for failed Glue Crawler runs (#15)
Browse files Browse the repository at this point in the history
Add CloudWatch alarms and the associated resources required to
alert us when a Glue Crawler job fails.
  • Loading branch information
patheard authored Nov 8, 2024
1 parent deda2ff commit 9353820
Show file tree
Hide file tree
Showing 11 changed files with 273 additions and 3 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/terragrunt-apply-production.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ env:
TERRAFORM_VERSION: 1.9.8
TERRAGRUNT_VERSION: 0.68.6
TF_INPUT: false
TF_VAR_cloudwatch_alarm_slack_webhook: ${{ secrets.PRODUCTION_SLACK_WEBHOOK_OPS }}

permissions:
id-token: write
Expand Down Expand Up @@ -47,6 +48,10 @@ jobs:
working-directory: terragrunt/env/production/glue
run: terragrunt apply --terragrunt-non-interactive -auto-approve

- name: Terragrunt apply alarms
working-directory: terragrunt/env/production/alarms
run: terragrunt apply --terragrunt-non-interactive -auto-approve

- name: Report deployment to Sentinel
if: always()
uses: cds-snc/sentinel-forward-data-action@main
Expand Down
14 changes: 12 additions & 2 deletions .github/workflows/terragrunt-plan-production.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ env:
TERRAFORM_VERSION: 1.9.8
TERRAGRUNT_VERSION: 0.68.6
TF_INPUT: false
TF_VAR_cloudwatch_alarm_slack_webhook: ${{ secrets.PRODUCTION_SLACK_WEBHOOK_OPS }}

permissions:
id-token: write
Expand Down Expand Up @@ -45,7 +46,7 @@ jobs:
with:
directory: "terragrunt/env/production/buckets"
comment-delete: "true"
comment-title: "Production: buckets"
comment-title: "Production: buckets 🪣"
github-token: "${{ secrets.GITHUB_TOKEN }}"
terragrunt: "true"

Expand All @@ -54,6 +55,15 @@ jobs:
with:
directory: "terragrunt/env/production/glue"
comment-delete: "true"
comment-title: "Production: glue"
comment-title: "Production: glue 🧴"
github-token: "${{ secrets.GITHUB_TOKEN }}"
terragrunt: "true"

- name: Terragrunt plan alarms
uses: cds-snc/terraform-plan@25afd759b2ada46a94b011fab7a81963c4f3a61a # v3.3.0
with:
directory: "terragrunt/env/production/alarms"
comment-delete: "true"
comment-title: "Production: alarms 🚨"
github-token: "${{ secrets.GITHUB_TOKEN }}"
terragrunt: "true"
48 changes: 48 additions & 0 deletions terragrunt/aws/alarms/alarms.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#
# Glue Crawler errors
#
resource "aws_cloudwatch_log_metric_filter" "glue_crawler_error" {
name = "glue-crawler-error"
pattern = local.glue_crawler_metric_filter_error_pattern
log_group_name = var.glue_crawler_log_group_name

metric_transformation {
name = "glue-crawler-error"
namespace = "data-lake"
value = "1"
default_value = "0"
unit = "Count"
}
}

resource "aws_cloudwatch_metric_alarm" "glue_crawler_error" {
alarm_name = "glue-crawler-error"
alarm_description = "Errors logged over 1 minute by the Glue Crawler."
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = aws_cloudwatch_log_metric_filter.glue_crawler_error.metric_transformation[0].name
namespace = aws_cloudwatch_log_metric_filter.glue_crawler_error.metric_transformation[0].namespace
period = "60"
statistic = "Sum"
threshold = "0"
treat_missing_data = "notBreaching"

alarm_actions = [aws_sns_topic.cloudwatch_alarm_action.arn]
ok_actions = [aws_sns_topic.cloudwatch_ok_action.arn]
}

#
# Log Insight queries
#
resource "aws_cloudwatch_query_definition" "glue_crawler_errors" {
name = "Glue Crawler - ERRORS"

log_group_names = [var.glue_crawler_log_group_name]

query_string = <<-QUERY
fields @timestamp, @message, @logStream
| filter @message like /${local.glue_crawler_metric_filter_error_pattern}/
| sort @timestamp desc
| limit 100
QUERY
}
75 changes: 75 additions & 0 deletions terragrunt/aws/alarms/kms.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#
# Encrypt messages sent to the CloudWatch alarm SNS topics
#
resource "aws_kms_key" "cloudwatch" {
description = "SNS topic for CloudWatch alarm actions"
enable_key_rotation = true
policy = data.aws_iam_policy_document.kms_cloudwatch.json
}

data "aws_iam_policy_document" "kms_cloudwatch" {
# checkov:skip=CKV_AWS_109: `resources = ["*"]` identifies the KMS key to which the key policy is attached
# checkov:skip=CKV_AWS_111: `resources = ["*"]` identifies the KMS key to which the key policy is attached
# checkov:skip=CKV_AWS_356: `resources = ["*"]` identifies the KMS key to which the key policy is attached
statement {
sid = "Enable IAM User Permissions"
effect = "Allow"
actions = ["kms:*"]
resources = ["*"]

principals {
type = "AWS"
identifiers = ["arn:aws:iam::${var.account_id}:root"]
}
}

statement {
effect = "Allow"
actions = [
"kms:Encrypt*",
"kms:Decrypt*",
"kms:ReEncrypt*",
"kms:GenerateDataKey*",
"kms:Describe*"
]
resources = ["*"]

principals {
type = "Service"
identifiers = ["logs.${var.region}.amazonaws.com"]
}
}

statement {
sid = "Allow_CloudWatch_for_CMK"
effect = "Allow"
actions = [
"kms:Decrypt",
"kms:GenerateDataKey*",
]
resources = ["*"]

principals {
type = "Service"
identifiers = ["cloudwatch.amazonaws.com"]
}
}

statement {
sid = "CloudwatchEvents"
effect = "Allow"
actions = [
"kms:Encrypt*",
"kms:Decrypt*",
"kms:ReEncrypt*",
"kms:GenerateDataKey*",
"kms:Describe*"
]
resources = ["*"]

principals {
type = "Service"
identifiers = ["events.amazonaws.com"]
}
}
}
3 changes: 3 additions & 0 deletions terragrunt/aws/alarms/locals.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
locals {
glue_crawler_metric_filter_error_pattern = "ERROR"
}
88 changes: 88 additions & 0 deletions terragrunt/aws/alarms/sns.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#
# SNS topics
#
resource "aws_sns_topic" "cloudwatch_alarm_action" {
name = "cloudwatch-alarm-action"
kms_master_key_id = aws_kms_key.cloudwatch.arn
}

resource "aws_sns_topic" "cloudwatch_ok_action" {
name = "cloudwatch-ok-action"
kms_master_key_id = aws_kms_key.cloudwatch.arn
}

#
# SNS topic subscriptions
#
resource "aws_sns_topic_subscription" "cloudwatch_alarm_action" {
topic_arn = aws_sns_topic.cloudwatch_alarm_action.arn
protocol = "https"
endpoint = var.cloudwatch_alarm_slack_webhook
}

resource "aws_sns_topic_subscription" "cloudwatch_ok_action" {
topic_arn = aws_sns_topic.cloudwatch_ok_action.arn
protocol = "https"
endpoint = var.cloudwatch_alarm_slack_webhook
}

#
# Allow CloudWatch to use the SNS topics
#
resource "aws_sns_topic_policy" "cloudwatch_alarm_action" {
arn = aws_sns_topic.cloudwatch_alarm_action.arn
policy = data.aws_iam_policy_document.cloudwatch_events_sns_topic.json
}

resource "aws_sns_topic_policy" "cloudwatch_ok_action" {
arn = aws_sns_topic.cloudwatch_ok_action.arn
policy = data.aws_iam_policy_document.cloudwatch_events_sns_topic.json
}

data "aws_iam_policy_document" "cloudwatch_events_sns_topic" {
statement {
# checkov:skip=CKV_AWS_111: False-positive, `resources = ["*"]` refers to the SNS topic the policy applies to
# checkov:skip=CKV_AWS_356: False-positive, `resources = ["*"]` refers to the SNS topic the policy applies to
sid = "SNS_Default_Policy"
effect = "Allow"
actions = [
"SNS:Subscribe",
"SNS:SetTopicAttributes",
"SNS:RemovePermission",
"SNS:Receive",
"SNS:Publish",
"SNS:ListSubscriptionsByTopic",
"SNS:GetTopicAttributes",
"SNS:DeleteTopic",
"SNS:AddPermission",
]

condition {
test = "StringEquals"
variable = "AWS:SourceOwner"
values = [var.account_id]
}

resources = ["*"]

principals {
type = "AWS"
identifiers = ["*"]
}
}

statement {
sid = "SNS_Publish_statement"
effect = "Allow"
actions = [
"sns:Publish"
]

resources = ["*"]

principals {
type = "Service"
identifiers = ["events.amazonaws.com"]
}
}
}
10 changes: 10 additions & 0 deletions terragrunt/aws/alarms/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
variable "cloudwatch_alarm_slack_webhook" {
description = "Slack webhook URL used by the CloudWatch alarm SNS topics."
type = string
sensitive = true
}

variable "glue_crawler_log_group_name" {
description = "The name of the Glue Crawler CloudWatch log group."
type = string
}
2 changes: 1 addition & 1 deletion terragrunt/aws/glue/iam.tf
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ data "aws_iam_policy_document" "glue_crawler" {
"logs:AssociateKmsKey"
]
resources = [
"arn:aws:logs:${var.region}:${var.account_id}:log-group:/aws-glue/crawlers-role/service-role/${aws_iam_role.glue_crawler.name}-${aws_glue_security_configuration.encryption_at_rest.name}:*"
"arn:aws:logs:${var.region}:${var.account_id}:log-group:${local.glue_crawler_log_group_name}:*"
]
}
}
3 changes: 3 additions & 0 deletions terragrunt/aws/glue/locals.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
locals {
glue_crawler_log_group_name = "/aws-glue/crawlers-role${aws_iam_role.glue_crawler.path}${aws_iam_role.glue_crawler.name}-${aws_glue_security_configuration.encryption_at_rest.name}"
}
4 changes: 4 additions & 0 deletions terragrunt/aws/glue/outputs.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
output "glue_crawler_log_group_name" {
description = "The name of the Glue Crawler CloudWatch log group."
value = local.glue_crawler_log_group_name
}
24 changes: 24 additions & 0 deletions terragrunt/env/production/alarms/terragrunt.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
terraform {
source = "../../../aws//alarms"
}

dependencies {
paths = ["../glue"]
}

dependency "glue" {
config_path = "../buckets"
mock_outputs_merge_strategy_with_state = "shallow"
mock_outputs_allowed_terraform_commands = ["init", "fmt", "validate", "plan", "show"]
mock_outputs = {
glue_crawler_log_group_name = "mock-glue-crawler-log-group"
}
}

inputs = {
glue_crawler_log_group_name = dependency.glue.outputs.glue_crawler_log_group_name
}

include {
path = find_in_parent_folders()
}

0 comments on commit 9353820

Please sign in to comment.