From f1b2d8ba9f7cb9f0be1a67ad88ea11bb72cf513a Mon Sep 17 00:00:00 2001 From: Aleksandr Kurlov Date: Wed, 17 Jul 2024 22:23:15 +0200 Subject: [PATCH 1/2] Add emailsender alerts --- resources/prometheus/prometheus-rules.yaml | 53 +++++++++++++++++++ .../RHACSEmailsenderContainerDown.yaml | 27 ++++++++++ ...ilsenderContainerFrequentlyRestarting.yaml | 27 ++++++++++ .../RHACSEmailsenderScrapeFailed.yaml.yaml | 29 ++++++++++ .../RHACSEmailsenderSendErrors.yaml | 29 ++++++++++ .../RHACSEmailsenderThrottledSend.yaml | 28 ++++++++++ 6 files changed, 193 insertions(+) create mode 100644 resources/prometheus/unit_tests/RHACSEmailsenderContainerDown.yaml create mode 100644 resources/prometheus/unit_tests/RHACSEmailsenderContainerFrequentlyRestarting.yaml create mode 100644 resources/prometheus/unit_tests/RHACSEmailsenderScrapeFailed.yaml.yaml create mode 100644 resources/prometheus/unit_tests/RHACSEmailsenderSendErrors.yaml create mode 100644 resources/prometheus/unit_tests/RHACSEmailsenderThrottledSend.yaml diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 7a5930c..f894a37 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -159,6 +159,59 @@ spec: description: "Fleetshard synchronizer manages `{{ $value }}` centrals. The number of Centrals should always be larger than zero in a working system. If it drops to or below zero, fleetshard synchronizer is assumed to be in a failed state." sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-007-fleetshard-sync-reconciliation-error.md" + - name: rhacs-emailsender + rules: + - alert: RHACSEmailsenderScrapeFailed + expr: | + (avg_over_time(up{pod=~"emailsender-.*"}[10m]) < 0.5 and ON(pod) kube_pod_container_status_ready{pod=~"emailsender-.*"} == 1) or absent(up{pod=~"emailsender-.*"}) + for: 20m + labels: + severity: warning + annotations: + summary: "Prometheus unable to scrape metrics from target `{{ $labels.pod }}` in namespace `{{ $labels.namespace }}`." + description: "During the last 10 minutes, only `{{ $value | humanizePercentage }}` of scrapes of target `{{ $labels.pod }}` in namespace `{{ $labels.namespace }}` were successful. This alert is raised when less than 50% of scrapes are successful." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-046-emailsender-unavailable.md" + - alert: RHACSEmailsenderContainerDown + expr: | + avg_over_time(kube_pod_container_status_ready{pod=~"emailsender-.*"}[10m]) < 0.5 + for: 20m + labels: + severity: warning + annotations: + summary: "Email Sender container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` is down or in a CrashLoopBackOff status." + description: "Email Sender container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` has been down or in a CrashLoopBackOff status for at least 10 minutes." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-046-emailsender-unavailable.md" + - alert: RHACSEmailsenderContainerFrequentlyRestarting + expr: increase(kube_pod_container_status_restarts_total{pod=~"emailsender-.*"}[30m]) > 3 + labels: + severity: warning + annotations: + summary: "Email Sender container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` restarted more than 3 times." + description: "Email Sender container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` has restarted more than 3 times during the last 30 minutes." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-046-emailsender-unavailable.md" + - alert: RHACSEmailsenderSendErrors + expr: |2 + (rate(acs_emailsender_failed_send_email_total[10m]) + / + rate(acs_emailsender_send_email_total[10m])) > 0.10 + for: 5m + labels: + severity: warning + annotations: + summary: "Email Sender container failing sending emails" + description: "Email Sender has a send email error rate of {{ $value | humanizePercentage }} over the last 10 minutes." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-047-emailsender-ses-send-error.md" + - alert: RHACSEmailsenderThrottledSend + expr: | + acs_emailsender_throttled_send_email_total > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Email Sender throttled sending for `{{ $labels.tenant_id }}` Central instance" + description: "Email Sender is throttled {{ $value }} times for `{{ $labels.tenant_id }}` Central" + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-048-emailsender-ses-send-throttled.md" + - name: tenant-resources rules: - expr: | diff --git a/resources/prometheus/unit_tests/RHACSEmailsenderContainerDown.yaml b/resources/prometheus/unit_tests/RHACSEmailsenderContainerDown.yaml new file mode 100644 index 0000000..42f72a8 --- /dev/null +++ b/resources/prometheus/unit_tests/RHACSEmailsenderContainerDown.yaml @@ -0,0 +1,27 @@ +rule_files: + - /tmp/prometheus-rules-test.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: kube_pod_container_status_ready{namespace="rhacs", pod="emailsender-123", container="emailsender"} + values: "1+0x10 0+0x50" + alert_rule_test: + - eval_time: 15m + alertname: RHACSEmailsenderContainerDown + exp_alerts: [] + - eval_time: 40m + alertname: RHACSEmailsenderContainerDown + exp_alerts: + - exp_labels: + alertname: RHACSEmailsenderContainerDown + container: emailsender + namespace: rhacs + pod: emailsender-123 + severity: warning + exp_annotations: + summary: "Email Sender container `emailsender-123/emailsender` in namespace `rhacs` is down or in a CrashLoopBackOff status." + description: "Email Sender container `emailsender-123/emailsender` in namespace `rhacs` has been down or in a CrashLoopBackOff status for at least 10 minutes." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-046-emailsender-unavailable.md" diff --git a/resources/prometheus/unit_tests/RHACSEmailsenderContainerFrequentlyRestarting.yaml b/resources/prometheus/unit_tests/RHACSEmailsenderContainerFrequentlyRestarting.yaml new file mode 100644 index 0000000..07c36bf --- /dev/null +++ b/resources/prometheus/unit_tests/RHACSEmailsenderContainerFrequentlyRestarting.yaml @@ -0,0 +1,27 @@ +rule_files: + - /tmp/prometheus-rules-test.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: kube_pod_container_status_restarts_total{namespace="rhacs", pod="emailsender-123", container="emailsender"} + values: "0+0x30 1+1x10 4+1x20" + alert_rule_test: + - eval_time: 30m + alertname: RHACSEmailsenderContainerFrequentlyRestarting + exp_alerts: [] + - eval_time: 60m + alertname: RHACSEmailsenderContainerFrequentlyRestarting + exp_alerts: + - exp_labels: + alertname: RHACSEmailsenderContainerFrequentlyRestarting + container: emailsender + namespace: rhacs + pod: emailsender-123 + severity: warning + exp_annotations: + summary: "Email Sender container `emailsender-123/emailsender` in namespace `rhacs` restarted more than 3 times." + description: "Email Sender container `emailsender-123/emailsender` in namespace `rhacs` has restarted more than 3 times during the last 30 minutes." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-046-emailsender-unavailable.md" diff --git a/resources/prometheus/unit_tests/RHACSEmailsenderScrapeFailed.yaml.yaml b/resources/prometheus/unit_tests/RHACSEmailsenderScrapeFailed.yaml.yaml new file mode 100644 index 0000000..351bb80 --- /dev/null +++ b/resources/prometheus/unit_tests/RHACSEmailsenderScrapeFailed.yaml.yaml @@ -0,0 +1,29 @@ +rule_files: + - /tmp/prometheus-rules-test.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: up{namespace="rhacs", pod="emailsender-123", instance="1.2.3.4:9090"} + values: "0+0x20 1+0x20" + - series: kube_pod_container_status_ready{namespace="rhacs", pod="emailsender-123"} + values: "1+0x40" + alert_rule_test: + - eval_time: 10m + alertname: RHACSEmailsenderScrapeFailed + exp_alerts: [] + - eval_time: 25m + alertname: RHACSEmailsenderScrapeFailed + exp_alerts: + - exp_labels: + alertname: RHACSEmailsenderScrapeFailed + instance: 1.2.3.4:9090 + namespace: rhacs + pod: emailsender-123 + severity: warning + exp_annotations: + summary: "Prometheus unable to scrape metrics from target `emailsender-123` in namespace `rhacs`." + description: "During the last 10 minutes, only `45.45%` of scrapes of target `emailsender-123` in namespace `rhacs` were successful. This alert is raised when less than 50% of scrapes are successful." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-046-emailsender-unavailable.md" diff --git a/resources/prometheus/unit_tests/RHACSEmailsenderSendErrors.yaml b/resources/prometheus/unit_tests/RHACSEmailsenderSendErrors.yaml new file mode 100644 index 0000000..11f5ef3 --- /dev/null +++ b/resources/prometheus/unit_tests/RHACSEmailsenderSendErrors.yaml @@ -0,0 +1,29 @@ +rule_files: + - /tmp/prometheus-rules-test.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: acs_emailsender_failed_send_email_total{namespace="rhacs", pod="emailsender-123", container="emailsender"} + values: "0+0x10 1+1x50" + - series: acs_emailsender_send_email_total{namespace="rhacs", pod="emailsender-123", container="emailsender"} + values: "1+1x10 1+2x50" + alert_rule_test: + - eval_time: 15m + alertname: RHACSEmailsenderSendErrors + exp_alerts: [] + - eval_time: 40m + alertname: RHACSEmailsenderSendErrors + exp_alerts: + - exp_labels: + alertname: RHACSEmailsenderSendErrors + container: emailsender + namespace: rhacs + pod: emailsender-123 + severity: warning + exp_annotations: + summary: "Email Sender container failing sending emails" + description: "Email Sender has a send email error rate of 50% over the last 10 minutes." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-047-emailsender-ses-send-error.md" diff --git a/resources/prometheus/unit_tests/RHACSEmailsenderThrottledSend.yaml b/resources/prometheus/unit_tests/RHACSEmailsenderThrottledSend.yaml new file mode 100644 index 0000000..00ced4e --- /dev/null +++ b/resources/prometheus/unit_tests/RHACSEmailsenderThrottledSend.yaml @@ -0,0 +1,28 @@ +rule_files: + - /tmp/prometheus-rules-test.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: acs_emailsender_throttled_send_email_total{namespace="rhacs", pod="emailsender-123", container="emailsender", tenant_id="centralid"} + values: "0+0x10 2+0x50" + alert_rule_test: + - eval_time: 15m + alertname: RHACSEmailsenderThrottledSend + exp_alerts: [] + - eval_time: 40m + alertname: RHACSEmailsenderThrottledSend + exp_alerts: + - exp_labels: + alertname: RHACSEmailsenderThrottledSend + container: emailsender + namespace: rhacs + pod: emailsender-123 + tenant_id: centralid + severity: warning + exp_annotations: + summary: "Email Sender throttled sending for `centralid` Central instance" + description: "Email Sender is throttled 2 times for `centralid` Central" + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-048-emailsender-ses-send-throttled.md" From f64a66dec7a10dd10032e873ef103fa89fb45570 Mon Sep 17 00:00:00 2001 From: Aleksandr Kurlov Date: Fri, 19 Jul 2024 11:45:21 +0200 Subject: [PATCH 2/2] Add rate to throttled metric --- resources/prometheus/prometheus-rules.yaml | 6 +++--- .../unit_tests/RHACSEmailsenderThrottledSend.yaml | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 9067d39..37e647d 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -203,13 +203,13 @@ spec: sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-047-emailsender-ses-send-error.md" - alert: RHACSEmailsenderThrottledSend expr: | - acs_emailsender_throttled_send_email_total > 0 - for: 5m + rate(acs_emailsender_throttled_send_email_total[10m]) * 60 > 0 + for: 10m labels: severity: warning annotations: summary: "Email Sender throttled sending for `{{ $labels.tenant_id }}` Central instance" - description: "Email Sender is throttled {{ $value }} times for `{{ $labels.tenant_id }}` Central" + description: "Email Sender throttled `{{ $labels.tenant_id }}` Central {{ $value | humanize }} time(s) per minute over the last 10 minutes." sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-048-emailsender-ses-send-throttled.md" - name: tenant-resources diff --git a/resources/prometheus/unit_tests/RHACSEmailsenderThrottledSend.yaml b/resources/prometheus/unit_tests/RHACSEmailsenderThrottledSend.yaml index 00ced4e..a270dc6 100644 --- a/resources/prometheus/unit_tests/RHACSEmailsenderThrottledSend.yaml +++ b/resources/prometheus/unit_tests/RHACSEmailsenderThrottledSend.yaml @@ -7,7 +7,7 @@ tests: - interval: 1m input_series: - series: acs_emailsender_throttled_send_email_total{namespace="rhacs", pod="emailsender-123", container="emailsender", tenant_id="centralid"} - values: "0+0x10 2+0x50" + values: "0+0x10 2+1x60" alert_rule_test: - eval_time: 15m alertname: RHACSEmailsenderThrottledSend @@ -24,5 +24,5 @@ tests: severity: warning exp_annotations: summary: "Email Sender throttled sending for `centralid` Central instance" - description: "Email Sender is throttled 2 times for `centralid` Central" + description: "Email Sender throttled `centralid` Central 1 time(s) per minute over the last 10 minutes." sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-048-emailsender-ses-send-throttled.md"