From 75deb07c299b0e9e19d26184e2454cd3323a2047 Mon Sep 17 00:00:00 2001 From: Stephan Hesselmann Date: Tue, 2 Jul 2024 15:04:03 +0200 Subject: [PATCH 1/2] feat(alerts): add weekly exhaustion alert (#248) --- resources/prometheus/prometheus-rules.yaml | 12 ++++++++++ .../unit_tests/RHACSCentralSLISLO.yaml | 22 +++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index a9f9f332..7db4d4d8 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -597,6 +597,18 @@ spec: namespace: "{{ $labels.namespace }}" rhacs_instance_id: "{{ $labels.rhacs_instance_id }}" + - alert: Central availability weekly exhaustion + annotations: + message: "Availability error budget exhaustion has increased by {{ $value | humanizePercentage }} over the last week." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md" + expr: | + central:slo:availability:error_budget_exhaustion - central:slo:availability:error_budget_exhaustion offset 1w > 0.1 + labels: + service: central + severity: warning + namespace: "{{ $labels.namespace }}" + rhacs_instance_id: "{{ $labels.rhacs_instance_id }}" + - alert: Central high availability burn rate annotations: message: "High availability burn rate for central. Current burn rate per hour: {{ $value | humanize }}." diff --git a/resources/prometheus/unit_tests/RHACSCentralSLISLO.yaml b/resources/prometheus/unit_tests/RHACSCentralSLISLO.yaml index 8933a82d..2fa3e971 100644 --- a/resources/prometheus/unit_tests/RHACSCentralSLISLO.yaml +++ b/resources/prometheus/unit_tests/RHACSCentralSLISLO.yaml @@ -148,6 +148,28 @@ tests: message: "High availability error budget exhaustion for central. Current exhaustion: 50.84%." sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md" + # Central weekly exhaustion + - interval: 30m + input_series: + - series: kube_deployment_status_replicas_ready{deployment="central", namespace="rhacs-ffffgggghhhhiiiijjjj"} + values: "0+0x5 1+0x265 0+0x15 1+0x100" + alert_rule_test: + - eval_time: 100m + alertname: Central availability weekly exhaustion + exp_alerts: [] + - eval_time: 11000m + alertname: Central availability weekly exhaustion + exp_alerts: + - exp_labels: + alertname: Central availability weekly exhaustion + service: central + severity: warning + namespace: rhacs-ffffgggghhhhiiiijjjj + rhacs_instance_id: ffffgggghhhhiiiijjjj + exp_annotations: + message: "Availability error budget exhaustion has increased by 21.83% over the last week." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md" + # Central high availability burn rate - interval: 5m input_series: From 5fa01d6f1edcf5c9dd89a9f245aee4b31a90e173 Mon Sep 17 00:00:00 2001 From: Stephan Hesselmann Date: Tue, 2 Jul 2024 15:28:14 +0200 Subject: [PATCH 2/2] chore: add more automerge options (#262) --- .github/workflows/automerge.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/automerge.yaml b/.github/workflows/automerge.yaml index 9d79c96d..fa22f15a 100644 --- a/.github/workflows/automerge.yaml +++ b/.github/workflows/automerge.yaml @@ -31,3 +31,6 @@ jobs: GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}" MERGE_METHOD: merge MERGE_FILTER_AUTHOR: github-actions[bot] + MERGE_FORKS: false + MERGE_REQUIRED_APPROVALS: 1 + UPDATE_METHOD: rebase