From d50aa6ab3da247a4fb9fc0d9a34ae10279aa2857 Mon Sep 17 00:00:00 2001 From: Ludovic Cleroux Date: Thu, 16 Nov 2023 15:36:29 +0100 Subject: [PATCH] ROX-20792: Add alert for operator OOM --- resources/prometheus/prometheus-rules.yaml | 31 +++++++++++++++++ .../RHACSOperatorMemoryUtilizationHigh.yaml | 34 +++++++++++++++++++ 2 files changed, 65 insertions(+) create mode 100644 resources/prometheus/unit_tests/RHACSOperatorMemoryUtilizationHigh.yaml diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 974b65c4..126804e7 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -158,6 +158,37 @@ spec: summary: "Fleetshard synchronizer manages `{{ $value }}` centrals." description: "Fleetshard synchronizer manages `{{ $value }}` centrals. The number of Centrals should always be larger than zero in a working system. If it drops to or below zero, fleetshard synchronizer is assumed to be in a failed state." sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-007-fleetshard-sync-reconciliation-error.md" + + - name: rhacs-operator + rules: + - expr: | + sum (kube_pod_info{namespace="rhacs"} + * on (pod, namespace) group_left() kube_pod_labels{namespace="rhacs", label_app="rhacs-operator"} + * on (pod, namespace) group_left(workload) namespace_workload_pod:kube_pod_owner:relabel) by (namespace, workload, pod) + record: rhacs_operator:namespace:workload:pod + - expr: | + rhacs_operator:namespace:workload:pod + * on (pod, namespace) group_left() sum(container_memory_max_usage_bytes{container!=""}) by (pod, namespace) + record: rhacs_operator:namespace:workload:pod:max_memory_usage_bytes + - expr: | + rhacs_operator:namespace:workload:pod + * on (pod, namespace) group_left() sum(container_spec_memory_limit_bytes{container!=""}) by (pod, namespace) + record: rhacs_operator:namespace:workload:pod:memory_limit_bytes + - expr: | + sum(rhacs_operator:namespace:workload:pod:max_memory_usage_bytes / rhacs_operator:namespace:workload:pod:memory_limit_bytes) + by (namespace, workload) + record: rhacs_operator:namespace:workload:max_memory_usage_ratio + - alert: RHACSOperatorMemoryUtilizationHigh + expr: | + rhacs_operator:namespace:workload:max_memory_usage_ratio > 0.6 + for: 5m + labels: + severity: warning + annotations: + summary: RHACS Operator '{{ $labels.workload }}' is reaching its memory limit. + description: The RHACS operator '{{ $labels.workload }}' reached {{ $value | humanizePercentage }} of its memory limit and is at risk of being OOM killed. + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-011-rhacs-operator-unavailable.md" + - name: rhacs-aws-quota rules: - alert: RHACSCentralDBClustersUtilizationHigh diff --git a/resources/prometheus/unit_tests/RHACSOperatorMemoryUtilizationHigh.yaml b/resources/prometheus/unit_tests/RHACSOperatorMemoryUtilizationHigh.yaml new file mode 100644 index 00000000..3933dc4e --- /dev/null +++ b/resources/prometheus/unit_tests/RHACSOperatorMemoryUtilizationHigh.yaml @@ -0,0 +1,34 @@ +rule_files: + - /tmp/prometheus-rules-test.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: kube_pod_info{namespace="rhacs", pod="operator-pod"} + values: "1+0x20" + - series: kube_pod_labels{namespace="rhacs", pod="operator-pod", label_app="rhacs-operator"} + values: "1+0x20" + - series: namespace_workload_pod:kube_pod_owner:relabel{namespace="rhacs", pod="operator-pod", workload="operator-workload"} + values: "1+0x20" + - series: container_memory_max_usage_bytes{namespace="rhacs", pod="operator-pod", container="operator-container"} + values: "50+0x10 70+0x10" + - series: container_spec_memory_limit_bytes{namespace="rhacs", pod="operator-pod", container="operator-container"} + values: "100+0x20" + alert_rule_test: + - eval_time: 1m + alertname: RHACSOperatorMemoryUtilizationHigh + exp_alerts: [] + - eval_time: 16m + alertname: RHACSOperatorMemoryUtilizationHigh + exp_alerts: + - exp_labels: + alertname: RHACSOperatorMemoryUtilizationHigh + severity: warning + namespace: rhacs + workload: operator-workload + exp_annotations: + description: "The RHACS operator 'operator-workload' reached 70% of its memory limit and is at risk of being OOM killed." + summary: "RHACS Operator 'operator-workload' is reaching its memory limit." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-011-rhacs-operator-unavailable.md"