From 0d142ccdb9931997eb5be42500c4063e15fa8632 Mon Sep 17 00:00:00 2001
From: Mladen Todorovic <mtodor@redhat.com>
Date: Fri, 12 Apr 2024 19:33:17 +0200
Subject: [PATCH] Add autoscaler alerts

---
 resources/prometheus/federation-config.yaml |   4 +-
 resources/prometheus/prometheus-rules.yaml  | 142 ++++++--------------
 2 files changed, 41 insertions(+), 105 deletions(-)

diff --git a/resources/prometheus/federation-config.yaml b/resources/prometheus/federation-config.yaml
index d7a7a558..5f2873be 100644
--- a/resources/prometheus/federation-config.yaml
+++ b/resources/prometheus/federation-config.yaml
@@ -17,6 +17,9 @@ match[]:
   - cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{job!~"central|scanner"}
   - cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{job!~"central|scanner"}
   - cluster:node_cpu:ratio_rate5m{job!~"central|scanner"}
+  - cluster_autoscaler_cluster_safe_to_autoscale{job!~"central|scanner"}
+  - cluster_autoscaler_skipped_scale_events_count{job!~"central|scanner"}
+  - cluster_autoscaler_unschedulable_pods_count{job!~"central|scanner"}
   - cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile{job!~"central|scanner"}
   - code_resource:apiserver_request_total:rate5m{job!~"central|scanner"}
   - container_cpu_cfs_periods_total{job!~"central|scanner"}
@@ -70,7 +73,6 @@ match[]:
   - kube_job_status_start_time{job!~"central|scanner"}
   - kube_namespace_status_phase{job!~"central|scanner"}
   - kube_node_info{job!~"central|scanner"}
-  - kube_node_labels{job!~"central|scanner"}
   - kube_node_role{job!~"central|scanner"}
   - kube_node_spec_taint{job!~"central|scanner"}
   - kube_node_status_allocatable{job!~"central|scanner"}
diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml
index aa8d82c7..c8f395aa 100644
--- a/resources/prometheus/prometheus-rules.yaml
+++ b/resources/prometheus/prometheus-rules.yaml
@@ -610,118 +610,52 @@ spec:
             severity: critical
             namespace: "{{ $labels.namespace }}"
             rhacs_instance_id: "{{ $labels.rhacs_instance_id }}"
-    - name: az-resources
+    - name: cluster-resources
       rules:
-        - record: acscs_worker_nodes
-          expr: |
-            kube_node_role{role="acscs-worker"}
-        - record: node_availability_zone
-          expr: |
-            sum(label_replace(kube_node_labels, "availability_zone", "$1", "label_failure_domain_beta_kubernetes_io_zone", "(.*)")) by (availability_zone, node) > 0
-        - record: memory_resource_requests:acscs_worker_nodes:by_availability_zone:sum
-          expr: |
-            sum(
-              sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{resource="memory",observability="",job="kube-state-metrics"}) by (node)
-              * on (node) acscs_worker_nodes
-              * on (node) group_left(availability_zone) node_availability_zone
-            ) by (availability_zone)
-        - record: memory_resource_limits:acscs_worker_nodes:by_availability_zone:sum
-          expr: |
-            sum(
-              sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{resource="memory",observability="",job="kube-state-metrics"}) by (node)
-              * on (node) acscs_worker_nodes
-              * on (node) group_left(availability_zone) node_availability_zone
-            ) by (availability_zone)
-        - record: cpu_resource_requests:acscs_worker_nodes:by_availability_zone:sum
-          expr: |
-            sum(
-              sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{resource="cpu", observability="",job="kube-state-metrics"}) by (node)
-              * on (node) acscs_worker_nodes
-              * on (node) group_left(availability_zone) node_availability_zone
-            ) by (availability_zone)
-        - record: cpu_resource_limits:acscs_worker_nodes:by_availability_zone:sum
-          expr: |
-            sum(
-              sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{observability="",job="kube-state-metrics"}) by (node)
-              * on (node) acscs_worker_nodes
-              * on (node) group_left(availability_zone) node_availability_zone
-            ) by (availability_zone)
-        - record: availability_zone:acscs_worker_nodes:allocatable_cpu
-          expr: |
-            sum(
-              sum(kube_node_status_allocatable{resource="cpu"}) by (node)
-              * on (node) acscs_worker_nodes
-              * on (node) group_left(availability_zone) node_availability_zone
-            ) by (availability_zone)
-        - record: availability_zone:acscs_worker_nodes:allocatable_memory
-          expr: |
-            sum(
-              sum(kube_node_status_allocatable{resource="memory"}) by (node)
-              * on (node) acscs_worker_nodes
-              * on (node) group_left(availability_zone) node_availability_zone
-            ) by (availability_zone)
-        - record: availability_zone:acscs_worker_nodes:memory_request_ratio
-          expr: |
-            memory_resource_requests:acscs_worker_nodes:by_availability_zone:sum
-            /
-            availability_zone:acscs_worker_nodes:allocatable_memory
-        - record: availability_zone:acscs_worker_nodes:cpu_request_ratio
-          expr: |
-            cpu_resource_requests:acscs_worker_nodes:by_availability_zone:sum
-            /
-            availability_zone:acscs_worker_nodes:allocatable_cpu
-        - record: availability_zone:acscs_worker_nodes:memory_limit_ratio
-          expr: |
-            memory_resource_limits:acscs_worker_nodes:by_availability_zone:sum
-            /
-            availability_zone:acscs_worker_nodes:allocatable_memory
-        - record: availability_zone:acscs_worker_nodes:cpu_limit_ratio
-          expr: |
-            cpu_resource_limits:acscs_worker_nodes:by_availability_zone:sum
-            /
-            availability_zone:acscs_worker_nodes:allocatable_cpu
-        - alert: WorkerNodesMemoryQuotaOverCommitWarning
-          expr: avg(availability_zone:acscs_worker_nodes:memory_request_ratio) > 0.85
-          for: 5m
-          labels:
-            severity: warning
-          annotations:
-            summary: "There is a risk of over-committing Memory resources on worker nodes."
-            description: "During the last 5 minutes, the average memory request commitment on worker nodes was {{ $value | humanizePercentage }}. This is above the recommended threshold of 85%."
-            sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md"
-        - alert: WorkerNodesMemoryQuotaOverCommit
-          expr: avg(availability_zone:acscs_worker_nodes:memory_request_ratio) > 0.95
-          for: 5m
+        # Copied from: https://github.com/openshift/cluster-autoscaler-operator/blob/287595b0ee37b893c02d51d3a461ba118b90c3dc/pkg/controller/clusterautoscaler/monitoring.go#L153
+        # However, different severity levels are used to ensure timely response.
+        # Additional information about alerts: https://github.com/openshift/cluster-autoscaler-operator/blob/287595b0ee37b893c02d51d3a461ba118b90c3dc/docs/user/alerts.md
+        - alert: ClusterAutoscalerUnschedulablePods
+          expr: cluster_autoscaler_unschedulable_pods_count{service="cluster-autoscaler-default"} > 0
+          for: 30m
           labels:
             severity: critical
           annotations:
-            summary: "There is a high risk of over-committing Memory resources on worker nodes."
-            description: "During the last 5 minutes, the average memory request commitment on worker nodes was {{ $value | humanizePercentage }}. This is above the critical threshold of 95%."
-            sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md"
-        - alert: WorkerNodesCPUQuotaOverCommitWarning
-          expr: avg(availability_zone:acscs_worker_nodes:cpu_request_ratio) > 0.85
-          for: 5m
+            summary: "Cluster Autoscaler has {{ $value }} unschedulable pods."
+            description: "The cluster autoscaler is unable to scale up and is alerting that there are unschedulable pods because of this condition.
+This may be caused by the cluster autoscaler reaching its resources limits, or by Kubernetes waiting for new nodes to become ready."
+            sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-041-modify-cluster-autoscaler.md"
+        - alert: ClusterAutoscalerNotSafeToScale
+          expr: cluster_autoscaler_cluster_safe_to_autoscale{service="cluster-autoscaler-default"} != 1
+          for: 15m
           labels:
             severity: warning
           annotations:
-            summary: "There is a risk of over-committing CPU resources on worker nodes."
-            description: "During the last 5 minutes, the average CPU request commitment on worker nodes was {{ $value | humanizePercentage }}. This is above the recommended threshold of 85%."
-            sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md"
-        - alert: WorkerNodesCPUQuotaOverCommit
-          expr: avg(availability_zone:acscs_worker_nodes:cpu_request_ratio) > 0.95
-          for: 5m
+            summary: "Cluster Autoscaler is reporting that the cluster is not ready for scaling."
+            description: "The cluster autoscaler has detected that the number of unready nodes is too high
+and it is not safe to continute scaling operations. It makes this determination by checking that the number of ready nodes is greater than the minimum ready count
+(default of 3) and the ratio of unready to ready nodes is less than the maximum unready node percentage (default of 45%). If either of those conditions are not
+true then the cluster autoscaler will enter an unsafe to scale state until the conditions change."
+            sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-041-modify-cluster-autoscaler.md"
+        - alert: ClusterAutoscalerUnableToScaleCPULimitReached
+          expr: increase(cluster_autoscaler_skipped_scale_events_count{service="cluster-autoscaler-default",direction="up",reason="CpuResourceLimit"}[15m]) > 0
+          for: 15m
           labels:
-            severity: critical
+            severity: info
           annotations:
-            summary: "There is a high risk of over-committing CPU resources on worker nodes."
-            description: "During the last 5 minutes, the average CPU request commitment on worker nodes was {{ $value | humanizePercentage }}. This is above the critical threshold of 95%."
-            sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md"
-        - alert: WorkerNodesMemoryOverCommit
-          expr: avg(availability_zone:acscs_worker_nodes:memory_limit_ratio) > 2
-          for: 5m
+            summary: "Cluster Autoscaler has reached its maximum CPU core limit and is unable to scale out."
+            description: "The number of total cores in the cluster has exceeded the maximum number set on the
+cluster autoscaler. This is calculated by summing the cpu capacity for all nodes in the cluster and comparing that number against the maximum cores value set for the
+cluster autoscaler. Limits can be adjusted by modifying the cluster autoscaler configuration."
+            sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-041-modify-cluster-autoscaler.md"
+        - alert: ClusterAutoscalerUnableToScaleMemoryLimitReached
+          expr: increase(cluster_autoscaler_skipped_scale_events_count{service="cluster-autoscaler-default",direction="up",reason="MemoryResourceLimit"}[15m]) > 0
+          for: 15m
           labels:
-            severity: critical
+            severity: info
           annotations:
-            summary: "There is a high risk of over-committing Memory resources on worker nodes."
-            description: "During the last 5 minutes, the average Memory limit commitment on worker nodes was {{ $value | humanizePercentage }}. This is above the recommended threshold of 200%."
-            sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md"
+            summary: "Cluster Autoscaler has reached its maximum Memory bytes limit and is unable to scale out."
+            description: "The number of total bytes of RAM in the cluster has exceeded the maximum number set on
+the cluster autoscaler. This is calculated by summing the memory capacity for all nodes in the cluster and comparing that number against the maximum memory bytes value set
+for the cluster autoscaler. Limits can be adjusted by modifying the cluster autoscaler configuration."
+            sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-041-modify-cluster-autoscaler.md"