From 0d142ccdb9931997eb5be42500c4063e15fa8632 Mon Sep 17 00:00:00 2001 From: Mladen Todorovic Date: Fri, 12 Apr 2024 19:33:17 +0200 Subject: [PATCH] Add autoscaler alerts --- resources/prometheus/federation-config.yaml | 4 +- resources/prometheus/prometheus-rules.yaml | 142 ++++++-------------- 2 files changed, 41 insertions(+), 105 deletions(-) diff --git a/resources/prometheus/federation-config.yaml b/resources/prometheus/federation-config.yaml index d7a7a558..5f2873be 100644 --- a/resources/prometheus/federation-config.yaml +++ b/resources/prometheus/federation-config.yaml @@ -17,6 +17,9 @@ match[]: - cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{job!~"central|scanner"} - cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{job!~"central|scanner"} - cluster:node_cpu:ratio_rate5m{job!~"central|scanner"} + - cluster_autoscaler_cluster_safe_to_autoscale{job!~"central|scanner"} + - cluster_autoscaler_skipped_scale_events_count{job!~"central|scanner"} + - cluster_autoscaler_unschedulable_pods_count{job!~"central|scanner"} - cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile{job!~"central|scanner"} - code_resource:apiserver_request_total:rate5m{job!~"central|scanner"} - container_cpu_cfs_periods_total{job!~"central|scanner"} @@ -70,7 +73,6 @@ match[]: - kube_job_status_start_time{job!~"central|scanner"} - kube_namespace_status_phase{job!~"central|scanner"} - kube_node_info{job!~"central|scanner"} - - kube_node_labels{job!~"central|scanner"} - kube_node_role{job!~"central|scanner"} - kube_node_spec_taint{job!~"central|scanner"} - kube_node_status_allocatable{job!~"central|scanner"} diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index aa8d82c7..c8f395aa 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -610,118 +610,52 @@ spec: severity: critical namespace: "{{ $labels.namespace }}" rhacs_instance_id: "{{ $labels.rhacs_instance_id }}" - - name: az-resources + - name: cluster-resources rules: - - record: acscs_worker_nodes - expr: | - kube_node_role{role="acscs-worker"} - - record: node_availability_zone - expr: | - sum(label_replace(kube_node_labels, "availability_zone", "$1", "label_failure_domain_beta_kubernetes_io_zone", "(.*)")) by (availability_zone, node) > 0 - - record: memory_resource_requests:acscs_worker_nodes:by_availability_zone:sum - expr: | - sum( - sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{resource="memory",observability="",job="kube-state-metrics"}) by (node) - * on (node) acscs_worker_nodes - * on (node) group_left(availability_zone) node_availability_zone - ) by (availability_zone) - - record: memory_resource_limits:acscs_worker_nodes:by_availability_zone:sum - expr: | - sum( - sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{resource="memory",observability="",job="kube-state-metrics"}) by (node) - * on (node) acscs_worker_nodes - * on (node) group_left(availability_zone) node_availability_zone - ) by (availability_zone) - - record: cpu_resource_requests:acscs_worker_nodes:by_availability_zone:sum - expr: | - sum( - sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{resource="cpu", observability="",job="kube-state-metrics"}) by (node) - * on (node) acscs_worker_nodes - * on (node) group_left(availability_zone) node_availability_zone - ) by (availability_zone) - - record: cpu_resource_limits:acscs_worker_nodes:by_availability_zone:sum - expr: | - sum( - sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{observability="",job="kube-state-metrics"}) by (node) - * on (node) acscs_worker_nodes - * on (node) group_left(availability_zone) node_availability_zone - ) by (availability_zone) - - record: availability_zone:acscs_worker_nodes:allocatable_cpu - expr: | - sum( - sum(kube_node_status_allocatable{resource="cpu"}) by (node) - * on (node) acscs_worker_nodes - * on (node) group_left(availability_zone) node_availability_zone - ) by (availability_zone) - - record: availability_zone:acscs_worker_nodes:allocatable_memory - expr: | - sum( - sum(kube_node_status_allocatable{resource="memory"}) by (node) - * on (node) acscs_worker_nodes - * on (node) group_left(availability_zone) node_availability_zone - ) by (availability_zone) - - record: availability_zone:acscs_worker_nodes:memory_request_ratio - expr: | - memory_resource_requests:acscs_worker_nodes:by_availability_zone:sum - / - availability_zone:acscs_worker_nodes:allocatable_memory - - record: availability_zone:acscs_worker_nodes:cpu_request_ratio - expr: | - cpu_resource_requests:acscs_worker_nodes:by_availability_zone:sum - / - availability_zone:acscs_worker_nodes:allocatable_cpu - - record: availability_zone:acscs_worker_nodes:memory_limit_ratio - expr: | - memory_resource_limits:acscs_worker_nodes:by_availability_zone:sum - / - availability_zone:acscs_worker_nodes:allocatable_memory - - record: availability_zone:acscs_worker_nodes:cpu_limit_ratio - expr: | - cpu_resource_limits:acscs_worker_nodes:by_availability_zone:sum - / - availability_zone:acscs_worker_nodes:allocatable_cpu - - alert: WorkerNodesMemoryQuotaOverCommitWarning - expr: avg(availability_zone:acscs_worker_nodes:memory_request_ratio) > 0.85 - for: 5m - labels: - severity: warning - annotations: - summary: "There is a risk of over-committing Memory resources on worker nodes." - description: "During the last 5 minutes, the average memory request commitment on worker nodes was {{ $value | humanizePercentage }}. This is above the recommended threshold of 85%." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md" - - alert: WorkerNodesMemoryQuotaOverCommit - expr: avg(availability_zone:acscs_worker_nodes:memory_request_ratio) > 0.95 - for: 5m + # Copied from: https://github.com/openshift/cluster-autoscaler-operator/blob/287595b0ee37b893c02d51d3a461ba118b90c3dc/pkg/controller/clusterautoscaler/monitoring.go#L153 + # However, different severity levels are used to ensure timely response. + # Additional information about alerts: https://github.com/openshift/cluster-autoscaler-operator/blob/287595b0ee37b893c02d51d3a461ba118b90c3dc/docs/user/alerts.md + - alert: ClusterAutoscalerUnschedulablePods + expr: cluster_autoscaler_unschedulable_pods_count{service="cluster-autoscaler-default"} > 0 + for: 30m labels: severity: critical annotations: - summary: "There is a high risk of over-committing Memory resources on worker nodes." - description: "During the last 5 minutes, the average memory request commitment on worker nodes was {{ $value | humanizePercentage }}. This is above the critical threshold of 95%." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md" - - alert: WorkerNodesCPUQuotaOverCommitWarning - expr: avg(availability_zone:acscs_worker_nodes:cpu_request_ratio) > 0.85 - for: 5m + summary: "Cluster Autoscaler has {{ $value }} unschedulable pods." + description: "The cluster autoscaler is unable to scale up and is alerting that there are unschedulable pods because of this condition. +This may be caused by the cluster autoscaler reaching its resources limits, or by Kubernetes waiting for new nodes to become ready." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-041-modify-cluster-autoscaler.md" + - alert: ClusterAutoscalerNotSafeToScale + expr: cluster_autoscaler_cluster_safe_to_autoscale{service="cluster-autoscaler-default"} != 1 + for: 15m labels: severity: warning annotations: - summary: "There is a risk of over-committing CPU resources on worker nodes." - description: "During the last 5 minutes, the average CPU request commitment on worker nodes was {{ $value | humanizePercentage }}. This is above the recommended threshold of 85%." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md" - - alert: WorkerNodesCPUQuotaOverCommit - expr: avg(availability_zone:acscs_worker_nodes:cpu_request_ratio) > 0.95 - for: 5m + summary: "Cluster Autoscaler is reporting that the cluster is not ready for scaling." + description: "The cluster autoscaler has detected that the number of unready nodes is too high +and it is not safe to continute scaling operations. It makes this determination by checking that the number of ready nodes is greater than the minimum ready count +(default of 3) and the ratio of unready to ready nodes is less than the maximum unready node percentage (default of 45%). If either of those conditions are not +true then the cluster autoscaler will enter an unsafe to scale state until the conditions change." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-041-modify-cluster-autoscaler.md" + - alert: ClusterAutoscalerUnableToScaleCPULimitReached + expr: increase(cluster_autoscaler_skipped_scale_events_count{service="cluster-autoscaler-default",direction="up",reason="CpuResourceLimit"}[15m]) > 0 + for: 15m labels: - severity: critical + severity: info annotations: - summary: "There is a high risk of over-committing CPU resources on worker nodes." - description: "During the last 5 minutes, the average CPU request commitment on worker nodes was {{ $value | humanizePercentage }}. This is above the critical threshold of 95%." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md" - - alert: WorkerNodesMemoryOverCommit - expr: avg(availability_zone:acscs_worker_nodes:memory_limit_ratio) > 2 - for: 5m + summary: "Cluster Autoscaler has reached its maximum CPU core limit and is unable to scale out." + description: "The number of total cores in the cluster has exceeded the maximum number set on the +cluster autoscaler. This is calculated by summing the cpu capacity for all nodes in the cluster and comparing that number against the maximum cores value set for the +cluster autoscaler. Limits can be adjusted by modifying the cluster autoscaler configuration." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-041-modify-cluster-autoscaler.md" + - alert: ClusterAutoscalerUnableToScaleMemoryLimitReached + expr: increase(cluster_autoscaler_skipped_scale_events_count{service="cluster-autoscaler-default",direction="up",reason="MemoryResourceLimit"}[15m]) > 0 + for: 15m labels: - severity: critical + severity: info annotations: - summary: "There is a high risk of over-committing Memory resources on worker nodes." - description: "During the last 5 minutes, the average Memory limit commitment on worker nodes was {{ $value | humanizePercentage }}. This is above the recommended threshold of 200%." - sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md" + summary: "Cluster Autoscaler has reached its maximum Memory bytes limit and is unable to scale out." + description: "The number of total bytes of RAM in the cluster has exceeded the maximum number set on +the cluster autoscaler. This is calculated by summing the memory capacity for all nodes in the cluster and comparing that number against the maximum memory bytes value set +for the cluster autoscaler. Limits can be adjusted by modifying the cluster autoscaler configuration." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-041-modify-cluster-autoscaler.md"