Skip to content

Commit

Permalink
Add autoscaler alerts
Browse files Browse the repository at this point in the history
  • Loading branch information
mtodor committed Apr 12, 2024
1 parent 59ec468 commit 0d142cc
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 105 deletions.
4 changes: 3 additions & 1 deletion resources/prometheus/federation-config.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

142 changes: 38 additions & 104 deletions resources/prometheus/prometheus-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -610,118 +610,52 @@ spec:
severity: critical
namespace: "{{ $labels.namespace }}"
rhacs_instance_id: "{{ $labels.rhacs_instance_id }}"
- name: az-resources
- name: cluster-resources
rules:
- record: acscs_worker_nodes
expr: |
kube_node_role{role="acscs-worker"}
- record: node_availability_zone
expr: |
sum(label_replace(kube_node_labels, "availability_zone", "$1", "label_failure_domain_beta_kubernetes_io_zone", "(.*)")) by (availability_zone, node) > 0
- record: memory_resource_requests:acscs_worker_nodes:by_availability_zone:sum
expr: |
sum(
sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{resource="memory",observability="",job="kube-state-metrics"}) by (node)
* on (node) acscs_worker_nodes
* on (node) group_left(availability_zone) node_availability_zone
) by (availability_zone)
- record: memory_resource_limits:acscs_worker_nodes:by_availability_zone:sum
expr: |
sum(
sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{resource="memory",observability="",job="kube-state-metrics"}) by (node)
* on (node) acscs_worker_nodes
* on (node) group_left(availability_zone) node_availability_zone
) by (availability_zone)
- record: cpu_resource_requests:acscs_worker_nodes:by_availability_zone:sum
expr: |
sum(
sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{resource="cpu", observability="",job="kube-state-metrics"}) by (node)
* on (node) acscs_worker_nodes
* on (node) group_left(availability_zone) node_availability_zone
) by (availability_zone)
- record: cpu_resource_limits:acscs_worker_nodes:by_availability_zone:sum
expr: |
sum(
sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{observability="",job="kube-state-metrics"}) by (node)
* on (node) acscs_worker_nodes
* on (node) group_left(availability_zone) node_availability_zone
) by (availability_zone)
- record: availability_zone:acscs_worker_nodes:allocatable_cpu
expr: |
sum(
sum(kube_node_status_allocatable{resource="cpu"}) by (node)
* on (node) acscs_worker_nodes
* on (node) group_left(availability_zone) node_availability_zone
) by (availability_zone)
- record: availability_zone:acscs_worker_nodes:allocatable_memory
expr: |
sum(
sum(kube_node_status_allocatable{resource="memory"}) by (node)
* on (node) acscs_worker_nodes
* on (node) group_left(availability_zone) node_availability_zone
) by (availability_zone)
- record: availability_zone:acscs_worker_nodes:memory_request_ratio
expr: |
memory_resource_requests:acscs_worker_nodes:by_availability_zone:sum
/
availability_zone:acscs_worker_nodes:allocatable_memory
- record: availability_zone:acscs_worker_nodes:cpu_request_ratio
expr: |
cpu_resource_requests:acscs_worker_nodes:by_availability_zone:sum
/
availability_zone:acscs_worker_nodes:allocatable_cpu
- record: availability_zone:acscs_worker_nodes:memory_limit_ratio
expr: |
memory_resource_limits:acscs_worker_nodes:by_availability_zone:sum
/
availability_zone:acscs_worker_nodes:allocatable_memory
- record: availability_zone:acscs_worker_nodes:cpu_limit_ratio
expr: |
cpu_resource_limits:acscs_worker_nodes:by_availability_zone:sum
/
availability_zone:acscs_worker_nodes:allocatable_cpu
- alert: WorkerNodesMemoryQuotaOverCommitWarning
expr: avg(availability_zone:acscs_worker_nodes:memory_request_ratio) > 0.85
for: 5m
labels:
severity: warning
annotations:
summary: "There is a risk of over-committing Memory resources on worker nodes."
description: "During the last 5 minutes, the average memory request commitment on worker nodes was {{ $value | humanizePercentage }}. This is above the recommended threshold of 85%."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md"
- alert: WorkerNodesMemoryQuotaOverCommit
expr: avg(availability_zone:acscs_worker_nodes:memory_request_ratio) > 0.95
for: 5m
# Copied from: https://github.com/openshift/cluster-autoscaler-operator/blob/287595b0ee37b893c02d51d3a461ba118b90c3dc/pkg/controller/clusterautoscaler/monitoring.go#L153
# However, different severity levels are used to ensure timely response.
# Additional information about alerts: https://github.com/openshift/cluster-autoscaler-operator/blob/287595b0ee37b893c02d51d3a461ba118b90c3dc/docs/user/alerts.md
- alert: ClusterAutoscalerUnschedulablePods
expr: cluster_autoscaler_unschedulable_pods_count{service="cluster-autoscaler-default"} > 0
for: 30m
labels:
severity: critical
annotations:
summary: "There is a high risk of over-committing Memory resources on worker nodes."
description: "During the last 5 minutes, the average memory request commitment on worker nodes was {{ $value | humanizePercentage }}. This is above the critical threshold of 95%."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md"
- alert: WorkerNodesCPUQuotaOverCommitWarning
expr: avg(availability_zone:acscs_worker_nodes:cpu_request_ratio) > 0.85
for: 5m
summary: "Cluster Autoscaler has {{ $value }} unschedulable pods."
description: "The cluster autoscaler is unable to scale up and is alerting that there are unschedulable pods because of this condition.
This may be caused by the cluster autoscaler reaching its resources limits, or by Kubernetes waiting for new nodes to become ready."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-041-modify-cluster-autoscaler.md"
- alert: ClusterAutoscalerNotSafeToScale
expr: cluster_autoscaler_cluster_safe_to_autoscale{service="cluster-autoscaler-default"} != 1
for: 15m
labels:
severity: warning
annotations:
summary: "There is a risk of over-committing CPU resources on worker nodes."
description: "During the last 5 minutes, the average CPU request commitment on worker nodes was {{ $value | humanizePercentage }}. This is above the recommended threshold of 85%."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md"
- alert: WorkerNodesCPUQuotaOverCommit
expr: avg(availability_zone:acscs_worker_nodes:cpu_request_ratio) > 0.95
for: 5m
summary: "Cluster Autoscaler is reporting that the cluster is not ready for scaling."
description: "The cluster autoscaler has detected that the number of unready nodes is too high
and it is not safe to continute scaling operations. It makes this determination by checking that the number of ready nodes is greater than the minimum ready count
(default of 3) and the ratio of unready to ready nodes is less than the maximum unready node percentage (default of 45%). If either of those conditions are not
true then the cluster autoscaler will enter an unsafe to scale state until the conditions change."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-041-modify-cluster-autoscaler.md"
- alert: ClusterAutoscalerUnableToScaleCPULimitReached
expr: increase(cluster_autoscaler_skipped_scale_events_count{service="cluster-autoscaler-default",direction="up",reason="CpuResourceLimit"}[15m]) > 0
for: 15m
labels:
severity: critical
severity: info
annotations:
summary: "There is a high risk of over-committing CPU resources on worker nodes."
description: "During the last 5 minutes, the average CPU request commitment on worker nodes was {{ $value | humanizePercentage }}. This is above the critical threshold of 95%."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md"
- alert: WorkerNodesMemoryOverCommit
expr: avg(availability_zone:acscs_worker_nodes:memory_limit_ratio) > 2
for: 5m
summary: "Cluster Autoscaler has reached its maximum CPU core limit and is unable to scale out."
description: "The number of total cores in the cluster has exceeded the maximum number set on the
cluster autoscaler. This is calculated by summing the cpu capacity for all nodes in the cluster and comparing that number against the maximum cores value set for the
cluster autoscaler. Limits can be adjusted by modifying the cluster autoscaler configuration."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-041-modify-cluster-autoscaler.md"
- alert: ClusterAutoscalerUnableToScaleMemoryLimitReached
expr: increase(cluster_autoscaler_skipped_scale_events_count{service="cluster-autoscaler-default",direction="up",reason="MemoryResourceLimit"}[15m]) > 0
for: 15m
labels:
severity: critical
severity: info
annotations:
summary: "There is a high risk of over-committing Memory resources on worker nodes."
description: "During the last 5 minutes, the average Memory limit commitment on worker nodes was {{ $value | humanizePercentage }}. This is above the recommended threshold of 200%."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md"
summary: "Cluster Autoscaler has reached its maximum Memory bytes limit and is unable to scale out."
description: "The number of total bytes of RAM in the cluster has exceeded the maximum number set on
the cluster autoscaler. This is calculated by summing the memory capacity for all nodes in the cluster and comparing that number against the maximum memory bytes value set
for the cluster autoscaler. Limits can be adjusted by modifying the cluster autoscaler configuration."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-041-modify-cluster-autoscaler.md"

0 comments on commit 0d142cc

Please sign in to comment.