Skip to content

Commit

Permalink
Adjust old alerts
Browse files Browse the repository at this point in the history
  • Loading branch information
mtodor committed Apr 15, 2024
1 parent c9da2c9 commit 7d30664
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 90 deletions.
34 changes: 8 additions & 26 deletions resources/prometheus/prometheus-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -680,41 +680,23 @@ spec:
cpu_resource_limits:acscs_worker_nodes:by_availability_zone:sum
/
availability_zone:acscs_worker_nodes:allocatable_cpu
- alert: WorkerNodesMemoryQuotaOverCommitWarning
expr: avg(availability_zone:acscs_worker_nodes:memory_request_ratio) > 0.85
for: 5m
- alert: WorkerNodesMemoryQuotaOverCommit
expr: avg(availability_zone:acscs_worker_nodes:memory_request_ratio) > 0.99
for: 15m
labels:
severity: warning
annotations:
summary: "There is a risk of over-committing Memory resources on worker nodes."
description: "During the last 5 minutes, the average memory request commitment on worker nodes was {{ $value | humanizePercentage }}. This is above the recommended threshold of 85%."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md"
- alert: WorkerNodesMemoryQuotaOverCommit
expr: avg(availability_zone:acscs_worker_nodes:memory_request_ratio) > 0.95
for: 5m
labels:
severity: critical
annotations:
summary: "There is a high risk of over-committing Memory resources on worker nodes."
description: "During the last 5 minutes, the average memory request commitment on worker nodes was {{ $value | humanizePercentage }}. This is above the critical threshold of 95%."
description: "During the last 15 minutes, the average memory request commitment on worker nodes was {{ $value | humanizePercentage }}. This could make pods unschedulable."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md"
- alert: WorkerNodesCPUQuotaOverCommitWarning
expr: avg(availability_zone:acscs_worker_nodes:cpu_request_ratio) > 0.85
for: 5m
- alert: WorkerNodesCPUQuotaOverCommit
expr: avg(availability_zone:acscs_worker_nodes:cpu_request_ratio) > 0.1
for: 15m
labels:
severity: warning
annotations:
summary: "There is a risk of over-committing CPU resources on worker nodes."
description: "During the last 5 minutes, the average CPU request commitment on worker nodes was {{ $value | humanizePercentage }}. This is above the recommended threshold of 85%."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md"
- alert: WorkerNodesCPUQuotaOverCommit
expr: avg(availability_zone:acscs_worker_nodes:cpu_request_ratio) > 0.95
for: 5m
labels:
severity: critical
annotations:
summary: "There is a high risk of over-committing CPU resources on worker nodes."
description: "During the last 5 minutes, the average CPU request commitment on worker nodes was {{ $value | humanizePercentage }}. This is above the critical threshold of 95%."
description: "During the last 15 minutes, the average CPU request commitment on worker nodes was {{ $value | humanizePercentage }}. This could make pods unschedulable."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md"
- alert: WorkerNodesMemoryOverCommit
expr: avg(availability_zone:acscs_worker_nodes:memory_limit_ratio) > 2
Expand Down
40 changes: 8 additions & 32 deletions resources/prometheus/unit_tests/WorkerNodesCPUQuotaOverCommit.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,48 +7,24 @@ tests:
- interval: 1m
input_series:
- series: kube_node_role{node="worker-1", role="acscs-worker"}
values: "1"
values: "1+0x20"
- series: kube_node_labels{node="worker-1", label_failure_domain_beta_kubernetes_io_zone="us-east-1a"}
values: "1"
values: "1+0x20"
- series: kube_node_status_allocatable{node="worker-1", resource="cpu", job="kube-state-metrics"}
values: "100"
values: "200+0x20"
- series: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{node="worker-1", resource="cpu", job="kube-state-metrics"}
values: "86"
alert_rule_test:
- eval_time: 1m
alertname: WorkerNodesCPUQuotaOverCommitWarning
exp_alerts: []
- eval_time: 5m
alertname: WorkerNodesCPUQuotaOverCommitWarning
exp_alerts:
- exp_labels:
alertname: WorkerNodesCPUQuotaOverCommitWarning
severity: warning
exp_annotations:
description: "During the last 5 minutes, the average CPU request commitment on worker nodes was 86%. This is above the recommended threshold of 85%."
summary: "There is a risk of over-committing CPU resources on worker nodes."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md"
- interval: 1m
input_series:
- series: kube_node_role{node="worker-1", role="acscs-worker"}
values: "1"
- series: kube_node_labels{node="worker-1", label_failure_domain_beta_kubernetes_io_zone="us-east-1a"}
values: "1"
- series: kube_node_status_allocatable{node="worker-1", resource="cpu", job="kube-state-metrics"}
values: "100"
- series: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{node="worker-1", resource="cpu", job="kube-state-metrics"}
values: "96"
values: "199+0x20"
alert_rule_test:
- eval_time: 1m
alertname: WorkerNodesCPUQuotaOverCommit
exp_alerts: []
- eval_time: 5m
- eval_time: 16m
alertname: WorkerNodesCPUQuotaOverCommit
exp_alerts:
- exp_labels:
alertname: WorkerNodesCPUQuotaOverCommit
severity: critical
severity: warning
exp_annotations:
description: "During the last 5 minutes, the average CPU request commitment on worker nodes was 96%. This is above the critical threshold of 95%."
summary: "There is a high risk of over-committing CPU resources on worker nodes."
summary: "There is a risk of over-committing CPU resources on worker nodes."
description: "During the last 15 minutes, the average CPU request commitment on worker nodes was 99.5%. This could make pods unschedulable."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md"
Original file line number Diff line number Diff line change
Expand Up @@ -7,48 +7,24 @@ tests:
- interval: 1m
input_series:
- series: kube_node_role{node="worker-1", role="acscs-worker"}
values: "1"
values: "1+0x20"
- series: kube_node_labels{node="worker-1", label_failure_domain_beta_kubernetes_io_zone="us-east-1a"}
values: "1"
values: "1+0x20"
- series: kube_node_status_allocatable{node="worker-1", resource="memory", job="kube-state-metrics"}
values: "100"
values: "200+0x20"
- series: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{node="worker-1", resource="memory", job="kube-state-metrics"}
values: "86"
alert_rule_test:
- eval_time: 1m
alertname: WorkerNodesMemoryQuotaOverCommitWarning
exp_alerts: []
- eval_time: 5m
alertname: WorkerNodesMemoryQuotaOverCommitWarning
exp_alerts:
- exp_labels:
alertname: WorkerNodesMemoryQuotaOverCommitWarning
severity: warning
exp_annotations:
description: "During the last 5 minutes, the average memory request commitment on worker nodes was 86%. This is above the recommended threshold of 85%."
summary: "There is a risk of over-committing Memory resources on worker nodes."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md"
- interval: 1m
input_series:
- series: kube_node_role{node="worker-1", role="acscs-worker"}
values: "1"
- series: kube_node_labels{node="worker-1", label_failure_domain_beta_kubernetes_io_zone="us-east-1a"}
values: "1"
- series: kube_node_status_allocatable{node="worker-1", resource="memory", job="kube-state-metrics"}
values: "100"
- series: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{node="worker-1", resource="memory", job="kube-state-metrics"}
values: "96"
values: "199+0x20"
alert_rule_test:
- eval_time: 1m
alertname: WorkerNodesMemoryQuotaOverCommit
exp_alerts: []
- eval_time: 5m
- eval_time: 16m
alertname: WorkerNodesMemoryQuotaOverCommit
exp_alerts:
- exp_labels:
alertname: WorkerNodesMemoryQuotaOverCommit
severity: critical
severity: warning
exp_annotations:
description: "During the last 5 minutes, the average memory request commitment on worker nodes was 96%. This is above the critical threshold of 95%."
summary: "There is a high risk of over-committing Memory resources on worker nodes."
summary: "There is a risk of over-committing Memory resources on worker nodes."
description: "During the last 15 minutes, the average memory request commitment on worker nodes was 99.5%. This could make pods unschedulable."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md"

0 comments on commit 7d30664

Please sign in to comment.