From 5ca8d2b9780cb20a195340340e83a06665fab96e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 8 Nov 2024 03:31:32 +0000 Subject: [PATCH] assets,site/content: daily assets regeneration --- assets/kubernetes/alerts.yaml | 19 ++++++++++++------- site/content/kubernetes/_index.md | 19 ++++++++++++------- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/assets/kubernetes/alerts.yaml b/assets/kubernetes/alerts.yaml index 94afe9bc..51607fcd 100644 --- a/assets/kubernetes/alerts.yaml +++ b/assets/kubernetes/alerts.yaml @@ -169,12 +169,13 @@ groups: severity: warning - alert: KubeContainerWaiting annotations: - description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container - {{ $labels.container}} has been in waiting state for longer than 1 hour. + description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on + container {{ $labels.container}} has been in waiting state for longer than + 1 hour. (reason: "{{ $labels.reason }}").' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting summary: Pod container waiting longer than 1 hour expr: | - sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0 + kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", job="kube-state-metrics"} > 0 for: 1h labels: severity: warning @@ -365,9 +366,9 @@ groups: runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh summary: Processes experience elevated CPU throttling. expr: | - sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (cluster, container, pod, namespace) + sum(increase(container_cpu_cfs_throttled_periods_total{container!="", job="cadvisor", }[5m])) without (id, metrics_path, name, image, endpoint, job, node) / - sum(increase(container_cpu_cfs_periods_total{}[5m])) by (cluster, container, pod, namespace) + sum(increase(container_cpu_cfs_periods_total{job="cadvisor", }[5m])) without (id, metrics_path, name, image, endpoint, job, node) > ( 25 / 100 ) for: 15m labels: @@ -573,7 +574,9 @@ groups: runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration summary: Client certificate is about to expire. expr: | - apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(cluster, job) histogram_quantile(0.01, sum by (cluster, job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800 + histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800 + and + on(job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 for: 5m labels: severity: warning @@ -584,7 +587,9 @@ groups: runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration summary: Client certificate is about to expire. expr: | - apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(cluster, job) histogram_quantile(0.01, sum by (cluster, job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400 + histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400 + and + on(job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 for: 5m labels: severity: critical diff --git a/site/content/kubernetes/_index.md b/site/content/kubernetes/_index.md index fb6b1c67..a165745d 100644 --- a/site/content/kubernetes/_index.md +++ b/site/content/kubernetes/_index.md @@ -244,12 +244,13 @@ https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md {{< code lang="yaml" >}} alert: KubeContainerWaiting annotations: - description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container - {{ $labels.container}} has been in waiting state for longer than 1 hour. + description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container + {{ $labels.container}} has been in waiting state for longer than 1 hour. (reason: + "{{ $labels.reason }}").' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting summary: Pod container waiting longer than 1 hour expr: | - sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0 + kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", job="kube-state-metrics"} > 0 for: 1h labels: severity: warning @@ -525,9 +526,9 @@ annotations: runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh summary: Processes experience elevated CPU throttling. expr: | - sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (cluster, container, pod, namespace) + sum(increase(container_cpu_cfs_throttled_periods_total{container!="", job="cadvisor", }[5m])) without (id, metrics_path, name, image, endpoint, job, node) / - sum(increase(container_cpu_cfs_periods_total{}[5m])) by (cluster, container, pod, namespace) + sum(increase(container_cpu_cfs_periods_total{job="cadvisor", }[5m])) without (id, metrics_path, name, image, endpoint, job, node) > ( 25 / 100 ) for: 15m labels: @@ -805,7 +806,9 @@ annotations: runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration summary: Client certificate is about to expire. expr: | - apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(cluster, job) histogram_quantile(0.01, sum by (cluster, job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800 + histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800 + and + on(job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 for: 5m labels: severity: warning @@ -822,7 +825,9 @@ annotations: runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration summary: Client certificate is about to expire. expr: | - apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(cluster, job) histogram_quantile(0.01, sum by (cluster, job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400 + histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400 + and + on(job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 for: 5m labels: severity: critical