From 843096133880267f0cb2fedeb1ef4cc095004421 Mon Sep 17 00:00:00 2001 From: Daniel Gottschalk Date: Thu, 10 Mar 2022 17:37:35 +0100 Subject: [PATCH 1/2] adds podmonitor for prometheus metrics --- helm/postgres/templates/podmonitor.yaml | 46 +++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 helm/postgres/templates/podmonitor.yaml diff --git a/helm/postgres/templates/podmonitor.yaml b/helm/postgres/templates/podmonitor.yaml new file mode 100644 index 00000000..5ceade7a --- /dev/null +++ b/helm/postgres/templates/podmonitor.yaml @@ -0,0 +1,46 @@ +{{- if .Values.monitoring }} +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: postgresql-exporter +spec: + selector: + matchLabels: + postgres-operator.crunchydata.com/crunchy-postgres-exporter: "true" + podMetricsEndpoints: + - port: exporter + relabelings: + - sourceLabels: [__meta_kubernetes_namespace] + action: replace + targetLabel: kubernetes_namespace + - sourceLabels: [__meta_kubernetes_pod_name] + targetLabel: pod + - sourceLabels: [__meta_kubernetes_pod_label_postgres_operator_crunchydata_com_cluster,__meta_kubernetes_pod_label_pg_cluster] + targetLabel: cluster + separator: "" + replacement: '$1' + - sourceLabels: [__meta_kubernetes_namespace,cluster] + targetLabel: pg_cluster + separator: ":" + replacement: '$1$2' + - sourceLabels: [__meta_kubernetes_pod_ip] + targetLabel: ip + replacement: '$1' + - sourceLabels: [__meta_kubernetes_pod_label_postgres_operator_crunchydata_com_instance,__meta_kubernetes_pod_label_deployment_name] + targetLabel: deployment + replacement: '$1' + separator: "" + - sourceLabels: [__meta_kubernetes_pod_label_postgres_operator_crunchydata_com_role,__meta_kubernetes_pod_label_role] + targetLabel: role + replacement: '$1' + separator: "" + - sourceLabels: [dbname] + targetLabel: dbname + replacement: '$1' + - sourceLabels: [relname] + targetLabel: relname + replacement: '$1' + - sourceLabels: [schemaname] + targetLabel: schemaname + replacement: '$1' +{{- end }} From 30fc78f319bbc968c89ab54dea8c290165cdbb7a Mon Sep 17 00:00:00 2001 From: Daniel Gottschalk Date: Thu, 10 Mar 2022 17:38:04 +0100 Subject: [PATCH 2/2] adds prometheusrule for prometheus alerts --- helm/postgres/templates/prometheusrule.yaml | 410 ++++++++++++++++++++ helm/postgres/values.yaml | 10 + 2 files changed, 420 insertions(+) create mode 100644 helm/postgres/templates/prometheusrule.yaml diff --git a/helm/postgres/templates/prometheusrule.yaml b/helm/postgres/templates/prometheusrule.yaml new file mode 100644 index 00000000..15fb1867 --- /dev/null +++ b/helm/postgres/templates/prometheusrule.yaml @@ -0,0 +1,410 @@ +{{- if and .Values.monitoring .Values.alerting }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: postgresql +spec: + groups: + - name: alert-rules + rules: + ########## EXPORTER RULES ########## + - alert: PGExporterScrapeError + expr: pg_exporter_last_scrape_error > 0 + for: 60s + labels: + service: postgresql + severity: critical + severity_num: 300 +{{- if .Values.alertingConfig.additionalAlertLabels }} +{{ toYaml .Values.alertingConfig.additionalAlertLabels | indent 8 }} +{{- end }} + annotations: + summary: '{{`Postgres Exporter running on {{ $labels.job }} (instance: {{ $labels.instance }}) is encountering scrape errors processing queries. Error count: ( {{ $value }} )`}}' + ########## POSTGRESQL RULES ########## + - alert: PGIsUp + expr: pg_up < 1 + for: 60s + labels: + service: postgresql + severity: critical + severity_num: 300 +{{- if .Values.alertingConfig.additionalAlertLabels }} +{{ toYaml .Values.alertingConfig.additionalAlertLabels | indent 8 }} +{{- end }} + annotations: + summary: '{{`postgres_exporter running on {{ $labels.job }} is unable to communicate with the configured database`}}' + # Example to check for current version of PostgreSQL. Metric returns the version that the exporter is running on, so you can set a rule to check for the minimum version you'd like all systems to be on. Number returned is the 6 digit integer representation contained in the setting "server_version_num". + # + # - alert: PGMinimumVersion + # expr: ccp_postgresql_version_current < 110005 + # for: 60s + # labels: + # service: postgresql + # severity: critical + # severity_num: 300 + # annotations: + # summary: '{{`{{ $labels.job }} is not running at least version 11.5 of PostgreSQL`}}' + # Whether a system switches from primary to replica or vice versa must be configured per named job. + # No way to tell what value a system is supposed to be without a rule expression for that specific system + # 2 to 1 means it changed from primary to replica. 1 to 2 means it changed from replica to primary + # Set this alert for each system that you want to monitor a recovery status change + # Below is an example for a target job called "Replica" and watches for the value to change above 1 which means it's no longer a replica + # + # - alert: PGRecoveryStatusSwitch_Replica + # expr: ccp_is_in_recovery_status{job="Replica"} > 1 + # for: 60s + # labels: + # service: postgresql + # severity: critical + # severity_num: 300 + # annotations: + # summary: '{{`{{ $labels.job }} has changed from replica to primary`}}' + # Absence alerts must be configured per named job, otherwise there's no way to know which job is down + # Below is an example for a target job called "Prod" + # - alert: PGConnectionAbsent_Prod + # expr: absent(ccp_connection_stats_max_connections{job="Prod"}) + # for: 10s + # labels: + # service: postgresql + # severity: critical + # severity_num: 300 + # annotations: + # description: 'Connection metric is absent from target (Prod). Check that postgres_exporter can connect to PostgreSQL.' + # Optional monitor for changes to pg_settings (postgresql.conf) system catalog. + # A similar metric is available for monitoring pg_hba.conf. See ccp_hba_settings_checksum(). + # If metric returns 0, then NO settings have changed for either pg_settings since last known valid state + # If metric returns 1, then pg_settings have changed since last known valid state + # To see what may have changed, check the monitor.pg_settings_checksum table for a history of config state. + # - alert: PGSettingsChecksum + # expr: ccp_pg_settings_checksum > 0 + # for 60s + # labels: + # service: postgresql + # severity: critical + # severity_num: 300 + # annotations: + # description: '{{`Configuration settings on {{ $labels.job }} have changed from previously known valid state. To reset current config to a valid state after alert fires, run monitor.pg_settings_checksum_set_valid().`}}' + # summary: 'PGSQL Instance settings checksum' + # Monitor for data block checksum failures. Only works in PG12+ + # - alert: PGDataChecksum + # expr: ccp_data_checksum_failure > 0 + # for 60s + # labels: + # service: postgresql + # severity: critical + # severity_num: 300 + # annotations: + # description: '{{`{{ $labels.job }} has at least one data checksum failure in database {{ $labels.dbname }}. See pg_stat_database system catalog for more information.`}}' + # summary: 'PGSQL Data Checksum failure' + - alert: PGIdleTxn + expr: ccp_connection_stats_max_idle_in_txn_time > 300 + for: 60s + labels: + service: postgresql + severity: warning + severity_num: 200 +{{- if .Values.alertingConfig.additionalAlertLabels }} +{{ toYaml .Values.alertingConfig.additionalAlertLabels | indent 8 }} +{{- end }} + annotations: + description: '{{`{{ $labels.job }} has at least one session idle in transaction for over 5 minutes.`}}' + summary: 'PGSQL Instance idle transactions' + - alert: PGIdleTxn + expr: ccp_connection_stats_max_idle_in_txn_time > 900 + for: 60s + labels: + service: postgresql + severity: critical + severity_num: 300 +{{- if .Values.alertingConfig.additionalAlertLabels }} +{{ toYaml .Values.alertingConfig.additionalAlertLabels | indent 8 }} +{{- end }} + annotations: + description: '{{`{{ $labels.job }} has at least one session idle in transaction for over 15 minutes.`}}' + summary: 'PGSQL Instance idle transactions' + - alert: PGQueryTime + expr: ccp_connection_stats_max_query_time > 43200 + for: 60s + labels: + service: postgresql + severity: warning + severity_num: 200 +{{- if .Values.alertingConfig.additionalAlertLabels }} +{{ toYaml .Values.alertingConfig.additionalAlertLabels | indent 8 }} +{{- end }} + annotations: + description: '{{`{{ $labels.job }} has at least one query running for over 12 hours.`}}' + summary: 'PGSQL Max Query Runtime' + - alert: PGQueryTime + expr: ccp_connection_stats_max_query_time > 86400 + for: 60s + labels: + service: postgresql + severity: critical + severity_num: 300 +{{- if .Values.alertingConfig.additionalAlertLabels }} +{{ toYaml .Values.alertingConfig.additionalAlertLabels | indent 8 }} +{{- end }} + annotations: + description: '{{`{{ $labels.job }} has at least one query running for over 1 day.`}}' + summary: 'PGSQL Max Query Runtime' + - alert: PGConnPerc + expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 75 + for: 60s + labels: + service: postgresql + severity: warning + severity_num: 200 +{{- if .Values.alertingConfig.additionalAlertLabels }} +{{ toYaml .Values.alertingConfig.additionalAlertLabels | indent 8 }} +{{- end }} + annotations: + description: '{{`{{ $labels.job }} is using 75% or more of available connections ({{ $value }}%`}})' + summary: 'PGSQL Instance connections' + - alert: PGConnPerc + expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 90 + for: 60s + labels: + service: postgresql + severity: critical + severity_num: 300 +{{- if .Values.alertingConfig.additionalAlertLabels }} +{{ toYaml .Values.alertingConfig.additionalAlertLabels | indent 8 }} +{{- end }} + annotations: + description: '{{`{{ $labels.job }} is using 90% or more of available connections ({{ $value }}%)`}}' + summary: 'PGSQL Instance connections' + - alert: PGDiskSize + expr: 100 * ((ccp_nodemx_data_disk_total_bytes - ccp_nodemx_data_disk_available_bytes) / ccp_nodemx_data_disk_total_bytes) > 75 + for: 60s + labels: + service: postgresql + severity: warning + severity_num: 200 +{{- if .Values.alertingConfig.additionalAlertLabels }} +{{ toYaml .Values.alertingConfig.additionalAlertLabels | indent 8 }} +{{- end }} + annotations: + description: '{{`PGSQL Instance {{ $labels.deployment }} over 75% disk usage at mount point "{{ $labels.mount_point }}": {{ $value }}%`}}' + summary: PGSQL Instance usage warning + - alert: PGDiskSize + expr: 100 * ((ccp_nodemx_data_disk_total_bytes - ccp_nodemx_data_disk_available_bytes) / ccp_nodemx_data_disk_total_bytes) > 90 + for: 60s + labels: + service: postgresql + severity: critical + severity_num: 300 +{{- if .Values.alertingConfig.additionalAlertLabels }} +{{ toYaml .Values.alertingConfig.additionalAlertLabels | indent 8 }} +{{- end }} + annotations: + description: '{{`PGSQL Instance {{ $labels.deployment }} over 90% disk usage at mount point "{{ $labels.mount_point }}": {{ $value }}%`}}' + summary: 'PGSQL Instance size critical' + - alert: PGReplicationByteLag + expr: ccp_replication_status_byte_lag > 5.24288e+07 + for: 60s + labels: + service: postgresql + severity: warning + severity_num: 200 +{{- if .Values.alertingConfig.additionalAlertLabels }} +{{ toYaml .Values.alertingConfig.additionalAlertLabels | indent 8 }} +{{- end }} + annotations: + description: '{{`PGSQL Instance {{ $labels.job }} has at least one replica lagging over 50MB behind.`}}' + summary: 'PGSQL Instance replica lag warning' + - alert: PGReplicationByteLag + expr: ccp_replication_status_byte_lag > 1.048576e+08 + for: 60s + labels: + service: postgresql + severity: critical + severity_num: 300 +{{- if .Values.alertingConfig.additionalAlertLabels }} +{{ toYaml .Values.alertingConfig.additionalAlertLabels | indent 8 }} +{{- end }} + annotations: + description: '{{`PGSQL Instance {{ $labels.job }} has at least one replica lagging over 100MB behind.`}}' + summary: 'PGSQL Instance replica lag warning' + - alert: PGReplicationSlotsInactive + expr: ccp_replication_slots_active == 0 + for: 60s + labels: + service: postgresql + severity: critical + severity_num: 300 +{{- if .Values.alertingConfig.additionalAlertLabels }} +{{ toYaml .Values.alertingConfig.additionalAlertLabels | indent 8 }} +{{- end }} + annotations: + description: '{{`PGSQL Instance {{ $labels.job }} has one or more inactive replication slots`}}' + summary: 'PGSQL Instance inactive replication slot' + - alert: PGXIDWraparound + expr: ccp_transaction_wraparound_percent_towards_wraparound > 50 + for: 60s + labels: + service: postgresql + severity: warning + severity_num: 200 +{{- if .Values.alertingConfig.additionalAlertLabels }} +{{ toYaml .Values.alertingConfig.additionalAlertLabels | indent 8 }} +{{- end }} + annotations: + description: '{{`PGSQL Instance {{ $labels.job }} is over 50% towards transaction id wraparound.`}}' + summary: '{{`PGSQL Instance {{ $labels.job }} transaction id wraparound imminent`}}' + - alert: PGXIDWraparound + expr: ccp_transaction_wraparound_percent_towards_wraparound > 75 + for: 60s + labels: + service: postgresql + severity: critical + severity_num: 300 +{{- if .Values.alertingConfig.additionalAlertLabels }} +{{ toYaml .Values.alertingConfig.additionalAlertLabels | indent 8 }} +{{- end }} + annotations: + description: '{{`PGSQL Instance {{ $labels.job }} is over 75% towards transaction id wraparound.`}}' + summary: 'PGSQL Instance transaction id wraparound imminent' + - alert: PGEmergencyVacuum + expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 110 + for: 60s + labels: + service: postgresql + severity: warning + severity_num: 200 +{{- if .Values.alertingConfig.additionalAlertLabels }} +{{ toYaml .Values.alertingConfig.additionalAlertLabels | indent 8 }} +{{- end }} + annotations: + description: '{{`PGSQL Instance {{ $labels.job }} is over 110% beyond autovacuum_freeze_max_age value. Autovacuum may need tuning to better keep up.`}}' + summary: 'PGSQL Instance emergency vacuum imminent' + - alert: PGEmergencyVacuum + expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 125 + for: 60s + labels: + service: postgresql + severity: critical + severity_num: 300 +{{- if .Values.alertingConfig.additionalAlertLabels }} +{{ toYaml .Values.alertingConfig.additionalAlertLabels | indent 8 }} +{{- end }} + annotations: + description: '{{`PGSQL Instance {{ $labels.job }} is over 125% beyond autovacuum_freeze_max_age value. Autovacuum needs tuning to better keep up.`}}' + summary: 'PGSQL Instance emergency vacuum imminent' + - alert: PGArchiveCommandStatus + expr: ccp_archive_command_status_seconds_since_last_fail > 300 + for: 60s + labels: + service: postgresql + severity: critical + severity_num: 300 +{{- if .Values.alertingConfig.additionalAlertLabels }} +{{ toYaml .Values.alertingConfig.additionalAlertLabels | indent 8 }} +{{- end }} + annotations: + description: '{{`PGSQL Instance {{ $labels.job }} has a recent failing archive command`}}' + summary: 'Seconds since the last recorded failure of the archive_command' + - alert: PGSequenceExhaustion + expr: ccp_sequence_exhaustion_count > 0 + for: 60s + labels: + service: postgresql + severity: critical + severity_num: 300 +{{- if .Values.alertingConfig.additionalAlertLabels }} +{{ toYaml .Values.alertingConfig.additionalAlertLabels | indent 8 }} +{{- end }} + annotations: + description: '{{`Count of sequences on instance {{ $labels.job }} at over 75% usage: {{ $value }}. Run following query to see full sequence status: SELECT * FROM monitor.sequence_status() WHERE percent >= 75`}}' + - alert: PGSettingsPendingRestart + expr: ccp_settings_pending_restart_count > 0 + for: 60s + labels: + service: postgresql + severity: critical + severity_num: 300 +{{- if .Values.alertingConfig.additionalAlertLabels }} +{{ toYaml .Values.alertingConfig.additionalAlertLabels | indent 8 }} +{{- end }} + annotations: + description: '{{`One or more settings in the pg_settings system catalog on system {{ $labels.job }} are in a pending_restart state. Check the system catalog for which settings are pending and review postgresql.conf for changes.`}}' + ########## PGBACKREST RULES ########## + # + # Uncomment and customize one or more of these rules to monitor your pgbackrest backups. + # Full backups are considered the equivalent of both differentials and incrementals since both are based on the last full + # And differentials are considered incrementals since incrementals will be based off the last diff if one exists + # This avoid false alerts, for example when you don't run diff/incr backups on the days that you run a full + # Stanza should also be set if different intervals are expected for each stanza. + # Otherwise rule will be applied to all stanzas returned on target system if not set. + # + # Relevant metric names are: + # ccp_backrest_last_full_time_since_completion_seconds + # ccp_backrest_last_incr_time_since_completion_seconds + # ccp_backrest_last_diff_time_since_completion_seconds + # + # - alert: PGBackRestLastCompletedFull_main + # expr: ccp_backrest_last_full_backup_time_since_completion_seconds{stanza="main"} > 604800 + # for: 60s + # labels: + # service: postgresql + # severity: critical + # severity_num: 300 + # annotations: + # summary: '{{`Full backup for stanza [main] on system {{ $labels.job }} has not completed in the last week.`}}' + # + # - alert: PGBackRestLastCompletedIncr_main + # expr: ccp_backrest_last_incr_backup_time_since_completion_seconds{stanza="main"} > 86400 + # for: 60s + # labels: + # service: postgresql + # severity: critical + # severity_num: 300 + # annotations: + # summary: '{{`Incremental backup for stanza [main] on system {{ $labels.job }} has not completed in the last 24 hours.`}}' + # + # + # Runtime monitoring is handled with a single metric: + # + # ccp_backrest_last_runtime_backup_runtime_seconds + # + # Runtime monitoring should have the "backup_type" label set. + # Otherwise the rule will apply to the last run of all backup types returned (full, diff, incr) + # Stanza should also be set if runtimes per stanza have different expected times + # + # - alert: PGBackRestLastRuntimeFull_main + # expr: ccp_backrest_last_runtime_backup_runtime_seconds{backup_type="full", stanza="main"} > 14400 + # for: 60s + # labels: + # service: postgresql + # severity: critical + # severity_num: 300 + # annotations: + # summary: 'Expected runtime of full backup for stanza [main] has exceeded 4 hours' + # + # - alert: PGBackRestLastRuntimeDiff_main + # expr: ccp_backrest_last_runtime_backup_runtime_seconds{backup_type="diff", stanza="main"} > 3600 + # for: 60s + # labels: + # service: postgresql + # severity: critical + # severity_num: 300 + # annotations: + # summary: 'Expected runtime of diff backup for stanza [main] has exceeded 1 hour' + ## + # + ## If the pgbackrest command fails to run, the metric disappears from the exporter output and the alert never fires. + ## An absence alert must be configured explicitly for each target (job) that backups are being monitored. + ## Checking for absence of just the full backup type should be sufficient (no need for diff/incr). + ## Note that while the backrest check command failing will likely also cause a scrape error alert, the addition of this + ## check gives a clearer answer as to what is causing it and that something is wrong with the backups. + # + # - alert: PGBackrestAbsentFull_Prod + # expr: absent(ccp_backrest_last_full_backup_time_since_completion_seconds{job="Prod"}) + # for: 10s + # labels: + # service: postgresql + # severity: critical + # severity_num: 300 + # annotations: + # description: 'Backup Full status missing for Prod. Check that pgbackrest info command is working on target system.' +{{- end }} diff --git a/helm/postgres/values.yaml b/helm/postgres/values.yaml index c9283e28..5be173e0 100644 --- a/helm/postgres/values.yaml +++ b/helm/postgres/values.yaml @@ -39,6 +39,10 @@ # below. # monitoring: false +# alerting enables the deployment of the prometheusrule for postgresql metrics in prometheus. +# You need to enable monitoring to use alerting. +# alerting: false + ################### # Image Overrides # ################### @@ -261,6 +265,12 @@ # "monitoring" setting. # monitoringConfig: {} +##################### +# Alerting Settings # +##################### +#alertingConfig: +# additionalAlertLabels: {} + ####################### # Kubernetes Settings # #######################