summaryrefslogtreecommitdiffstats
path: root/health
diff options
context:
space:
mode:
authorIlya Mashchenko <ilya@netdata.cloud>2022-09-19 15:26:43 +0300
committerGitHub <noreply@github.com>2022-09-19 15:26:43 +0300
commitd6995cabcf5fa021385ccaf194315c886836403d (patch)
tree75ca344355092ecc7c385c6eb7725c71d3a1eceb /health
parent8c3c83d52876152cecceb0564f6bc1bc2de8eb7a (diff)
feat(health): add Postgres alarms (#13671)
Diffstat (limited to 'health')
-rw-r--r--health/health.d/postgres.conf200
1 files changed, 199 insertions, 1 deletions
diff --git a/health/health.d/postgres.conf b/health/health.d/postgres.conf
index 47267555bf..76975f30e6 100644
--- a/health/health.d/postgres.conf
+++ b/health/health.d/postgres.conf
@@ -1,6 +1,6 @@
# you can disable an alarm notification by setting the 'to' line to: silent
- template: total_connection_utilization
+ template: postgres_total_connection_utilization
on: postgres.connections_utilization
class: Utilization
type: Database
@@ -14,3 +14,201 @@ component: PostgreSQL
delay: down 15m multiplier 1.5 max 1h
info: average total connection utilization over the last minute
to: dba
+
+ template: postgres_acquired_locks_utilization
+ on: postgres.locks_utilization
+ class: Utilization
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -1m unaligned of used
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (15) : (20))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average acquired locks utilization over the last minute
+ to: dba
+
+ template: postgres_txid_exhaustion_perc
+ on: postgres.txid_exhaustion_perc
+ class: Utilization
+ type: Database
+component: PostgreSQL
+ hosts: *
+ calc: $txid_exhaustion
+ units: %
+ every: 1m
+ warn: $this > 90
+ delay: down 15m multiplier 1.5 max 1h
+ info: percent towards TXID wraparound
+ to: dba
+
+# Database alarms
+
+ template: postgres_db_cache_io_ratio
+ on: postgres.db_cache_io_ratio
+ class: Workload
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -1m unaligned of miss
+ calc: 100 - $this
+ units: %
+ every: 1m
+ warn: $this < (($status >= $WARNING) ? (70) : (60))
+ crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average cache hit ratio in db $label:database over the last minute
+ to: dba
+
+ template: postgres_db_transactions_rollback_ratio
+ on: postgres.db_transactions_ratio
+ class: Workload
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -5m unaligned of rollback
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (2))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average aborted transactions percentage in db $label:database over the last five minutes
+ to: dba
+
+ template: postgres_db_deadlocks_rate
+ on: postgres.db_deadlocks_rate
+ class: Errors
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: sum -1m unaligned of deadlocks
+ units: deadlocks
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (10))
+ delay: down 15m multiplier 1.5 max 1h
+ info: number of deadlocks detected in db $label:database in the last minute
+ to: dba
+
+# Table alarms
+
+ template: postgres_table_cache_io_ratio
+ on: postgres.table_cache_io_ratio
+ class: Workload
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -1m unaligned of miss
+ calc: 100 - $this
+ units: %
+ every: 1m
+ warn: $this < (($status >= $WARNING) ? (70) : (60))
+ crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average cache hit ratio in db $label:database table $label:table over the last minute
+ to: dba
+
+ template: postgres_table_index_cache_io_ratio
+ on: postgres.table_index_cache_io_ratio
+ class: Workload
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -1m unaligned of miss
+ calc: 100 - $this
+ units: %
+ every: 1m
+ warn: $this < (($status >= $WARNING) ? (70) : (60))
+ crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average index cache hit ratio in db $label:database table $label:table over the last minute
+ to: dba
+
+ template: postgres_table_toast_cache_io_ratio
+ on: postgres.table_toast_cache_io_ratio
+ class: Workload
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -1m unaligned of miss
+ calc: 100 - $this
+ units: %
+ every: 1m
+ warn: $this < (($status >= $WARNING) ? (70) : (60))
+ crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average TOAST hit ratio in db $label:database table $label:table over the last minute
+ to: dba
+
+ template: postgres_table_toast_index_cache_io_ratio
+ on: postgres.table_toast_index_cache_io_ratio
+ class: Workload
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -1m unaligned of miss
+ calc: 100 - $this
+ units: %
+ every: 1m
+ warn: $this < (($status >= $WARNING) ? (70) : (60))
+ crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average index TOAST hit ratio in db $label:database table $label:table over the last minute
+ to: dba
+
+ template: postgres_table_bloat_size_perc
+ on: postgres.table_bloat_size_perc
+ class: Errors
+ type: Database
+component: PostgreSQL
+ hosts: *
+ calc: $bloat
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (60) : (70))
+ crit: $this > (($status == $CRITICAL) ? (70) : (80))
+ delay: down 15m multiplier 1.5 max 1h
+ info: bloat size percentage in db $label:database table $label:table
+ to: dba
+
+ template: postgres_table_last_autovacuum_time
+ on: postgres.table_autovacuum_since_time
+ class: Errors
+ type: Database
+component: PostgreSQL
+ hosts: *
+ calc: $time
+ units: seconds
+ every: 1m
+ warn: $this != nan AND $this > (60 * 60 * 24 * 7)
+ info: time elapsed since db $label:database table $label:table was vacuumed by the autovacuum daemon
+ to: dba
+
+ template: postgres_table_last_autoanalyze_time
+ on: postgres.table_autoanalyze_since_time
+ class: Errors
+ type: Database
+component: PostgreSQL
+ hosts: *
+ calc: $time
+ units: seconds
+ every: 1m
+ warn: $this != nan AND $this > (60 * 60 * 24 * 7)
+ info: time elapsed since db $label:database table $label:table was analyzed by the autovacuum daemon
+ to: dba
+
+# Index alarms
+
+ template: postgres_index_bloat_size_perc
+ on: postgres.index_bloat_size_perc
+ class: Errors
+ type: Database
+component: PostgreSQL
+ hosts: *
+ calc: $bloat
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (60) : (70))
+ crit: $this > (($status == $CRITICAL) ? (70) : (80))
+ delay: down 15m multiplier 1.5 max 1h
+ info: bloat size percentage in db $label:database table $label:table index $label:index
+ to: dba