summaryrefslogtreecommitdiffstats
path: root/health
diff options
context:
space:
mode:
authorIlya Mashchenko <ilya@netdata.cloud>2021-03-09 16:11:41 +0300
committerGitHub <noreply@github.com>2021-03-09 16:11:41 +0300
commitdc365288d11a010650a6bd3c8de14222cc25368d (patch)
treea97dc868e20b2aa2199a73df5b88d803263b23f0 /health
parent2e6de7e835485b719a95a38904e0903bbd6f7bb1 (diff)
health: make alarms less sensitive (#10688)
Diffstat (limited to 'health')
-rw-r--r--health/Makefile.am1
-rw-r--r--health/health.d/apps_plugin.conf15
-rw-r--r--health/health.d/bcache.conf7
-rw-r--r--health/health.d/ceph.conf11
-rw-r--r--health/health.d/disks.conf96
-rw-r--r--health/health.d/entropy.conf2
-rw-r--r--health/health.d/load.conf12
-rw-r--r--health/health.d/net.conf30
-rw-r--r--health/health.d/netfilter.conf17
-rw-r--r--health/health.d/processes.conf4
-rw-r--r--health/health.d/swap.conf8
-rw-r--r--health/health.d/tcp_resets.conf15
-rw-r--r--health/health.d/udp_errors.conf25
13 files changed, 83 insertions, 160 deletions
diff --git a/health/Makefile.am b/health/Makefile.am
index 399d6df5ab..0802dc750f 100644
--- a/health/Makefile.am
+++ b/health/Makefile.am
@@ -29,7 +29,6 @@ dist_healthconfig_DATA = \
health.d/anomalies.conf \
health.d/apache.conf \
health.d/apcupsd.conf \
- health.d/apps_plugin.conf \
health.d/backend.conf \
health.d/bcache.conf \
health.d/beanstalkd.conf \
diff --git a/health/health.d/apps_plugin.conf b/health/health.d/apps_plugin.conf
deleted file mode 100644
index 9a27bc6ba1..0000000000
--- a/health/health.d/apps_plugin.conf
+++ /dev/null
@@ -1,15 +0,0 @@
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-# disabled due to https://github.com/netdata/netdata/issues/10327
-#
-# alarm: used_file_descriptors
-# on: apps.files
-# hosts: *
-# calc: $fdperc
-# units: %
-# every: 5s
-# warn: $this > (($status >= $WARNING) ? (75) : (80))
-# crit: $this > (($status == $CRITICAL) ? (85) : (90))
-# delay: down 5m multiplier 1.5 max 1h
-# info: Peak percentage of file descriptors used
-# to: sysadmin
diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf
index f0da9ac5e5..cbaf18e8ae 100644
--- a/health/health.d/bcache.conf
+++ b/health/health.d/bcache.conf
@@ -1,13 +1,12 @@
template: bcache_cache_errors
on: disk.bcache_cache_read_races
- lookup: sum -10m unaligned absolute
+ lookup: sum -1m unaligned absolute
units: errors
every: 1m
warn: $this > 0
- crit: $this > ( ($status >= $CRITICAL) ? (0) : (10) )
- delay: down 1h multiplier 1.5 max 2h
- info: the number of times bcache had issues using the cache, during the last 10 mins (this usually means your SSD cache is failing)
+ delay: up 2m down 1h multiplier 1.5 max 2h
+ info: the number of times the data was being read from the cache, the bucket was reused and invalidated, during the last 10 mins (when this occurs the data is reread from the backing device)
to: sysadmin
template: bcache_cache_dirty
diff --git a/health/health.d/ceph.conf b/health/health.d/ceph.conf
index de16f7b6ff..e0a55a3e72 100644
--- a/health/health.d/ceph.conf
+++ b/health/health.d/ceph.conf
@@ -2,12 +2,11 @@
template: cluster_space_usage
on: ceph.general_usage
- calc: $avail * 100 / ($avail + $used)
+ calc: $used * 100 / ($used + $avail)
units: %
- every: 10s
- warn: $this < 10
- crit: $this < 1
+ every: 1m
+ warn: $this > (($status >= $WARNING ) ? (85) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: down 5m multiplier 1.2 max 1h
- info: ceph disk usage is almost full
+ info: current ceph disk usage
to: sysadmin
-
diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf
index 9c194ced2f..ba0cbbb5c1 100644
--- a/health/health.d/disks.conf
+++ b/health/health.d/disks.conf
@@ -49,35 +49,35 @@ families: !/dev !/dev/* !/run !/run/* *
# we will use it in the next template to find
# the hours remaining
-template: disk_fill_rate
- on: disk.space
- os: linux freebsd
- hosts: *
-families: *
- lookup: min -10m at -50m unaligned of avail
- calc: ($this - $avail) / (($now - $after) / 3600)
- every: 1m
- units: GB/hour
- info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour
+# template: disk_fill_rate
+# on: disk.space
+# os: linux freebsd
+# hosts: *
+# families: *
+# lookup: min -10m at -50m unaligned of avail
+# calc: ($this - $avail) / (($now - $after) / 3600)
+# every: 1m
+# units: GB/hour
+# info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour
# calculate the hours remaining
# if the disk continues to fill
# in this rate
-template: out_of_disk_space_time
- on: disk.space
- os: linux freebsd
- hosts: *
-families: *
- calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf)
- units: hours
- every: 10s
- warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
- crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
- delay: down 15m multiplier 1.2 max 1h
- info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour
- to: sysadmin
+# template: out_of_disk_space_time
+# on: disk.space
+# os: linux freebsd
+# hosts: *
+# families: *
+# calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf)
+# units: hours
+# every: 10s
+# warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
+# crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+# delay: down 15m multiplier 1.2 max 1h
+# info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour
+# to: sysadmin
# -----------------------------------------------------------------------------
@@ -91,34 +91,34 @@ families: *
# we will use it in the next template to find
# the hours remaining
-template: disk_inode_rate
- on: disk.inodes
- os: linux freebsd
- hosts: *
-families: *
- lookup: min -10m at -50m unaligned of avail
- calc: ($this - $avail) / (($now - $after) / 3600)
- every: 1m
- units: inodes/hour
- info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour
+# template: disk_inode_rate
+# on: disk.inodes
+# os: linux freebsd
+# hosts: *
+# families: *
+# lookup: min -10m at -50m unaligned of avail
+# calc: ($this - $avail) / (($now - $after) / 3600)
+# every: 1m
+# units: inodes/hour
+# info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour
# calculate the hours remaining
# if the disk inodes are allocated
# in this rate
-template: out_of_disk_inodes_time
- on: disk.inodes
- os: linux freebsd
- hosts: *
-families: *
- calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf)
- units: hours
- every: 10s
- warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
- crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
- delay: down 15m multiplier 1.2 max 1h
- info: estimated time the disk will run out of inodes, if the system continues to allocate inodes with the rate of the last hour
- to: sysadmin
+# template: out_of_disk_inodes_time
+# on: disk.inodes
+# os: linux freebsd
+# hosts: *
+# families: *
+# calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf)
+# units: hours
+# every: 10s
+# warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
+# crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+# delay: down 15m multiplier 1.2 max 1h
+# info: estimated time the disk will run out of inodes, if the system continues to allocate inodes with the rate of the last hour
+# to: sysadmin
# -----------------------------------------------------------------------------
@@ -142,7 +142,7 @@ families: *
crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1))
delay: down 15m multiplier 1.2 max 1h
info: the percentage of time the disk was busy, during the last 10 minutes
- to: sysadmin
+ to: silent
# raise an alarm if the disk backlog
@@ -164,4 +164,4 @@ families: *
crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1))
delay: down 15m multiplier 1.2 max 1h
info: average of the kernel estimated disk backlog, for the last 10 minutes
- to: sysadmin
+ to: silent
diff --git a/health/health.d/entropy.conf b/health/health.d/entropy.conf
index 66d44ec139..d67c753af6 100644
--- a/health/health.d/entropy.conf
+++ b/health/health.d/entropy.conf
@@ -7,7 +7,7 @@
on: system.entropy
os: linux
hosts: *
- lookup: min -10m unaligned
+ lookup: min -5m unaligned
units: entries
every: 5m
warn: $this < (($status >= $WARNING) ? (200) : (100))
diff --git a/health/health.d/load.conf b/health/health.d/load.conf
index ee0c54b8e7..ce7f77551b 100644
--- a/health/health.d/load.conf
+++ b/health/health.d/load.conf
@@ -4,7 +4,7 @@
# Calculate the base trigger point for the load average alarms.
# This is the maximum number of CPU's in the system over the past 1
# minute, with a special case for a single CPU of setting the trigger at 2.
- alarm: load_trigger
+ alarm: load_cpu_number
on: system.load
os: linux
hosts: *
@@ -16,6 +16,7 @@
# Send alarms if the load average is unusually high.
# These intentionally _do not_ calculate the average over the sampled
# time period because the values being checked already are averages.
+
alarm: load_average_15
on: system.load
os: linux
@@ -23,8 +24,7 @@
lookup: max -1m unaligned of load15
units: load
every: 1m
- warn: $this > (($status >= $WARNING) ? (1.75 * $load_trigger) : (2 * $load_trigger))
- crit: $this > (($status == $CRITICAL) ? (3.5 * $load_trigger) : (4 * $load_trigger))
+ warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 175 : 200)
delay: down 15m multiplier 1.5 max 1h
info: fifteen-minute load average
to: sysadmin
@@ -36,8 +36,7 @@
lookup: max -1m unaligned of load5
units: load
every: 1m
- warn: $this > (($status >= $WARNING) ? (3.5 * $load_trigger) : (4 * $load_trigger))
- crit: $this > (($status == $CRITICAL) ? (7 * $load_trigger) : (8 * $load_trigger))
+ warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 350 : 400)
delay: down 15m multiplier 1.5 max 1h
info: five-minute load average
to: sysadmin
@@ -49,8 +48,7 @@
lookup: max -1m unaligned of load1
units: load
every: 1m
- warn: $this > (($status >= $WARNING) ? (7 * $load_trigger) : (8 * $load_trigger))
- crit: $this > (($status == $CRITICAL) ? (14 * $load_trigger) : (16 * $load_trigger))
+ warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 700 : 800)
delay: down 15m multiplier 1.5 max 1h
info: one-minute load average
to: sysadmin
diff --git a/health/health.d/net.conf b/health/health.d/net.conf
index 261290e513..96c0f3cde6 100644
--- a/health/health.d/net.conf
+++ b/health/health.d/net.conf
@@ -23,9 +23,8 @@
calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan )
units: %
every: 10s
- warn: $this > (($status >= $WARNING) ? (80) : (85))
- crit: $this > (($status == $CRITICAL) ? (85) : (90))
- delay: down 1m multiplier 1.5 max 1h
+ warn: $this > (($status >= $WARNING) ? (85) : (90))
+ delay: up 1m down 1m multiplier 1.5 max 1h
info: interface received bandwidth usage over net device speed max
to: sysadmin
@@ -38,9 +37,8 @@
calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan )
units: %
every: 10s
- warn: $this > (($status >= $WARNING) ? (80) : (85))
- crit: $this > (($status == $CRITICAL) ? (85) : (90))
- delay: down 1m multiplier 1.5 max 1h
+ warn: $this > (($status >= $WARNING) ? (85) : (90))
+ delay: up 1m down 1m multiplier 1.5 max 1h
info: interface sent bandwidth usage over net device speed max
to: sysadmin
@@ -62,10 +60,7 @@ families: *
lookup: sum -10m unaligned absolute of inbound
units: packets
every: 1m
- warn: $this >= 5
- delay: down 1h multiplier 1.5 max 2h
info: interface inbound dropped packets in the last 10 minutes
- to: sysadmin
template: outbound_packets_dropped
on: net.drops
@@ -75,10 +70,7 @@ families: *
lookup: sum -10m unaligned absolute of outbound
units: packets
every: 1m
- warn: $this >= 5
- delay: down 1h multiplier 1.5 max 2h
info: interface outbound dropped packets in the last 10 minutes
- to: sysadmin
template: inbound_packets_dropped_ratio
on: net.packets
@@ -86,12 +78,11 @@ template: inbound_packets_dropped_ratio
hosts: *
families: *
lookup: sum -10m unaligned absolute of received
- calc: (($inbound_packets_dropped != nan AND $this > 0) ? ($inbound_packets_dropped * 100 / $this) : (0))
+ calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0))
units: %
every: 1m
- warn: $this >= 0.1
- crit: $this >= 2
- delay: down 1h multiplier 1.5 max 2h
+ warn: $this >= 2
+ delay: up 1m down 1h multiplier 1.5 max 2h
info: the ratio of inbound dropped packets vs the total number of received packets of the network interface, during the last 10 minutes
to: sysadmin
@@ -101,12 +92,11 @@ template: outbound_packets_dropped_ratio
hosts: *
families: *
lookup: sum -10m unaligned absolute of sent
- calc: (($outbound_packets_dropped != nan AND $this > 0) ? ($outbound_packets_dropped * 100 / $this) : (0))
+ calc: (($outbound_packets_dropped != nan AND $this > 1000) ? ($outbound_packets_dropped * 100 / $this) : (0))
units: %
every: 1m
- warn: $this >= 0.1
- crit: $this >= 2
- delay: down 1h multiplier 1.5 max 2h
+ warn: $this >= 2
+ delay: up 1m down 1h multiplier 1.5 max 2h
info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes
to: sysadmin
diff --git a/health/health.d/netfilter.conf b/health/health.d/netfilter.conf
index 1d07752cc2..473aea4d79 100644
--- a/health/health.d/netfilter.conf
+++ b/health/health.d/netfilter.conf
@@ -1,19 +1,6 @@
# you can disable an alarm notification by setting the 'to' line to: silent
- alarm: netfilter_last_collected_secs
- on: netfilter.conntrack_sockets
- os: linux
- hosts: *
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
-
alarm: netfilter_conntrack_full
on: netfilter.conntrack_sockets
os: linux
@@ -22,8 +9,8 @@
calc: $this * 100 / $netfilter_conntrack_max
units: %
every: 10s
- warn: $this > (($status >= $WARNING) ? (70) : (80))
- crit: $this > (($status == $CRITICAL) ? (80) : (90))
+ warn: $this > (($status >= $WARNING) ? (85) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (95))
delay: down 5m multiplier 1.5 max 1h
info: the number of connections tracked by the netfilter connection tracker, as a percentage of the connection tracker table size
to: sysadmin
diff --git a/health/health.d/processes.conf b/health/health.d/processes.conf
index 293f1aa0d3..786a655d48 100644
--- a/health/health.d/processes.conf
+++ b/health/health.d/processes.conf
@@ -6,8 +6,8 @@
calc: $active * 100 / $pidmax
units: %
every: 5s
- warn: $this > (($status >= $WARNING) ? (75) : (80))
- crit: $this > (($status == $CRITICAL) ? (85) : (90))
+ warn: $this > (($status >= $WARNING) ? (85) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (95))
delay: down 5m multiplier 1.5 max 1h
info: the percentage of active processes
to: sysadmin
diff --git a/health/health.d/swap.conf b/health/health.d/swap.conf
index f920b0807d..7cf8134e30 100644
--- a/health/health.d/swap.conf
+++ b/health/health.d/swap.conf
@@ -10,9 +10,8 @@
calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
units: % of RAM
every: 1m
- warn: $this > (($status >= $WARNING) ? (10) : (20))
- crit: $this > (($status == $CRITICAL) ? (20) : (30))
- delay: up 0 down 15m multiplier 1.5 max 1h
+ warn: $this > (($status >= $WARNING) ? (20) : (30))
+ delay: down 15m multiplier 1.5 max 1h
info: the amount of memory swapped in the last 30 minutes, as a percentage of the system RAM
to: sysadmin
@@ -23,8 +22,7 @@
calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
units: % of RAM
every: 10s
- warn: $this > (($status >= $WARNING) ? (15) : (20))
- crit: $this > (($status == $CRITICAL) ? (40) : (50))
+ warn: $this > (($status >= $WARNING) ? (40) : (50))
delay: up 30s down 15m multiplier 1.5 max 1h
info: the swap memory used, as a percentage of the system RAM
to: sysadmin
diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf
index 36a550a5db..f41eee0ea5 100644
--- a/health/health.d/tcp_resets.conf
+++ b/health/health.d/tcp_resets.conf
@@ -2,21 +2,6 @@
# you can disable an alarm notification by setting the 'to' line to: silent
# -----------------------------------------------------------------------------
-
- alarm: ipv4_tcphandshake_last_collected_secs
- on: ipv4.tcphandshake
- os: linux freebsd
- hosts: *
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: up 0 down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
-
-# -----------------------------------------------------------------------------
# tcp resets this host sends
alarm: 1m_ipv4_tcp_resets_sent
diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf
index 1e47b5c8bf..7cf254e754 100644
--- a/health/health.d/udp_errors.conf
+++ b/health/health.d/udp_errors.conf
@@ -2,21 +2,6 @@
# you can disable an alarm notification by setting the 'to' line to: silent
# -----------------------------------------------------------------------------
-
- alarm: ipv4_udperrors_last_collected_secs
- on: ipv4.udperrors
- os: linux freebsd
- hosts: *
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: up 0 down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
-
-# -----------------------------------------------------------------------------
# UDP receive buffer errors
alarm: 1m_ipv4_udp_receive_buffer_errors
@@ -26,10 +11,9 @@
lookup: average -1m unaligned absolute of RcvbufErrors
units: errors
every: 10s
- warn: $this > 1
- crit: $this > (($status == $CRITICAL) ? (0) : (10))
+ warn: $this > (($status >= $WARNING) ? (0) : (10))
info: average number of UDP receive buffer errors during the last minute
- delay: up 0 down 60m multiplier 1.2 max 2h
+ delay: up 1m down 60m multiplier 1.2 max 2h
to: sysadmin
# -----------------------------------------------------------------------------
@@ -42,8 +26,7 @@
lookup: average -1m unaligned absolute of SndbufErrors
units: errors
every: 10s
- warn: $this > 1
- crit: $this > (($status == $CRITICAL) ? (0) : (10))
+ warn: $this > (($status >= $WARNING) ? (0) : (10))
info: number of UDP send buffer errors during the last minute
- delay: up 0 down 60m multiplier 1.2 max 2h
+ delay: up 1m down 60m multiplier 1.2 max 2h
to: sysadmin