diff options
author | Ilya Mashchenko <ilya@netdata.cloud> | 2021-03-09 16:11:41 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-03-09 16:11:41 +0300 |
commit | dc365288d11a010650a6bd3c8de14222cc25368d (patch) | |
tree | a97dc868e20b2aa2199a73df5b88d803263b23f0 /health | |
parent | 2e6de7e835485b719a95a38904e0903bbd6f7bb1 (diff) |
health: make alarms less sensitive (#10688)
Diffstat (limited to 'health')
-rw-r--r-- | health/Makefile.am | 1 | ||||
-rw-r--r-- | health/health.d/apps_plugin.conf | 15 | ||||
-rw-r--r-- | health/health.d/bcache.conf | 7 | ||||
-rw-r--r-- | health/health.d/ceph.conf | 11 | ||||
-rw-r--r-- | health/health.d/disks.conf | 96 | ||||
-rw-r--r-- | health/health.d/entropy.conf | 2 | ||||
-rw-r--r-- | health/health.d/load.conf | 12 | ||||
-rw-r--r-- | health/health.d/net.conf | 30 | ||||
-rw-r--r-- | health/health.d/netfilter.conf | 17 | ||||
-rw-r--r-- | health/health.d/processes.conf | 4 | ||||
-rw-r--r-- | health/health.d/swap.conf | 8 | ||||
-rw-r--r-- | health/health.d/tcp_resets.conf | 15 | ||||
-rw-r--r-- | health/health.d/udp_errors.conf | 25 |
13 files changed, 83 insertions, 160 deletions
diff --git a/health/Makefile.am b/health/Makefile.am index 399d6df5ab..0802dc750f 100644 --- a/health/Makefile.am +++ b/health/Makefile.am @@ -29,7 +29,6 @@ dist_healthconfig_DATA = \ health.d/anomalies.conf \ health.d/apache.conf \ health.d/apcupsd.conf \ - health.d/apps_plugin.conf \ health.d/backend.conf \ health.d/bcache.conf \ health.d/beanstalkd.conf \ diff --git a/health/health.d/apps_plugin.conf b/health/health.d/apps_plugin.conf deleted file mode 100644 index 9a27bc6ba1..0000000000 --- a/health/health.d/apps_plugin.conf +++ /dev/null @@ -1,15 +0,0 @@ -# you can disable an alarm notification by setting the 'to' line to: silent - -# disabled due to https://github.com/netdata/netdata/issues/10327 -# -# alarm: used_file_descriptors -# on: apps.files -# hosts: * -# calc: $fdperc -# units: % -# every: 5s -# warn: $this > (($status >= $WARNING) ? (75) : (80)) -# crit: $this > (($status == $CRITICAL) ? (85) : (90)) -# delay: down 5m multiplier 1.5 max 1h -# info: Peak percentage of file descriptors used -# to: sysadmin diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf index f0da9ac5e5..cbaf18e8ae 100644 --- a/health/health.d/bcache.conf +++ b/health/health.d/bcache.conf @@ -1,13 +1,12 @@ template: bcache_cache_errors on: disk.bcache_cache_read_races - lookup: sum -10m unaligned absolute + lookup: sum -1m unaligned absolute units: errors every: 1m warn: $this > 0 - crit: $this > ( ($status >= $CRITICAL) ? (0) : (10) ) - delay: down 1h multiplier 1.5 max 2h - info: the number of times bcache had issues using the cache, during the last 10 mins (this usually means your SSD cache is failing) + delay: up 2m down 1h multiplier 1.5 max 2h + info: the number of times the data was being read from the cache, the bucket was reused and invalidated, during the last 10 mins (when this occurs the data is reread from the backing device) to: sysadmin template: bcache_cache_dirty diff --git a/health/health.d/ceph.conf b/health/health.d/ceph.conf index de16f7b6ff..e0a55a3e72 100644 --- a/health/health.d/ceph.conf +++ b/health/health.d/ceph.conf @@ -2,12 +2,11 @@ template: cluster_space_usage on: ceph.general_usage - calc: $avail * 100 / ($avail + $used) + calc: $used * 100 / ($used + $avail) units: % - every: 10s - warn: $this < 10 - crit: $this < 1 + every: 1m + warn: $this > (($status >= $WARNING ) ? (85) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 5m multiplier 1.2 max 1h - info: ceph disk usage is almost full + info: current ceph disk usage to: sysadmin - diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf index 9c194ced2f..ba0cbbb5c1 100644 --- a/health/health.d/disks.conf +++ b/health/health.d/disks.conf @@ -49,35 +49,35 @@ families: !/dev !/dev/* !/run !/run/* * # we will use it in the next template to find # the hours remaining -template: disk_fill_rate - on: disk.space - os: linux freebsd - hosts: * -families: * - lookup: min -10m at -50m unaligned of avail - calc: ($this - $avail) / (($now - $after) / 3600) - every: 1m - units: GB/hour - info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour +# template: disk_fill_rate +# on: disk.space +# os: linux freebsd +# hosts: * +# families: * +# lookup: min -10m at -50m unaligned of avail +# calc: ($this - $avail) / (($now - $after) / 3600) +# every: 1m +# units: GB/hour +# info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour # calculate the hours remaining # if the disk continues to fill # in this rate -template: out_of_disk_space_time - on: disk.space - os: linux freebsd - hosts: * -families: * - calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf) - units: hours - every: 10s - warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) - crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) - delay: down 15m multiplier 1.2 max 1h - info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour - to: sysadmin +# template: out_of_disk_space_time +# on: disk.space +# os: linux freebsd +# hosts: * +# families: * +# calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf) +# units: hours +# every: 10s +# warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) +# crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) +# delay: down 15m multiplier 1.2 max 1h +# info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour +# to: sysadmin # ----------------------------------------------------------------------------- @@ -91,34 +91,34 @@ families: * # we will use it in the next template to find # the hours remaining -template: disk_inode_rate - on: disk.inodes - os: linux freebsd - hosts: * -families: * - lookup: min -10m at -50m unaligned of avail - calc: ($this - $avail) / (($now - $after) / 3600) - every: 1m - units: inodes/hour - info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour +# template: disk_inode_rate +# on: disk.inodes +# os: linux freebsd +# hosts: * +# families: * +# lookup: min -10m at -50m unaligned of avail +# calc: ($this - $avail) / (($now - $after) / 3600) +# every: 1m +# units: inodes/hour +# info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour # calculate the hours remaining # if the disk inodes are allocated # in this rate -template: out_of_disk_inodes_time - on: disk.inodes - os: linux freebsd - hosts: * -families: * - calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf) - units: hours - every: 10s - warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) - crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) - delay: down 15m multiplier 1.2 max 1h - info: estimated time the disk will run out of inodes, if the system continues to allocate inodes with the rate of the last hour - to: sysadmin +# template: out_of_disk_inodes_time +# on: disk.inodes +# os: linux freebsd +# hosts: * +# families: * +# calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf) +# units: hours +# every: 10s +# warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) +# crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) +# delay: down 15m multiplier 1.2 max 1h +# info: estimated time the disk will run out of inodes, if the system continues to allocate inodes with the rate of the last hour +# to: sysadmin # ----------------------------------------------------------------------------- @@ -142,7 +142,7 @@ families: * crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1)) delay: down 15m multiplier 1.2 max 1h info: the percentage of time the disk was busy, during the last 10 minutes - to: sysadmin + to: silent # raise an alarm if the disk backlog @@ -164,4 +164,4 @@ families: * crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1)) delay: down 15m multiplier 1.2 max 1h info: average of the kernel estimated disk backlog, for the last 10 minutes - to: sysadmin + to: silent diff --git a/health/health.d/entropy.conf b/health/health.d/entropy.conf index 66d44ec139..d67c753af6 100644 --- a/health/health.d/entropy.conf +++ b/health/health.d/entropy.conf @@ -7,7 +7,7 @@ on: system.entropy os: linux hosts: * - lookup: min -10m unaligned + lookup: min -5m unaligned units: entries every: 5m warn: $this < (($status >= $WARNING) ? (200) : (100)) diff --git a/health/health.d/load.conf b/health/health.d/load.conf index ee0c54b8e7..ce7f77551b 100644 --- a/health/health.d/load.conf +++ b/health/health.d/load.conf @@ -4,7 +4,7 @@ # Calculate the base trigger point for the load average alarms. # This is the maximum number of CPU's in the system over the past 1 # minute, with a special case for a single CPU of setting the trigger at 2. - alarm: load_trigger + alarm: load_cpu_number on: system.load os: linux hosts: * @@ -16,6 +16,7 @@ # Send alarms if the load average is unusually high. # These intentionally _do not_ calculate the average over the sampled # time period because the values being checked already are averages. + alarm: load_average_15 on: system.load os: linux @@ -23,8 +24,7 @@ lookup: max -1m unaligned of load15 units: load every: 1m - warn: $this > (($status >= $WARNING) ? (1.75 * $load_trigger) : (2 * $load_trigger)) - crit: $this > (($status == $CRITICAL) ? (3.5 * $load_trigger) : (4 * $load_trigger)) + warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 175 : 200) delay: down 15m multiplier 1.5 max 1h info: fifteen-minute load average to: sysadmin @@ -36,8 +36,7 @@ lookup: max -1m unaligned of load5 units: load every: 1m - warn: $this > (($status >= $WARNING) ? (3.5 * $load_trigger) : (4 * $load_trigger)) - crit: $this > (($status == $CRITICAL) ? (7 * $load_trigger) : (8 * $load_trigger)) + warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 350 : 400) delay: down 15m multiplier 1.5 max 1h info: five-minute load average to: sysadmin @@ -49,8 +48,7 @@ lookup: max -1m unaligned of load1 units: load every: 1m - warn: $this > (($status >= $WARNING) ? (7 * $load_trigger) : (8 * $load_trigger)) - crit: $this > (($status == $CRITICAL) ? (14 * $load_trigger) : (16 * $load_trigger)) + warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 700 : 800) delay: down 15m multiplier 1.5 max 1h info: one-minute load average to: sysadmin diff --git a/health/health.d/net.conf b/health/health.d/net.conf index 261290e513..96c0f3cde6 100644 --- a/health/health.d/net.conf +++ b/health/health.d/net.conf @@ -23,9 +23,8 @@ calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan ) units: % every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (90)) - delay: down 1m multiplier 1.5 max 1h + warn: $this > (($status >= $WARNING) ? (85) : (90)) + delay: up 1m down 1m multiplier 1.5 max 1h info: interface received bandwidth usage over net device speed max to: sysadmin @@ -38,9 +37,8 @@ calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan ) units: % every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (90)) - delay: down 1m multiplier 1.5 max 1h + warn: $this > (($status >= $WARNING) ? (85) : (90)) + delay: up 1m down 1m multiplier 1.5 max 1h info: interface sent bandwidth usage over net device speed max to: sysadmin @@ -62,10 +60,7 @@ families: * lookup: sum -10m unaligned absolute of inbound units: packets every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h info: interface inbound dropped packets in the last 10 minutes - to: sysadmin template: outbound_packets_dropped on: net.drops @@ -75,10 +70,7 @@ families: * lookup: sum -10m unaligned absolute of outbound units: packets every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h info: interface outbound dropped packets in the last 10 minutes - to: sysadmin template: inbound_packets_dropped_ratio on: net.packets @@ -86,12 +78,11 @@ template: inbound_packets_dropped_ratio hosts: * families: * lookup: sum -10m unaligned absolute of received - calc: (($inbound_packets_dropped != nan AND $this > 0) ? ($inbound_packets_dropped * 100 / $this) : (0)) + calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0)) units: % every: 1m - warn: $this >= 0.1 - crit: $this >= 2 - delay: down 1h multiplier 1.5 max 2h + warn: $this >= 2 + delay: up 1m down 1h multiplier 1.5 max 2h info: the ratio of inbound dropped packets vs the total number of received packets of the network interface, during the last 10 minutes to: sysadmin @@ -101,12 +92,11 @@ template: outbound_packets_dropped_ratio hosts: * families: * lookup: sum -10m unaligned absolute of sent - calc: (($outbound_packets_dropped != nan AND $this > 0) ? ($outbound_packets_dropped * 100 / $this) : (0)) + calc: (($outbound_packets_dropped != nan AND $this > 1000) ? ($outbound_packets_dropped * 100 / $this) : (0)) units: % every: 1m - warn: $this >= 0.1 - crit: $this >= 2 - delay: down 1h multiplier 1.5 max 2h + warn: $this >= 2 + delay: up 1m down 1h multiplier 1.5 max 2h info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes to: sysadmin diff --git a/health/health.d/netfilter.conf b/health/health.d/netfilter.conf index 1d07752cc2..473aea4d79 100644 --- a/health/health.d/netfilter.conf +++ b/health/health.d/netfilter.conf @@ -1,19 +1,6 @@ # you can disable an alarm notification by setting the 'to' line to: silent - alarm: netfilter_last_collected_secs - on: netfilter.conntrack_sockets - os: linux - hosts: * - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - alarm: netfilter_conntrack_full on: netfilter.conntrack_sockets os: linux @@ -22,8 +9,8 @@ calc: $this * 100 / $netfilter_conntrack_max units: % every: 10s - warn: $this > (($status >= $WARNING) ? (70) : (80)) - crit: $this > (($status == $CRITICAL) ? (80) : (90)) + warn: $this > (($status >= $WARNING) ? (85) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (95)) delay: down 5m multiplier 1.5 max 1h info: the number of connections tracked by the netfilter connection tracker, as a percentage of the connection tracker table size to: sysadmin diff --git a/health/health.d/processes.conf b/health/health.d/processes.conf index 293f1aa0d3..786a655d48 100644 --- a/health/health.d/processes.conf +++ b/health/health.d/processes.conf @@ -6,8 +6,8 @@ calc: $active * 100 / $pidmax units: % every: 5s - warn: $this > (($status >= $WARNING) ? (75) : (80)) - crit: $this > (($status == $CRITICAL) ? (85) : (90)) + warn: $this > (($status >= $WARNING) ? (85) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (95)) delay: down 5m multiplier 1.5 max 1h info: the percentage of active processes to: sysadmin diff --git a/health/health.d/swap.conf b/health/health.d/swap.conf index f920b0807d..7cf8134e30 100644 --- a/health/health.d/swap.conf +++ b/health/health.d/swap.conf @@ -10,9 +10,8 @@ calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) units: % of RAM every: 1m - warn: $this > (($status >= $WARNING) ? (10) : (20)) - crit: $this > (($status == $CRITICAL) ? (20) : (30)) - delay: up 0 down 15m multiplier 1.5 max 1h + warn: $this > (($status >= $WARNING) ? (20) : (30)) + delay: down 15m multiplier 1.5 max 1h info: the amount of memory swapped in the last 30 minutes, as a percentage of the system RAM to: sysadmin @@ -23,8 +22,7 @@ calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) units: % of RAM every: 10s - warn: $this > (($status >= $WARNING) ? (15) : (20)) - crit: $this > (($status == $CRITICAL) ? (40) : (50)) + warn: $this > (($status >= $WARNING) ? (40) : (50)) delay: up 30s down 15m multiplier 1.5 max 1h info: the swap memory used, as a percentage of the system RAM to: sysadmin diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf index 36a550a5db..f41eee0ea5 100644 --- a/health/health.d/tcp_resets.conf +++ b/health/health.d/tcp_resets.conf @@ -2,21 +2,6 @@ # you can disable an alarm notification by setting the 'to' line to: silent # ----------------------------------------------------------------------------- - - alarm: ipv4_tcphandshake_last_collected_secs - on: ipv4.tcphandshake - os: linux freebsd - hosts: * - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: up 0 down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - -# ----------------------------------------------------------------------------- # tcp resets this host sends alarm: 1m_ipv4_tcp_resets_sent diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf index 1e47b5c8bf..7cf254e754 100644 --- a/health/health.d/udp_errors.conf +++ b/health/health.d/udp_errors.conf @@ -2,21 +2,6 @@ # you can disable an alarm notification by setting the 'to' line to: silent # ----------------------------------------------------------------------------- - - alarm: ipv4_udperrors_last_collected_secs - on: ipv4.udperrors - os: linux freebsd - hosts: * - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: up 0 down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - -# ----------------------------------------------------------------------------- # UDP receive buffer errors alarm: 1m_ipv4_udp_receive_buffer_errors @@ -26,10 +11,9 @@ lookup: average -1m unaligned absolute of RcvbufErrors units: errors every: 10s - warn: $this > 1 - crit: $this > (($status == $CRITICAL) ? (0) : (10)) + warn: $this > (($status >= $WARNING) ? (0) : (10)) info: average number of UDP receive buffer errors during the last minute - delay: up 0 down 60m multiplier 1.2 max 2h + delay: up 1m down 60m multiplier 1.2 max 2h to: sysadmin # ----------------------------------------------------------------------------- @@ -42,8 +26,7 @@ lookup: average -1m unaligned absolute of SndbufErrors units: errors every: 10s - warn: $this > 1 - crit: $this > (($status == $CRITICAL) ? (0) : (10)) + warn: $this > (($status >= $WARNING) ? (0) : (10)) info: number of UDP send buffer errors during the last minute - delay: up 0 down 60m multiplier 1.2 max 2h + delay: up 1m down 60m multiplier 1.2 max 2h to: sysadmin |