health: make alarms less sensitive (#10688)

author: Ilya Mashchenko <ilya@netdata.cloud> 2021-03-09 16:11:41 +0300
committer: GitHub <noreply@github.com> 2021-03-09 16:11:41 +0300
commit: dc365288d11a010650a6bd3c8de14222cc25368d (patch)
tree: a97dc868e20b2aa2199a73df5b88d803263b23f0 /health
parent: 2e6de7e835485b719a95a38904e0903bbd6f7bb1 (diff)
13 files changed, 83 insertions, 160 deletions
diff --git a/health/Makefile.am b/health/Makefile.am
index 399d6df5ab..0802dc750f 100644
--- a/health/Makefile.am
+++ b/health/Makefile.am
@@ -29,7 +29,6 @@ dist_healthconfig_DATA = \
     health.d/anomalies.conf \
     health.d/apache.conf \
     health.d/apcupsd.conf \
-    health.d/apps_plugin.conf \
     health.d/backend.conf \
     health.d/bcache.conf \
     health.d/beanstalkd.conf \
diff --git a/health/health.d/apps_plugin.conf b/health/health.d/apps_plugin.conf
deleted file mode 100644
index 9a27bc6ba1..0000000000
--- a/health/health.d/apps_plugin.conf
+++ /dev/null
@@ -1,15 +0,0 @@
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-#  disabled due to https://github.com/netdata/netdata/issues/10327
-#
-#   alarm: used_file_descriptors
-#      on: apps.files
-#   hosts: *
-#    calc: $fdperc
-#   units: %
-#   every: 5s
-#    warn: $this > (($status >= $WARNING)  ? (75) : (80))
-#    crit: $this > (($status == $CRITICAL) ? (85) : (90))
-#   delay: down 5m multiplier 1.5 max 1h
-#    info: Peak percentage of file descriptors used
-#      to: sysadmin
diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf
index f0da9ac5e5..cbaf18e8ae 100644
--- a/health/health.d/bcache.conf
+++ b/health/health.d/bcache.conf
@@ -1,13 +1,12 @@
 
 template: bcache_cache_errors
       on: disk.bcache_cache_read_races
-  lookup: sum -10m unaligned absolute
+  lookup: sum -1m unaligned absolute
    units: errors
    every: 1m
     warn: $this > 0
-    crit: $this > ( ($status >= $CRITICAL) ? (0) : (10) )
-   delay: down 1h multiplier 1.5 max 2h
-    info: the number of times bcache had issues using the cache, during the last 10 mins (this usually means your SSD cache is failing)
+   delay: up 2m down 1h multiplier 1.5 max 2h
+    info: the number of times the data was being read from the cache, the bucket was reused and invalidated, during the last 10 mins (when this occurs the data is reread from the backing device)
       to: sysadmin
 
 template: bcache_cache_dirty
diff --git a/health/health.d/ceph.conf b/health/health.d/ceph.conf
index de16f7b6ff..e0a55a3e72 100644
--- a/health/health.d/ceph.conf
+++ b/health/health.d/ceph.conf
@@ -2,12 +2,11 @@
 
 template: cluster_space_usage
       on: ceph.general_usage
-    calc: $avail * 100 / ($avail + $used)
+    calc: $used * 100 / ($used + $avail)
    units: %
-   every: 10s
-    warn: $this < 10
-    crit: $this < 1
+   every: 1m
+    warn: $this > (($status >= $WARNING ) ? (85) : (90))
+    crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: down 5m multiplier 1.2 max 1h
-    info: ceph disk usage is almost full
+    info: current ceph disk usage
       to: sysadmin
-
diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf
index 9c194ced2f..ba0cbbb5c1 100644
--- a/health/health.d/disks.conf
+++ b/health/health.d/disks.conf
@@ -49,35 +49,35 @@ families: !/dev !/dev/* !/run !/run/* *
 # we will use it in the next template to find
 # the hours remaining
 
-template: disk_fill_rate
-      on: disk.space
-      os: linux freebsd
-   hosts: *
-families: *
-  lookup: min -10m at -50m unaligned of avail
-    calc: ($this - $avail) / (($now - $after) / 3600)
-   every: 1m
-   units: GB/hour
-    info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour
+# template: disk_fill_rate
+#       on: disk.space
+#       os: linux freebsd
+#    hosts: *
+# families: *
+#   lookup: min -10m at -50m unaligned of avail
+#     calc: ($this - $avail) / (($now - $after) / 3600)
+#    every: 1m
+#    units: GB/hour
+#     info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour
 
 
 # calculate the hours remaining
 # if the disk continues to fill
 # in this rate
 
-template: out_of_disk_space_time
-      on: disk.space
-      os: linux freebsd
-   hosts: *
-families: *
-    calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf)
-   units: hours
-   every: 10s
-    warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
-    crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
-   delay: down 15m multiplier 1.2 max 1h
-    info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour
-      to: sysadmin
+# template: out_of_disk_space_time
+#       on: disk.space
+#       os: linux freebsd
+#    hosts: *
+# families: *
+#     calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf)
+#    units: hours
+#    every: 10s
+#     warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
+#     crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+#    delay: down 15m multiplier 1.2 max 1h
+#     info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour
+#       to: sysadmin
 
 
 # -----------------------------------------------------------------------------
@@ -91,34 +91,34 @@ families: *
 # we will use it in the next template to find
 # the hours remaining
 
-template: disk_inode_rate
-      on: disk.inodes
-      os: linux freebsd
-   hosts: *
-families: *
-  lookup: min -10m at -50m unaligned of avail
-    calc: ($this - $avail) / (($now - $after) / 3600)
-   every: 1m
-   units: inodes/hour
-    info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour
+# template: disk_inode_rate
+#       on: disk.inodes
+#       os: linux freebsd
+#    hosts: *
+# families: *
+#   lookup: min -10m at -50m unaligned of avail
+#     calc: ($this - $avail) / (($now - $after) / 3600)
+#    every: 1m
+#    units: inodes/hour
+#     info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour
 
 # calculate the hours remaining
 # if the disk inodes are allocated
 # in this rate
 
-template: out_of_disk_inodes_time
-      on: disk.inodes
-      os: linux freebsd
-   hosts: *
-families: *
-    calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf)
-   units: hours
-   every: 10s
-    warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
-    crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
-   delay: down 15m multiplier 1.2 max 1h
-    info: estimated time the disk will run out of inodes, if the system continues to allocate inodes with the rate of the last hour
-      to: sysadmin
+# template: out_of_disk_inodes_time
+#       on: disk.inodes
+#       os: linux freebsd
+#    hosts: *
+# families: *
+#     calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf)
+#    units: hours
+#    every: 10s
+#     warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
+#     crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+#    delay: down 15m multiplier 1.2 max 1h
+#     info: estimated time the disk will run out of inodes, if the system continues to allocate inodes with the rate of the last hour
+#       to: sysadmin
 
 
 # -----------------------------------------------------------------------------
@@ -142,7 +142,7 @@ families: *
     crit: $this > $red   * (($status == $CRITICAL) ? (0.7) : (1))
    delay: down 15m multiplier 1.2 max 1h
     info: the percentage of time the disk was busy, during the last 10 minutes
-      to: sysadmin
+      to: silent
 
 
 # raise an alarm if the disk backlog
@@ -164,4 +164,4 @@ families: *
     crit: $this > $red   * (($status == $CRITICAL) ? (0.7) : (1))
    delay: down 15m multiplier 1.2 max 1h
     info: average of the kernel estimated disk backlog, for the last 10 minutes
-      to: sysadmin
+      to: silent
diff --git a/health/health.d/entropy.conf b/health/health.d/entropy.conf
index 66d44ec139..d67c753af6 100644
--- a/health/health.d/entropy.conf
+++ b/health/health.d/entropy.conf
@@ -7,7 +7,7 @@
       on: system.entropy
       os: linux
    hosts: *
-  lookup: min -10m unaligned
+  lookup: min -5m unaligned
    units: entries
    every: 5m
     warn: $this < (($status >= $WARNING) ? (200) : (100))
diff --git a/health/health.d/load.conf b/health/health.d/load.conf
index ee0c54b8e7..ce7f77551b 100644
--- a/health/health.d/load.conf
+++ b/health/health.d/load.conf
@@ -4,7 +4,7 @@
 # Calculate the base trigger point for the load average alarms.
 # This is the maximum number of CPU's in the system over the past 1
 # minute, with a special case for a single CPU of setting the trigger at 2.
-   alarm: load_trigger
+   alarm: load_cpu_number
       on: system.load
       os: linux
    hosts: *
@@ -16,6 +16,7 @@
 # Send alarms if the load average is unusually high.
 # These intentionally _do not_ calculate the average over the sampled
 # time period because the values being checked already are averages.
+
    alarm: load_average_15
       on: system.load
       os: linux
@@ -23,8 +24,7 @@
   lookup: max -1m unaligned of load15
    units: load
    every: 1m
-    warn: $this > (($status >= $WARNING)  ? (1.75 * $load_trigger) : (2 * $load_trigger))
-    crit: $this > (($status == $CRITICAL) ? (3.5 * $load_trigger) : (4 * $load_trigger))
+    warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 175 : 200)
    delay: down 15m multiplier 1.5 max 1h
     info: fifteen-minute load average
       to: sysadmin
@@ -36,8 +36,7 @@
   lookup: max -1m unaligned of load5
    units: load
    every: 1m
-    warn: $this > (($status >= $WARNING)  ? (3.5 * $load_trigger) : (4 * $load_trigger))
-    crit: $this > (($status == $CRITICAL) ? (7 * $load_trigger) : (8 * $load_trigger))
+    warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 350 : 400)
    delay: down 15m multiplier 1.5 max 1h
     info: five-minute load average
       to: sysadmin
@@ -49,8 +48,7 @@
   lookup: max -1m unaligned of load1
    units: load
    every: 1m
-    warn: $this > (($status >= $WARNING)  ? (7 * $load_trigger) : (8 * $load_trigger))
-    crit: $this > (($status == $CRITICAL) ? (14 * $load_trigger) : (16 * $load_trigger))
+    warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 700 : 800)
    delay: down 15m multiplier 1.5 max 1h
     info: one-minute load average
       to: sysadmin
diff --git a/health/health.d/net.conf b/health/health.d/net.conf
index 261290e513..96c0f3cde6 100644
--- a/health/health.d/net.conf
+++ b/health/health.d/net.conf
@@ -23,9 +23,8 @@
      calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan )
     units: %
     every: 10s
-     warn: $this > (($status >= $WARNING)  ? (80) : (85))
-     crit: $this > (($status == $CRITICAL) ? (85) : (90))
-    delay: down 1m multiplier 1.5 max 1h
+     warn: $this > (($status >= $WARNING)  ? (85) : (90))
+    delay: up 1m down 1m multiplier 1.5 max 1h
      info: interface received bandwidth usage over net device speed max
        to: sysadmin
 
@@ -38,9 +37,8 @@
      calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan )
     units: %
     every: 10s
-     warn: $this > (($status >= $WARNING)  ? (80) : (85))
-     crit: $this > (($status == $CRITICAL) ? (85) : (90))
-    delay: down 1m multiplier 1.5 max 1h
+     warn: $this > (($status >= $WARNING)  ? (85) : (90))
+    delay: up 1m down 1m multiplier 1.5 max 1h
      info: interface sent bandwidth usage over net device speed max
        to: sysadmin
 
@@ -62,10 +60,7 @@ families: *
   lookup: sum -10m unaligned absolute of inbound
    units: packets
    every: 1m
-    warn: $this >= 5
-   delay: down 1h multiplier 1.5 max 2h
     info: interface inbound dropped packets in the last 10 minutes
-      to: sysadmin
 
 template: outbound_packets_dropped
       on: net.drops
@@ -75,10 +70,7 @@ families: *
   lookup: sum -10m unaligned absolute of outbound
    units: packets
    every: 1m
-    warn: $this >= 5
-   delay: down 1h multiplier 1.5 max 2h
     info: interface outbound dropped packets in the last 10 minutes
-      to: sysadmin
 
 template: inbound_packets_dropped_ratio
       on: net.packets
@@ -86,12 +78,11 @@ template: inbound_packets_dropped_ratio
    hosts: *
 families: *
   lookup: sum -10m unaligned absolute of received
-    calc: (($inbound_packets_dropped != nan AND $this > 0) ? ($inbound_packets_dropped * 100 / $this) : (0))
+    calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0))
    units: %
    every: 1m
-    warn: $this >= 0.1
-    crit: $this >= 2
-   delay: down 1h multiplier 1.5 max 2h
+    warn: $this >= 2
+   delay: up 1m down 1h multiplier 1.5 max 2h
     info: the ratio of inbound dropped packets vs the total number of received packets of the network interface, during the last 10 minutes
       to: sysadmin
 
@@ -101,12 +92,11 @@ template: outbound_packets_dropped_ratio
    hosts: *
 families: *
   lookup: sum -10m unaligned absolute of sent
-    calc: (($outbound_packets_dropped != nan AND $this > 0) ? ($outbound_packets_dropped * 100 / $this) : (0))
+    calc: (($outbound_packets_dropped != nan AND $this > 1000) ? ($outbound_packets_dropped * 100 / $this) : (0))
    units: %
    every: 1m
-    warn: $this >= 0.1
-    crit: $this >= 2
-   delay: down 1h multiplier 1.5 max 2h
+    warn: $this >= 2
+   delay: up 1m down 1h multiplier 1.5 max 2h
     info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes
       to: sysadmin
 
diff --git a/health/health.d/netfilter.conf b/health/health.d/netfilter.conf
index 1d07752cc2..473aea4d79 100644
--- a/health/health.d/netfilter.conf
+++ b/health/health.d/netfilter.conf
@@ -1,19 +1,6 @@
 
 # you can disable an alarm notification by setting the 'to' line to: silent
 
-   alarm: netfilter_last_collected_secs
-      on: netfilter.conntrack_sockets
-      os: linux
-   hosts: *
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sysadmin
-
    alarm: netfilter_conntrack_full
       on: netfilter.conntrack_sockets
       os: linux
@@ -22,8 +9,8 @@
     calc: $this * 100 / $netfilter_conntrack_max
    units: %
    every: 10s
-    warn: $this > (($status >= $WARNING)  ? (70) : (80))
-    crit: $this > (($status == $CRITICAL) ? (80) : (90))
+    warn: $this > (($status >= $WARNING)  ? (85) : (90))
+    crit: $this > (($status == $CRITICAL) ? (90) : (95))
    delay: down 5m multiplier 1.5 max 1h
     info: the number of connections tracked by the netfilter connection tracker, as a percentage of the connection tracker table size
       to: sysadmin
diff --git a/health/health.d/processes.conf b/health/health.d/processes.conf
index 293f1aa0d3..786a655d48 100644
--- a/health/health.d/processes.conf
+++ b/health/health.d/processes.conf
@@ -6,8 +6,8 @@
     calc: $active * 100 / $pidmax
    units: %
    every: 5s
-    warn: $this > (($status >= $WARNING)  ? (75) : (80))
-    crit: $this > (($status == $CRITICAL) ? (85) : (90))
+    warn: $this > (($status >= $WARNING)  ? (85) : (90))
+    crit: $this > (($status == $CRITICAL) ? (90) : (95))
    delay: down 5m multiplier 1.5 max 1h
     info: the percentage of active processes
       to: sysadmin
diff --git a/health/health.d/swap.conf b/health/health.d/swap.conf
index f920b0807d..7cf8134e30 100644
--- a/health/health.d/swap.conf
+++ b/health/health.d/swap.conf
@@ -10,9 +10,8 @@
     calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
    units: % of RAM
    every: 1m
-    warn: $this > (($status >= $WARNING)  ? (10) : (20))
-    crit: $this > (($status == $CRITICAL) ? (20) : (30))
-   delay: up 0 down 15m multiplier 1.5 max 1h
+    warn: $this > (($status >= $WARNING)  ? (20) : (30))
+   delay: down 15m multiplier 1.5 max 1h
     info: the amount of memory swapped in the last 30 minutes, as a percentage of the system RAM
       to: sysadmin
 
@@ -23,8 +22,7 @@
     calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
    units: % of RAM
    every: 10s
-    warn: $this > (($status >= $WARNING)  ? (15) : (20))
-    crit: $this > (($status == $CRITICAL) ? (40) : (50))
+    warn: $this > (($status >= $WARNING)  ? (40) : (50))
    delay: up 30s down 15m multiplier 1.5 max 1h
     info: the swap memory used, as a percentage of the system RAM
       to: sysadmin
diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf
index 36a550a5db..f41eee0ea5 100644
--- a/health/health.d/tcp_resets.conf
+++ b/health/health.d/tcp_resets.conf
@@ -2,21 +2,6 @@
 # you can disable an alarm notification by setting the 'to' line to: silent
 
 # -----------------------------------------------------------------------------
-
-   alarm: ipv4_tcphandshake_last_collected_secs
-      on: ipv4.tcphandshake
-      os: linux freebsd
-   hosts: *
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: up 0 down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sysadmin
-
-# -----------------------------------------------------------------------------
 # tcp resets this host sends
 
    alarm: 1m_ipv4_tcp_resets_sent
diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf
index 1e47b5c8bf..7cf254e754 100644
--- a/health/health.d/udp_errors.conf
+++ b/health/health.d/udp_errors.conf
@@ -2,21 +2,6 @@
 # you can disable an alarm notification by setting the 'to' line to: silent
 
 # -----------------------------------------------------------------------------
-
-   alarm: ipv4_udperrors_last_collected_secs
-      on: ipv4.udperrors
-      os: linux freebsd
-   hosts: *
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: up 0 down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sysadmin
-
-# -----------------------------------------------------------------------------
 # UDP receive buffer errors
 
    alarm: 1m_ipv4_udp_receive_buffer_errors
@@ -26,10 +11,9 @@
   lookup: average -1m unaligned absolute of RcvbufErrors
    units: errors
    every: 10s
-    warn: $this > 1
-    crit: $this > (($status == $CRITICAL) ? (0) : (10))
+    warn: $this > (($status >= $WARNING) ? (0) : (10))
     info: average number of UDP receive buffer errors during the last minute
-   delay: up 0 down 60m multiplier 1.2 max 2h
+   delay: up 1m down 60m multiplier 1.2 max 2h
       to: sysadmin
 
 # -----------------------------------------------------------------------------
@@ -42,8 +26,7 @@
   lookup: average -1m unaligned absolute of SndbufErrors
    units: errors
    every: 10s
-    warn: $this > 1
-    crit: $this > (($status == $CRITICAL) ? (0) : (10))
+    warn: $this > (($status >= $WARNING) ? (0) : (10))
     info: number of UDP send buffer errors during the last minute
-   delay: up 0 down 60m multiplier 1.2 max 2h
+   delay: up 1m down 60m multiplier 1.2 max 2h
       to: sysadmin
author	Ilya Mashchenko <ilya@netdata.cloud>	2021-03-09 16:11:41 +0300
committer	GitHub <noreply@github.com>	2021-03-09 16:11:41 +0300
commit	dc365288d11a010650a6bd3c8de14222cc25368d (patch)
tree	a97dc868e20b2aa2199a73df5b88d803263b23f0 /health
parent	2e6de7e835485b719a95a38904e0903bbd6f7bb1 (diff)