Fine tune various alarm values. (#7322)

* Fix formatting in alarm configurations. This makes sure everything is lined up properly so that the alarm definitions are easier to read. * Make TCP Accept Queue alarms much less aggressive. This switches the alarms to use averages instead of sums, and bumps up the trip points to be more aggressive, as both of these may be non-zero even in normal operation of a system. * Make softnet alarms less aggressive. This decreases the sampling window from 10 minutes to 1 minute, switches to using an average instead of a sum, and adjusts the trigger thresholds to be more aggressive. This one will need to be watched, as the resultant values may be too lenient for some systems. * Tweak UDP alarms to work like the TCP alarms. Just to ensure consistency.
author: Austin S. Hemmelgarn <ahferroin7@gmail.com> 2019-11-15 13:14:43 -0500
committer: Ilya Mashchenko <ilya@netdata.cloud> 2019-11-15 21:14:43 +0300
commit: ef721290f03ec9d0292f062672feaad78d7fe3a9 (patch)
tree: 3a8e57a0117bfe166317dfcdd407ee861eb90ef1 /health
parent: ac90f27a580aad0c4d20c5354657b34e5ceed29e (diff)
8 files changed, 120 insertions, 121 deletions
diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf
index b7eb4e0a3e..ecf3b84a82 100644
--- a/health/health.d/dnsmasq_dhcp.conf
+++ b/health/health.d/dnsmasq_dhcp.conf
@@ -1,6 +1,6 @@
- # dhcp-range utilization
+# dhcp-range utilization
 
- template: dnsmasq_dhcp_dhcp_range_utilization
+template: dnsmasq_dhcp_dhcp_range_utilization
       on: dnsmasq_dhcp.dhcp_range_utilization
    every: 10s
    units: %
diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf
index 1881a7be14..73b87dcc03 100644
--- a/health/health.d/megacli.conf
+++ b/health/health.d/megacli.conf
@@ -1,48 +1,48 @@
    alarm: adapter_state
       on: megacli.adapter_degraded
    units: is degraded
-    lookup: sum -10s
-    every: 10s
+  lookup: sum -10s
+   every: 10s
     crit: $this > 0
     info: adapter state
       to: sysadmin
 
-   template: bbu_relative_charge
+template: bbu_relative_charge
       on: megacli.bbu_relative_charge
    units: percent
-    lookup: average -10s
-    every: 10s
+  lookup: average -10s
+   every: 10s
     warn: $this <= (($status >= $WARNING)  ? (85) : (80))
     crit: $this <= (($status == $CRITICAL)  ? (50) : (40))
     info: BBU relative state of charge
       to: sysadmin
 
-   template: bbu_cycle_count
+template: bbu_cycle_count
       on: megacli.bbu_cycle_count
    units: cycle count
-    lookup: average -10s
-    every: 10s
+  lookup: average -10s
+   every: 10s
     warn: $this >= 100
     crit: $this >= 500
     info: BBU cycle count
       to: sysadmin
 
-    alarm: pd_media_errors
+   alarm: pd_media_errors
       on: megacli.pd_media_error
    units: media errors
-    lookup: sum -10s
-    every: 10s
+  lookup: sum -10s
+   every: 10s
     warn: $this > 0
-    delay: down 1m multiplier 2 max 10m
+   delay: down 1m multiplier 2 max 10m
     info: physical drive media errors
       to: sysadmin
 
-    alarm: pd_predictive_failures
+   alarm: pd_predictive_failures
       on: megacli.pd_predictive_failure
    units: predictive failures
-    lookup: sum -10s
-    every: 10s
+  lookup: sum -10s
+   every: 10s
     warn: $this > 0
-    delay: down 1m multiplier 2 max 10m
+   delay: down 1m multiplier 2 max 10m
     info: physical drive predictive failures
       to: sysadmin
diff --git a/health/health.d/net.conf b/health/health.d/net.conf
index 255ab9982b..e43cb16912 100644
--- a/health/health.d/net.conf
+++ b/health/health.d/net.conf
@@ -161,8 +161,8 @@ families: *
     calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
    every: 10s
    units: %
-   warn: $this > (($status >= $WARNING)?(200):(5000))
-   crit: $this > (($status >= $WARNING)?(5000):(6000))
-options: no-clear-notification
-   info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent)
-     to: sysadmin
+    warn: $this > (($status >= $WARNING)?(200):(5000))
+    crit: $this > (($status >= $WARNING)?(5000):(6000))
+ options: no-clear-notification
+    info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent)
+      to: sysadmin
diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf
index 4a1217239d..b255d35f90 100644
--- a/health/health.d/pihole.conf
+++ b/health/health.d/pihole.conf
@@ -1,5 +1,5 @@
 
- # Make sure Pi-hole is responding.
+# Make sure Pi-hole is responding.
 
 template: pihole_last_collected_secs
       on: pihole.dns_queries_total
@@ -12,56 +12,54 @@ template: pihole_last_collected_secs
     info: number of seconds since the last successful data collection
       to: webmaster
 
-  # Blocked DNS queries.
+# Blocked DNS queries.
 
- template: pihole_blocked_queries
-       on: pihole.dns_queries_percentage
-    every: 10s
-    units: %
-     calc: $blocked
-     warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) )
-     crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) )
-    delay: up 2m down 5m
-     info: percentage of blocked dns queries for the last 24 hour
-       to: sysadmin
-
-
-  # Blocklist last update time.
-  # Default update interval is a week.
+template: pihole_blocked_queries
+      on: pihole.dns_queries_percentage
+   every: 10s
+   units: %
+    calc: $blocked
+    warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) )
+    crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) )
+   delay: up 2m down 5m
+    info: percentage of blocked dns queries for the last 24 hour
+      to: sysadmin
 
- template: pihole_blocklist_last_update
-       on: pihole.blocklist_last_update
-    every: 10s
-    units: seconds
-     calc: $ago
-     warn: $this > 60 * 60 * 24 * 8
-     crit: $this > 60 * 60 * 24 * 8 * 2
-     info: blocklist last update time
-       to: sysadmin
 
+# Blocklist last update time.
+# Default update interval is a week.
 
-  # Gravity file check (gravity.list).
+template: pihole_blocklist_last_update
+      on: pihole.blocklist_last_update
+   every: 10s
+   units: seconds
+    calc: $ago
+    warn: $this > 60 * 60 * 24 * 8
+    crit: $this > 60 * 60 * 24 * 8 * 2
+    info: blocklist last update time
+      to: sysadmin
 
- template: pihole_blocklist_gravity_file
-       on: pihole.blocklist_last_update
-    every: 10s
-    units: boolean
-     calc: $file_exists
-     crit: $this != 1
-    delay: up 2m down 5m
-     info: gravity file existence
-       to: sysadmin
+# Gravity file check (gravity.list).
 
+template: pihole_blocklist_gravity_file
+      on: pihole.blocklist_last_update
+   every: 10s
+   units: boolean
+    calc: $file_exists
+    crit: $this != 1
+   delay: up 2m down 5m
+    info: gravity file existence
+      to: sysadmin
 
-  # Pi-hole's ability to block unwanted domains.
-  # Should be enabled. The whole point of Pi-hole!
+# Pi-hole's ability to block unwanted domains.
+# Should be enabled. The whole point of Pi-hole!
 
- template: pihole_status
-       on: pihole.unwanted_domains_blocking_status
-    every: 10s
-    units: boolean
-     calc: $enabled
-     warn: $this != 1
-    delay: up 2m down 5m
-     info: unwanted domains blocking status
-       to: sysadmin
+template: pihole_status
+      on: pihole.unwanted_domains_blocking_status
+   every: 10s
+   units: boolean
+    calc: $enabled
+    warn: $this != 1
+   delay: up 2m down 5m
+    info: unwanted domains blocking status
+      to: sysadmin
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index 4e41bb496d..15e8e84640 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -37,28 +37,28 @@
       to: sysadmin
 
 ## FreeBSD
-alarm: ram_in_use
-   on: system.ram
-   os: freebsd
-hosts: *
- calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
-units: %
-every: 10s
- warn: $this > (($status >= $WARNING)  ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
-delay: down 15m multiplier 1.5 max 1h
- info: system RAM usage
-   to: sysadmin
+   alarm: ram_in_use
+      on: system.ram
+      os: freebsd
+   hosts: *
+    calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (80) : (90))
+    crit: $this > (($status == $CRITICAL) ? (90) : (98))
+   delay: down 15m multiplier 1.5 max 1h
+    info: system RAM usage
+      to: sysadmin
 
- alarm: ram_available
-    on: system.ram
-    os: freebsd
- hosts: *
-  calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
- units: %
- every: 10s
-  warn: $this < (($status >= $WARNING)  ? (15) : (10))
-  crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
- delay: down 15m multiplier 1.5 max 1h
-  info: estimated amount of RAM available for userspace processes, without causing swapping
-    to: sysadmin
+   alarm: ram_available
+      on: system.ram
+      os: freebsd
+   hosts: *
+    calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
+   units: %
+   every: 10s
+    warn: $this < (($status >= $WARNING)  ? (15) : (10))
+    crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
+   delay: down 15m multiplier 1.5 max 1h
+    info: estimated amount of RAM available for userspace processes, without causing swapping
+      to: sysadmin
diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf
index 77c804bfd6..ff36486261 100644
--- a/health/health.d/softnet.conf
+++ b/health/health.d/softnet.conf
@@ -3,38 +3,38 @@
 
 # check for common /proc/net/softnet_stat errors
 
-   alarm: 10min_netdev_backlog_exceeded
+   alarm: 1min_netdev_backlog_exceeded
       on: system.softnet_stat
       os: linux
    hosts: *
-  lookup: sum -10m unaligned absolute of dropped
+  lookup: average -1m unaligned absolute of dropped
    units: packets
-   every: 1m
-    warn: $this > 0
+   every: 10s
+    warn: $this > (($status >= $WARNING) ? (0) : (10)
    delay: down 1h multiplier 1.5 max 2h
-    info: number of packets dropped in the last 10min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
+    info: average number of packets dropped in the last 1min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
       to: sysadmin
 
-   alarm: 10min_netdev_budget_ran_outs
+   alarm: 1min_netdev_budget_ran_outs
       on: system.softnet_stat
       os: linux
    hosts: *
-  lookup: sum -10m unaligned absolute of squeezed
+  lookup: average -1m unaligned absolute of squeezed
    units: events
-   every: 1m
-    warn: $this > (($status >= $WARNING)  ? (0) : (10))
+   every: 10s
+    warn: $this > (($status >= $WARNING) ? (0) : (10))
    delay: down 1h multiplier 1.5 max 2h
-    info: number of times, during the last 10min, ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs, with work remaining (this can be a cause for dropped packets)
+    info: average number of times, during the last 1min, ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs, with work remaining (this can be a cause for dropped packets)
       to: silent
 
    alarm: 10min_netisr_backlog_exceeded
       on: system.softnet_stat
       os: freebsd
    hosts: *
-   lookup: sum -10m unaligned absolute of qdrops
+  lookup: average -1m unaligned absolute of qdrops
    units: packets
-   every: 1m
-    warn: $this > 0
+   every: 10s
+    warn: $this > (($status >+ $WARNING) ? (0) : (10))
    delay: down 1h multiplier 1.5 max 2h
-    info: number of drops in the last 10min, because sysctl net.route.netisr_maxqlen was exceeded (this can be a cause for dropped packets)
+    info: average number of drops in the last 1min, because sysctl net.route.netisr_maxqlen was exceeded (this can be a cause for dropped packets)
       to: sysadmin
diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf
index 5ac1aa8e00..3b30725772 100644
--- a/health/health.d/tcp_listen.conf
+++ b/health/health.d/tcp_listen.conf
@@ -22,12 +22,13 @@
       on: ip.tcp_accept_queue
       os: linux
    hosts: *
-  lookup: sum -60s unaligned absolute of ListenOverflows
+  lookup: average -60s unaligned absolute of ListenOverflows
    units: overflows
    every: 10s
-    crit: $this > 0
+    warn: $this > 1
+    crit: $this > (($status == $CRITICAL) ? (1) : (5))
    delay: up 0 down 5m multiplier 1.5 max 1h
-    info: the number of times the TCP accept queue of the kernel overflown, during the last minute
+    info: the average number of times the TCP accept queue of the kernel overflown, during the last minute
       to: sysadmin
 
 # THIS IS TOO GENERIC
@@ -36,13 +37,13 @@
       on: ip.tcp_accept_queue
       os: linux
    hosts: *
-  lookup: sum -60s unaligned absolute of ListenDrops
+  lookup: average -60s unaligned absolute of ListenDrops
    units: drops
    every: 10s
-#    warn: $this > 0
-    crit: $this > (($status == $CRITICAL) ? (0) : (150))
+    warn: $this > 1
+    crit: $this > (($status == $CRITICAL) ? (1) : (5))
    delay: up 0 down 5m multiplier 1.5 max 1h
-    info: the number of times the TCP accept queue of the kernel dropped packets, during the last minute (includes bogus packets received)
+    info: the average number of times the TCP accept queue of the kernel dropped packets, during the last minute (includes bogus packets received)
       to: sysadmin
 
 
diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf
index 5140228f56..1e47b5c8bf 100644
--- a/health/health.d/udp_errors.conf
+++ b/health/health.d/udp_errors.conf
@@ -23,12 +23,12 @@
       on: ipv4.udperrors
       os: linux freebsd
    hosts: *
-  lookup: sum -1m unaligned absolute of RcvbufErrors
+  lookup: average -1m unaligned absolute of RcvbufErrors
    units: errors
    every: 10s
-    warn: $this > 0
-    crit: $this > (($status == $CRITICAL) ? (0) : (100))
-    info: number of UDP receive buffer errors during the last minute
+    warn: $this > 1
+    crit: $this > (($status == $CRITICAL) ? (0) : (10))
+    info: average number of UDP receive buffer errors during the last minute
    delay: up 0 down 60m multiplier 1.2 max 2h
       to: sysadmin
 
@@ -39,11 +39,11 @@
       on: ipv4.udperrors
       os: linux
    hosts: *
-  lookup: sum -1m unaligned absolute of SndbufErrors
+  lookup: average -1m unaligned absolute of SndbufErrors
    units: errors
    every: 10s
-    warn: $this > 0
-    crit: $this > (($status == $CRITICAL) ? (0) : (100))
+    warn: $this > 1
+    crit: $this > (($status == $CRITICAL) ? (0) : (10))
     info: number of UDP send buffer errors during the last minute
    delay: up 0 down 60m multiplier 1.2 max 2h
       to: sysadmin
author	Austin S. Hemmelgarn <ahferroin7@gmail.com>	2019-11-15 13:14:43 -0500
committer	Ilya Mashchenko <ilya@netdata.cloud>	2019-11-15 21:14:43 +0300
commit	ef721290f03ec9d0292f062672feaad78d7fe3a9 (patch)
tree	3a8e57a0117bfe166317dfcdd407ee861eb90ef1 /health
parent	ac90f27a580aad0c4d20c5354657b34e5ceed29e (diff)