8 files changed, 120 insertions, 121 deletions
diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf
index b7eb4e0a3e..ecf3b84a82 100644
--- a/health/health.d/dnsmasq_dhcp.conf
+++ b/health/health.d/dnsmasq_dhcp.conf
@@ -1,6 +1,6 @@
- # dhcp-range utilization
+# dhcp-range utilization
 
- template: dnsmasq_dhcp_dhcp_range_utilization
+template: dnsmasq_dhcp_dhcp_range_utilization
       on: dnsmasq_dhcp.dhcp_range_utilization
    every: 10s
    units: %
diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf
index 1881a7be14..73b87dcc03 100644
--- a/health/health.d/megacli.conf
+++ b/health/health.d/megacli.conf
@@ -1,48 +1,48 @@
    alarm: adapter_state
       on: megacli.adapter_degraded
    units: is degraded
-    lookup: sum -10s
-    every: 10s
+  lookup: sum -10s
+   every: 10s
     crit: $this > 0
     info: adapter state
       to: sysadmin
 
-   template: bbu_relative_charge
+template: bbu_relative_charge
       on: megacli.bbu_relative_charge
    units: percent
-    lookup: average -10s
-    every: 10s
+  lookup: average -10s
+   every: 10s
     warn: $this <= (($status >= $WARNING)  ? (85) : (80))
     crit: $this <= (($status == $CRITICAL)  ? (50) : (40))
     info: BBU relative state of charge
       to: sysadmin
 
-   template: bbu_cycle_count
+template: bbu_cycle_count
       on: megacli.bbu_cycle_count
    units: cycle count
-    lookup: average -10s
-    every: 10s
+  lookup: average -10s
+   every: 10s
     warn: $this >= 100
     crit: $this >= 500
     info: BBU cycle count
       to: sysadmin
 
-    alarm: pd_media_errors
+   alarm: pd_media_errors
       on: megacli.pd_media_error
    units: media errors
-    lookup: sum -10s
-    every: 10s
+  lookup: sum -10s
+   every: 10s
     warn: $this > 0
-    delay: down 1m multiplier 2 max 10m
+   delay: down 1m multiplier 2 max 10m
     info: physical drive media errors
       to: sysadmin
 
-    alarm: pd_predictive_failures
+   alarm: pd_predictive_failures
       on: megacli.pd_predictive_failure
    units: predictive failures
-    lookup: sum -10s
-    every: 10s
+  lookup: sum -10s
+   every: 10s
     warn: $this > 0
-    delay: down 1m multiplier 2 max 10m
+   delay: down 1m multiplier 2 max 10m
     info: physical drive predictive failures
       to: sysadmin
diff --git a/health/health.d/net.conf b/health/health.d/net.conf
index 255ab9982b..e43cb16912 100644
--- a/health/health.d/net.conf
+++ b/health/health.d/net.conf
@@ -161,8 +161,8 @@ families: *
     calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
    every: 10s
    units: %
-   warn: $this > (($status >= $WARNING)?(200):(5000))
-   crit: $this > (($status >= $WARNING)?(5000):(6000))
-options: no-clear-notification
-   info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent)
-     to: sysadmin
+    warn: $this > (($status >= $WARNING)?(200):(5000))
+    crit: $this > (($status >= $WARNING)?(5000):(6000))
+ options: no-clear-notification
+    info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent)
+      to: sysadmin
diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf
index 4a1217239d..b255d35f90 100644
--- a/health/health.d/pihole.conf
+++ b/health/health.d/pihole.conf
@@ -1,5 +1,5 @@
 
- # Make sure Pi-hole is responding.
+# Make sure Pi-hole is responding.
 
 template: pihole_last_collected_secs
       on: pihole.dns_queries_total
@@ -12,56 +12,54 @@ template: pihole_last_collected_secs
     info: number of seconds since the last successful data collection
       to: webmaster
 
-  # Blocked DNS queries.
+# Blocked DNS queries.
 
- template: pihole_blocked_queries
-       on: pihole.dns_queries_percentage
-    every: 10s
-    units: %
-     calc: $blocked
-     warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) )
-     crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) )
-    delay: up 2m down 5m
-     info: percentage of blocked dns queries for the last 24 hour
-       to: sysadmin
-
-
-  # Blocklist last update time.
-  # Default update interval is a week.
+template: pihole_blocked_queries
+      on: pihole.dns_queries_percentage
+   every: 10s
+   units: %
+    calc: $blocked
+    warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) )
+    crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) )
+   delay: up 2m down 5m
+    info: percentage of blocked dns queries for the last 24 hour
+      to: sysadmin
 
- template: pihole_blocklist_last_update
-       on: pihole.blocklist_last_update
-    every: 10s
-    units: seconds
-     calc: $ago
-     warn: $this > 60 * 60 * 24 * 8
-     crit: $this > 60 * 60 * 24 * 8 * 2
-     info: blocklist last update time
-       to: sysadmin
 
+# Blocklist last update time.
+# Default update interval is a week.
 
-  # Gravity file check (gravity.list).
+template: pihole_blocklist_last_update
+      on: pihole.blocklist_last_update
+   every: 10s
+   units: seconds
+    calc: $ago
+    warn: $this > 60 * 60 * 24 * 8
+    crit: $this > 60 * 60 * 24 * 8 * 2
+    info: blocklist last update time
+      to: sysadmin
 
- template: pihole_blocklist_gravity_file
-       on: pihole.blocklist_last_update
-    every: 10s
-    units: boolean
-     calc: $file_exists
-     crit: $this != 1
-    delay: up 2m down 5m
-     info: gravity file existence
-       to: sysadmin
+# Gravity file check (gravity.list).
 
+template: pihole_blocklist_gravity_file
+      on: pihole.blocklist_last_update
+   every: 10s
+   units: boolean
+    calc: $file_exists
+    crit: $this != 1
+   delay: up 2m down 5m
+    info: gravity file existence
+      to: sysadmin
 
-  # Pi-hole's ability to block unwanted domains.
-  # Should be enabled. The whole point of Pi-hole!
+# Pi-hole's ability to block unwanted domains.
+# Should be enabled. The whole point of Pi-hole!
 
- template: pihole_status
-       on: pihole.unwanted_domains_blocking_status
-    every: 10s
-    units: boolean
-     calc: $enabled
-     warn: $this != 1
-    delay: up 2m down 5m
-     info: unwanted domains blocking status
-       to: sysadmin
+template: pihole_status
+      on: pihole.unwanted_domains_blocking_status
+   every: 10s
+   units: boolean
+    calc: $enabled
+    warn: $this != 1
+   delay: up 2m down 5m
+    info: unwanted domains blocking status
+      to: sysadmin
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index 4e41bb496d..15e8e84640 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -37,28 +37,28 @@
       to: sysadmin
 
 ## FreeBSD
-alarm: ram_in_use
-   on: system.ram
-   os: freebsd
-hosts: *
- calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
-units: %
-every: 10s
- warn: $this > (($status >= $WARNING)  ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
-delay: down 15m multiplier 1.5 max 1h
- info: system RAM usage
-   to: sysadmin
+   alarm: ram_in_use
+      on: system.ram
+      os: freebsd
+   hosts: *
+    calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (80) : (90))
+    crit: $this > (($status == $CRITICAL) ? (90) : (98))
+   delay: down 15m multiplier 1.5 max 1h
+    info: system RAM usage
+      to: sysadmin
 
- alarm: ram_available
-    on: system.ram
-    os: freebsd
- hosts: *
-  calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
- units: %
- every: 10s
-  warn: $this < (($status >= $WARNING)  ? (15) : (10))
-  crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
- delay: down 15m multiplier 1.5 max 1h
-  info: estimated amount of RAM available for userspace processes, without causing swapping
-    to: sysadmin
+   alarm: ram_available
+      on: system.ram
+      os: freebsd
+   hosts: *
+    calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
+   units: %
+   every: 10s
+    warn: $this < (($status >= $WARNING)  ? (15) : (10))
+    crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
+   delay: down 15m multiplier 1.5 max 1h
+    info: estimated amount of RAM available for userspace processes, without causing swapping
+      to: sysadmin
diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf
index 77c804bfd6..ff36486261 100644
--- a/health/health.d/softnet.conf
+++ b/health/health.d/softnet.conf
@@ -3,38 +3,38 @@
 
 # check for common /proc/net/softnet_stat errors
 
-   alarm: 10min_netdev_backlog_exceeded
+   alarm: 1min_netdev_backlog_exceeded
       on: system.softnet_stat
       os: linux
    hosts: *
-  lookup: sum -10m unaligned absolute of dropped
+  lookup: average -1m unaligned absolute of dropped
    units: packets
-   every: 1m
-    warn: $this > 0
+   every: 10s
+    warn: $this > (($status >= $WARNING) ? (0) : (10)
    delay: down 1h multiplier 1.5 max 2h
-    info: number of packets dropped in the last 10min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
+    info: average number of packets dropped in the last 1min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
       to: sysadmin
 
-   alarm: 10min_netdev_budget_ran_outs
+   alarm: 1min_netdev_budget_ran_outs
       on: system.softnet_stat
       os: linux
    hosts: *
-  lookup: sum -10m unaligned absolute of squeezed
+  lookup: average -1m unaligned absolute of squeezed
    units: events
-   every: 1m
-    warn: $this > (($status >= $WARNING)  ? (0) : (10))
+   every: 10s
+    warn: $this > (($status >= $WARNING) ? (0) : (10))
    delay: down 1h multiplier 1.5 max 2h
-    info: number of times, during the last 10min, ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs, with work remaining (this can be a cause for dropped packets)
+    info: average number of times, during the last 1min, ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs, with work remaining (this can be a cause for dropped packets)
       to: silent
 
    alarm: 10min_netisr_backlog_exceeded
       on: system.softnet_stat
       os: freebsd
    hosts: *
-   lookup: sum -10m unaligned absolute of qdrops
+  lookup: average -1m unaligned absolute of qdrops
    units: packets
-   every: 1m
-    warn: $this > 0
+   every: 10s
+    warn: $this > (($status >+ $WARNING) ? (0) : (10))
    delay: down 1h multiplier 1.5 max 2h
-    info: number of drops in the last 10min, because sysctl net.route.netisr_maxqlen was exceeded (this can be a cause for dropped packets)
+    info: average number of drops in the last 1min, because sysctl net.route.netisr_maxqlen was exceeded (this can be a cause for dropped packets)
       to: sysadmin
diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf
index 5ac1aa8e00..3b30725772 100644
--- a/health/health.d/tcp_listen.conf
+++ b/health/health.d/tcp_listen.conf
@@ -22,12 +22,13 @@
       on: ip.tcp_accept_queue
       os: linux
    hosts: *
-  lookup: sum -60s unaligned absolute of ListenOverflows
+  lookup: average -60s unaligned absolute of ListenOverflows
    units: overflows
    every: 10s
-    crit: $this > 0
+    warn: $this > 1
+    crit: $this > (($status == $CRITICAL) ? (1) : (5))
    delay: up 0 down 5m multiplier 1.5 max 1h
-    info: the number of times the TCP accept queue of the kernel overflown, during the last minute
+    info: the average number of times the TCP accept queue of the kernel overflown, during the last minute
       to: sysadmin
 
 # THIS IS TOO GENERIC
@@ -36,13 +37,13 @@
       on: ip.tcp_accept_queue
       os: linux
    hosts: *
-  lookup: sum -60s unaligned absolute of ListenDrops
+  lookup: average -60s unaligned absolute of ListenDrops
    units: drops
    every: 10s
-#    warn: $this > 0
-    crit: $this > (($status == $CRITICAL) ? (0) : (150))
+    warn: $this > 1
+    crit: $this > (($status == $CRITICAL) ? (1) : (5))
    delay: up 0 down 5m multiplier 1.5 max 1h
-    info: the number of times the TCP accept queue of the kernel dropped packets, during the last minute (includes bogus packets received)
+    info: the average number of times the TCP accept queue of the kernel dropped packets, during the last minute (includes bogus packets received)
       to: sysadmin
 
 
diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf
index 5140228f56..1e47b5c8bf 100644
--- a/health/health.d/udp_errors.conf
+++ b/health/health.d/udp_errors.conf
@@ -23,12 +23,12 @@
       on: ipv4.udperrors
       os: linux freebsd
    hosts: *
-  lookup: sum -1m unaligned absolute of RcvbufErrors
+  lookup: average -1m unaligned absolute of RcvbufErrors
    units: errors
    every: 10s
-    warn: $this > 0
-    crit: $this > (($status == $CRITICAL) ? (0) : (100))
-    info: number of UDP receive buffer errors during the last minute
+    warn: $this > 1
+    crit: $this > (($status == $CRITICAL) ? (0) : (10))
+    info: average number of UDP receive buffer errors during the last minute
    delay: up 0 down 60m multiplier 1.2 max 2h
       to: sysadmin
 
@@ -39,11 +39,11 @@
       on: ipv4.udperrors
       os: linux
    hosts: *
-  lookup: sum -1m unaligned absolute of SndbufErrors
+  lookup: average -1m unaligned absolute of SndbufErrors
    units: errors
    every: 10s
-    warn: $this > 0
-    crit: $this > (($status == $CRITICAL) ? (0) : (100))
+    warn: $this > 1
+    crit: $this > (($status == $CRITICAL) ? (0) : (10))
     info: number of UDP send buffer errors during the last minute
    delay: up 0 down 60m multiplier 1.2 max 2h
       to: sysadmin