health/: fix various alarms critical and warning thresholds hysteresis (#10779)

author: Ilya Mashchenko <ilya@netdata.cloud> 2021-03-16 18:30:22 +0300
committer: GitHub <noreply@github.com> 2021-03-16 18:30:22 +0300
commit: e186b45a0159ee72e5a7075e53b53151f2bc8373 (patch)
tree: 106a78d7015c6e906d10748d9de58ac88cee7d97
parent: d9354c86b51bdeb9b6341660ad05a10b70a2640d (diff)
8 files changed, 8 insertions, 8 deletions
diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf
index cbaf18e8ae..f737129bf6 100644
--- a/health/health.d/bcache.conf
+++ b/health/health.d/bcache.conf
@@ -15,7 +15,7 @@ template: bcache_cache_dirty
    units: %
    every: 1m
     warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) )
-    crit: $this > ( ($status >= $CRITICAL) ? ( 90 ) : ( 95 ) )
+    crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
    delay: up 1m down 1h multiplier 1.5 max 2h
     info: the percentage of cache space used for dirty and metadata (this usually means your SSD cache is too small)
       to: sysadmin
diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf
index ecf3b84a82..51f7ec98d7 100644
--- a/health/health.d/dnsmasq_dhcp.conf
+++ b/health/health.d/dnsmasq_dhcp.conf
@@ -6,7 +6,7 @@ template: dnsmasq_dhcp_dhcp_range_utilization
    units: %
     calc: $used
     warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) )
-    crit: $this > ( ($status >= $CRITICAL) ? ( 90 ) : ( 95 ) )
+    crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
    delay: down 5m
     info: dhcp-range utilization above threshold!
       to: sysadmin
diff --git a/health/health.d/net.conf b/health/health.d/net.conf
index 85e194bb98..29556befc8 100644
--- a/health/health.d/net.conf
+++ b/health/health.d/net.conf
@@ -179,7 +179,7 @@ families: *
    every: 10s
    units: %
     warn: $this > (($status >= $WARNING)?(200):(5000))
-    crit: $this > (($status >= $WARNING)?(5000):(6000))
+    crit: $this > (($status == $CRITICAL)?(5000):(6000))
  options: no-clear-notification
     info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent)
       to: sysadmin
diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf
index b255d35f90..20b9322302 100644
--- a/health/health.d/pihole.conf
+++ b/health/health.d/pihole.conf
@@ -20,7 +20,7 @@ template: pihole_blocked_queries
    units: %
     calc: $blocked
     warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) )
-    crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) )
+    crit: $this > ( ($status == $CRITICAL) ? ( 55 ) : ( 75 ) )
    delay: up 2m down 5m
     info: percentage of blocked dns queries for the last 24 hour
       to: sysadmin
diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf
index f835f2aee1..fb2a1f5c0b 100644
--- a/health/health.d/softnet.conf
+++ b/health/health.d/softnet.conf
@@ -34,7 +34,7 @@
   lookup: average -1m unaligned absolute of qdrops
    units: packets
    every: 10s
-    warn: $this > (($status >+ $WARNING) ? (0) : (10))
+    warn: $this > (($status >= $WARNING) ? (0) : (10))
    delay: down 1h multiplier 1.5 max 2h
     info: average number of drops in the last 1min, because sysctl net.route.netisr_maxqlen was exceeded (this can be a cause for dropped packets)
       to: sysadmin
diff --git a/health/health.d/tcp_conn.conf b/health/health.d/tcp_conn.conf
index 7aa9a98001..7dee6aaf7f 100644
--- a/health/health.d/tcp_conn.conf
+++ b/health/health.d/tcp_conn.conf
@@ -13,7 +13,7 @@
    units: %
    every: 10s
     warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 ))
-    crit: $this > (($status >= $CRITICAL) ? ( 80 ) : ( 90 ))
+    crit: $this > (($status == $CRITICAL) ? ( 80 ) : ( 90 ))
    delay: up 0 down 5m multiplier 1.5 max 1h
     info: the percentage of IPv4 TCP connections over the max allowed
       to: sysadmin
diff --git a/health/health.d/tcp_mem.conf b/health/health.d/tcp_mem.conf
index 6927d57652..71f316634b 100644
--- a/health/health.d/tcp_mem.conf
+++ b/health/health.d/tcp_mem.conf
@@ -14,7 +14,7 @@
    units: %
    every: 10s
     warn: ${mem} > (($status >= $WARNING  ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure}   ))
-    crit: ${mem} > (($status >= $CRITICAL ) ? ( ${tcp_mem_pressure}       ) : ( ${tcp_mem_high} * 0.9 ))
+    crit: ${mem} > (($status == $CRITICAL ) ? ( ${tcp_mem_pressure}       ) : ( ${tcp_mem_high} * 0.9 ))
    delay: up 0 down 5m multiplier 1.5 max 1h
     info: the amount of TCP memory as a percentage of its max memory limit
       to: sysadmin
diff --git a/health/health.d/tcp_orphans.conf b/health/health.d/tcp_orphans.conf
index 280d6590fc..084e969d5a 100644
--- a/health/health.d/tcp_orphans.conf
+++ b/health/health.d/tcp_orphans.conf
@@ -15,7 +15,7 @@
    units: %
    every: 10s
     warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 ))
-    crit: $this > (($status >= $CRITICAL) ? ( 25 ) : ( 50 ))
+    crit: $this > (($status == $CRITICAL) ? ( 25 ) : ( 50 ))
    delay: up 0 down 5m multiplier 1.5 max 1h
     info: the percentage of orphan IPv4 TCP sockets over the max allowed (this may lead to too-many-orphans errors)
       to: sysadmin
author	Ilya Mashchenko <ilya@netdata.cloud>	2021-03-16 18:30:22 +0300
committer	GitHub <noreply@github.com>	2021-03-16 18:30:22 +0300
commit	e186b45a0159ee72e5a7075e53b53151f2bc8373 (patch)
tree	106a78d7015c6e906d10748d9de58ac88cee7d97
parent	d9354c86b51bdeb9b6341660ad05a10b70a2640d (diff)