summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIlya Mashchenko <ilya@netdata.cloud>2021-03-16 18:30:22 +0300
committerGitHub <noreply@github.com>2021-03-16 18:30:22 +0300
commite186b45a0159ee72e5a7075e53b53151f2bc8373 (patch)
tree106a78d7015c6e906d10748d9de58ac88cee7d97
parentd9354c86b51bdeb9b6341660ad05a10b70a2640d (diff)
health/: fix various alarms critical and warning thresholds hysteresis (#10779)
-rw-r--r--health/health.d/bcache.conf2
-rw-r--r--health/health.d/dnsmasq_dhcp.conf2
-rw-r--r--health/health.d/net.conf2
-rw-r--r--health/health.d/pihole.conf2
-rw-r--r--health/health.d/softnet.conf2
-rw-r--r--health/health.d/tcp_conn.conf2
-rw-r--r--health/health.d/tcp_mem.conf2
-rw-r--r--health/health.d/tcp_orphans.conf2
8 files changed, 8 insertions, 8 deletions
diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf
index cbaf18e8ae..f737129bf6 100644
--- a/health/health.d/bcache.conf
+++ b/health/health.d/bcache.conf
@@ -15,7 +15,7 @@ template: bcache_cache_dirty
units: %
every: 1m
warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) )
- crit: $this > ( ($status >= $CRITICAL) ? ( 90 ) : ( 95 ) )
+ crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
delay: up 1m down 1h multiplier 1.5 max 2h
info: the percentage of cache space used for dirty and metadata (this usually means your SSD cache is too small)
to: sysadmin
diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf
index ecf3b84a82..51f7ec98d7 100644
--- a/health/health.d/dnsmasq_dhcp.conf
+++ b/health/health.d/dnsmasq_dhcp.conf
@@ -6,7 +6,7 @@ template: dnsmasq_dhcp_dhcp_range_utilization
units: %
calc: $used
warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) )
- crit: $this > ( ($status >= $CRITICAL) ? ( 90 ) : ( 95 ) )
+ crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
delay: down 5m
info: dhcp-range utilization above threshold!
to: sysadmin
diff --git a/health/health.d/net.conf b/health/health.d/net.conf
index 85e194bb98..29556befc8 100644
--- a/health/health.d/net.conf
+++ b/health/health.d/net.conf
@@ -179,7 +179,7 @@ families: *
every: 10s
units: %
warn: $this > (($status >= $WARNING)?(200):(5000))
- crit: $this > (($status >= $WARNING)?(5000):(6000))
+ crit: $this > (($status == $CRITICAL)?(5000):(6000))
options: no-clear-notification
info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent)
to: sysadmin
diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf
index b255d35f90..20b9322302 100644
--- a/health/health.d/pihole.conf
+++ b/health/health.d/pihole.conf
@@ -20,7 +20,7 @@ template: pihole_blocked_queries
units: %
calc: $blocked
warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) )
- crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) )
+ crit: $this > ( ($status == $CRITICAL) ? ( 55 ) : ( 75 ) )
delay: up 2m down 5m
info: percentage of blocked dns queries for the last 24 hour
to: sysadmin
diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf
index f835f2aee1..fb2a1f5c0b 100644
--- a/health/health.d/softnet.conf
+++ b/health/health.d/softnet.conf
@@ -34,7 +34,7 @@
lookup: average -1m unaligned absolute of qdrops
units: packets
every: 10s
- warn: $this > (($status >+ $WARNING) ? (0) : (10))
+ warn: $this > (($status >= $WARNING) ? (0) : (10))
delay: down 1h multiplier 1.5 max 2h
info: average number of drops in the last 1min, because sysctl net.route.netisr_maxqlen was exceeded (this can be a cause for dropped packets)
to: sysadmin
diff --git a/health/health.d/tcp_conn.conf b/health/health.d/tcp_conn.conf
index 7aa9a98001..7dee6aaf7f 100644
--- a/health/health.d/tcp_conn.conf
+++ b/health/health.d/tcp_conn.conf
@@ -13,7 +13,7 @@
units: %
every: 10s
warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 ))
- crit: $this > (($status >= $CRITICAL) ? ( 80 ) : ( 90 ))
+ crit: $this > (($status == $CRITICAL) ? ( 80 ) : ( 90 ))
delay: up 0 down 5m multiplier 1.5 max 1h
info: the percentage of IPv4 TCP connections over the max allowed
to: sysadmin
diff --git a/health/health.d/tcp_mem.conf b/health/health.d/tcp_mem.conf
index 6927d57652..71f316634b 100644
--- a/health/health.d/tcp_mem.conf
+++ b/health/health.d/tcp_mem.conf
@@ -14,7 +14,7 @@
units: %
every: 10s
warn: ${mem} > (($status >= $WARNING ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure} ))
- crit: ${mem} > (($status >= $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 ))
+ crit: ${mem} > (($status == $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 ))
delay: up 0 down 5m multiplier 1.5 max 1h
info: the amount of TCP memory as a percentage of its max memory limit
to: sysadmin
diff --git a/health/health.d/tcp_orphans.conf b/health/health.d/tcp_orphans.conf
index 280d6590fc..084e969d5a 100644
--- a/health/health.d/tcp_orphans.conf
+++ b/health/health.d/tcp_orphans.conf
@@ -15,7 +15,7 @@
units: %
every: 10s
warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 ))
- crit: $this > (($status >= $CRITICAL) ? ( 25 ) : ( 50 ))
+ crit: $this > (($status == $CRITICAL) ? ( 25 ) : ( 50 ))
delay: up 0 down 5m multiplier 1.5 max 1h
info: the percentage of orphan IPv4 TCP sockets over the max allowed (this may lead to too-many-orphans errors)
to: sysadmin