From e186b45a0159ee72e5a7075e53b53151f2bc8373 Mon Sep 17 00:00:00 2001 From: Ilya Mashchenko Date: Tue, 16 Mar 2021 18:30:22 +0300 Subject: health/: fix various alarms critical and warning thresholds hysteresis (#10779) --- health/health.d/bcache.conf | 2 +- health/health.d/dnsmasq_dhcp.conf | 2 +- health/health.d/net.conf | 2 +- health/health.d/pihole.conf | 2 +- health/health.d/softnet.conf | 2 +- health/health.d/tcp_conn.conf | 2 +- health/health.d/tcp_mem.conf | 2 +- health/health.d/tcp_orphans.conf | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf index cbaf18e8ae..f737129bf6 100644 --- a/health/health.d/bcache.conf +++ b/health/health.d/bcache.conf @@ -15,7 +15,7 @@ template: bcache_cache_dirty units: % every: 1m warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) ) - crit: $this > ( ($status >= $CRITICAL) ? ( 90 ) : ( 95 ) ) + crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) ) delay: up 1m down 1h multiplier 1.5 max 2h info: the percentage of cache space used for dirty and metadata (this usually means your SSD cache is too small) to: sysadmin diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf index ecf3b84a82..51f7ec98d7 100644 --- a/health/health.d/dnsmasq_dhcp.conf +++ b/health/health.d/dnsmasq_dhcp.conf @@ -6,7 +6,7 @@ template: dnsmasq_dhcp_dhcp_range_utilization units: % calc: $used warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) ) - crit: $this > ( ($status >= $CRITICAL) ? ( 90 ) : ( 95 ) ) + crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) ) delay: down 5m info: dhcp-range utilization above threshold! to: sysadmin diff --git a/health/health.d/net.conf b/health/health.d/net.conf index 85e194bb98..29556befc8 100644 --- a/health/health.d/net.conf +++ b/health/health.d/net.conf @@ -179,7 +179,7 @@ families: * every: 10s units: % warn: $this > (($status >= $WARNING)?(200):(5000)) - crit: $this > (($status >= $WARNING)?(5000):(6000)) + crit: $this > (($status == $CRITICAL)?(5000):(6000)) options: no-clear-notification info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent) to: sysadmin diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf index b255d35f90..20b9322302 100644 --- a/health/health.d/pihole.conf +++ b/health/health.d/pihole.conf @@ -20,7 +20,7 @@ template: pihole_blocked_queries units: % calc: $blocked warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) ) - crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) ) + crit: $this > ( ($status == $CRITICAL) ? ( 55 ) : ( 75 ) ) delay: up 2m down 5m info: percentage of blocked dns queries for the last 24 hour to: sysadmin diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf index f835f2aee1..fb2a1f5c0b 100644 --- a/health/health.d/softnet.conf +++ b/health/health.d/softnet.conf @@ -34,7 +34,7 @@ lookup: average -1m unaligned absolute of qdrops units: packets every: 10s - warn: $this > (($status >+ $WARNING) ? (0) : (10)) + warn: $this > (($status >= $WARNING) ? (0) : (10)) delay: down 1h multiplier 1.5 max 2h info: average number of drops in the last 1min, because sysctl net.route.netisr_maxqlen was exceeded (this can be a cause for dropped packets) to: sysadmin diff --git a/health/health.d/tcp_conn.conf b/health/health.d/tcp_conn.conf index 7aa9a98001..7dee6aaf7f 100644 --- a/health/health.d/tcp_conn.conf +++ b/health/health.d/tcp_conn.conf @@ -13,7 +13,7 @@ units: % every: 10s warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 )) - crit: $this > (($status >= $CRITICAL) ? ( 80 ) : ( 90 )) + crit: $this > (($status == $CRITICAL) ? ( 80 ) : ( 90 )) delay: up 0 down 5m multiplier 1.5 max 1h info: the percentage of IPv4 TCP connections over the max allowed to: sysadmin diff --git a/health/health.d/tcp_mem.conf b/health/health.d/tcp_mem.conf index 6927d57652..71f316634b 100644 --- a/health/health.d/tcp_mem.conf +++ b/health/health.d/tcp_mem.conf @@ -14,7 +14,7 @@ units: % every: 10s warn: ${mem} > (($status >= $WARNING ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure} )) - crit: ${mem} > (($status >= $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 )) + crit: ${mem} > (($status == $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 )) delay: up 0 down 5m multiplier 1.5 max 1h info: the amount of TCP memory as a percentage of its max memory limit to: sysadmin diff --git a/health/health.d/tcp_orphans.conf b/health/health.d/tcp_orphans.conf index 280d6590fc..084e969d5a 100644 --- a/health/health.d/tcp_orphans.conf +++ b/health/health.d/tcp_orphans.conf @@ -15,7 +15,7 @@ units: % every: 10s warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 )) - crit: $this > (($status >= $CRITICAL) ? ( 25 ) : ( 50 )) + crit: $this > (($status == $CRITICAL) ? ( 25 ) : ( 50 )) delay: up 0 down 5m multiplier 1.5 max 1h info: the percentage of orphan IPv4 TCP sockets over the max allowed (this may lead to too-many-orphans errors) to: sysadmin -- cgit v1.2.3