From dfc4915ffb73b5b12ce50bcf6c1981afa9928f0a Mon Sep 17 00:00:00 2001 From: Ilya Mashchenko Date: Mon, 24 May 2021 14:36:12 +0300 Subject: health: make stocks alarms less sensitive (2) (#11153) --- health/health.d/disks.conf | 10 ++-------- health/health.d/net.conf | 4 ++-- health/health.d/ram.conf | 8 ++++---- health/health.d/web_log.conf | 4 ---- 4 files changed, 8 insertions(+), 18 deletions(-) diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf index 60f8faed99..0b88106181 100644 --- a/health/health.d/disks.conf +++ b/health/health.d/disks.conf @@ -145,10 +145,7 @@ component: Disk lookup: average -10m unaligned units: % every: 1m - green: 90 - red: 98 - warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1)) - crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1)) + warn: $this > 98 * (($status >= $WARNING) ? (0.7) : (1)) delay: down 15m multiplier 1.2 max 1h info: average percentage of time $family disk was busy over the last 10 minutes to: silent @@ -170,10 +167,7 @@ component: Disk lookup: average -10m unaligned units: ms every: 1m - green: 2000 - red: 5000 - warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1)) - crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1)) + warn: $this > 5000 * (($status >= $WARNING) ? (0.7) : (1)) delay: down 15m multiplier 1.2 max 1h info: average backlog size of the $family disk over the last 10 minutes to: silent diff --git a/health/health.d/net.conf b/health/health.d/net.conf index 04219e163a..db480031ce 100644 --- a/health/health.d/net.conf +++ b/health/health.d/net.conf @@ -96,7 +96,7 @@ component: Network hosts: * families: !net* !wl* * lookup: sum -10m unaligned absolute of received - calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0)) + calc: (($inbound_packets_dropped != nan AND $this > 10000) ? ($inbound_packets_dropped * 100 / $this) : (0)) units: % every: 1m warn: $this >= 2 @@ -130,7 +130,7 @@ component: Network hosts: * families: wl* lookup: sum -10m unaligned absolute of received - calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0)) + calc: (($inbound_packets_dropped != nan AND $this > 10000) ? ($inbound_packets_dropped * 100 / $this) : (0)) units: % every: 1m warn: $this >= 10 diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf index 0e3cc29faf..450c548171 100644 --- a/health/health.d/ram.conf +++ b/health/health.d/ram.conf @@ -50,12 +50,12 @@ component: Memory on: mem.oom_kill os: linux hosts: * - lookup: sum -1m unaligned + lookup: sum -30m unaligned units: kills - every: 10s + every: 5m warn: $this > 0 - delay: down 5m - info: number of out of memory kills in the last minute + delay: down 10m + info: number of out of memory kills in the last 30 minutes to: sysadmin ## FreeBSD diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf index 127c9a9c6a..c1237fa06d 100644 --- a/health/health.d/web_log.conf +++ b/health/health.d/web_log.conf @@ -66,7 +66,6 @@ component: Web log units: % every: 10s warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 ) - crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h info: ratio of redirection HTTP requests over the last minute (3xx except 304) to: webmaster @@ -82,7 +81,6 @@ component: Web log units: % every: 10s warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 ) - crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h info: ratio of client error HTTP requests over the last minute (4xx except 401) to: webmaster @@ -335,7 +333,6 @@ component: Web log units: % every: 10s warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 ) - crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h info: ratio of redirection HTTP requests over the last minute (3xx except 304) to: webmaster @@ -351,7 +348,6 @@ component: Web log units: % every: 10s warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 ) - crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h info: ratio of client error HTTP requests over the last minute (4xx except 401) to: webmaster -- cgit v1.2.3