summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIlya Mashchenko <ilya@netdata.cloud>2021-05-24 14:36:12 +0300
committerGitHub <noreply@github.com>2021-05-24 14:36:12 +0300
commitdfc4915ffb73b5b12ce50bcf6c1981afa9928f0a (patch)
treec62bae5b14fa741e48c6494393a2423a36ee2f54
parentca571a05f797009c87ae9b5819c1aab35c0aecb4 (diff)
health: make stocks alarms less sensitive (2) (#11153)
-rw-r--r--health/health.d/disks.conf10
-rw-r--r--health/health.d/net.conf4
-rw-r--r--health/health.d/ram.conf8
-rw-r--r--health/health.d/web_log.conf4
4 files changed, 8 insertions, 18 deletions
diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf
index 60f8faed99..0b88106181 100644
--- a/health/health.d/disks.conf
+++ b/health/health.d/disks.conf
@@ -145,10 +145,7 @@ component: Disk
lookup: average -10m unaligned
units: %
every: 1m
- green: 90
- red: 98
- warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1))
- crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1))
+ warn: $this > 98 * (($status >= $WARNING) ? (0.7) : (1))
delay: down 15m multiplier 1.2 max 1h
info: average percentage of time $family disk was busy over the last 10 minutes
to: silent
@@ -170,10 +167,7 @@ component: Disk
lookup: average -10m unaligned
units: ms
every: 1m
- green: 2000
- red: 5000
- warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1))
- crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1))
+ warn: $this > 5000 * (($status >= $WARNING) ? (0.7) : (1))
delay: down 15m multiplier 1.2 max 1h
info: average backlog size of the $family disk over the last 10 minutes
to: silent
diff --git a/health/health.d/net.conf b/health/health.d/net.conf
index 04219e163a..db480031ce 100644
--- a/health/health.d/net.conf
+++ b/health/health.d/net.conf
@@ -96,7 +96,7 @@ component: Network
hosts: *
families: !net* !wl* *
lookup: sum -10m unaligned absolute of received
- calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0))
+ calc: (($inbound_packets_dropped != nan AND $this > 10000) ? ($inbound_packets_dropped * 100 / $this) : (0))
units: %
every: 1m
warn: $this >= 2
@@ -130,7 +130,7 @@ component: Network
hosts: *
families: wl*
lookup: sum -10m unaligned absolute of received
- calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0))
+ calc: (($inbound_packets_dropped != nan AND $this > 10000) ? ($inbound_packets_dropped * 100 / $this) : (0))
units: %
every: 1m
warn: $this >= 10
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index 0e3cc29faf..450c548171 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -50,12 +50,12 @@ component: Memory
on: mem.oom_kill
os: linux
hosts: *
- lookup: sum -1m unaligned
+ lookup: sum -30m unaligned
units: kills
- every: 10s
+ every: 5m
warn: $this > 0
- delay: down 5m
- info: number of out of memory kills in the last minute
+ delay: down 10m
+ info: number of out of memory kills in the last 30 minutes
to: sysadmin
## FreeBSD
diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf
index 127c9a9c6a..c1237fa06d 100644
--- a/health/health.d/web_log.conf
+++ b/health/health.d/web_log.conf
@@ -66,7 +66,6 @@ component: Web log
units: %
every: 10s
warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 )
- crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
info: ratio of redirection HTTP requests over the last minute (3xx except 304)
to: webmaster
@@ -82,7 +81,6 @@ component: Web log
units: %
every: 10s
warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 )
- crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
info: ratio of client error HTTP requests over the last minute (4xx except 401)
to: webmaster
@@ -335,7 +333,6 @@ component: Web log
units: %
every: 10s
warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 )
- crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
info: ratio of redirection HTTP requests over the last minute (3xx except 304)
to: webmaster
@@ -351,7 +348,6 @@ component: Web log
units: %
every: 10s
warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 )
- crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
info: ratio of client error HTTP requests over the last minute (4xx except 401)
to: webmaster