summaryrefslogtreecommitdiffstats
path: root/health
diff options
context:
space:
mode:
authorAustin S. Hemmelgarn <ahferroin7@gmail.com>2019-11-15 13:14:43 -0500
committerIlya Mashchenko <ilya@netdata.cloud>2019-11-15 21:14:43 +0300
commitef721290f03ec9d0292f062672feaad78d7fe3a9 (patch)
tree3a8e57a0117bfe166317dfcdd407ee861eb90ef1 /health
parentac90f27a580aad0c4d20c5354657b34e5ceed29e (diff)
Fine tune various alarm values. (#7322)
* Fix formatting in alarm configurations. This makes sure everything is lined up properly so that the alarm definitions are easier to read. * Make TCP Accept Queue alarms much less aggressive. This switches the alarms to use averages instead of sums, and bumps up the trip points to be more aggressive, as both of these may be non-zero even in normal operation of a system. * Make softnet alarms less aggressive. This decreases the sampling window from 10 minutes to 1 minute, switches to using an average instead of a sum, and adjusts the trigger thresholds to be more aggressive. This one will need to be watched, as the resultant values may be too lenient for some systems. * Tweak UDP alarms to work like the TCP alarms. Just to ensure consistency.
Diffstat (limited to 'health')
-rw-r--r--health/health.d/dnsmasq_dhcp.conf4
-rw-r--r--health/health.d/megacli.conf32
-rw-r--r--health/health.d/net.conf10
-rw-r--r--health/health.d/pihole.conf90
-rw-r--r--health/health.d/ram.conf48
-rw-r--r--health/health.d/softnet.conf28
-rw-r--r--health/health.d/tcp_listen.conf15
-rw-r--r--health/health.d/udp_errors.conf14
8 files changed, 120 insertions, 121 deletions
diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf
index b7eb4e0a3e..ecf3b84a82 100644
--- a/health/health.d/dnsmasq_dhcp.conf
+++ b/health/health.d/dnsmasq_dhcp.conf
@@ -1,6 +1,6 @@
- # dhcp-range utilization
+# dhcp-range utilization
- template: dnsmasq_dhcp_dhcp_range_utilization
+template: dnsmasq_dhcp_dhcp_range_utilization
on: dnsmasq_dhcp.dhcp_range_utilization
every: 10s
units: %
diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf
index 1881a7be14..73b87dcc03 100644
--- a/health/health.d/megacli.conf
+++ b/health/health.d/megacli.conf
@@ -1,48 +1,48 @@
alarm: adapter_state
on: megacli.adapter_degraded
units: is degraded
- lookup: sum -10s
- every: 10s
+ lookup: sum -10s
+ every: 10s
crit: $this > 0
info: adapter state
to: sysadmin
- template: bbu_relative_charge
+template: bbu_relative_charge
on: megacli.bbu_relative_charge
units: percent
- lookup: average -10s
- every: 10s
+ lookup: average -10s
+ every: 10s
warn: $this <= (($status >= $WARNING) ? (85) : (80))
crit: $this <= (($status == $CRITICAL) ? (50) : (40))
info: BBU relative state of charge
to: sysadmin
- template: bbu_cycle_count
+template: bbu_cycle_count
on: megacli.bbu_cycle_count
units: cycle count
- lookup: average -10s
- every: 10s
+ lookup: average -10s
+ every: 10s
warn: $this >= 100
crit: $this >= 500
info: BBU cycle count
to: sysadmin
- alarm: pd_media_errors
+ alarm: pd_media_errors
on: megacli.pd_media_error
units: media errors
- lookup: sum -10s
- every: 10s
+ lookup: sum -10s
+ every: 10s
warn: $this > 0
- delay: down 1m multiplier 2 max 10m
+ delay: down 1m multiplier 2 max 10m
info: physical drive media errors
to: sysadmin
- alarm: pd_predictive_failures
+ alarm: pd_predictive_failures
on: megacli.pd_predictive_failure
units: predictive failures
- lookup: sum -10s
- every: 10s
+ lookup: sum -10s
+ every: 10s
warn: $this > 0
- delay: down 1m multiplier 2 max 10m
+ delay: down 1m multiplier 2 max 10m
info: physical drive predictive failures
to: sysadmin
diff --git a/health/health.d/net.conf b/health/health.d/net.conf
index 255ab9982b..e43cb16912 100644
--- a/health/health.d/net.conf
+++ b/health/health.d/net.conf
@@ -161,8 +161,8 @@ families: *
calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
every: 10s
units: %
- warn: $this > (($status >= $WARNING)?(200):(5000))
- crit: $this > (($status >= $WARNING)?(5000):(6000))
-options: no-clear-notification
- info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent)
- to: sysadmin
+ warn: $this > (($status >= $WARNING)?(200):(5000))
+ crit: $this > (($status >= $WARNING)?(5000):(6000))
+ options: no-clear-notification
+ info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent)
+ to: sysadmin
diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf
index 4a1217239d..b255d35f90 100644
--- a/health/health.d/pihole.conf
+++ b/health/health.d/pihole.conf
@@ -1,5 +1,5 @@
- # Make sure Pi-hole is responding.
+# Make sure Pi-hole is responding.
template: pihole_last_collected_secs
on: pihole.dns_queries_total
@@ -12,56 +12,54 @@ template: pihole_last_collected_secs
info: number of seconds since the last successful data collection
to: webmaster
- # Blocked DNS queries.
+# Blocked DNS queries.
- template: pihole_blocked_queries
- on: pihole.dns_queries_percentage
- every: 10s
- units: %
- calc: $blocked
- warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) )
- crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) )
- delay: up 2m down 5m
- info: percentage of blocked dns queries for the last 24 hour
- to: sysadmin
-
-
- # Blocklist last update time.
- # Default update interval is a week.
+template: pihole_blocked_queries
+ on: pihole.dns_queries_percentage
+ every: 10s
+ units: %
+ calc: $blocked
+ warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) )
+ crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) )
+ delay: up 2m down 5m
+ info: percentage of blocked dns queries for the last 24 hour
+ to: sysadmin
- template: pihole_blocklist_last_update
- on: pihole.blocklist_last_update
- every: 10s
- units: seconds
- calc: $ago
- warn: $this > 60 * 60 * 24 * 8
- crit: $this > 60 * 60 * 24 * 8 * 2
- info: blocklist last update time
- to: sysadmin
+# Blocklist last update time.
+# Default update interval is a week.
- # Gravity file check (gravity.list).
+template: pihole_blocklist_last_update
+ on: pihole.blocklist_last_update
+ every: 10s
+ units: seconds
+ calc: $ago
+ warn: $this > 60 * 60 * 24 * 8
+ crit: $this > 60 * 60 * 24 * 8 * 2
+ info: blocklist last update time
+ to: sysadmin
- template: pihole_blocklist_gravity_file
- on: pihole.blocklist_last_update
- every: 10s
- units: boolean
- calc: $file_exists
- crit: $this != 1
- delay: up 2m down 5m
- info: gravity file existence
- to: sysadmin
+# Gravity file check (gravity.list).
+template: pihole_blocklist_gravity_file
+ on: pihole.blocklist_last_update
+ every: 10s
+ units: boolean
+ calc: $file_exists
+ crit: $this != 1
+ delay: up 2m down 5m
+ info: gravity file existence
+ to: sysadmin
- # Pi-hole's ability to block unwanted domains.
- # Should be enabled. The whole point of Pi-hole!
+# Pi-hole's ability to block unwanted domains.
+# Should be enabled. The whole point of Pi-hole!
- template: pihole_status
- on: pihole.unwanted_domains_blocking_status
- every: 10s
- units: boolean
- calc: $enabled
- warn: $this != 1
- delay: up 2m down 5m
- info: unwanted domains blocking status
- to: sysadmin
+template: pihole_status
+ on: pihole.unwanted_domains_blocking_status
+ every: 10s
+ units: boolean
+ calc: $enabled
+ warn: $this != 1
+ delay: up 2m down 5m
+ info: unwanted domains blocking status
+ to: sysadmin
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index 4e41bb496d..15e8e84640 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -37,28 +37,28 @@
to: sysadmin
## FreeBSD
-alarm: ram_in_use
- on: system.ram
- os: freebsd
-hosts: *
- calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
-units: %
-every: 10s
- warn: $this > (($status >= $WARNING) ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
-delay: down 15m multiplier 1.5 max 1h
- info: system RAM usage
- to: sysadmin
+ alarm: ram_in_use
+ on: system.ram
+ os: freebsd
+ hosts: *
+ calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ info: system RAM usage
+ to: sysadmin
- alarm: ram_available
- on: system.ram
- os: freebsd
- hosts: *
- calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
- units: %
- every: 10s
- warn: $this < (($status >= $WARNING) ? (15) : (10))
- crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
- delay: down 15m multiplier 1.5 max 1h
- info: estimated amount of RAM available for userspace processes, without causing swapping
- to: sysadmin
+ alarm: ram_available
+ on: system.ram
+ os: freebsd
+ hosts: *
+ calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
+ units: %
+ every: 10s
+ warn: $this < (($status >= $WARNING) ? (15) : (10))
+ crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
+ delay: down 15m multiplier 1.5 max 1h
+ info: estimated amount of RAM available for userspace processes, without causing swapping
+ to: sysadmin
diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf
index 77c804bfd6..ff36486261 100644
--- a/health/health.d/softnet.conf
+++ b/health/health.d/softnet.conf
@@ -3,38 +3,38 @@
# check for common /proc/net/softnet_stat errors
- alarm: 10min_netdev_backlog_exceeded
+ alarm: 1min_netdev_backlog_exceeded
on: system.softnet_stat
os: linux
hosts: *
- lookup: sum -10m unaligned absolute of dropped
+ lookup: average -1m unaligned absolute of dropped
units: packets
- every: 1m
- warn: $this > 0
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : (10)
delay: down 1h multiplier 1.5 max 2h
- info: number of packets dropped in the last 10min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
+ info: average number of packets dropped in the last 1min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
to: sysadmin
- alarm: 10min_netdev_budget_ran_outs
+ alarm: 1min_netdev_budget_ran_outs
on: system.softnet_stat
os: linux
hosts: *
- lookup: sum -10m unaligned absolute of squeezed
+ lookup: average -1m unaligned absolute of squeezed
units: events
- every: 1m
- warn: $this > (($status >= $WARNING) ? (0) : (10))
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : (10))
delay: down 1h multiplier 1.5 max 2h
- info: number of times, during the last 10min, ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs, with work remaining (this can be a cause for dropped packets)
+ info: average number of times, during the last 1min, ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs, with work remaining (this can be a cause for dropped packets)
to: silent
alarm: 10min_netisr_backlog_exceeded
on: system.softnet_stat
os: freebsd
hosts: *
- lookup: sum -10m unaligned absolute of qdrops
+ lookup: average -1m unaligned absolute of qdrops
units: packets
- every: 1m
- warn: $this > 0
+ every: 10s
+ warn: $this > (($status >+ $WARNING) ? (0) : (10))
delay: down 1h multiplier 1.5 max 2h
- info: number of drops in the last 10min, because sysctl net.route.netisr_maxqlen was exceeded (this can be a cause for dropped packets)
+ info: average number of drops in the last 1min, because sysctl net.route.netisr_maxqlen was exceeded (this can be a cause for dropped packets)
to: sysadmin
diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf
index 5ac1aa8e00..3b30725772 100644
--- a/health/health.d/tcp_listen.conf
+++ b/health/health.d/tcp_listen.conf
@@ -22,12 +22,13 @@
on: ip.tcp_accept_queue
os: linux
hosts: *
- lookup: sum -60s unaligned absolute of ListenOverflows
+ lookup: average -60s unaligned absolute of ListenOverflows
units: overflows
every: 10s
- crit: $this > 0
+ warn: $this > 1
+ crit: $this > (($status == $CRITICAL) ? (1) : (5))
delay: up 0 down 5m multiplier 1.5 max 1h
- info: the number of times the TCP accept queue of the kernel overflown, during the last minute
+ info: the average number of times the TCP accept queue of the kernel overflown, during the last minute
to: sysadmin
# THIS IS TOO GENERIC
@@ -36,13 +37,13 @@
on: ip.tcp_accept_queue
os: linux
hosts: *
- lookup: sum -60s unaligned absolute of ListenDrops
+ lookup: average -60s unaligned absolute of ListenDrops
units: drops
every: 10s
-# warn: $this > 0
- crit: $this > (($status == $CRITICAL) ? (0) : (150))
+ warn: $this > 1
+ crit: $this > (($status == $CRITICAL) ? (1) : (5))
delay: up 0 down 5m multiplier 1.5 max 1h
- info: the number of times the TCP accept queue of the kernel dropped packets, during the last minute (includes bogus packets received)
+ info: the average number of times the TCP accept queue of the kernel dropped packets, during the last minute (includes bogus packets received)
to: sysadmin
diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf
index 5140228f56..1e47b5c8bf 100644
--- a/health/health.d/udp_errors.conf
+++ b/health/health.d/udp_errors.conf
@@ -23,12 +23,12 @@
on: ipv4.udperrors
os: linux freebsd
hosts: *
- lookup: sum -1m unaligned absolute of RcvbufErrors
+ lookup: average -1m unaligned absolute of RcvbufErrors
units: errors
every: 10s
- warn: $this > 0
- crit: $this > (($status == $CRITICAL) ? (0) : (100))
- info: number of UDP receive buffer errors during the last minute
+ warn: $this > 1
+ crit: $this > (($status == $CRITICAL) ? (0) : (10))
+ info: average number of UDP receive buffer errors during the last minute
delay: up 0 down 60m multiplier 1.2 max 2h
to: sysadmin
@@ -39,11 +39,11 @@
on: ipv4.udperrors
os: linux
hosts: *
- lookup: sum -1m unaligned absolute of SndbufErrors
+ lookup: average -1m unaligned absolute of SndbufErrors
units: errors
every: 10s
- warn: $this > 0
- crit: $this > (($status == $CRITICAL) ? (0) : (100))
+ warn: $this > 1
+ crit: $this > (($status == $CRITICAL) ? (0) : (10))
info: number of UDP send buffer errors during the last minute
delay: up 0 down 60m multiplier 1.2 max 2h
to: sysadmin