summaryrefslogtreecommitdiffstats
path: root/health
diff options
context:
space:
mode:
Diffstat (limited to 'health')
-rw-r--r--health/health.d/dnsmasq_dhcp.conf4
-rw-r--r--health/health.d/megacli.conf32
-rw-r--r--health/health.d/net.conf10
-rw-r--r--health/health.d/pihole.conf90
-rw-r--r--health/health.d/ram.conf48
-rw-r--r--health/health.d/softnet.conf28
-rw-r--r--health/health.d/tcp_listen.conf15
-rw-r--r--health/health.d/udp_errors.conf14
8 files changed, 120 insertions, 121 deletions
diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf
index b7eb4e0a3e..ecf3b84a82 100644
--- a/health/health.d/dnsmasq_dhcp.conf
+++ b/health/health.d/dnsmasq_dhcp.conf
@@ -1,6 +1,6 @@
- # dhcp-range utilization
+# dhcp-range utilization
- template: dnsmasq_dhcp_dhcp_range_utilization
+template: dnsmasq_dhcp_dhcp_range_utilization
on: dnsmasq_dhcp.dhcp_range_utilization
every: 10s
units: %
diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf
index 1881a7be14..73b87dcc03 100644
--- a/health/health.d/megacli.conf
+++ b/health/health.d/megacli.conf
@@ -1,48 +1,48 @@
alarm: adapter_state
on: megacli.adapter_degraded
units: is degraded
- lookup: sum -10s
- every: 10s
+ lookup: sum -10s
+ every: 10s
crit: $this > 0
info: adapter state
to: sysadmin
- template: bbu_relative_charge
+template: bbu_relative_charge
on: megacli.bbu_relative_charge
units: percent
- lookup: average -10s
- every: 10s
+ lookup: average -10s
+ every: 10s
warn: $this <= (($status >= $WARNING) ? (85) : (80))
crit: $this <= (($status == $CRITICAL) ? (50) : (40))
info: BBU relative state of charge
to: sysadmin
- template: bbu_cycle_count
+template: bbu_cycle_count
on: megacli.bbu_cycle_count
units: cycle count
- lookup: average -10s
- every: 10s
+ lookup: average -10s
+ every: 10s
warn: $this >= 100
crit: $this >= 500
info: BBU cycle count
to: sysadmin
- alarm: pd_media_errors
+ alarm: pd_media_errors
on: megacli.pd_media_error
units: media errors
- lookup: sum -10s
- every: 10s
+ lookup: sum -10s
+ every: 10s
warn: $this > 0
- delay: down 1m multiplier 2 max 10m
+ delay: down 1m multiplier 2 max 10m
info: physical drive media errors
to: sysadmin
- alarm: pd_predictive_failures
+ alarm: pd_predictive_failures
on: megacli.pd_predictive_failure
units: predictive failures
- lookup: sum -10s
- every: 10s
+ lookup: sum -10s
+ every: 10s
warn: $this > 0
- delay: down 1m multiplier 2 max 10m
+ delay: down 1m multiplier 2 max 10m
info: physical drive predictive failures
to: sysadmin
diff --git a/health/health.d/net.conf b/health/health.d/net.conf
index 255ab9982b..e43cb16912 100644
--- a/health/health.d/net.conf
+++ b/health/health.d/net.conf
@@ -161,8 +161,8 @@ families: *
calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
every: 10s
units: %
- warn: $this > (($status >= $WARNING)?(200):(5000))
- crit: $this > (($status >= $WARNING)?(5000):(6000))
-options: no-clear-notification
- info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent)
- to: sysadmin
+ warn: $this > (($status >= $WARNING)?(200):(5000))
+ crit: $this > (($status >= $WARNING)?(5000):(6000))
+ options: no-clear-notification
+ info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent)
+ to: sysadmin
diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf
index 4a1217239d..b255d35f90 100644
--- a/health/health.d/pihole.conf
+++ b/health/health.d/pihole.conf
@@ -1,5 +1,5 @@
- # Make sure Pi-hole is responding.
+# Make sure Pi-hole is responding.
template: pihole_last_collected_secs
on: pihole.dns_queries_total
@@ -12,56 +12,54 @@ template: pihole_last_collected_secs
info: number of seconds since the last successful data collection
to: webmaster
- # Blocked DNS queries.
+# Blocked DNS queries.
- template: pihole_blocked_queries
- on: pihole.dns_queries_percentage
- every: 10s
- units: %
- calc: $blocked
- warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) )
- crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) )
- delay: up 2m down 5m
- info: percentage of blocked dns queries for the last 24 hour
- to: sysadmin
-
-
- # Blocklist last update time.
- # Default update interval is a week.
+template: pihole_blocked_queries
+ on: pihole.dns_queries_percentage
+ every: 10s
+ units: %
+ calc: $blocked
+ warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) )
+ crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) )
+ delay: up 2m down 5m
+ info: percentage of blocked dns queries for the last 24 hour
+ to: sysadmin
- template: pihole_blocklist_last_update
- on: pihole.blocklist_last_update
- every: 10s
- units: seconds
- calc: $ago
- warn: $this > 60 * 60 * 24 * 8
- crit: $this > 60 * 60 * 24 * 8 * 2
- info: blocklist last update time
- to: sysadmin
+# Blocklist last update time.
+# Default update interval is a week.
- # Gravity file check (gravity.list).
+template: pihole_blocklist_last_update
+ on: pihole.blocklist_last_update
+ every: 10s
+ units: seconds
+ calc: $ago
+ warn: $this > 60 * 60 * 24 * 8
+ crit: $this > 60 * 60 * 24 * 8 * 2
+ info: blocklist last update time
+ to: sysadmin
- template: pihole_blocklist_gravity_file
- on: pihole.blocklist_last_update
- every: 10s
- units: boolean
- calc: $file_exists
- crit: $this != 1
- delay: up 2m down 5m
- info: gravity file existence
- to: sysadmin
+# Gravity file check (gravity.list).
+template: pihole_blocklist_gravity_file
+ on: pihole.blocklist_last_update
+ every: 10s
+ units: boolean
+ calc: $file_exists
+ crit: $this != 1
+ delay: up 2m down 5m
+ info: gravity file existence
+ to: sysadmin
- # Pi-hole's ability to block unwanted domains.
- # Should be enabled. The whole point of Pi-hole!
+# Pi-hole's ability to block unwanted domains.
+# Should be enabled. The whole point of Pi-hole!
- template: pihole_status
- on: pihole.unwanted_domains_blocking_status
- every: 10s
- units: boolean
- calc: $enabled
- warn: $this != 1
- delay: up 2m down 5m
- info: unwanted domains blocking status
- to: sysadmin
+template: pihole_status
+ on: pihole.unwanted_domains_blocking_status
+ every: 10s
+ units: boolean
+ calc: $enabled
+ warn: $this != 1
+ delay: up 2m down 5m
+ info: unwanted domains blocking status
+ to: sysadmin
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index 4e41bb496d..15e8e84640 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -37,28 +37,28 @@
to: sysadmin
## FreeBSD
-alarm: ram_in_use
- on: system.ram
- os: freebsd
-hosts: *
- calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
-units: %
-every: 10s
- warn: $this > (($status >= $WARNING) ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
-delay: down 15m multiplier 1.5 max 1h
- info: system RAM usage
- to: sysadmin
+ alarm: ram_in_use
+ on: system.ram
+ os: freebsd
+ hosts: *
+ calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ info: system RAM usage
+ to: sysadmin
- alarm: ram_available
- on: system.ram
- os: freebsd
- hosts: *
- calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
- units: %
- every: 10s
- warn: $this < (($status >= $WARNING) ? (15) : (10))
- crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
- delay: down 15m multiplier 1.5 max 1h
- info: estimated amount of RAM available for userspace processes, without causing swapping
- to: sysadmin
+ alarm: ram_available
+ on: system.ram
+ os: freebsd
+ hosts: *
+ calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
+ units: %
+ every: 10s
+ warn: $this < (($status >= $WARNING) ? (15) : (10))
+ crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
+ delay: down 15m multiplier 1.5 max 1h
+ info: estimated amount of RAM available for userspace processes, without causing swapping
+ to: sysadmin
diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf
index 77c804bfd6..ff36486261 100644
--- a/health/health.d/softnet.conf
+++ b/health/health.d/softnet.conf
@@ -3,38 +3,38 @@
# check for common /proc/net/softnet_stat errors
- alarm: 10min_netdev_backlog_exceeded
+ alarm: 1min_netdev_backlog_exceeded
on: system.softnet_stat
os: linux
hosts: *
- lookup: sum -10m unaligned absolute of dropped
+ lookup: average -1m unaligned absolute of dropped
units: packets
- every: 1m
- warn: $this > 0
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : (10)
delay: down 1h multiplier 1.5 max 2h
- info: number of packets dropped in the last 10min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
+ info: average number of packets dropped in the last 1min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
to: sysadmin
- alarm: 10min_netdev_budget_ran_outs
+ alarm: 1min_netdev_budget_ran_outs
on: system.softnet_stat
os: linux
hosts: *
- lookup: sum -10m unaligned absolute of squeezed
+ lookup: average -1m unaligned absolute of squeezed
units: events
- every: 1m
- warn: $this > (($status >= $WARNING) ? (0) : (10))
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : (10))
delay: down 1h multiplier 1.5 max 2h
- info: number of times, during the last 10min, ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs, with work remaining (this can be a cause for dropped packets)
+ info: average number of times, during the last 1min, ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs, with work remaining (this can be a cause for dropped packets)
to: silent
alarm: 10min_netisr_backlog_exceeded
on: system.softnet_stat
os: freebsd
hosts: *
- lookup: sum -10m unaligned absolute of qdrops
+ lookup: average -1m unaligned absolute of qdrops
units: packets
- every: 1m
- warn: $this > 0
+ every: 10s
+ warn: $this > (($status >+ $WARNING) ? (0) : (10))
delay: down 1h multiplier 1.5 max 2h
- info: number of drops in the last 10min, because sysctl net.route.netisr_maxqlen was exceeded (this can be a cause for dropped packets)
+ info: average number of drops in the last 1min, because sysctl net.route.netisr_maxqlen was exceeded (this can be a cause for dropped packets)
to: sysadmin
diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf
index 5ac1aa8e00..3b30725772 100644
--- a/health/health.d/tcp_listen.conf
+++ b/health/health.d/tcp_listen.conf
@@ -22,12 +22,13 @@
on: ip.tcp_accept_queue
os: linux
hosts: *
- lookup: sum -60s unaligned absolute of ListenOverflows
+ lookup: average -60s unaligned absolute of ListenOverflows
units: overflows
every: 10s
- crit: $this > 0
+ warn: $this > 1
+ crit: $this > (($status == $CRITICAL) ? (1) : (5))
delay: up 0 down 5m multiplier 1.5 max 1h
- info: the number of times the TCP accept queue of the kernel overflown, during the last minute
+ info: the average number of times the TCP accept queue of the kernel overflown, during the last minute
to: sysadmin
# THIS IS TOO GENERIC
@@ -36,13 +37,13 @@
on: ip.tcp_accept_queue
os: linux
hosts: *
- lookup: sum -60s unaligned absolute of ListenDrops
+ lookup: average -60s unaligned absolute of ListenDrops
units: drops
every: 10s
-# warn: $this > 0
- crit: $this > (($status == $CRITICAL) ? (0) : (150))
+ warn: $this > 1
+ crit: $this > (($status == $CRITICAL) ? (1) : (5))
delay: up 0 down 5m multiplier 1.5 max 1h
- info: the number of times the TCP accept queue of the kernel dropped packets, during the last minute (includes bogus packets received)
+ info: the average number of times the TCP accept queue of the kernel dropped packets, during the last minute (includes bogus packets received)
to: sysadmin
diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf
index 5140228f56..1e47b5c8bf 100644
--- a/health/health.d/udp_errors.conf
+++ b/health/health.d/udp_errors.conf
@@ -23,12 +23,12 @@
on: ipv4.udperrors
os: linux freebsd
hosts: *
- lookup: sum -1m unaligned absolute of RcvbufErrors
+ lookup: average -1m unaligned absolute of RcvbufErrors
units: errors
every: 10s
- warn: $this > 0
- crit: $this > (($status == $CRITICAL) ? (0) : (100))
- info: number of UDP receive buffer errors during the last minute
+ warn: $this > 1
+ crit: $this > (($status == $CRITICAL) ? (0) : (10))
+ info: average number of UDP receive buffer errors during the last minute
delay: up 0 down 60m multiplier 1.2 max 2h
to: sysadmin
@@ -39,11 +39,11 @@
on: ipv4.udperrors
os: linux
hosts: *
- lookup: sum -1m unaligned absolute of SndbufErrors
+ lookup: average -1m unaligned absolute of SndbufErrors
units: errors
every: 10s
- warn: $this > 0
- crit: $this > (($status == $CRITICAL) ? (0) : (100))
+ warn: $this > 1
+ crit: $this > (($status == $CRITICAL) ? (0) : (10))
info: number of UDP send buffer errors during the last minute
delay: up 0 down 60m multiplier 1.2 max 2h
to: sysadmin