summaryrefslogtreecommitdiffstats
path: root/conf.d/health.d
diff options
context:
space:
mode:
authorCosta Tsaousis (ktsaou) <costa@tsaousis.gr>2017-01-22 02:01:47 +0200
committerCosta Tsaousis (ktsaou) <costa@tsaousis.gr>2017-01-22 02:01:47 +0200
commit35fd21781a71ad173ce3ddc757f49c57b8ad448d (patch)
treef2f44fbea9bc937dc5c80cfe93974825be17f627 /conf.d/health.d
parentc516b0aec2fd3d107a40b410ca2ef4242f9ebbdf (diff)
alarms reworked to lower CPU pressure on large systems
Diffstat (limited to 'conf.d/health.d')
-rw-r--r--conf.d/health.d/cpu.conf2
-rw-r--r--conf.d/health.d/entropy.conf8
-rw-r--r--conf.d/health.d/net.conf48
-rw-r--r--conf.d/health.d/softnet.conf16
4 files changed, 37 insertions, 37 deletions
diff --git a/conf.d/health.d/cpu.conf b/conf.d/health.d/cpu.conf
index 2b04c794d0..60f494d70c 100644
--- a/conf.d/health.d/cpu.conf
+++ b/conf.d/health.d/cpu.conf
@@ -28,6 +28,6 @@ template: 20min_steal_cpu
every: 5m
warn: $this > (($status >= $WARNING) ? (5) : (10))
crit: $this > (($status == $CRITICAL) ? (20) : (30))
- delay: down 15m multiplier 1.5 max 1h
+ delay: down 1h multiplier 1.5 max 2h
info: average CPU steal time for the last 20 minutes
to: sysadmin
diff --git a/conf.d/health.d/entropy.conf b/conf.d/health.d/entropy.conf
index d0eca8a6c8..5dd8af502e 100644
--- a/conf.d/health.d/entropy.conf
+++ b/conf.d/health.d/entropy.conf
@@ -3,12 +3,12 @@
# the alarm is checked every 1 minute
# and examines the last hour of data
- alarm: 1hour_lowest_entropy
+ alarm: lowest_entropy
on: system.entropy
- lookup: min -1h unaligned
+ lookup: min -10m unaligned
units: entries
every: 5m
warn: $this < (($status >= $WARNING) ? (200) : (100))
- delay: down 1h multiplier 1.5 max 1h
- info: minimum entries in the random numbers pool in the last 30 minutes
+ delay: down 1h multiplier 1.5 max 2h
+ info: minimum entries in the random numbers pool in the last 10 minutes
to: silent
diff --git a/conf.d/health.d/net.conf b/conf.d/health.d/net.conf
index 11f7c43e7b..924acccc3d 100644
--- a/conf.d/health.d/net.conf
+++ b/conf.d/health.d/net.conf
@@ -19,54 +19,54 @@ families: *
# check if an interface is dropping packets
# the alarm is checked every 1 minute
-# and examines the last hour of data
+# and examines the last 10 minutes of data
-template: 1hour_packet_drops_inbound
+template: inbound_packets_dropped
on: net.drops
families: *
- lookup: sum -1h unaligned absolute of inbound
+ lookup: sum -10m unaligned absolute of inbound
units: packets
every: 1m
warn: $this > 0
- delay: down 30m multiplier 1.5 max 1h
- info: interface inbound dropped packets in the last hour
+ delay: down 1h multiplier 1.5 max 2h
+ info: interface inbound dropped packets in the last 10 minutes
to: sysadmin
-template: 1hour_packet_drops_outbound
+template: outbound_packets_dropped
on: net.drops
families: *
- lookup: sum -1h unaligned absolute of outbound
+ lookup: sum -10m unaligned absolute of outbound
units: packets
every: 1m
warn: $this > 0
- delay: down 30m multiplier 1.5 max 1h
- info: interface outbound dropped packets in the last hour
+ delay: down 1h multiplier 1.5 max 2h
+ info: interface outbound dropped packets in the last 10 minutes
to: sysadmin
-template: 1hour_packet_drops_ratio_inbound
+template: inbound_packets_dropped_ratio
on: net.packets
families: *
- lookup: sum -1h unaligned absolute of received
- calc: (($1hour_packet_drops_inbound != nan AND $this > 0) ? ($1hour_packet_drops_inbound * 100 / $this) : (0))
+ lookup: sum -10m unaligned absolute of received
+ calc: (($inbound_packets_dropped != nan AND $this > 0) ? ($inbound_packets_dropped * 100 / $this) : (0))
units: %
every: 1m
warn: $this > 0.5
crit: $this > 3
- delay: down 30m multiplier 1.5 max 1h
- info: the ratio of inbound dropped packets vs the total number of received packets of the network interface, during the last hour
+ delay: down 1h multiplier 1.5 max 2h
+ info: the ratio of inbound dropped packets vs the total number of received packets of the network interface, during the last 10 minutes
to: sysadmin
-template: 1hour_packet_drops_ratio_outbound
+template: outbound_packets_dropped_ratio
on: net.packets
families: *
- lookup: sum -1h unaligned absolute of sent
- calc: (($1hour_packet_drops_outbound != nan AND $this > 0) ? ($1hour_packet_drops_outbound * 100 / $this) : (0))
+ lookup: sum -10m unaligned absolute of sent
+ calc: (($outbound_packets_dropped != nan AND $this > 0) ? ($outbound_packets_dropped * 100 / $this) : (0))
units: %
every: 1m
warn: $this > 0.5
crit: $this > 3
- delay: down 30m multiplier 1.5 max 1h
- info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last hour
+ delay: down 1h multiplier 1.5 max 2h
+ info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes
to: sysadmin
@@ -76,17 +76,17 @@ families: *
# check if an interface is having FIFO
# buffer errors
# the alarm is checked every 1 minute
-# and examines the last hour of data
+# and examines the last 10 minutes of data
-template: 1hour_fifo_errors
+template: 10min_fifo_errors
on: net.fifo
families: *
- lookup: sum -1h unaligned absolute
+ lookup: sum -10m unaligned absolute
units: errors
every: 1m
warn: $this > 0
- delay: down 30m multiplier 1.5 max 1h
- info: interface fifo errors in the last hour
+ delay: down 1h multiplier 1.5 max 2h
+ info: interface fifo errors in the last 10 minutes
to: sysadmin
diff --git a/conf.d/health.d/softnet.conf b/conf.d/health.d/softnet.conf
index 420a45448a..5faf9a9ee5 100644
--- a/conf.d/health.d/softnet.conf
+++ b/conf.d/health.d/softnet.conf
@@ -1,21 +1,21 @@
# check for common /proc/net/softnet_stat errors
- alarm: 1hour_netdev_backlog_exceeded
+ alarm: 10min_netdev_backlog_exceeded
on: system.softnet_stat
- lookup: sum -1h unaligned absolute of dropped
+ lookup: sum -10m unaligned absolute of dropped
units: packets
every: 1m
warn: $this > 0
- delay: down 30m multiplier 1.5 max 1h
- info: number of packets dropped because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
+ delay: down 1h multiplier 1.5 max 2h
+ info: number of packets dropped in the last 10min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
to: sysadmin
- alarm: 1hour_netdev_budget_ran_outs
+ alarm: 10min_netdev_budget_ran_outs
on: system.softnet_stat
- lookup: sum -1h unaligned absolute of squeezed
+ lookup: sum -10m unaligned absolute of squeezed
units: events
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (10))
- delay: down 30m multiplier 1.5 max 1h
- info: number of times ksoftirq ran out of sysctl net.core.netdev_budget or time slice, with work remaining (this can be a cause for dropped packets)
+ delay: down 1h multiplier 1.5 max 2h
+ info: number of times, during the last 10min, ksoftirq ran out of sysctl net.core.netdev_budget or time slice, with work remaining (this can be a cause for dropped packets)
to: silent