diff options
author | Costa Tsaousis (ktsaou) <costa@tsaousis.gr> | 2017-09-05 23:33:40 +0300 |
---|---|---|
committer | Costa Tsaousis (ktsaou) <costa@tsaousis.gr> | 2017-09-05 23:33:40 +0300 |
commit | 344cb4c893da7c52730d546749cd300481b93829 (patch) | |
tree | 70f1cb5c6f94667cbad1fe830541cd3f167344f2 /conf.d/health.d | |
parent | ed59e9b7f682e78b16bfd3697de5a5e8cacc4896 (diff) |
updated alarms to use O/S and hosts filtering based on simple patterns; #2608
Diffstat (limited to 'conf.d/health.d')
-rw-r--r-- | conf.d/health.d/cpu.conf | 8 | ||||
-rw-r--r-- | conf.d/health.d/disks.conf | 14 | ||||
-rw-r--r-- | conf.d/health.d/entropy.conf | 2 | ||||
-rw-r--r-- | conf.d/health.d/ipc.conf | 6 | ||||
-rw-r--r-- | conf.d/health.d/memory.conf | 8 | ||||
-rw-r--r-- | conf.d/health.d/net.conf | 16 | ||||
-rw-r--r-- | conf.d/health.d/netfilter.conf | 6 | ||||
-rw-r--r-- | conf.d/health.d/qos.conf | 4 | ||||
-rw-r--r-- | conf.d/health.d/ram.conf | 6 | ||||
-rw-r--r-- | conf.d/health.d/softnet.conf | 7 | ||||
-rw-r--r-- | conf.d/health.d/swap.conf | 8 | ||||
-rw-r--r-- | conf.d/health.d/tcp_resets.conf | 13 | ||||
-rw-r--r-- | conf.d/health.d/udp_errors.conf | 9 |
13 files changed, 107 insertions, 0 deletions
diff --git a/conf.d/health.d/cpu.conf b/conf.d/health.d/cpu.conf index 30a7140972..db6285561b 100644 --- a/conf.d/health.d/cpu.conf +++ b/conf.d/health.d/cpu.conf @@ -1,6 +1,10 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + template: 10min_cpu_usage on: system.cpu + os: linux + hosts: * lookup: average -10m unaligned of user,system,softirq,irq,guest units: % every: 1m @@ -12,6 +16,8 @@ template: 10min_cpu_usage template: 10min_cpu_iowait on: system.cpu + os: linux + hosts: * lookup: average -10m unaligned of iowait units: % every: 1m @@ -23,6 +29,8 @@ template: 10min_cpu_iowait template: 20min_steal_cpu on: system.cpu + os: linux + hosts: * lookup: average -20m unaligned of steal units: % every: 5m diff --git a/conf.d/health.d/disks.conf b/conf.d/health.d/disks.conf index 9548f9ee01..66dc78df37 100644 --- a/conf.d/health.d/disks.conf +++ b/conf.d/health.d/disks.conf @@ -1,3 +1,7 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + + # ----------------------------------------------------------------------------- # low disk space @@ -7,6 +11,8 @@ template: disk_space_usage on: disk.space + os: linux + hosts: * families: * calc: $used * 100 / ($avail + $used) units: % @@ -43,6 +49,8 @@ families: * template: disk_fill_rate on: disk.space + os: linux + hosts: * families: * lookup: min -10m at -50m unaligned of avail calc: ($this - $avail) / (($now - $after) / 3600) @@ -57,6 +65,8 @@ families: * template: out_of_disk_space_time on: disk.space + os: linux + hosts: * families: * calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf) units: hours @@ -77,6 +87,8 @@ families: * template: 10min_disk_utilization on: disk.util + os: linux + hosts: * families: * lookup: average -10m unaligned units: % @@ -97,6 +109,8 @@ families: * template: 10min_disk_backlog on: disk.backlog + os: linux + hosts: * families: * lookup: average -10m unaligned units: ms diff --git a/conf.d/health.d/entropy.conf b/conf.d/health.d/entropy.conf index 5dd8af502e..66d44ec139 100644 --- a/conf.d/health.d/entropy.conf +++ b/conf.d/health.d/entropy.conf @@ -5,6 +5,8 @@ alarm: lowest_entropy on: system.entropy + os: linux + hosts: * lookup: min -10m unaligned units: entries every: 5m diff --git a/conf.d/health.d/ipc.conf b/conf.d/health.d/ipc.conf index ee7c4badd9..03cf264d8c 100644 --- a/conf.d/health.d/ipc.conf +++ b/conf.d/health.d/ipc.conf @@ -1,6 +1,10 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + alarm: semaphores_used on: system.ipc_semaphores + os: linux + hosts: * calc: $semaphores * 100 / $ipc.semaphores.max units: % every: 10s @@ -12,6 +16,8 @@ alarm: semaphore_arrays_used on: system.ipc_semaphore_arrays + os: linux + hosts: * calc: $arrays * 100 / $ipc.semaphores.arrays.max units: % every: 10s diff --git a/conf.d/health.d/memory.conf b/conf.d/health.d/memory.conf index 3c904f6b1e..4a0e6e5222 100644 --- a/conf.d/health.d/memory.conf +++ b/conf.d/health.d/memory.conf @@ -1,6 +1,10 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + alarm: 1hour_ecc_memory_correctable on: mem.ecc_ce + os: linux + hosts: * lookup: sum -10m unaligned units: errors every: 1m @@ -11,6 +15,8 @@ alarm: 1hour_ecc_memory_uncorrectable on: mem.ecc_ue + os: linux + hosts: * lookup: sum -10m unaligned units: errors every: 1m @@ -21,6 +27,8 @@ alarm: 1hour_memory_hw_corrupted on: mem.hwcorrupt + os: linux + hosts: * calc: $HardwareCorrupted units: MB every: 10s diff --git a/conf.d/health.d/net.conf b/conf.d/health.d/net.conf index 846ca9113d..00a1986121 100644 --- a/conf.d/health.d/net.conf +++ b/conf.d/health.d/net.conf @@ -1,4 +1,6 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + # ----------------------------------------------------------------------------- # dropped packets @@ -8,6 +10,8 @@ template: inbound_packets_dropped on: net.drops + os: linux + hosts: * families: * lookup: sum -10m unaligned absolute of inbound units: packets @@ -19,6 +23,8 @@ families: * template: outbound_packets_dropped on: net.drops + os: linux + hosts: * families: * lookup: sum -10m unaligned absolute of outbound units: packets @@ -30,6 +36,8 @@ families: * template: inbound_packets_dropped_ratio on: net.packets + os: linux + hosts: * families: * lookup: sum -10m unaligned absolute of received calc: (($inbound_packets_dropped != nan AND $this > 0) ? ($inbound_packets_dropped * 100 / $this) : (0)) @@ -43,6 +51,8 @@ families: * template: outbound_packets_dropped_ratio on: net.packets + os: linux + hosts: * families: * lookup: sum -10m unaligned absolute of sent calc: (($outbound_packets_dropped != nan AND $this > 0) ? ($outbound_packets_dropped * 100 / $this) : (0)) @@ -65,6 +75,8 @@ families: * template: 10min_fifo_errors on: net.fifo + os: linux + hosts: * families: * lookup: sum -10m unaligned absolute units: errors @@ -86,6 +98,8 @@ families: * template: 1m_received_packets_rate on: net.packets + os: linux + hosts: * families: * lookup: average -1m of received units: packets @@ -94,6 +108,8 @@ families: * template: 10s_received_packets_storm on: net.packets + os: linux + hosts: * families: * lookup: average -10s of received calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate)) diff --git a/conf.d/health.d/netfilter.conf b/conf.d/health.d/netfilter.conf index 3dd6a67b34..fa1732b33d 100644 --- a/conf.d/health.d/netfilter.conf +++ b/conf.d/health.d/netfilter.conf @@ -1,6 +1,10 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + alarm: netfilter_last_collected_secs on: netfilter.conntrack_sockets + os: linux + hosts: * calc: $now - $last_collected_t units: seconds ago every: 10s @@ -12,6 +16,8 @@ alarm: netfilter_conntrack_full on: netfilter.conntrack_sockets + os: linux + hosts: * lookup: max -10s unaligned of connections calc: $this * 100 / $netfilter.conntrack.max units: % diff --git a/conf.d/health.d/qos.conf b/conf.d/health.d/qos.conf index 9e5939fdc1..7290d15ff8 100644 --- a/conf.d/health.d/qos.conf +++ b/conf.d/health.d/qos.conf @@ -1,10 +1,14 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + # check if a QoS class is dropping packets # the alarm is checked every 10 seconds # and examines the last minute of data #template: 10min_qos_packet_drops # on: tc.qos_dropped +# os: linux +# hosts: * # lookup: sum -10m unaligned absolute # every: 30s # warn: $this > 0 diff --git a/conf.d/health.d/ram.conf b/conf.d/health.d/ram.conf index b99e5e226c..8d0e8838d6 100644 --- a/conf.d/health.d/ram.conf +++ b/conf.d/health.d/ram.conf @@ -1,12 +1,18 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + alarm: used_ram_to_ignore on: system.ram + os: linux + hosts: * calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz) every: 10s info: the amount of memory that is reported as used, but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC) alarm: ram_in_use on: system.ram + os: linux + hosts: * # calc: $used * 100 / ($used + $cached + $free) calc: ($used - $used_ram_to_ignore) * 100 / ($used - $used_ram_to_ignore + $cached + $free) units: % diff --git a/conf.d/health.d/softnet.conf b/conf.d/health.d/softnet.conf index 5faf9a9ee5..64e1c67842 100644 --- a/conf.d/health.d/softnet.conf +++ b/conf.d/health.d/softnet.conf @@ -1,7 +1,12 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + # check for common /proc/net/softnet_stat errors alarm: 10min_netdev_backlog_exceeded on: system.softnet_stat + os: linux + hosts: * lookup: sum -10m unaligned absolute of dropped units: packets every: 1m @@ -12,6 +17,8 @@ alarm: 10min_netdev_budget_ran_outs on: system.softnet_stat + os: linux + hosts: * lookup: sum -10m unaligned absolute of squeezed units: events every: 1m diff --git a/conf.d/health.d/swap.conf b/conf.d/health.d/swap.conf index 7f57560e24..830a9af956 100644 --- a/conf.d/health.d/swap.conf +++ b/conf.d/health.d/swap.conf @@ -1,6 +1,10 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + alarm: 30min_ram_swapped_out on: system.swapio + os: linux + hosts: * lookup: sum -30m unaligned absolute of out # we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024 calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) @@ -14,6 +18,8 @@ alarm: ram_in_swap on: system.swap + os: linux + hosts: * calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) units: % of RAM every: 10s @@ -25,6 +31,8 @@ alarm: used_swap on: system.swap + os: linux + hosts: * calc: $used * 100 / ( $used + $free ) units: % every: 10s diff --git a/conf.d/health.d/tcp_resets.conf b/conf.d/health.d/tcp_resets.conf index 803c88a81c..fec124ac72 100644 --- a/conf.d/health.d/tcp_resets.conf +++ b/conf.d/health.d/tcp_resets.conf @@ -1,7 +1,12 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + # ----------------------------------------------------------------------------- alarm: ipv4_tcphandshake_last_collected_secs on: ipv4.tcphandshake + os: linux + hosts: * calc: $now - $last_collected_t units: seconds ago every: 10s @@ -16,6 +21,8 @@ alarm: 1m_ipv4_tcp_resets_sent on: ipv4.tcphandshake + os: linux + hosts: * lookup: average -1m at -10s unaligned absolute of OutRsts units: tcp resets/s every: 10s @@ -23,6 +30,8 @@ alarm: 10s_ipv4_tcp_resets_sent on: ipv4.tcphandshake + os: linux + hosts: * lookup: average -10s unaligned absolute of OutRsts units: tcp resets/s every: 10s @@ -37,6 +46,8 @@ options: no-clear-notification alarm: 1m_ipv4_tcp_resets_received on: ipv4.tcphandshake + os: linux + hosts: * lookup: average -1m at -10s unaligned absolute of AttemptFails units: tcp resets/s every: 10s @@ -44,6 +55,8 @@ options: no-clear-notification alarm: 10s_ipv4_tcp_resets_received on: ipv4.tcphandshake + os: linux + hosts: * lookup: average -10s unaligned absolute of AttemptFails units: tcp resets/s every: 10s diff --git a/conf.d/health.d/udp_errors.conf b/conf.d/health.d/udp_errors.conf index 98e955c02b..33338b83e5 100644 --- a/conf.d/health.d/udp_errors.conf +++ b/conf.d/health.d/udp_errors.conf @@ -1,7 +1,12 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + # ----------------------------------------------------------------------------- alarm: ipv4_udperrors_last_collected_secs on: ipv4.udperrors + os: linux + hosts: * calc: $now - $last_collected_t units: seconds ago every: 10s @@ -16,6 +21,8 @@ alarm: 1m_ipv4_udp_receive_buffer_errors on: ipv4.udperrors + os: linux + hosts: * lookup: sum -1m unaligned absolute of RcvbufErrors units: errors every: 10s @@ -30,6 +37,8 @@ alarm: 1m_ipv4_udp_send_buffer_errors on: ipv4.udperrors + os: linux + hosts: * lookup: sum -1m unaligned absolute of SndbufErrors units: errors every: 10s |