summaryrefslogtreecommitdiffstats
path: root/health
diff options
context:
space:
mode:
authorMateusz Bularz <60339703+M4itee@users.noreply.github.com>2023-07-06 11:43:00 +0200
committerGitHub <noreply@github.com>2023-07-06 12:43:00 +0300
commit94ec32356117cf09ef6e4f79329dc00c911b8015 (patch)
treef7d1b67c7f044ecd0fc97f916b0d145ee0d32bb9 /health
parentd50b0a7985a15e20d0c976472d3037bfce4914f9 (diff)
fix(alerting): removing some of criticals (#15124)
Diffstat (limited to 'health')
-rw-r--r--health/health.d/apcupsd.conf4
-rw-r--r--health/health.d/bcache.conf3
-rw-r--r--health/health.d/beanstalkd.conf3
-rw-r--r--health/health.d/bind_rndc.conf1
-rw-r--r--health/health.d/boinc.conf4
-rw-r--r--health/health.d/cgroups.conf3
-rw-r--r--health/health.d/cpu.conf2
-rw-r--r--health/health.d/dnsmasq_dhcp.conf1
-rw-r--r--health/health.d/docker.conf2
-rw-r--r--health/health.d/elasticsearch.conf2
-rw-r--r--health/health.d/gearman.conf1
-rw-r--r--health/health.d/geth.conf1
-rw-r--r--health/health.d/ioping.conf4
-rw-r--r--health/health.d/ipc.conf2
-rw-r--r--health/health.d/linux_power_supply.conf1
-rw-r--r--health/health.d/nut.conf4
-rw-r--r--health/health.d/pihole.conf3
-rw-r--r--health/health.d/ram.conf2
-rw-r--r--health/health.d/scaleio.conf4
-rw-r--r--health/health.d/vcsa.conf3
-rw-r--r--health/health.d/windows.conf16
21 files changed, 12 insertions, 54 deletions
diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf
index 65f1a69ab9..7a0afcd18a 100644
--- a/health/health.d/apcupsd.conf
+++ b/health/health.d/apcupsd.conf
@@ -11,7 +11,6 @@ component: UPS
units: %
every: 1m
warn: $this > (($status >= $WARNING) ? (70) : (80))
- crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 10m multiplier 1.5 max 1h
info: average UPS load over the last 10 minutes
to: sitemgr
@@ -29,7 +28,7 @@ component: UPS
units: %
every: 60s
warn: $this < 100
- crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ crit: $this < 40
delay: down 10m multiplier 1.5 max 1h
info: average UPS charge over the last minute
to: sitemgr
@@ -43,7 +42,6 @@ component: UPS device
every: 10s
units: seconds ago
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
to: sitemgr
diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf
index 49cb5ad0f6..3f92e80df5 100644
--- a/health/health.d/bcache.conf
+++ b/health/health.d/bcache.conf
@@ -22,8 +22,7 @@ component: Disk
calc: $dirty + $metadata + $undefined
units: %
every: 1m
- warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) )
- crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
+ warn: $this > 75
delay: up 1m down 1h multiplier 1.5 max 2h
info: percentage of cache space used for dirty data and metadata \
(this usually means your SSD cache is too small)
diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf
index 13ac8c1825..4ee8bc0bd6 100644
--- a/health/health.d/beanstalkd.conf
+++ b/health/health.d/beanstalkd.conf
@@ -8,8 +8,7 @@ component: Beanstalk
calc: $buried
units: jobs
every: 10s
- warn: $this > 0
- crit: $this > 10
+ warn: $this > 3
delay: up 0 down 5m multiplier 1.2 max 1h
info: number of buried jobs across all tubes. \
You need to manually kick them so they can be processed. \
diff --git a/health/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf
index 7c09225ff6..b3e75a239c 100644
--- a/health/health.d/bind_rndc.conf
+++ b/health/health.d/bind_rndc.conf
@@ -7,6 +7,5 @@ component: BIND
every: 60
calc: $stats_size
warn: $this > 512
- crit: $this > 1024
info: BIND statistics-file size
to: sysadmin
diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf
index 6f37787d71..b7dcbe3167 100644
--- a/health/health.d/boinc.conf
+++ b/health/health.d/boinc.conf
@@ -12,7 +12,6 @@ component: BOINC
units: tasks
every: 1m
warn: $this > 0
- crit: $this > 1
delay: up 1m down 5m multiplier 1.5 max 1h
info: average number of compute errors over the last 10 minutes
to: sysadmin
@@ -29,7 +28,6 @@ component: BOINC
units: tasks
every: 1m
warn: $this > 0
- crit: $this > 1
delay: up 1m down 5m multiplier 1.5 max 1h
info: average number of failed uploads over the last 10 minutes
to: sysadmin
@@ -46,7 +44,6 @@ component: BOINC
units: tasks
every: 1m
warn: $this < 1
- crit: $this < 0.1
delay: up 5m down 10m multiplier 1.5 max 1h
info: average number of total tasks over the last 10 minutes
to: sysadmin
@@ -64,7 +61,6 @@ component: BOINC
units: tasks
every: 1m
warn: $this < 1
- crit: $this < 0.1
delay: up 5m down 10m multiplier 1.5 max 1h
info: average number of active tasks over the last 10 minutes
to: sysadmin
diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf
index 08260ff6d9..f625e5455d 100644
--- a/health/health.d/cgroups.conf
+++ b/health/health.d/cgroups.conf
@@ -64,7 +64,6 @@ component: Network
every: 10s
units: %
warn: $this > (($status >= $WARNING)?(200):(5000))
- crit: $this > (($status == $CRITICAL)?(5000):(6000))
options: no-clear-notification
info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
compared to the rate over the last minute
@@ -83,7 +82,6 @@ component: CPU
units: %
every: 1m
warn: $this > (($status >= $WARNING) ? (75) : (85))
- crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 15m multiplier 1.5 max 1h
info: average cgroup CPU utilization over the last 10 minutes
to: sysadmin
@@ -134,7 +132,6 @@ component: Network
every: 10s
units: %
warn: $this > (($status >= $WARNING)?(200):(5000))
- crit: $this > (($status == $CRITICAL)?(5000):(6000))
options: no-clear-notification
info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
compared to the rate over the last minute
diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf
index ad69528253..907d6ff8a2 100644
--- a/health/health.d/cpu.conf
+++ b/health/health.d/cpu.conf
@@ -28,7 +28,6 @@ component: CPU
units: %
every: 1m
warn: $this > (($status >= $WARNING) ? (20) : (40))
- crit: $this > (($status == $CRITICAL) ? (40) : (50))
delay: down 15m multiplier 1.5 max 1h
info: average CPU iowait time over the last 10 minutes
to: sysadmin
@@ -44,7 +43,6 @@ component: CPU
units: %
every: 5m
warn: $this > (($status >= $WARNING) ? (5) : (10))
- crit: $this > (($status == $CRITICAL) ? (20) : (30))
delay: down 1h multiplier 1.5 max 2h
info: average CPU steal time over the last 20 minutes
to: sysadmin
diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf
index 010b945992..81d37df64d 100644
--- a/health/health.d/dnsmasq_dhcp.conf
+++ b/health/health.d/dnsmasq_dhcp.conf
@@ -9,7 +9,6 @@ component: Dnsmasq
units: %
calc: $used
warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) )
- crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
delay: down 5m
info: DHCP range utilization
to: sysadmin
diff --git a/health/health.d/docker.conf b/health/health.d/docker.conf
index f17028472e..01919dc0d7 100644
--- a/health/health.d/docker.conf
+++ b/health/health.d/docker.conf
@@ -6,6 +6,6 @@ component: Docker
units: status
every: 10s
lookup: average -10s of unhealthy
- crit: $this > 0
+ warn: $this > 0
info: ${label:container_name} docker container health status is unhealthy
to: sysadmin
diff --git a/health/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf
index 47f8e1eb90..29f1e9b272 100644
--- a/health/health.d/elasticsearch.conf
+++ b/health/health.d/elasticsearch.conf
@@ -10,7 +10,7 @@ component: Elasticsearch
lookup: average -5s unaligned of *ed
every: 10s
units: status
- warn: $this == 1
+ crit: $this == 1
delay: down 5m multiplier 1.5 max 1h
info: cluster health status is red.
to: sysadmin
diff --git a/health/health.d/gearman.conf b/health/health.d/gearman.conf
index 14010d445b..580d114f82 100644
--- a/health/health.d/gearman.conf
+++ b/health/health.d/gearman.conf
@@ -8,7 +8,6 @@ component: Gearman
units: workers
every: 10s
warn: $this > 30000
- crit: $this > 100000
delay: down 5m multiplier 1.5 max 1h
info: average number of queued jobs over the last 10 minutes
to: sysadmin
diff --git a/health/health.d/geth.conf b/health/health.d/geth.conf
index dd1eb4701e..361b6b41f0 100644
--- a/health/health.d/geth.conf
+++ b/health/health.d/geth.conf
@@ -8,5 +8,4 @@ component: geth
calc: $chain_head_block - $chain_head_header
units: blocks
warn: $this != 0
- crit: $this > 5
delay: down 1m multiplier 1.5 max 1h
diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf
index 2786cbd625..47ac4453c9 100644
--- a/health/health.d/ioping.conf
+++ b/health/health.d/ioping.conf
@@ -6,10 +6,8 @@ component: Disk
lookup: average -10s unaligned of latency
units: microseconds
every: 10s
- green: 5000
- red: 10000
+ green: 10000
warn: $this > $green
- crit: $this > $red
delay: down 30m multiplier 1.5 max 2h
info: average I/O latency over the last 10 seconds
to: sysadmin
diff --git a/health/health.d/ipc.conf b/health/health.d/ipc.conf
index c178a410aa..3d1b46c02d 100644
--- a/health/health.d/ipc.conf
+++ b/health/health.d/ipc.conf
@@ -12,7 +12,6 @@ component: IPC
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (70) : (80))
- crit: $this > (($status == $CRITICAL) ? (70) : (90))
delay: down 5m multiplier 1.5 max 1h
info: IPC semaphore utilization
to: sysadmin
@@ -28,7 +27,6 @@ component: IPC
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (70) : (80))
- crit: $this > (($status == $CRITICAL) ? (70) : (90))
delay: down 5m multiplier 1.5 max 1h
info: IPC semaphore arrays utilization
to: sysadmin
diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf
index c0bc6de8a1..4562122ca5 100644
--- a/health/health.d/linux_power_supply.conf
+++ b/health/health.d/linux_power_supply.conf
@@ -9,7 +9,6 @@ component: Battery
units: %
every: 10s
warn: $this < 10
- crit: $this < 5
delay: up 30s down 5m multiplier 1.2 max 1h
info: percentage of remaining power supply capacity
to: sysadmin
diff --git a/health/health.d/nut.conf b/health/health.d/nut.conf
index 6231dd97b2..67843205c7 100644
--- a/health/health.d/nut.conf
+++ b/health/health.d/nut.conf
@@ -26,8 +26,8 @@ component: UPS
lookup: average -60s unaligned of battery_charge
units: %
every: 60s
- warn: $this < 100
- crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ warn: $this < 75
+ crit: $this < 40
delay: down 10m multiplier 1.5 max 1h
info: average UPS charge over the last minute
to: sitemgr
diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf
index ee6c57cc57..045930ae59 100644
--- a/health/health.d/pihole.conf
+++ b/health/health.d/pihole.conf
@@ -10,8 +10,7 @@ component: Pi-hole
every: 10s
units: seconds
calc: $ago
- warn: $this > 60 * 60 * 24 * 8
- crit: $this > 60 * 60 * 24 * 8 * 2
+ warn: $this > 60 * 60 * 24 * 30
info: gravity.list (blocklist) file last update time
to: sysadmin
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index ab382c43b1..34e5431a8a 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -28,7 +28,6 @@ component: Memory
units: %
every: 10s
warn: $this < (($status >= $WARNING) ? (15) : (10))
- crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
delay: down 15m multiplier 1.5 max 1h
info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
to: sysadmin
@@ -74,7 +73,6 @@ component: Memory
units: %
every: 10s
warn: $this < (($status >= $WARNING) ? (15) : (10))
- crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
delay: down 15m multiplier 1.5 max 1h
info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
to: sysadmin
diff --git a/health/health.d/scaleio.conf b/health/health.d/scaleio.conf
index ab110bf07f..27a857fcdc 100644
--- a/health/health.d/scaleio.conf
+++ b/health/health.d/scaleio.conf
@@ -9,8 +9,8 @@ component: ScaleIO
calc: $used
units: %
every: 10s
- warn: $this > (($status >= $WARNING) ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ warn: $this > (($status >= $WARNING) ? (80) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (90))
delay: down 15m multiplier 1.5 max 1h
info: storage pool capacity utilization
to: sysadmin
diff --git a/health/health.d/vcsa.conf b/health/health.d/vcsa.conf
index a9cc7ceeff..bff34cd39a 100644
--- a/health/health.d/vcsa.conf
+++ b/health/health.d/vcsa.conf
@@ -133,8 +133,7 @@ component: VMware vCenter
lookup: max -10s unaligned of software_packages
units: status
every: 10s
- warn: $this == 4
- crit: $this == 3
+ warn: ($this == 3) || ($this == 4)
delay: down 1m multiplier 1.5 max 1h
info: software updates availability status \
(-1: unknown, 0: green, 2: orange, 3: red, 4: grey)
diff --git a/health/health.d/windows.conf b/health/health.d/windows.conf
index d4bc7639c6..b3b228e75c 100644
--- a/health/health.d/windows.conf
+++ b/health/health.d/windows.conf
@@ -36,22 +36,6 @@ component: Memory
info: memory utilization
to: sysadmin
- template: windows_swap_in_use
- on: windows.memory_swap_utilization
- class: Utilization
- type: Windows
-component: Memory
- os: linux
- hosts: *
- calc: ($used) * 100 / ($used + $available)
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
- delay: down 15m multiplier 1.5 max 1h
- info: swap memory utilization
- to: sysadmin
-
## Network