fix(alerting): removing some of criticals (#15124)

author: Mateusz Bularz <60339703+M4itee@users.noreply.github.com> 2023-07-06 11:43:00 +0200
committer: GitHub <noreply@github.com> 2023-07-06 12:43:00 +0300
commit: 94ec32356117cf09ef6e4f79329dc00c911b8015 (patch)
tree: f7d1b67c7f044ecd0fc97f916b0d145ee0d32bb9 /health
parent: d50b0a7985a15e20d0c976472d3037bfce4914f9 (diff)
21 files changed, 12 insertions, 54 deletions
diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf
index 65f1a69ab9..7a0afcd18a 100644
--- a/health/health.d/apcupsd.conf
+++ b/health/health.d/apcupsd.conf
@@ -11,7 +11,6 @@ component: UPS
     units: %
     every: 1m
      warn: $this > (($status >= $WARNING)  ? (70) : (80))
-     crit: $this > (($status == $CRITICAL) ? (85) : (95))
     delay: down 10m multiplier 1.5 max 1h
      info: average UPS load over the last 10 minutes
        to: sitemgr
@@ -29,7 +28,7 @@ component: UPS
     units: %
     every: 60s
      warn: $this < 100
-     crit: $this < (($status == $CRITICAL) ? (60) : (50))
+     crit: $this < 40
     delay: down 10m multiplier 1.5 max 1h
      info: average UPS charge over the last minute
        to: sitemgr
@@ -43,7 +42,6 @@ component: UPS device
     every: 10s
     units: seconds ago
      warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
     delay: down 5m multiplier 1.5 max 1h
      info: number of seconds since the last successful data collection
        to: sitemgr
diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf
index 49cb5ad0f6..3f92e80df5 100644
--- a/health/health.d/bcache.conf
+++ b/health/health.d/bcache.conf
@@ -22,8 +22,7 @@ component: Disk
      calc: $dirty + $metadata + $undefined
     units: %
     every: 1m
-     warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) )
-     crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
+     warn: $this > 75
     delay: up 1m down 1h multiplier 1.5 max 2h
      info: percentage of cache space used for dirty data and metadata \
            (this usually means your SSD cache is too small)
diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf
index 13ac8c1825..4ee8bc0bd6 100644
--- a/health/health.d/beanstalkd.conf
+++ b/health/health.d/beanstalkd.conf
@@ -8,8 +8,7 @@ component: Beanstalk
      calc: $buried
     units: jobs
     every: 10s
-     warn: $this > 0
-     crit: $this > 10
+     warn: $this > 3
     delay: up 0 down 5m multiplier 1.2 max 1h
      info: number of buried jobs across all tubes. \
            You need to manually kick them so they can be processed. \
diff --git a/health/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf
index 7c09225ff6..b3e75a239c 100644
--- a/health/health.d/bind_rndc.conf
+++ b/health/health.d/bind_rndc.conf
@@ -7,6 +7,5 @@ component: BIND
     every: 60
      calc: $stats_size
      warn: $this > 512
-     crit: $this > 1024
      info: BIND statistics-file size
        to: sysadmin
diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf
index 6f37787d71..b7dcbe3167 100644
--- a/health/health.d/boinc.conf
+++ b/health/health.d/boinc.conf
@@ -12,7 +12,6 @@ component: BOINC
     units: tasks
     every: 1m
      warn: $this > 0
-     crit: $this > 1
     delay: up 1m down 5m multiplier 1.5 max 1h
      info: average number of compute errors over the last 10 minutes
        to: sysadmin
@@ -29,7 +28,6 @@ component: BOINC
     units: tasks
     every: 1m
      warn: $this > 0
-     crit: $this > 1
     delay: up 1m down 5m multiplier 1.5 max 1h
      info: average number of failed uploads over the last 10 minutes
        to: sysadmin
@@ -46,7 +44,6 @@ component: BOINC
     units: tasks
     every: 1m
      warn: $this < 1
-     crit: $this < 0.1
     delay: up 5m down 10m multiplier 1.5 max 1h
      info: average number of total tasks over the last 10 minutes
        to: sysadmin
@@ -64,7 +61,6 @@ component: BOINC
     units: tasks
     every: 1m
      warn: $this < 1
-     crit: $this < 0.1
     delay: up 5m down 10m multiplier 1.5 max 1h
      info: average number of active tasks over the last 10 minutes
        to: sysadmin
diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf
index 08260ff6d9..f625e5455d 100644
--- a/health/health.d/cgroups.conf
+++ b/health/health.d/cgroups.conf
@@ -64,7 +64,6 @@ component: Network
     every: 10s
     units: %
      warn: $this > (($status >= $WARNING)?(200):(5000))
-     crit: $this > (($status == $CRITICAL)?(5000):(6000))
   options: no-clear-notification
      info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
            compared to the rate over the last minute
@@ -83,7 +82,6 @@ component: CPU
     units: %
     every: 1m
      warn: $this > (($status >= $WARNING)  ? (75) : (85))
-     crit: $this > (($status == $CRITICAL) ? (85) : (95))
     delay: down 15m multiplier 1.5 max 1h
      info: average cgroup CPU utilization over the last 10 minutes
        to: sysadmin
@@ -134,7 +132,6 @@ component: Network
     every: 10s
     units: %
      warn: $this > (($status >= $WARNING)?(200):(5000))
-     crit: $this > (($status == $CRITICAL)?(5000):(6000))
   options: no-clear-notification
      info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
            compared to the rate over the last minute
diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf
index ad69528253..907d6ff8a2 100644
--- a/health/health.d/cpu.conf
+++ b/health/health.d/cpu.conf
@@ -28,7 +28,6 @@ component: CPU
     units: %
     every: 1m
      warn: $this > (($status >= $WARNING)  ? (20) : (40))
-     crit: $this > (($status == $CRITICAL) ? (40) : (50))
     delay: down 15m multiplier 1.5 max 1h
      info: average CPU iowait time over the last 10 minutes
        to: sysadmin
@@ -44,7 +43,6 @@ component: CPU
     units: %
     every: 5m
      warn: $this > (($status >= $WARNING)  ? (5)  : (10))
-     crit: $this > (($status == $CRITICAL) ? (20) : (30))
     delay: down 1h multiplier 1.5 max 2h
      info: average CPU steal time over the last 20 minutes
        to: sysadmin
diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf
index 010b945992..81d37df64d 100644
--- a/health/health.d/dnsmasq_dhcp.conf
+++ b/health/health.d/dnsmasq_dhcp.conf
@@ -9,7 +9,6 @@ component: Dnsmasq
     units: %
      calc: $used
      warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) )
-     crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
     delay: down 5m
      info: DHCP range utilization
        to: sysadmin
diff --git a/health/health.d/docker.conf b/health/health.d/docker.conf
index f17028472e..01919dc0d7 100644
--- a/health/health.d/docker.conf
+++ b/health/health.d/docker.conf
@@ -6,6 +6,6 @@ component: Docker
     units: status
     every: 10s
    lookup: average -10s of unhealthy
-     crit: $this > 0
+     warn: $this > 0
      info: ${label:container_name} docker container health status is unhealthy
        to: sysadmin
diff --git a/health/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf
index 47f8e1eb90..29f1e9b272 100644
--- a/health/health.d/elasticsearch.conf
+++ b/health/health.d/elasticsearch.conf
@@ -10,7 +10,7 @@ component: Elasticsearch
    lookup: average -5s unaligned of *ed
     every: 10s
     units: status
-     warn: $this == 1
+     crit: $this == 1
     delay: down 5m multiplier 1.5 max 1h
      info: cluster health status is red.
        to: sysadmin
diff --git a/health/health.d/gearman.conf b/health/health.d/gearman.conf
index 14010d445b..580d114f82 100644
--- a/health/health.d/gearman.conf
+++ b/health/health.d/gearman.conf
@@ -8,7 +8,6 @@ component: Gearman
     units: workers
     every: 10s
      warn: $this > 30000
-     crit: $this > 100000
     delay: down 5m multiplier 1.5 max 1h
      info: average number of queued jobs over the last 10 minutes
        to: sysadmin
diff --git a/health/health.d/geth.conf b/health/health.d/geth.conf
index dd1eb4701e..361b6b41f0 100644
--- a/health/health.d/geth.conf
+++ b/health/health.d/geth.conf
@@ -8,5 +8,4 @@ component: geth
      calc: $chain_head_block -  $chain_head_header
     units: blocks
      warn: $this != 0
-     crit: $this > 5
     delay: down 1m multiplier 1.5 max 1h
diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf
index 2786cbd625..47ac4453c9 100644
--- a/health/health.d/ioping.conf
+++ b/health/health.d/ioping.conf
@@ -6,10 +6,8 @@ component: Disk
    lookup: average -10s unaligned of latency
     units: microseconds
     every: 10s
-    green: 5000
-      red: 10000
+    green: 10000
      warn: $this > $green
-     crit: $this > $red
     delay: down 30m multiplier 1.5 max 2h
      info: average I/O latency over the last 10 seconds
        to: sysadmin
diff --git a/health/health.d/ipc.conf b/health/health.d/ipc.conf
index c178a410aa..3d1b46c02d 100644
--- a/health/health.d/ipc.conf
+++ b/health/health.d/ipc.conf
@@ -12,7 +12,6 @@ component: IPC
     units: %
     every: 10s
      warn: $this > (($status >= $WARNING)  ? (70) : (80))
-     crit: $this > (($status == $CRITICAL) ? (70) : (90))
     delay: down 5m multiplier 1.5 max 1h
      info: IPC semaphore utilization
        to: sysadmin
@@ -28,7 +27,6 @@ component: IPC
     units: %
     every: 10s
      warn: $this > (($status >= $WARNING)  ? (70) : (80))
-     crit: $this > (($status == $CRITICAL) ? (70) : (90))
     delay: down 5m multiplier 1.5 max 1h
      info: IPC semaphore arrays utilization
        to: sysadmin
diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf
index c0bc6de8a1..4562122ca5 100644
--- a/health/health.d/linux_power_supply.conf
+++ b/health/health.d/linux_power_supply.conf
@@ -9,7 +9,6 @@ component: Battery
     units: %
     every: 10s
      warn: $this < 10
-     crit: $this < 5
     delay: up 30s down 5m multiplier 1.2 max 1h
      info: percentage of remaining power supply capacity
        to: sysadmin
diff --git a/health/health.d/nut.conf b/health/health.d/nut.conf
index 6231dd97b2..67843205c7 100644
--- a/health/health.d/nut.conf
+++ b/health/health.d/nut.conf
@@ -26,8 +26,8 @@ component: UPS
    lookup: average -60s unaligned of battery_charge
     units: %
     every: 60s
-     warn: $this < 100
-     crit: $this < (($status == $CRITICAL) ? (60) : (50))
+     warn: $this < 75
+     crit: $this < 40
     delay: down 10m multiplier 1.5 max 1h
      info: average UPS charge over the last minute
        to: sitemgr
diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf
index ee6c57cc57..045930ae59 100644
--- a/health/health.d/pihole.conf
+++ b/health/health.d/pihole.conf
@@ -10,8 +10,7 @@ component: Pi-hole
     every: 10s
     units: seconds
      calc: $ago
-     warn: $this > 60 * 60 * 24 * 8
-     crit: $this > 60 * 60 * 24 * 8 * 2
+     warn: $this > 60 * 60 * 24 * 30
      info: gravity.list (blocklist) file last update time
        to: sysadmin
 
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index ab382c43b1..34e5431a8a 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -28,7 +28,6 @@ component: Memory
     units: %
     every: 10s
      warn: $this < (($status >= $WARNING)  ? (15) : (10))
-     crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
     delay: down 15m multiplier 1.5 max 1h
      info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
        to: sysadmin
@@ -74,7 +73,6 @@ component: Memory
     units: %
     every: 10s
      warn: $this < (($status >= $WARNING)  ? (15) : (10))
-     crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
     delay: down 15m multiplier 1.5 max 1h
      info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
        to: sysadmin
diff --git a/health/health.d/scaleio.conf b/health/health.d/scaleio.conf
index ab110bf07f..27a857fcdc 100644
--- a/health/health.d/scaleio.conf
+++ b/health/health.d/scaleio.conf
@@ -9,8 +9,8 @@ component: ScaleIO
      calc: $used
     units: %
     every: 10s
-     warn: $this > (($status >= $WARNING)  ? (80) : (90))
-     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+     warn: $this > (($status >= $WARNING)  ? (80) : (85))
+     crit: $this > (($status == $CRITICAL) ? (85) : (90))
     delay: down 15m multiplier 1.5 max 1h
      info: storage pool capacity utilization
        to: sysadmin
diff --git a/health/health.d/vcsa.conf b/health/health.d/vcsa.conf
index a9cc7ceeff..bff34cd39a 100644
--- a/health/health.d/vcsa.conf
+++ b/health/health.d/vcsa.conf
@@ -133,8 +133,7 @@ component: VMware vCenter
    lookup: max -10s unaligned of software_packages
     units: status
     every: 10s
-     warn: $this == 4
-     crit: $this == 3
+     warn: ($this == 3) || ($this == 4)
     delay: down 1m multiplier 1.5 max 1h
      info: software updates availability status \
            (-1: unknown, 0: green, 2: orange, 3: red, 4: grey)
diff --git a/health/health.d/windows.conf b/health/health.d/windows.conf
index d4bc7639c6..b3b228e75c 100644
--- a/health/health.d/windows.conf
+++ b/health/health.d/windows.conf
@@ -36,22 +36,6 @@ component: Memory
      info: memory utilization
        to: sysadmin
 
- template: windows_swap_in_use
-       on: windows.memory_swap_utilization
-    class: Utilization
-     type: Windows
-component: Memory
-       os: linux
-    hosts: *
-     calc: ($used) * 100 / ($used + $available)
-    units: %
-    every: 10s
-     warn: $this > (($status >= $WARNING)  ? (80) : (90))
-     crit: $this > (($status == $CRITICAL) ? (90) : (98))
-    delay: down 15m multiplier 1.5 max 1h
-     info: swap memory utilization
-       to: sysadmin
-
 
 ## Network
author	Mateusz Bularz <60339703+M4itee@users.noreply.github.com>	2023-07-06 11:43:00 +0200
committer	GitHub <noreply@github.com>	2023-07-06 12:43:00 +0300
commit	94ec32356117cf09ef6e4f79329dc00c911b8015 (patch)
tree	f7d1b67c7f044ecd0fc97f916b0d145ee0d32bb9 /health
parent	d50b0a7985a15e20d0c976472d3037bfce4914f9 (diff)