diff options
author | Ilya Mashchenko <ilya@netdata.cloud> | 2024-03-05 22:37:12 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-05 22:37:12 +0200 |
commit | 746ebfdbd20045a6e72057736d57820caed73e5b (patch) | |
tree | c08d04d4c817800fc5ea247ac0c38fffe375e2bd | |
parent | 291004e39ba61d34f3c89f2ef30143e880a0ee78 (diff) |
remove "os" "hosts" "plugin" and "module" from stock alarms (#17113)
35 files changed, 1124 insertions, 1306 deletions
diff --git a/src/health/health.d/apcupsd.conf b/src/health/health.d/apcupsd.conf index 90a72af192..5fd7aa1126 100644 --- a/src/health/health.d/apcupsd.conf +++ b/src/health/health.d/apcupsd.conf @@ -5,8 +5,6 @@ class: Utilization type: Power Supply component: UPS - os: * - hosts: * lookup: average -10m unaligned of percentage units: % every: 1m @@ -23,8 +21,6 @@ component: UPS class: Errors type: Power Supply component: UPS - os: * - hosts: * lookup: average -60s unaligned of charge units: % every: 60s diff --git a/src/health/health.d/boinc.conf b/src/health/health.d/boinc.conf index 092a568450..6fd987de19 100644 --- a/src/health/health.d/boinc.conf +++ b/src/health/health.d/boinc.conf @@ -1,4 +1,4 @@ -# Alarms for various BOINC issues. +# you can disable an alarm notification by setting the 'to' line to: silent # Warn on any compute errors encountered. template: boinc_compute_errors @@ -6,8 +6,6 @@ class: Errors type: Computing component: BOINC - os: * - hosts: * lookup: average -10m unaligned of comperror units: tasks every: 1m @@ -23,8 +21,6 @@ component: BOINC class: Errors type: Computing component: BOINC - os: * - hosts: * lookup: average -10m unaligned of upload_failed units: tasks every: 1m @@ -40,8 +36,6 @@ component: BOINC class: Utilization type: Computing component: BOINC - os: * - hosts: * lookup: average -10m unaligned of total units: tasks every: 1m @@ -57,8 +51,6 @@ component: BOINC class: Utilization type: Computing component: BOINC - os: * - hosts: * lookup: average -10m unaligned of active calc: ($boinc_total_tasks >= 1) ? ($this) : (inf) units: tasks diff --git a/src/health/health.d/btrfs.conf b/src/health/health.d/btrfs.conf index 1557a59410..f43f600c05 100644 --- a/src/health/health.d/btrfs.conf +++ b/src/health/health.d/btrfs.conf @@ -1,11 +1,10 @@ +# you can disable an alarm notification by setting the 'to' line to: silent template: btrfs_allocated on: btrfs.disk class: Utilization type: System component: File system - os: * - hosts: * calc: 100 - ($unallocated * 100 / ($unallocated + $data_used + $data_free + $meta_used + $meta_free + $sys_used + $sys_free)) units: % every: 10s @@ -20,8 +19,6 @@ component: File system class: Utilization type: System component: File system - os: * - hosts: * calc: $used * 100 / ($used + $free) units: % every: 10s @@ -37,8 +34,6 @@ component: File system class: Utilization type: System component: File system - os: * - hosts: * calc: ($used + $reserved) * 100 / ($used + $free + $reserved) units: % every: 10s @@ -54,8 +49,6 @@ component: File system class: Utilization type: System component: File system - os: * - hosts: * calc: $used * 100 / ($used + $free) units: % every: 10s @@ -71,8 +64,6 @@ component: File system class: Errors type: System component: File system - os: * - hosts: * units: errors lookup: max -10m every 1m of read_errs warn: $this > 0 @@ -86,8 +77,6 @@ component: File system class: Errors type: System component: File system - os: * - hosts: * units: errors lookup: max -10m every 1m of write_errs crit: $this > 0 @@ -101,8 +90,6 @@ component: File system class: Errors type: System component: File system - os: * - hosts: * units: errors lookup: max -10m every 1m of flush_errs crit: $this > 0 @@ -116,8 +103,6 @@ component: File system class: Errors type: System component: File system - os: * - hosts: * units: errors lookup: max -10m every 1m of corruption_errs warn: $this > 0 @@ -131,8 +116,6 @@ component: File system class: Errors type: System component: File system - os: * - hosts: * units: errors lookup: max -10m every 1m of generation_errs warn: $this > 0 diff --git a/src/health/health.d/cgroups.conf b/src/health/health.d/cgroups.conf index 9c55633efb..52ca026242 100644 --- a/src/health/health.d/cgroups.conf +++ b/src/health/health.d/cgroups.conf @@ -1,72 +1,67 @@ - # you can disable an alarm notification by setting the 'to' line to: silent - template: cgroup_10min_cpu_usage - on: cgroup.cpu_limit - class: Utilization - type: Cgroups -component: CPU - os: linux - hosts: * - lookup: average -10m unaligned - units: % - every: 1m - warn: $this > (($status == $CRITICAL) ? (85) : (95)) - delay: down 15m multiplier 1.5 max 1h - summary: Cgroup ${label:cgroup_name} CPU utilization - info: Cgroup ${label:cgroup_name} average CPU utilization over the last 10 minutes - to: silent + template: cgroup_10min_cpu_usage + on: cgroup.cpu_limit + class: Utilization + type: Cgroups + component: CPU +host labels: _os=linux + lookup: average -10m unaligned + units: % + every: 1m + warn: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: Cgroup ${label:cgroup_name} CPU utilization + info: Cgroup ${label:cgroup_name} average CPU utilization over the last 10 minutes + to: silent - template: cgroup_ram_in_use - on: cgroup.mem_usage - class: Utilization - type: Cgroups -component: Memory - os: linux - hosts: * - calc: ($ram) * 100 / $memory_limit - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: down 15m multiplier 1.5 max 1h - summary: Cgroup ${label:cgroup_name} memory utilization - info: Cgroup ${label:cgroup_name} memory utilization - to: silent + template: cgroup_ram_in_use + on: cgroup.mem_usage + class: Utilization + type: Cgroups + component: Memory +host labels: _os=linux + calc: ($ram) * 100 / $memory_limit + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: Cgroup ${label:cgroup_name} memory utilization + info: Cgroup ${label:cgroup_name} memory utilization + to: silent # ---------------------------------K8s containers-------------------------------------------- - template: k8s_cgroup_10min_cpu_usage - on: k8s.cgroup.cpu_limit - class: Utilization - type: Cgroups -component: CPU - os: linux - hosts: * - lookup: average -10m unaligned - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (75) : (85)) - delay: down 15m multiplier 1.5 max 1h - summary: Container ${label:k8s_container_name} pod ${label:k8s_pod_name} CPU utilization - info: Container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \ - average CPU utilization over the last 10 minutes - to: silent + template: k8s_cgroup_10min_cpu_usage + on: k8s.cgroup.cpu_limit + class: Utilization + type: Cgroups + component: CPU +host labels: _os=linux + lookup: average -10m unaligned + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + delay: down 15m multiplier 1.5 max 1h + summary: Container ${label:k8s_container_name} pod ${label:k8s_pod_name} CPU utilization + info: Container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \ + average CPU utilization over the last 10 minutes + to: silent - template: k8s_cgroup_ram_in_use - on: k8s.cgroup.mem_usage - class: Utilization - type: Cgroups -component: Memory - os: linux - hosts: * - calc: ($ram) * 100 / $memory_limit - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: down 15m multiplier 1.5 max 1h - summary: Container ${label:k8s_container_name} pod ${label:k8s_pod_name} memory utilization - info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \ - memory utilization - to: silent + template: k8s_cgroup_ram_in_use + on: k8s.cgroup.mem_usage + class: Utilization + type: Cgroups + component: Memory +host labels: _os=linux + calc: ($ram) * 100 / $memory_limit + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: Container ${label:k8s_container_name} pod ${label:k8s_pod_name} memory utilization + info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \ + memory utilization + to: silent diff --git a/src/health/health.d/cpu.conf b/src/health/health.d/cpu.conf index 0b007d6b4b..a3a05855a3 100644 --- a/src/health/health.d/cpu.conf +++ b/src/health/health.d/cpu.conf @@ -1,69 +1,65 @@ # you can disable an alarm notification by setting the 'to' line to: silent - template: 10min_cpu_usage - on: system.cpu - class: Utilization - type: System -component: CPU - os: linux - hosts: * - lookup: average -10m unaligned of user,system,softirq,irq,guest - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (75) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) - delay: down 15m multiplier 1.5 max 1h - summary: System CPU utilization - info: Average CPU utilization over the last 10 minutes (excluding iowait, nice and steal) - to: silent + template: 10min_cpu_usage + on: system.cpu + class: Utilization + type: System + component: CPU +host labels: _os=linux + lookup: average -10m unaligned of user,system,softirq,irq,guest + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: System CPU utilization + info: Average CPU utilization over the last 10 minutes (excluding iowait, nice and steal) + to: silent - template: 10min_cpu_iowait - on: system.cpu - class: Utilization - type: System -component: CPU - os: linux - hosts: * - lookup: average -10m unaligned of iowait - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (20) : (40)) - delay: up 30m down 30m multiplier 1.5 max 2h - summary: System CPU iowait time - info: Average CPU iowait time over the last 10 minutes - to: silent + template: 10min_cpu_iowait + on: system.cpu + class: Utilization + type: System + component: CPU +host labels: _os=linux + lookup: average -10m unaligned of iowait + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (20) : (40)) + delay: up 30m down 30m multiplier 1.5 max 2h + summary: System CPU iowait time + info: Average CPU iowait time over the last 10 minutes + to: silent - template: 20min_steal_cpu - on: system.cpu - class: Latency - type: System -component: CPU - os: linux - hosts: * - lookup: average -20m unaligned of steal - units: % - every: 5m - warn: $this > (($status >= $WARNING) ? (5) : (10)) - delay: down 1h multiplier 1.5 max 2h - summary: System CPU steal time - info: Average CPU steal time over the last 20 minutes - to: silent + template: 20min_steal_cpu + on: system.cpu + class: Latency + type: System + component: CPU +host labels: _os=linux + lookup: average -20m unaligned of steal + units: % + every: 5m + warn: $this > (($status >= $WARNING) ? (5) : (10)) + delay: down 1h multiplier 1.5 max 2h + summary: System CPU steal time + info: Average CPU steal time over the last 20 minutes + to: silent ## FreeBSD - template: 10min_cpu_usage - on: system.cpu - class: Utilization - type: System -component: CPU - os: freebsd - hosts: * - lookup: average -10m unaligned of user,system,interrupt - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (75) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) - delay: down 15m multiplier 1.5 max 1h - summary: System CPU utilization - info: Average CPU utilization over the last 10 minutes (excluding nice) - to: silent + template: 10min_cpu_usage + on: system.cpu + class: Utilization + type: System + component: CPU +host labels: _os=freebsd + lookup: average -10m unaligned of user,system,interrupt + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: System CPU utilization + info: Average CPU utilization over the last 10 minutes (excluding nice) + to: silent diff --git a/src/health/health.d/dbengine.conf b/src/health/health.d/dbengine.conf index 0a70d2e8f9..5585a95335 100644 --- a/src/health/health.d/dbengine.conf +++ b/src/health/health.d/dbengine.conf @@ -1,4 +1,3 @@ - # you can disable an alarm notification by setting the 'to' line to: silent alarm: 10min_dbengine_global_fs_errors @@ -6,8 +5,6 @@ class: Errors type: Netdata component: DB engine - os: linux freebsd macos - hosts: * lookup: sum -10m unaligned of fs_errors units: errors every: 10s @@ -22,8 +19,6 @@ component: DB engine class: Errors type: Netdata component: DB engine - os: linux freebsd macos - hosts: * lookup: sum -10m unaligned of io_errors units: errors every: 10s @@ -38,8 +33,6 @@ component: DB engine class: Errors type: Netdata component: DB engine - os: linux freebsd macos - hosts: * lookup: sum -10m unaligned of pg_cache_over_half_dirty_events units: errors every: 10s @@ -55,8 +48,6 @@ component: DB engine class: Errors type: Netdata component: DB engine - os: linux freebsd macos - hosts: * lookup: sum -10m unaligned of flushing_pressure_deletions units: pages every: 10s diff --git a/src/health/health.d/disks.conf b/src/health/health.d/disks.conf index 2e417fd4a3..fe96837fbc 100644 --- a/src/health/health.d/disks.conf +++ b/src/health/health.d/disks.conf @@ -1,7 +1,5 @@ - # you can disable an alarm notification by setting the 'to' line to: silent - # ----------------------------------------------------------------------------- # low disk space @@ -9,41 +7,39 @@ # raise an alarm if the disk is low on # available disk space - template: disk_space_usage - on: disk.space - class: Utilization - type: System -component: Disk - os: linux freebsd - hosts: * + template: disk_space_usage + on: disk.space + class: Utilization + type: System + component: Disk + host labels: _os=linux freebsd chart labels: mount_point=!/dev !/dev/* !/run !/run/* * - calc: $used * 100 / ($avail + $used) - units: % - every: 1m - warn: $this > (($status >= $WARNING ) ? (80) : (90)) - crit: ($this > (($status == $CRITICAL) ? (90) : (98))) && $avail < 5 - delay: up 1m down 15m multiplier 1.5 max 1h - summary: Disk ${label:mount_point} space usage - info: Total space utilization of disk ${label:mount_point} - to: sysadmin - - template: disk_inode_usage - on: disk.inodes - class: Utilization - type: System -component: Disk - os: linux freebsd - hosts: * + calc: $used * 100 / ($avail + $used) + units: % + every: 1m + warn: $this > (($status >= $WARNING ) ? (80) : (90)) + crit: ($this > (($status == $CRITICAL) ? (90) : (98))) && $avail < 5 + delay: up 1m down 15m multiplier 1.5 max 1h + summary: Disk ${label:mount_point} space usage + info: Total space utilization of disk ${label:mount_point} + to: sysadmin + + template: disk_inode_usage + on: disk.inodes + class: Utilization + type: System + component: Disk + host labels: _os=linux freebsd chart labels: mount_point=!/dev !/dev/* !/run !/run/* * - calc: $used * 100 / ($avail + $used) - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: up 1m down 15m multiplier 1.5 max 1h - summary: Disk ${label:mount_point} inode usage - info: Total inode utilization of disk ${label:mount_point} - to: sysadmin + calc: $used * 100 / ($avail + $used) + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: up 1m down 15m multiplier 1.5 max 1h + summary: Disk ${label:mount_point} inode usage + info: Total inode utilization of disk ${label:mount_point} + to: sysadmin # ----------------------------------------------------------------------------- @@ -57,33 +53,30 @@ chart labels: mount_point=!/dev !/dev/* !/run !/run/* * # we will use it in the next template to find # the hours remaining -template: disk_fill_rate - on: disk.space - os: linux freebsd - hosts: * - lookup: min -10m at -50m unaligned of avail - calc: ($this - $avail) / (($now - $after) / 3600) - every: 1m - units: GB/hour - info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour + template: disk_fill_rate + on: disk.space +host labels: _os=linux freebsd + lookup: min -10m at -50m unaligned of avail + calc: ($this - $avail) / (($now - $after) / 3600) + every: 1m + units: GB/hour + info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour # calculate the hours remaining -# if the disk continues to fill -# in this rate - -template: out_of_disk_space_time - on: disk.space - os: linux freebsd - hosts: * - calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf) - units: hours - every: 10s - warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) - crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) - delay: down 15m multiplier 1.2 max 1h - summary: Disk ${label:mount_point} estimation of lack of space - info: Estimated time the disk ${label:mount_point} will run out of space, if the system continues to add data with the rate of the last hour - to: silent +# if the disk continues to fill in this rate + + template: out_of_disk_space_time + on: disk.space +host labels: _os=linux freebsd + calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf) + units: hours + every: 10s + warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) + crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) + delay: down 15m multiplier 1.2 max 1h + summary: Disk ${label:mount_point} estimation of lack of space + info: Estimated time the disk ${label:mount_point} will run out of space, if the system continues to add data with the rate of the last hour + to: silent # ----------------------------------------------------------------------------- @@ -97,33 +90,31 @@ template: out_of_disk_space_time # we will use it in the next template to find # the hours remaining -template: disk_inode_rate - on: disk.inodes - os: linux freebsd - hosts: * - lookup: min -10m at -50m unaligned of avail - calc: ($this - $avail) / (($now - $after) / 3600) - every: 1m - units: inodes/hour - info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour + template: disk_inode_rate + on: disk.inodes +host labels: _os=linux freebsd + lookup: min -10m at -50m unaligned of avail + calc: ($this - $avail) / (($now - $after) / 3600) + every: 1m + units: inodes/hour + info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour # calculate the hours remaining # if the disk inodes are allocated # in this rate -template: out_of_disk_inodes_time - on: disk.inodes - os: linux freebsd - hosts: * - calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf) - units: hours - every: 10s - warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) - crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) - delay: down 15m multiplier 1.2 max 1h - summary: Disk ${label:mount_point} estimation of lack of inodes - info: Estimated time the disk ${label:mount_point} will run out of inodes, if the system continues to allocate inodes with the rate of the last hour - to: silent + template: out_of_disk_inodes_time + on: disk.inodes +host labels: _os=linux freebsd + calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf) + units: hours + every: 10s + warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) + crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) + delay: down 15m multiplier 1.2 max 1h + summary: Disk ${label:mount_point} estimation of lack of inodes + info: Estimated time the disk ${label:mount_point} will run out of inodes, if the system continues to allocate inodes with the rate of the last hour + to: silent # ----------------------------------------------------------------------------- @@ -133,21 +124,20 @@ template: out_of_disk_inodes_time # by calculating the average disk utilization # for the last 10 minutes - template: 10min_disk_utilization - on: disk.util - class: Utilization - type: System -component: Disk - os: linux freebsd - hosts: * - lookup: average -10m unaligned - units: % - every: 1m - warn: $this > 98 * (($status >= $WARNING) ? (0.7) : (1)) - delay: down 15m multiplier 1.2 max 1h - summary: Disk ${label:device} utilization - info: Average percentage of time ${label:device} disk was busy over the last 10 minutes - to: silent + template: 10min_disk_utilization + on: disk.util + class: Utilization + type: System + component: Disk +host labels: _os=linux freebsd + lookup: average -10m unaligned + units: % + every: 1m + warn: $this > 98 * (($status >= $WARNING) ? (0.7) : (1)) + delay: down 15m multiplier 1.2 max 1h + summary: Disk ${label:device} utilization + info: Average percentage of time ${label:device} disk was busy over the last 10 minutes + to: silent # raise an alarm if the disk backlog @@ -155,18 +145,17 @@ component: Disk # for 10 minutes # (i.e. the disk cannot catch up) - template: 10min_disk_backlog - on: disk.backlog - class: Latency - type: System -component: Disk - os: linux - hosts: * - lookup: average -10m unaligned - units: ms - every: 1m - warn: $this > 5000 * (($status >= $WARNING) ? (0.7) : (1)) - delay: down 15m multiplier 1.2 max 1h - summary: Disk ${label:device} backlog - info: Average backlog size of the ${label:device} disk over the last 10 minutes - to: silent + template: 10min_disk_backlog + on: disk.backlog + class: Latency + type: System + component: Disk +host labels: _os=linux freebsd + lookup: average -10m unaligned + units: ms + every: 1m + warn: $this > 5000 * (($status >= $WARNING) ? (0.7) : (1)) + delay: down 15m multiplier 1.2 max 1h + summary: Disk ${label:device} backlog + info: Average backlog size of the ${label:device} disk over the last 10 minutes + to: silent diff --git a/src/health/health.d/entropy.conf b/src/hea |