summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIlya Mashchenko <ilya@netdata.cloud>2021-09-27 15:31:51 +0300
committerGitHub <noreply@github.com>2021-09-27 15:31:51 +0300
commit07946d9f0a8826c841db5754cfa6a59871ea1ef7 (patch)
tree2f30881c0666909026f2264c4149793c2f79d40e
parent44cf669e8849aeb4042fb8e97f0c0cdc3741c396 (diff)
Update alarms info (#11481)
* update apcupsd alarm's info
-rw-r--r--health/health.d/apcupsd.conf10
-rw-r--r--health/health.d/backend.conf13
-rw-r--r--health/health.d/bcache.conf11
-rw-r--r--health/health.d/beanstalkd.conf8
-rw-r--r--health/health.d/btrfs.conf21
-rw-r--r--health/health.d/ceph.conf4
-rw-r--r--health/health.d/cgroups.conf10
-rw-r--r--health/health.d/cockroachdb.conf21
-rw-r--r--health/health.d/cpu.conf20
-rw-r--r--health/health.d/dbengine.conf16
-rw-r--r--health/health.d/disks.conf15
-rw-r--r--health/health.d/dns_query.conf3
-rw-r--r--health/health.d/dnsmasq_dhcp.conf5
-rw-r--r--health/health.d/dockerd.conf5
-rw-r--r--health/health.d/entropy.conf5
-rw-r--r--health/health.d/exporting.conf12
-rw-r--r--health/health.d/fping.conf12
-rw-r--r--health/health.d/gearman.conf4
-rw-r--r--health/health.d/haproxy.conf6
-rw-r--r--health/health.d/hdfs.conf27
-rw-r--r--health/health.d/ioping.conf4
-rw-r--r--health/health.d/ipc.conf10
-rw-r--r--health/health.d/ipfs.conf3
-rw-r--r--health/health.d/ipmi.conf9
-rw-r--r--health/health.d/kubelet.conf24
-rw-r--r--health/health.d/linux_power_supply.conf5
-rw-r--r--health/health.d/load.conf18
-rw-r--r--health/health.d/mdstat.conf10
-rw-r--r--health/health.d/megacli.conf25
-rw-r--r--health/health.d/memcached.conf13
-rw-r--r--health/health.d/memory.conf17
-rw-r--r--health/health.d/mysql.conf40
-rw-r--r--health/health.d/net.conf44
-rw-r--r--health/health.d/netfilter.conf5
-rw-r--r--health/health.d/pihole.conf15
-rw-r--r--health/health.d/portcheck.conf8
-rw-r--r--health/health.d/processes.conf4
-rw-r--r--health/health.d/ram.conf28
-rw-r--r--health/health.d/redis.conf9
-rw-r--r--health/health.d/retroshare.conf3
-rw-r--r--health/health.d/riakkv.conf20
-rw-r--r--health/health.d/softnet.conf23
-rw-r--r--health/health.d/swap.conf11
-rw-r--r--health/health.d/synchronization.conf5
-rw-r--r--health/health.d/systemdunits.conf50
-rw-r--r--health/health.d/tcp_conn.conf4
-rw-r--r--health/health.d/tcp_listen.conf20
-rw-r--r--health/health.d/tcp_mem.conf6
-rw-r--r--health/health.d/tcp_orphans.conf5
-rw-r--r--health/health.d/tcp_resets.conf14
-rw-r--r--health/health.d/timex.conf4
-rw-r--r--health/health.d/udp_errors.conf10
-rw-r--r--health/health.d/unbound.conf10
-rw-r--r--health/health.d/vcsa.conf48
-rw-r--r--health/health.d/vernemq.conf22
-rw-r--r--health/health.d/whoisquery.conf5
-rw-r--r--health/health.d/wmi.conf24
-rw-r--r--health/health.d/x509check.conf7
58 files changed, 599 insertions, 211 deletions
diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf
index 65f1a69ab9..bab4e1ae02 100644
--- a/health/health.d/apcupsd.conf
+++ b/health/health.d/apcupsd.conf
@@ -13,7 +13,11 @@ component: UPS
warn: $this > (($status >= $WARNING) ? (70) : (80))
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 10m multiplier 1.5 max 1h
- info: average UPS load over the last 10 minutes
+ info: Average UPS load over the last 10 minutes. \
+ High UPS load. \
+ It may result in either your UPS transferring to bypass or shutting down \
+ as a self-protection and safety measure due to overload. \
+ You should remove some attached equipment from the UPS.
to: sitemgr
# Discussion in https://github.com/netdata/netdata/pull/3928:
@@ -31,7 +35,9 @@ component: UPS
warn: $this < 100
crit: $this < (($status == $CRITICAL) ? (60) : (50))
delay: down 10m multiplier 1.5 max 1h
- info: average UPS charge over the last minute
+ info: Average UPS charge over the last minute. \
+ The UPS is running on battery power. It will shut down if external power is not restored. \
+ You should prepare any attached equipment for the shutdown.
to: sitemgr
template: apcupsd_last_collected_secs
diff --git a/health/health.d/backend.conf b/health/health.d/backend.conf
index 91d469395e..14fe255242 100644
--- a/health/health.d/backend.conf
+++ b/health/health.d/backend.conf
@@ -9,7 +9,8 @@ component: Exporting engine
every: 1m
warn: $this > 0
delay: down 5m multiplier 1.5 max 1h
- info: the backends subsystem is deprecated and will be removed soon. Migrate your configuration to exporting.conf.
+ info: The backends subsystem is deprecated and will be removed soon. \
+ Migrate your configuration to exporting.conf.
to: sysadmin
# make sure we are sending data to backend
@@ -25,7 +26,10 @@ component: Exporting engine
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful buffering of backend data
+ info: Number of seconds since the last successful buffering of backend data. \
+ The backend subsystem failed to buffer metrics for a while. Some metrics are lost while exporting. \
+ It indicates that the backend destination is down or unreachable. \
+ Short-term network availability issues might be fixed by increasing [buffer on failures] in netdata.conf.
to: dba
alarm: backend_metrics_sent
@@ -38,5 +42,8 @@ component: Exporting engine
every: 10s
warn: $this != 100
delay: down 5m multiplier 1.5 max 1h
- info: percentage of metrics sent to the backend server
+ info: Percentage of metrics sent to the backend server. \
+ The backends subsystem failed to send all metrics. Some metrics are lost while exporting. \
+ It indicates that the backend destination is down or unreachable. \
+ Short-term network availability issues might be fixed by increasing [buffer on failures] in netdata.conf.
to: dba
diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf
index 49cb5ad0f6..df030c497e 100644
--- a/health/health.d/bcache.conf
+++ b/health/health.d/bcache.conf
@@ -9,9 +9,9 @@ component: Disk
every: 1m
warn: $this > 0
delay: up 2m down 1h multiplier 1.5 max 2h
- info: number of times data was read from the cache, \
- the bucket was reused and invalidated in the last 10 minutes \
- (when this occurs the data is reread from the backing device)
+ info: Number of bcache read races in the last minute. \
+ The bucket was reused and invalidated while reading from the cache. \
+ When this occurs the data is reread from the backing device.
to: sysadmin
template: bcache_cache_dirty
@@ -25,6 +25,7 @@ component: Disk
warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) )
crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
delay: up 1m down 1h multiplier 1.5 max 2h
- info: percentage of cache space used for dirty data and metadata \
- (this usually means your SSD cache is too small)
+ info: Percentage of cache space used for dirty data and metadata. \
+ High block cache utilization by dirty data and metadata. \
+ This usually means your SSD cache is too small.
to: sysadmin
diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf
index 13ac8c1825..35c7587c06 100644
--- a/health/health.d/beanstalkd.conf
+++ b/health/health.d/beanstalkd.conf
@@ -11,9 +11,11 @@ component: Beanstalk
warn: $this > 0
crit: $this > 10
delay: up 0 down 5m multiplier 1.2 max 1h
- info: number of buried jobs across all tubes. \
- You need to manually kick them so they can be processed. \
- Presence of buried jobs in a tube does not affect new jobs.
+ info: Number of buried jobs across all tubes. \
+ There are buried jobs. \
+ It usually happens if something goes wrong while the consumer processes it. \
+ The presence of buried jobs in a tube does not affect new jobs. \
+ You need to manually kick the jobs, so they can be processed.
to: sysadmin
# get the number of buried jobs per queue
diff --git a/health/health.d/btrfs.conf b/health/health.d/btrfs.conf
index 8d197aa8d2..44b5167ce8 100644
--- a/health/health.d/btrfs.conf
+++ b/health/health.d/btrfs.conf
@@ -13,7 +13,10 @@ component: File system
warn: $this > (($status >= $WARNING) ? (90) : (95))
crit: $this > (($status == $CRITICAL) ? (95) : (98))
delay: up 1m down 15m multiplier 1.5 max 1h
- info: percentage of allocated BTRFS physical disk space
+ info: Percentage of allocated Btrfs physical disk space. \
+ Most of the Btrfs physical disk space is allocated. \
+ To fix it, first, try running Btrfs balance. \
+ If that does not help, consider deleting snapshots or adding more physical space to the pool.
to: sysadmin
template: btrfs_data
@@ -30,7 +33,11 @@ component: File system
warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98
crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
delay: up 1m down 15m multiplier 1.5 max 1h
- info: utilization of BTRFS data space
+ info: Percentage of used Btrfs data space. \
+ High Btrfs data space utilization. \
+ If there is enough unallocated memory, the data space will be automatically increased. \
+ Otherwise, to fix, first try to run a balance. \
+ If that does not help, you should add more physical space to the pool.
to: sysadmin
template: btrfs_metadata
@@ -47,7 +54,11 @@ component: File system
warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98
crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
delay: up 1m down 15m multiplier 1.5 max 1h
- info: utilization of BTRFS metadata space
+ info: Percentage of used Btrfs metadata space. \
+ High Btrfs metadata space utilization. \
+ If there is enough unallocated memory, the metadata space will be automatically increased. \
+ Otherwise, you may wish to run a balance on metadata only if you find you have very large amounts of \
+ metadata space allocated, but unused.
to: sysadmin
template: btrfs_system
@@ -64,5 +75,7 @@ component: File system
warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98
crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
delay: up 1m down 15m multiplier 1.5 max 1h
- info: utilization of BTRFS system space
+ info: Percentage of used Btrfs system space. \
+ High Btrfs system space utilization. \
+ If there is enough unallocated memory, the system space will be automatically increased.
to: sysadmin
diff --git a/health/health.d/ceph.conf b/health/health.d/ceph.conf
index 1f9da25c75..227e8ecf8d 100644
--- a/health/health.d/ceph.conf
+++ b/health/health.d/ceph.conf
@@ -11,5 +11,7 @@ component: Ceph
warn: $this > (($status >= $WARNING ) ? (85) : (90))
crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: down 5m multiplier 1.2 max 1h
- info: cluster disk space utilization
+ info: Percentage of used cluster disk space. \
+ High disk space utilization. \
+ To fix this, consider adding a node or removing unneeded data from the cluster.
to: sysadmin
diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf
index 45b34806ca..717c6084eb 100644
--- a/health/health.d/cgroups.conf
+++ b/health/health.d/cgroups.conf
@@ -14,7 +14,10 @@ component: CPU
warn: $this > (($status >= $WARNING) ? (75) : (85))
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 15m multiplier 1.5 max 1h
- info: average cgroup CPU utilization over the last 10 minutes
+ info: Average CPU utilization over the last 10 minutes. \
+ High cgroup CPU utilization. \
+ The system will throttle the cgroup CPU usage when the usage is over the limit. \
+ To fix, increase the cgroup CPU limit.
to: sysadmin
template: cgroup_ram_in_use
@@ -30,5 +33,8 @@ component: Memory
warn: $this > (($status >= $WARNING) ? (80) : (90))
crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: down 15m multiplier 1.5 max 1h
- info: cgroup memory utilization
+ info: Percentage of used memory. \
+ High cgroup memory utilization. \
+ OOM will kill some processes when the utilization reaches 100%. \
+ To fix, increase the cgroup memory limit (if set).
to: sysadmin
diff --git a/health/health.d/cockroachdb.conf b/health/health.d/cockroachdb.conf
index 1f227841e6..de35b66aa0 100644
--- a/health/health.d/cockroachdb.conf
+++ b/health/health.d/cockroachdb.conf
@@ -12,7 +12,9 @@ component: CockroachDB
warn: $this > (($status >= $WARNING) ? (80) : (85))
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 15m multiplier 1.5 max 1h
- info: storage capacity utilization
+ info: Percentage of used storage space. \
+ High storage capacity utilization. \
+ To fix, increase the space available for CockroachDB data.
to: dba
template: cockroachdb_used_usable_storage_capacity
@@ -26,7 +28,9 @@ component: CockroachDB
warn: $this > (($status >= $WARNING) ? (80) : (85))
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 15m multiplier 1.5 max 1h
- info: storage usable space utilization
+ info: Percentage of storage usable space. \
+ High usable storage capacity utilization. \
+ To fix, increase the space available for CockroachDB data.
to: dba
# Replication
@@ -41,7 +45,10 @@ component: CockroachDB
every: 10s
warn: $this > 0
delay: down 15m multiplier 1.5 max 1h
- info: number of ranges with fewer live replicas than needed for quorum
+ info: Number of unavailable ranges. \
+ There are ranges with fewer live replicas than needed for quorum. \
+ If a majority of a range replicas are on nodes that are unavailable, \
+ then the entire range is unavailable and will be unable to process queries.
to: dba
template: cockroachdb_underreplicated_ranges
@@ -54,7 +61,9 @@ component: CockroachDB
every: 10s
warn: $this > 0
delay: down 15m multiplier 1.5 max 1h
- info: number of ranges with fewer live replicas than the replication target
+ info: Number of under-replicated ranges. \
+ There are ranges with fewer live replicas than the replication target. \
+ As soon as other nodes are available, they will replicate to them until they have reached their desired replication factor.
to: dba
# FD
@@ -69,5 +78,7 @@ component: CockroachDB
every: 10s
warn: $this > 80
delay: down 15m multiplier 1.5 max 1h
- info: open file descriptors utilization (against softlimit)
+ info: Percentage of used file descriptors. \
+ High file descriptors utilization (against softlimit). \
+ To fix, adjust the file descriptors limit for the process or system-wide.
to: dba
diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf
index ad69528253..a24436e68c 100644
--- a/health/health.d/cpu.conf
+++ b/health/health.d/cpu.conf
@@ -14,7 +14,11 @@ component: CPU
warn: $this > (($status >= $WARNING) ? (75) : (85))
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 15m multiplier 1.5 max 1h
- info: average CPU utilization over the last 10 minutes (excluding iowait, nice and steal)
+ info: Average CPU utilization over the last 10 minutes (excluding iowait, nice and steal). \
+ High system CPU utilization. \
+ A constantly high value might indicate CPU bottleneck, which can make the system run slower. \
+ You can check the CPU PSI charts if there is CPU contention and \
+ per-process CPU usage to find the top consumers.
to: sysadmin
template: 10min_cpu_iowait
@@ -30,7 +34,9 @@ component: CPU
warn: $this > (($status >= $WARNING) ? (20) : (40))
crit: $this > (($status == $CRITICAL) ? (40) : (50))
delay: down 15m multiplier 1.5 max 1h
- info: average CPU iowait time over the last 10 minutes
+ info: Average CPU iowait time over the last 10 minutes. \
+ High system CPU iowait time. \
+ A constantly high value indicates that IO is a bottleneck, which can make the system run slower.
to: sysadmin
template: 20min_steal_cpu
@@ -46,7 +52,10 @@ component: CPU
warn: $this > (($status >= $WARNING) ? (5) : (10))
crit: $this > (($status == $CRITICAL) ? (20) : (30))
delay: down 1h multiplier 1.5 max 2h
- info: average CPU steal time over the last 20 minutes
+ info: Average CPU steal time over the last 20 minutes. \
+ High system CPU steal time. \
+ A large amount of steal time indicates CPU contention on the host system, which can reduce guest performance. \
+ To fix, increase the guest CPU priority or CPU quota, or run fewer guests on the host.
to: sysadmin
## FreeBSD
@@ -63,5 +72,8 @@ component: CPU
warn: $this > (($status >= $WARNING) ? (75) : (85))
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 15m multiplier 1.5 max 1h
- info: average CPU utilization over the last 10 minutes (excluding nice)
+ info: Average CPU utilization over the last 10 minutes (excluding nice). \
+ High system CPU utilization. \
+ A constantly high value might indicate CPU bottleneck, which can make the system run slower. \
+ You can check per-process CPU usage to find the top consumers.
to: sysadmin
diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf
index 65c41b8462..d40cd82d99 100644
--- a/health/health.d/dbengine.conf
+++ b/health/health.d/dbengine.conf
@@ -13,7 +13,8 @@ component: DB engine
every: 10s
crit: $this > 0
delay: down 15m multiplier 1.5 max 1h
- info: number of filesystem errors in the last 10 minutes (too many open files, wrong permissions, etc)
+ info: Number of filesystem errors in the last 10 minutes. \
+ Dbengine is experiencing filesystem errors (too many open files, wrong permissions, etc.).
to: sysadmin
alarm: 10min_dbengine_global_io_errors
@@ -28,7 +29,8 @@ component: DB engine
every: 10s
crit: $this > 0
delay: down 1h multiplier 1.5 max 3h
- info: number of IO errors in the last 10 minutes (CRC errors, out of space, bad disk, etc)
+ info: Number of IO errors in the last 10 minutes. \
+ Dbengine is experiencing I/O errors (CRC errors, out of space, bad disk, etc.).
to: sysadmin
alarm: 10min_dbengine_global_flushing_warnings
@@ -43,8 +45,9 @@ component: DB engine
every: 10s
warn: $this > 0
delay: down 1h multiplier 1.5 max 3h
- info: number of times when dbengine dirty pages were over 50% of the instance's page cache in the last 10 minutes. \
- Metric data are at risk of not being stored in the database. To remedy, reduce disk load or use faster disks.
+ info: Number of times when dbengine dirty pages were over 50% of the instance page cache in the last 10 minutes. \
+ Metric data is at risk of not being stored in the database. \
+ To remedy, reduce disk load or use faster disks.
to: sysadmin
alarm: 10min_dbengine_global_flushing_errors
@@ -59,6 +62,7 @@ component: DB engine
every: 10s
crit: $this != 0
delay: down 1h multiplier 1.5 max 3h
- info: number of pages deleted due to failure to flush data to disk in the last 10 minutes. \
- Metric data were lost to unblock data collection. To fix, reduce disk load or use faster disks.
+ info: Number of pages deleted due to failure to flush data to disk in the last 10 minutes. \
+ Some metric data was dropped to unblock data collection. \
+ To fix, reduce disk load or use faster disks.
to: sysadmin
diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf
index 5daff61a14..c185232581 100644
--- a/health/health.d/disks.conf
+++ b/health/health.d/disks.conf
@@ -23,7 +23,10 @@ component: Disk
warn: $this > (($status >= $WARNING ) ? (80) : (90))
crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: up 1m down 15m multiplier 1.5 max 1h
- info: disk $family space utilization
+ info: Percentage of used space by disk $family. \
+ High disk space utilization. \
+ You may experience slowdowns and crashes if the disk is full. \
+ To fix, cleanup your disk or upgrade it.
to: sysadmin
template: disk_inode_usage
@@ -40,7 +43,10 @@ component: Disk
warn: $this > (($status >= $WARNING) ? (80) : (90))
crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: up 1m down 15m multiplier 1.5 max 1h
- info: disk $family inode utilization
+ info: Percentage of used inodes by disk $family. \
+ High disk inode utilization. \
+ The number of inodes indicates the number of files and folders you have. \
+ To fix, clear cache files or delete unnecessary files and folders.
to: sysadmin
@@ -147,7 +153,10 @@ component: Disk
every: 1m
warn: $this > 98 * (($status >= $WARNING) ? (0.7) : (1))
delay: down 15m multiplier 1.2 max 1h
- info: average percentage of time $family disk was busy over the last 10 minutes
+ info: Average percentage of time $family disk was busy over the last 10 minutes. \
+ High disk load. \
+ The disk spent most of the time servicing read or write requests. \
+ If the disk controller processes the operations in parallel, the alarm does not necessarily indicate a high load.
to: silent
diff --git a/health/health.d/dns_query.conf b/health/health.d/dns_query.conf
index ec4937c0a8..d25844c5e8 100644
--- a/health/health.d/dns_query.conf
+++ b/health/health.d/dns_query.conf
@@ -11,5 +11,6 @@ component: DNS
every: 10s
warn: $this == nan
delay: up 20s down 5m multiplier 1.5 max 1h
- info: average DNS query round trip time over the last 10 seconds
+ info: Average DNS query round trip time over the last 10 seconds. \
+ Failed to query the DNS server.
to: sysadmin
diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf
index 010b945992..37f6e307b5 100644
--- a/health/health.d/dnsmasq_dhcp.conf
+++ b/health/health.d/dnsmasq_dhcp.conf
@@ -11,5 +11,8 @@ component: Dnsmasq
warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) )
crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
delay: down 5m
- info: DHCP range utilization
+ info: Percentage of leased IP addresses. \
+ High DHCP range utilization. \
+ The number of DHCP addresses in use is close to the total number of provisioned DHCP addresses. \
+ To fix, increase the number of IP addresses on a subnet.
to: sysadmin
diff --git a/health/health.d/dockerd.conf b/health/health.d/dockerd.conf
index 220ddd664b..717c8ed3d3 100644
--- a/health/health.d/dockerd.conf
+++ b/health/health.d/dockerd.conf
@@ -7,5 +7,8 @@ component: Docker
every: 10s
lookup: average -10s
crit: $this > 0
- info: average number of unhealthy docker containers over the last 10 seconds
+ info: Average number of unhealthy docker containers over the last 10 seconds. \
+ There are unhealthy docker containers. \
+ Some containers are not running due to failed health checks. \