health: improve alarms infos (#10853)

* health: update anomalies alarms info * health: update apcupsd alarms info * health: update backend alarms info * health: update bcache alarms info * health: update beanstalk alarms info * health: update bind_rndc alarms info * health: update boinc alarms info * health: update btrfs alarms info * health: update ceph alarms info * health: update cgroups alarms info * health: update cockroachdb alarms info * health: update cpu alarms info * health: update cpu dbengine info * health: update cpu disks info * health: update cpu dns_query info * health: update cpu dnsmasq_dhcp info * health: update cpu dockerd info * health: update cpu entropy info * health: update cpu fping info * health: update cpu gearman info * health: update cpu haproxy info * health: update cpu hdfs info * health: update cpu httpcheck info * health: update cpu ioping info * health: update cpu ipc alarms info * health: update cpu ipfs alarms info * health: update cpu ipmi alarms info * health: update cpu isc_dhcpd alarms info * health: update cpu kubelet alarms info * health: update cpu linux_power_supply alarms info * health: update cpu load alarms info * health: update cpu mdstat alarms info * health: update cpu megacli alarms info * health: update cpu memcached alarms info * health: update cpu memory alarms info * health: update cpu mysql alarms info * health: update cpu net alarms info * health: update cpu netfilter alarms info * health: update cpu pihole alarms info * health: update cpu portcheck alarms info * health: update cpu processes alarms info * health: update cpu ram alarms info * health: update cpu redis alarms info * health: update cpu retroshare alarms info * health: update cpu riakkv alarms info * health: update cpu scaleio alarms info * health: update cpu softnet alarms info * health: update cpu swap alarms info * health: update cpu sync alarms info * health: update cpu tcp_conn alarms info * health: update cpu tcp_listen alarms info * health: update cpu tcp_mem alarms info * health: update cpu tcp_orphans alarms info * health: update cpu tcp_resets alarms info * health: update cpu udp_errors alarms info * health: update cpu unbound alarms info * health: update cpu vcsa alarms info * health: update cpu vernemq alarms info * health: update cpu vsphere alarms info * health: update cpu whoisquery alarms info * health: update cpu wmi alarms info * health: update cpu x509check alarms info * health: update cpu zfs alarms info * health: update web_log alarms info * health: update mdstat alarms info * health: update processes alarms info * health: minor * Update health/health.d/load.conf Co-authored-by: Vladimir Kobal <vlad@prokk.net> * Update health/health.d/mysql.conf Co-authored-by: Vladimir Kobal <vlad@prokk.net> * Update health/health.d/mysql.conf Co-authored-by: Vladimir Kobal <vlad@prokk.net> * Update health/health.d/memory.conf Co-authored-by: Vladimir Kobal <vlad@prokk.net> * Update health/health.d/memory.conf Co-authored-by: Vladimir Kobal <vlad@prokk.net> * Update health/health.d/httpcheck.conf Co-authored-by: Vladimir Kobal <vlad@prokk.net> * Update health/health.d/pihole.conf Co-authored-by: Vladimir Kobal <vlad@prokk.net> * Update health/health.d/web_log.conf Co-authored-by: Vladimir Kobal <vlad@prokk.net> * Update health/health.d/web_log.conf Co-authored-by: Vladimir Kobal <vlad@prokk.net> * Update health/health.d/web_log.conf Co-authored-by: Vladimir Kobal <vlad@prokk.net> * Update health/health.d/web_log.conf Co-authored-by: Vladimir Kobal <vlad@prokk.net> * Update health/health.d/web_log.conf Co-authored-by: Vladimir Kobal <vlad@prokk.net> * Update health/health.d/web_log.conf Co-authored-by: Vladimir Kobal <vlad@prokk.net> * Update health/health.d/bcache.conf Co-authored-by: Joel Hans <joel.g.hans@gmail.com> * Update health/health.d/dbengine.conf Co-authored-by: Joel Hans <joel.g.hans@gmail.com> * Update health/health.d/dbengine.conf Co-authored-by: Joel Hans <joel.g.hans@gmail.com> * Update health/health.d/dbengine.conf Co-authored-by: Joel Hans <joel.g.hans@gmail.com> * Update health/health.d/mdstat.conf Co-authored-by: Joel Hans <joel.g.hans@gmail.com> * Update health/health.d/memcached.conf Co-authored-by: Joel Hans <joel.g.hans@gmail.com> * Update health/health.d/net.conf Co-authored-by: Joel Hans <joel.g.hans@gmail.com> * Update health/health.d/ram.conf Co-authored-by: Joel Hans <joel.g.hans@gmail.com> * Update health/health.d/softnet.conf Co-authored-by: Joel Hans <joel.g.hans@gmail.com> * Update health/health.d/softnet.conf Co-authored-by: Joel Hans <joel.g.hans@gmail.com> * Update health/health.d/tcp_resets.conf Co-authored-by: Joel Hans <joel.g.hans@gmail.com> * Update health/health.d/tcp_resets.conf Co-authored-by: Joel Hans <joel.g.hans@gmail.com> Co-authored-by: Vladimir Kobal <vlad@prokk.net> Co-authored-by: Joel Hans <joel.g.hans@gmail.com>
author: Ilya Mashchenko <ilya@netdata.cloud> 2021-03-26 15:39:51 +0300
committer: GitHub <noreply@github.com> 2021-03-26 08:39:51 -0400
commit: ba0992f3affcc57cd62343891455d8defa04530b (patch)
tree: 7c6f9f4eee3c2a5901cc47562666ef91b43f1a4a
parent: 5510b429a642eb2abd8f9831ac98116c4f473325 (diff)
64 files changed, 316 insertions, 311 deletions
diff --git a/health/health.d/anomalies.conf b/health/health.d/anomalies.conf
index 8c7945c3cd..c4c96eaf96 100644
--- a/health/health.d/anomalies.conf
+++ b/health/health.d/anomalies.conf
@@ -5,7 +5,7 @@ template: anomalies_anomaly_probabilities
   lookup: average -2m foreach *
    every: 1m
     warn: $this > 50
-    info: average anomaly probability > 50% for last 2 minutes
+    info: average anomaly probability over the last 2 minutes
 
 # raise a warning alarm if an anomaly flag is consistently firing
 
@@ -14,4 +14,4 @@ template: anomalies_anomaly_flags
   lookup: sum -2m foreach *
    every: 1m
     warn: $this > 10
-    info: count of anomalies > 10 for last 2 minutes
+    info: number of anomalies in the last 2 minutes
diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf
index 4d0f050708..12384fac69 100644
--- a/health/health.d/apcupsd.conf
+++ b/health/health.d/apcupsd.conf
@@ -10,7 +10,7 @@ template: apcupsd_10min_ups_load
     warn: $this > (($status >= $WARNING)  ? (70) : (80))
     crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 10m multiplier 1.5 max 1h
-    info: average UPS load for the last 10 minutes
+    info: average UPS load over the last 10 minutes
       to: sitemgr
 
 # Discussion in https://github.com/netdata/netdata/pull/3928:
@@ -25,7 +25,7 @@ template: apcupsd_ups_charge
     warn: $this < 100
     crit: $this < (($status == $CRITICAL) ? (60) : (50))
    delay: down 10m multiplier 1.5 max 1h
-    info: current UPS charge, averaged over the last 60 seconds to reduce measurement errors
+    info: average UPS charge over the last minute
       to: sitemgr
 
 template: apcupsd_last_collected_secs
diff --git a/health/health.d/backend.conf b/health/health.d/backend.conf
index e51b8aa5f7..8089dc94e2 100644
--- a/health/health.d/backend.conf
+++ b/health/health.d/backend.conf
@@ -6,7 +6,7 @@
    every: 1m
     warn: $this > 0
    delay: down 5m multiplier 1.5 max 1h
-    info: The backends subsystem is deprecated and will be removed soon. Migrate your configuration to exporting.conf.
+    info: the backends subsystem is deprecated and will be removed soon. Migrate your configuration to exporting.conf.
       to: sysadmin
 
 # make sure we are sending data to backend
@@ -31,26 +31,3 @@
    delay: down 5m multiplier 1.5 max 1h
     info: percentage of metrics sent to the backend server
       to: dba
-
-   alarm: backend_metrics_lost
-      on: netdata.backend_metrics
-   units: metrics
-    calc: abs($lost)
-   every: 10s
-    crit: ($this != 0) || ($status == $CRITICAL && abs($sent) == 0)
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of metrics lost due to repeating failures to contact the backend server
-      to: dba
-
-
-# this chart has been removed from netdata
-#   alarm: backend_slow
-#      on: netdata.backend_latency
-#   units: %
-#    calc: $latency * 100 / ($update_every * 1000)
-#   every: 10s
-#    warn: $this > 50
-#    crit: $this > 100
-#   delay: down 5m multiplier 1.5 max 1h
-#    info: the percentage of time between iterations needed by the backend time to process the data sent by netdata
-#      to: dba
diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf
index f737129bf6..d5fccf4f7d 100644
--- a/health/health.d/bcache.conf
+++ b/health/health.d/bcache.conf
@@ -6,7 +6,9 @@ template: bcache_cache_errors
    every: 1m
     warn: $this > 0
    delay: up 2m down 1h multiplier 1.5 max 2h
-    info: the number of times the data was being read from the cache, the bucket was reused and invalidated, during the last 10 mins (when this occurs the data is reread from the backing device)
+    info: number of times data was read from the cache, \
+          the bucket was reused and invalidated in the last 10 minutes \
+          (when this occurs the data is reread from the backing device)
       to: sysadmin
 
 template: bcache_cache_dirty
@@ -17,5 +19,6 @@ template: bcache_cache_dirty
     warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) )
     crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
    delay: up 1m down 1h multiplier 1.5 max 2h
-    info: the percentage of cache space used for dirty and metadata (this usually means your SSD cache is too small)
+    info: percentage of cache space used for dirty data and metadata \
+          (this usually means your SSD cache is too small)
       to: sysadmin
diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf
index af17358847..0c428ecbc6 100644
--- a/health/health.d/beanstalkd.conf
+++ b/health/health.d/beanstalkd.conf
@@ -8,7 +8,9 @@ template: beanstalk_server_buried_jobs
     warn: $this > 0
     crit: $this > 10
    delay: up 0 down 5m multiplier 1.2 max 1h
-    info: the number of buried jobs aggregated across all tubes
+    info: number of buried jobs across all tubes. \
+          You need to manually kick them so they can be processed. \
+          Presence of buried jobs in a tube does not affect new jobs.
       to: sysadmin
       
 # get the number of buried jobs per queue
diff --git a/health/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf
index 4145e77cd4..5cc7a72f7a 100644
--- a/health/health.d/bind_rndc.conf
+++ b/health/health.d/bind_rndc.conf
@@ -1,9 +1,9 @@
- template: bind_rndc_stats_file_size
+template: bind_rndc_stats_file_size
       on: bind_rndc.stats_size
    units: megabytes
    every: 60
     calc: $stats_size
     warn: $this > 512
     crit: $this > 1024
-    info: Bind stats file is very large! Consider to create logrotate conf file for it!
+    info: BIND statistics-file size
       to: sysadmin
diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf
index 43c588db64..25b7f1994e 100644
--- a/health/health.d/boinc.conf
+++ b/health/health.d/boinc.conf
@@ -12,7 +12,7 @@ families: *
     warn: $this > 0
     crit: $this > 1
    delay: up 1m down 5m multiplier 1.5 max 1h
-    info: the total number of compute errors over the past 10 minutes
+    info: average number of compute errors over the last 10 minutes
       to: sysadmin
 
 # Warn on lots of upload errors
@@ -27,7 +27,7 @@ families: *
     warn: $this > 0
     crit: $this > 1
    delay: up 1m down 5m multiplier 1.5 max 1h
-    info: the average number of failed uploads over the past 10 minutes
+    info: average number of failed uploads over the last 10 minutes
       to: sysadmin
 
 # Warn on the task queue being empty
@@ -42,7 +42,7 @@ families: *
     warn: $this < 1
     crit: $this < 0.1
    delay: up 5m down 10m multiplier 1.5 max 1h
-    info: the total number of locally available tasks
+    info: average number of total tasks over the last 10 minutes
       to: sysadmin
 
 # Warn on no active tasks with a non-empty queue
@@ -58,5 +58,5 @@ families: *
     warn: $this < 1
     crit: $this < 0.1
    delay: up 5m down 10m multiplier 1.5 max 1h
-    info: the total number of active tasks
+    info: average number of active tasks over the last 10 minutes
       to: sysadmin
diff --git a/health/health.d/btrfs.conf b/health/health.d/btrfs.conf
index b27aa544fc..93ab8748a6 100644
--- a/health/health.d/btrfs.conf
+++ b/health/health.d/btrfs.conf
@@ -10,7 +10,7 @@ families: *
     warn: $this > (($status >= $WARNING)  ? (90) : (95))
     crit: $this > (($status == $CRITICAL) ? (95) : (98))
    delay: up 1m down 15m multiplier 1.5 max 1h
-    info: the percentage of allocated BTRFS physical disk space
+    info: percentage of allocated BTRFS physical disk space
       to: sysadmin
 
 template: btrfs_data
@@ -24,7 +24,7 @@ families: *
     warn: $this > (($status >= $WARNING)  ? (90) : (95)) && $btrfs_allocated > 98
     crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
    delay: up 1m down 15m multiplier 1.5 max 1h
-    info: the percentage of used BTRFS data space
+    info: utilization of BTRFS data space
       to: sysadmin
 
 template: btrfs_metadata
@@ -38,7 +38,7 @@ families: *
     warn: $this > (($status >= $WARNING)  ? (90) : (95)) && $btrfs_allocated > 98
     crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
    delay: up 1m down 15m multiplier 1.5 max 1h
-    info: the percentage of used BTRFS metadata space
+    info: utilization of BTRFS metadata space
       to: sysadmin
 
 template: btrfs_system
@@ -52,6 +52,5 @@ families: *
     warn: $this > (($status >= $WARNING)  ? (90) : (95)) && $btrfs_allocated > 98
     crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
    delay: up 1m down 15m multiplier 1.5 max 1h
-    info: the percentage of used BTRFS system space
+    info: utilization of BTRFS system space
       to: sysadmin
-
diff --git a/health/health.d/ceph.conf b/health/health.d/ceph.conf
index 5cae0877c6..cdbab0f67e 100644
--- a/health/health.d/ceph.conf
+++ b/health/health.d/ceph.conf
@@ -8,5 +8,5 @@ template: ceph_cluster_space_usage
     warn: $this > (($status >= $WARNING ) ? (85) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: down 5m multiplier 1.2 max 1h
-    info: current ceph disk usage
+    info: cluster disk space utilization
       to: sysadmin
diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf
index 0157f3efed..c0a16f154c 100644
--- a/health/health.d/cgroups.conf
+++ b/health/health.d/cgroups.conf
@@ -11,7 +11,7 @@ template: cgroup_10min_cpu_usage
     warn: $this > (($status >= $WARNING)  ? (75) : (85))
     crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 15m multiplier 1.5 max 1h
-    info: cpu utilization for the last 10 minutes
+    info: average cgroup CPU utilization over the last 10 minutes
       to: sysadmin
 
 template: cgroup_ram_in_use
@@ -24,5 +24,5 @@ template: cgroup_ram_in_use
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: down 15m multiplier 1.5 max 1h
-    info: RAM used by cgroup
+    info: cgroup memory utilization
       to: sysadmin
diff --git a/health/health.d/cockroachdb.conf b/health/health.d/cockroachdb.conf
index 8ab2c9d0f7..47773d04cf 100644
--- a/health/health.d/cockroachdb.conf
+++ b/health/health.d/cockroachdb.conf
@@ -22,7 +22,7 @@ template: cockroachdb_used_storage_capacity
     warn: $this > (($status >= $WARNING)  ? (80) : (85))
     crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 15m multiplier 1.5 max 1h
-    info: entire disk usage percentage
+    info: storage capacity utilization
       to: dba
 
 template: cockroachdb_used_usable_storage_capacity
@@ -33,7 +33,7 @@ template: cockroachdb_used_usable_storage_capacity
     warn: $this > (($status >= $WARNING)  ? (80) : (85))
     crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 15m multiplier 1.5 max 1h
-    info: usable space usage percentage
+    info: storage usable space utilization
       to: dba
 
 # Replication
@@ -67,7 +67,7 @@ template: cockroachdb_open_file_descriptors_limit
    every: 10s
     warn: $this > 80
    delay: down 15m multiplier 1.5 max 1h
-    info: open file descriptors usage percentage
+    info: open file descriptors utilization (against softlimit)
       to: dba
 
 # SQL
diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf
index fa8189856b..32c69f8f5b 100644
--- a/health/health.d/cpu.conf
+++ b/health/health.d/cpu.conf
@@ -11,7 +11,7 @@ template: 10min_cpu_usage
     warn: $this > (($status >= $WARNING)  ? (75) : (85))
     crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 15m multiplier 1.5 max 1h
-    info: average cpu utilization for the last 10 minutes (excluding iowait, nice and steal)
+    info: average CPU utilization over the last 10 minutes (excluding iowait, nice and steal)
       to: sysadmin
 
 template: 10min_cpu_iowait
@@ -24,7 +24,7 @@ template: 10min_cpu_iowait
     warn: $this > (($status >= $WARNING)  ? (20) : (40))
     crit: $this > (($status == $CRITICAL) ? (40) : (50))
    delay: down 15m multiplier 1.5 max 1h
-    info: average CPU wait I/O for the last 10 minutes
+    info: average CPU iowait time over the last 10 minutes
       to: sysadmin
 
 template: 20min_steal_cpu
@@ -37,7 +37,7 @@ template: 20min_steal_cpu
     warn: $this > (($status >= $WARNING)  ? (5)  : (10))
     crit: $this > (($status == $CRITICAL) ? (20) : (30))
    delay: down 1h multiplier 1.5 max 2h
-    info: average CPU steal time for the last 20 minutes
+    info: average CPU steal time over the last 20 minutes
       to: sysadmin
 
 ## FreeBSD
@@ -51,5 +51,5 @@ template: 10min_cpu_usage
     warn: $this > (($status >= $WARNING)  ? (75) : (85))
     crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 15m multiplier 1.5 max 1h
-    info: average cpu utilization for the last 10 minutes (excluding nice)
+    info: average CPU utilization over the last 10 minutes (excluding nice)
       to: sysadmin
diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf
index 274673e3e3..3e51d37eca 100644
--- a/health/health.d/dbengine.conf
+++ b/health/health.d/dbengine.conf
@@ -10,7 +10,7 @@ lookup: sum -10m unaligned of fs_errors
  every: 10s
   crit: $this > 0
  delay: down 15m multiplier 1.5 max 1h
-  info: number of File-System errors dbengine came across the last 10 minutes (too many open files, wrong permissions etc)
+  info: number of filesystem errors in the last 10 minutes (too many open files, wrong permissions, etc)
     to: sysadmin
 
  alarm: 10min_dbengine_global_io_errors
@@ -22,7 +22,7 @@ lookup: sum -10m unaligned of io_errors
  every: 10s
   crit: $this > 0
  delay: down 1h multiplier 1.5 max 3h
-  info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc)
+  info: number of IO errors in the last 10 minutes (CRC errors, out of space, bad disk, etc)
     to: sysadmin
 
  alarm: 10min_dbengine_global_flushing_warnings
@@ -34,7 +34,8 @@ lookup: sum -10m unaligned of pg_cache_over_half_dirty_events
  every: 10s
   warn: $this > 0
  delay: down 1h multiplier 1.5 max 3h
-  info: number of times in the last 10 minutes that dbengine dirty pages were over 50% of the instance's page cache, metric data at risk of not being stored in the database, please reduce disk load or use faster disks
+  info: number of times when dbengine dirty pages were over 50% of the instance's page cache in the last 10 minutes. \
+        Metric data are at risk of not being stored in the database. To remedy, reduce disk load or use faster disks.
     to: sysadmin
 
  alarm: 10min_dbengine_global_flushing_err
author	Ilya Mashchenko <ilya@netdata.cloud>	2021-03-26 15:39:51 +0300
committer	GitHub <noreply@github.com>	2021-03-26 08:39:51 -0400
commit	ba0992f3affcc57cd62343891455d8defa04530b (patch)
tree	7c6f9f4eee3c2a5901cc47562666ef91b43f1a4a
parent	5510b429a642eb2abd8f9831ac98116c4f473325 (diff)