Revert "Update alarms info" (#11633)

author: Ilya Mashchenko <ilya@netdata.cloud> 2021-10-18 11:36:23 +0300
committer: GitHub <noreply@github.com> 2021-10-18 11:36:23 +0300
commit: 59d95ec88cd3663c5aaa7efd408fbbdca981a0b1 (patch)
tree: 356cf3e26e21b32cd1472aef98f8c4d3a889cded /health
parent: e57e486903ca3aeba7d286c3dfa183af82c8ffd7 (diff)
58 files changed, 211 insertions, 599 deletions
diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf
index bab4e1ae02..65f1a69ab9 100644
--- a/health/health.d/apcupsd.conf
+++ b/health/health.d/apcupsd.conf
@@ -13,11 +13,7 @@ component: UPS
      warn: $this > (($status >= $WARNING)  ? (70) : (80))
      crit: $this > (($status == $CRITICAL) ? (85) : (95))
     delay: down 10m multiplier 1.5 max 1h
-     info: Average UPS load over the last 10 minutes. \
-           High UPS load. \
-           It may result in either your UPS transferring to bypass or shutting down \
-           as a self-protection and safety measure due to overload. \
-           You should remove some attached equipment from the UPS.
+     info: average UPS load over the last 10 minutes
        to: sitemgr
 
 # Discussion in https://github.com/netdata/netdata/pull/3928:
@@ -35,9 +31,7 @@ component: UPS
      warn: $this < 100
      crit: $this < (($status == $CRITICAL) ? (60) : (50))
     delay: down 10m multiplier 1.5 max 1h
-     info: Average UPS charge over the last minute. \
-           The UPS is running on battery power. It will shut down if external power is not restored. \
-           You should prepare any attached equipment for the shutdown.
+     info: average UPS charge over the last minute
        to: sitemgr
 
  template: apcupsd_last_collected_secs
diff --git a/health/health.d/backend.conf b/health/health.d/backend.conf
index 14fe255242..91d469395e 100644
--- a/health/health.d/backend.conf
+++ b/health/health.d/backend.conf
@@ -9,8 +9,7 @@ component: Exporting engine
     every: 1m
      warn: $this > 0
     delay: down 5m multiplier 1.5 max 1h
-     info: The backends subsystem is deprecated and will be removed soon. \
-           Migrate your configuration to exporting.conf.
+     info: the backends subsystem is deprecated and will be removed soon. Migrate your configuration to exporting.conf.
        to: sysadmin
 
 # make sure we are sending data to backend
@@ -26,10 +25,7 @@ component: Exporting engine
      warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
      crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
     delay: down 5m multiplier 1.5 max 1h
-     info: Number of seconds since the last successful buffering of backend data. \
-           The backend subsystem failed to buffer metrics for a while. Some metrics are lost while exporting. \
-           It indicates that the backend destination is down or unreachable. \
-           Short-term network availability issues might be fixed by increasing [buffer on failures] in netdata.conf.
+     info: number of seconds since the last successful buffering of backend data
        to: dba
 
     alarm: backend_metrics_sent
@@ -42,8 +38,5 @@ component: Exporting engine
     every: 10s
      warn: $this != 100
     delay: down 5m multiplier 1.5 max 1h
-     info: Percentage of metrics sent to the backend server. \
-           The backends subsystem failed to send all metrics. Some metrics are lost while exporting. \
-           It indicates that the backend destination is down or unreachable. \
-           Short-term network availability issues might be fixed by increasing [buffer on failures] in netdata.conf.
+     info: percentage of metrics sent to the backend server
        to: dba
diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf
index df030c497e..49cb5ad0f6 100644
--- a/health/health.d/bcache.conf
+++ b/health/health.d/bcache.conf
@@ -9,9 +9,9 @@ component: Disk
     every: 1m
      warn: $this > 0
     delay: up 2m down 1h multiplier 1.5 max 2h
-     info: Number of bcache read races in the last minute. \
-           The bucket was reused and invalidated while reading from the cache. \
-           When this occurs the data is reread from the backing device.
+     info: number of times data was read from the cache, \
+           the bucket was reused and invalidated in the last 10 minutes \
+           (when this occurs the data is reread from the backing device)
        to: sysadmin
 
  template: bcache_cache_dirty
@@ -25,7 +25,6 @@ component: Disk
      warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) )
      crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
     delay: up 1m down 1h multiplier 1.5 max 2h
-     info: Percentage of cache space used for dirty data and metadata. \
-           High block cache utilization by dirty data and metadata. \
-           This usually means your SSD cache is too small.
+     info: percentage of cache space used for dirty data and metadata \
+           (this usually means your SSD cache is too small)
        to: sysadmin
diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf
index 35c7587c06..13ac8c1825 100644
--- a/health/health.d/beanstalkd.conf
+++ b/health/health.d/beanstalkd.conf
@@ -11,11 +11,9 @@ component: Beanstalk
      warn: $this > 0
      crit: $this > 10
     delay: up 0 down 5m multiplier 1.2 max 1h
-     info: Number of buried jobs across all tubes. \
-           There are buried jobs. \
-           It usually happens if something goes wrong while the consumer processes it. \
-           The presence of buried jobs in a tube does not affect new jobs. \
-           You need to manually kick the jobs, so they can be processed.
+     info: number of buried jobs across all tubes. \
+           You need to manually kick them so they can be processed. \
+           Presence of buried jobs in a tube does not affect new jobs.
        to: sysadmin
       
 # get the number of buried jobs per queue
diff --git a/health/health.d/btrfs.conf b/health/health.d/btrfs.conf
index 44b5167ce8..8d197aa8d2 100644
--- a/health/health.d/btrfs.conf
+++ b/health/health.d/btrfs.conf
@@ -13,10 +13,7 @@ component: File system
      warn: $this > (($status >= $WARNING)  ? (90) : (95))
      crit: $this > (($status == $CRITICAL) ? (95) : (98))
     delay: up 1m down 15m multiplier 1.5 max 1h
-     info: Percentage of allocated Btrfs physical disk space. \
-           Most of the Btrfs physical disk space is allocated. \
-           To fix it, first, try running Btrfs balance. \
-           If that does not help, consider deleting snapshots or adding more physical space to the pool.
+     info: percentage of allocated BTRFS physical disk space
        to: sysadmin
 
  template: btrfs_data
@@ -33,11 +30,7 @@ component: File system
      warn: $this > (($status >= $WARNING)  ? (90) : (95)) && $btrfs_allocated > 98
      crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
     delay: up 1m down 15m multiplier 1.5 max 1h
-     info: Percentage of used Btrfs data space. \
-           High Btrfs data space utilization. \
-           If there is enough unallocated memory, the data space will be automatically increased. \
-           Otherwise, to fix, first try to run a balance. \
-           If that does not help, you should add more physical space to the pool.
+     info: utilization of BTRFS data space
        to: sysadmin
 
  template: btrfs_metadata
@@ -54,11 +47,7 @@ component: File system
      warn: $this > (($status >= $WARNING)  ? (90) : (95)) && $btrfs_allocated > 98
      crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
     delay: up 1m down 15m multiplier 1.5 max 1h
-     info: Percentage of used Btrfs metadata space. \
-           High Btrfs metadata space utilization. \
-           If there is enough unallocated memory, the metadata space will be automatically increased. \
-           Otherwise, you may wish to run a balance on metadata only if you find you have very large amounts of \
-           metadata space allocated, but unused.
+     info: utilization of BTRFS metadata space
        to: sysadmin
 
  template: btrfs_system
@@ -75,7 +64,5 @@ component: File system
      warn: $this > (($status >= $WARNING)  ? (90) : (95)) && $btrfs_allocated > 98
      crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
     delay: up 1m down 15m multiplier 1.5 max 1h
-     info: Percentage of used Btrfs system space. \
-           High Btrfs system space utilization. \
-           If there is enough unallocated memory, the system space will be automatically increased.
+     info: utilization of BTRFS system space
        to: sysadmin
diff --git a/health/health.d/ceph.conf b/health/health.d/ceph.conf
index 227e8ecf8d..1f9da25c75 100644
--- a/health/health.d/ceph.conf
+++ b/health/health.d/ceph.conf
@@ -11,7 +11,5 @@ component: Ceph
      warn: $this > (($status >= $WARNING ) ? (85) : (90))
      crit: $this > (($status == $CRITICAL) ? (90) : (98))
     delay: down 5m multiplier 1.2 max 1h
-     info: Percentage of used cluster disk space. \
-           High disk space utilization. \
-           To fix this, consider adding a node or removing unneeded data from the cluster.
+     info: cluster disk space utilization
        to: sysadmin
diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf
index 717c6084eb..45b34806ca 100644
--- a/health/health.d/cgroups.conf
+++ b/health/health.d/cgroups.conf
@@ -14,10 +14,7 @@ component: CPU
      warn: $this > (($status >= $WARNING)  ? (75) : (85))
      crit: $this > (($status == $CRITICAL) ? (85) : (95))
     delay: down 15m multiplier 1.5 max 1h
-     info: Average CPU utilization over the last 10 minutes. \
-           High cgroup CPU utilization. \
-           The system will throttle the cgroup CPU usage when the usage is over the limit. \
-           To fix, increase the cgroup CPU limit.
+     info: average cgroup CPU utilization over the last 10 minutes
        to: sysadmin
 
  template: cgroup_ram_in_use
@@ -33,8 +30,5 @@ component: Memory
      warn: $this > (($status >= $WARNING)  ? (80) : (90))
      crit: $this > (($status == $CRITICAL) ? (90) : (98))
     delay: down 15m multiplier 1.5 max 1h
-     info: Percentage of used memory. \
-           High cgroup memory utilization. \
-           OOM will kill some processes when the utilization reaches 100%. \
-           To fix, increase the cgroup memory limit (if set).
+     info: cgroup memory utilization
        to: sysadmin
diff --git a/health/health.d/cockroachdb.conf b/health/health.d/cockroachdb.conf
index de35b66aa0..1f227841e6 100644
--- a/health/health.d/cockroachdb.conf
+++ b/health/health.d/cockroachdb.conf
@@ -12,9 +12,7 @@ component: CockroachDB
      warn: $this > (($status >= $WARNING)  ? (80) : (85))
      crit: $this > (($status == $CRITICAL) ? (85) : (95))
     delay: down 15m multiplier 1.5 max 1h
-     info: Percentage of used storage space. \
-           High storage capacity utilization. \
-           To fix, increase the space available for CockroachDB data.
+     info: storage capacity utilization
        to: dba
 
  template: cockroachdb_used_usable_storage_capacity
@@ -28,9 +26,7 @@ component: CockroachDB
      warn: $this > (($status >= $WARNING)  ? (80) : (85))
      crit: $this > (($status == $CRITICAL) ? (85) : (95))
     delay: down 15m multiplier 1.5 max 1h
-     info: Percentage of storage usable space. \
-           High usable storage capacity utilization. \
-           To fix, increase the space available for CockroachDB data.
+     info: storage usable space utilization
        to: dba
 
 # Replication
@@ -45,10 +41,7 @@ component: CockroachDB
     every: 10s
      warn: $this > 0
     delay: down 15m multiplier 1.5 max 1h
-     info: Number of unavailable ranges. \
-           There are ranges with fewer live replicas than needed for quorum. \
-           If a majority of a range replicas are on nodes that are unavailable, \
-           then the entire range is unavailable and will be unable to process queries.
+     info: number of ranges with fewer live replicas than needed for quorum
        to: dba
 
  template: cockroachdb_underreplicated_ranges
@@ -61,9 +54,7 @@ component: CockroachDB
     every: 10s
      warn: $this > 0
     delay: down 15m multiplier 1.5 max 1h
-     info: Number of under-replicated ranges. \
-           There are ranges with fewer live replicas than the replication target. \
-           As soon as other nodes are available, they will replicate to them until they have reached their desired replication factor.
+     info: number of ranges with fewer live replicas than the replication target
        to: dba
 
 # FD
@@ -78,7 +69,5 @@ component: CockroachDB
     every: 10s
      warn: $this > 80
     delay: down 15m multiplier 1.5 max 1h
-     info: Percentage of used file descriptors. \
-           High file descriptors utilization (against softlimit). \
-           To fix, adjust the file descriptors limit for the process or system-wide.
+     info: open file descriptors utilization (against softlimit)
        to: dba
diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf
index a24436e68c..ad69528253 100644
--- a/health/health.d/cpu.conf
+++ b/health/health.d/cpu.conf
@@ -14,11 +14,7 @@ component: CPU
      warn: $this > (($status >= $WARNING)  ? (75) : (85))
      crit: $this > (($status == $CRITICAL) ? (85) : (95))
     delay: down 15m multiplier 1.5 max 1h
-     info: Average CPU utilization over the last 10 minutes (excluding iowait, nice and steal). \
-           High system CPU utilization. \
-           A constantly high value might indicate CPU bottleneck, which can make the system run slower. \
-           You can check the CPU PSI charts if there is CPU contention and \
-           per-process CPU usage to find the top consumers.
+     info: average CPU utilization over the last 10 minutes (excluding iowait, nice and steal)
        to: sysadmin
 
  template: 10min_cpu_iowait
@@ -34,9 +30,7 @@ component: CPU
      warn: $this > (($status >= $WARNING)  ? (20) : (40))
      crit: $this > (($status == $CRITICAL) ? (40) : (50))
     delay: down 15m multiplier 1.5 max 1h
-     info: Average CPU iowait time over the last 10 minutes. \
-           High system CPU iowait time. \
-           A constantly high value indicates that IO is a bottleneck, which can make the system run slower.
+     info: average CPU iowait time over the last 10 minutes
        to: sysadmin
 
  template: 20min_steal_cpu
@@ -52,10 +46,7 @@ component: CPU
      warn: $this > (($status >= $WARNING)  ? (5)  : (10))
      crit: $this > (($status == $CRITICAL) ? (20) : (30))
     delay: down 1h multiplier 1.5 max 2h
-     info: Average CPU steal time over the last 20 minutes. \
-           High system CPU steal time. \
-           A large amount of steal time indicates CPU contention on the host system, which can reduce guest performance. \
-           To fix, increase the guest CPU priority or CPU quota, or run fewer guests on the host.
+     info: average CPU steal time over the last 20 minutes
        to: sysadmin
 
 ## FreeBSD
@@ -72,8 +63,5 @@ component: CPU
      warn: $this > (($status >= $WARNING)  ? (75) : (85))
      crit: $this > (($status == $CRITICAL) ? (85) : (95))
     delay: down 15m multiplier 1.5 max 1h
-     info: Average CPU utilization over the last 10 minutes (excluding nice). \
-           High system CPU utilization. \
-           A constantly high value might indicate CPU bottleneck, which can make the system run slower. \
-           You can check per-process CPU usage to find the top consumers.
+     info: average CPU utilization over the last 10 minutes (excluding nice)
        to: sysadmin
diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf
index d40cd82d99..65c41b8462 100644
--- a/health/health.d/dbengine.conf
+++ b/health/health.d/dbengine.conf
@@ -13,8 +13,7 @@ component: DB engine
     every: 10s
      crit: $this > 0
     delay: down 15m multiplier 1.5 max 1h
-     info: Number of filesystem errors in the last 10 minutes. \
-           Dbengine is experiencing filesystem errors (too many open files, wrong permissions, etc.).
+     info: number of filesystem errors in the last 10 minutes (too many open files, wrong permissions, etc)
        to: sysadmin
 
     alarm: 10min_dbengine_global_io_errors
@@ -29,8 +28,7 @@ component: DB engine
     every: 10s
      crit: $this > 0
     delay: down 1h multiplier 1.5 max 3h
-     info: Number of IO errors in the last 10 minutes. \
-           Dbengine is experiencing I/O errors (CRC errors, out of space, bad disk, etc.).
+     info: number of IO errors in the last 10 minutes (CRC errors, out of space, bad disk, etc)
        to: sysadmin
 
     alarm: 10min_dbengine_global_flushing_warnings
@@ -45,9 +43,8 @@ component: DB engine
     every: 10s
      warn: $this > 0
     delay: down 1h multiplier 1.5 max 3h
-     info: Number of times when dbengine dirty pages were over 50% of the instance page cache in the last 10 minutes. \
-           Metric data is at risk of not being stored in the database. \
-           To remedy, reduce disk load or use faster disks.
+     info: number of times when dbengine dirty pages were over 50% of the instance's page cache in the last 10 minutes. \
+           Metric data are at risk of not being stored in the database. To remedy, reduce disk load or use faster disks.
        to: sysadmin
 
     alarm: 10min_dbengine_global_flushing_errors
@@ -62,7 +59,6 @@ component: DB engine
     every: 10s
      crit: $this != 0
     delay: down 1h multiplier 1.5 max 3h
-     info: Number of pages deleted due to failure to flush data to disk in the last 10 minutes. \
-           Some metric data was dropped to unblock data collection. \
-           To fix, reduce disk load or use faster disks.
+     info: number of pages deleted due to failure to flush data to disk in the last 10 minutes. \
+           Metric data were lost to unblock data collection. To fix, reduce disk load or use faster disks.
        to: sysadmin
diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf
index c185232581..5daff61a14 100644
--- a/health/health.d/disks.conf
+++ b/health/health.d/disks.conf
@@ -23,10 +23,7 @@ component: Disk
      warn: $this > (($status >= $WARNING ) ? (80) : (90))
      crit: $this > (($status == $CRITICAL) ? (90) : (98))
     delay: up 1m down 15m multiplier 1.5 max 1h
-     info: Percentage of used space by disk $family. \
-           High disk space utilization. \
-           You may experience slowdowns and crashes if the disk is full. \
-           To fix, cleanup your disk or upgrade it.
+     info: disk $family space utilization
        to: sysadmin
 
  template: disk_inode_usage
@@ -43,10 +40,7 @@ component: Disk
      warn: $this > (($status >= $WARNING)  ? (80) : (90))
      crit: $this > (($status == $CRITICAL) ? (90) : (98))
     delay: up 1m down 15m multiplier 1.5 max 1h
-     info: Percentage of used inodes by disk $family. \
-           High disk inode utilization. \
-           The number of inodes indicates the number of files and folders you have. \
-           To fix, clear cache files or delete unnecessary files and folders.
+     info: disk $family inode utilization
        to: sysadmin
 
 
@@ -153,10 +147,7 @@ component: Disk
     every: 1m
      warn: $this > 98 * (($status >= $WARNING)  ? (0.7) : (1))
     delay: down 15m multiplier 1.2 max 1h
-     info: Average percentage of time $family disk was busy over the last 10 minutes. \
-           High disk load. \
-           The disk spent most of the time servicing read or write requests. \
-           If the disk controller processes the operations in parallel, the alarm does not necessarily indicate a high load.
+     info: average percentage of time $family disk was busy over the last 10 minutes
        to: silent
 
 
diff --git a/health/health.d/dns_query.conf b/health/health.d/dns_query.conf
index d25844c5e8..ec4937c0a8 100644
--- a/health/health.d/dns_query.conf
+++ b/health/health.d/dns_query.conf
@@ -11,6 +11,5 @@ component: DNS
     every: 10s
      warn: $this == nan
     delay: up 20s down 5m multiplier 1.5 max 1h
-     info: Average DNS query round trip time over the last 10 seconds. \
-           Failed to query the DNS server.
+     info: average DNS query round trip time over the last 10 seconds
        to: sysadmin
diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf
index 37f6e307b5..010b945992 100644
--- a/
author	Ilya Mashchenko <ilya@netdata.cloud>	2021-10-18 11:36:23 +0300
committer	GitHub <noreply@github.com>	2021-10-18 11:36:23 +0300
commit	59d95ec88cd3663c5aaa7efd408fbbdca981a0b1 (patch)
tree	356cf3e26e21b32cd1472aef98f8c4d3a889cded /health
parent	e57e486903ca3aeba7d286c3dfa183af82c8ffd7 (diff)