summaryrefslogtreecommitdiffstats
path: root/health
diff options
context:
space:
mode:
authorEmmanuel Vasilakis <mrzammler@mm.st>2021-04-20 16:24:41 +0300
committerGitHub <noreply@github.com>2021-04-20 16:24:41 +0300
commitf5bd20e60ae8a65b9c709fcd28d08c1bea268f2d (patch)
tree8897d68a69f82d8fd0732aca8c344778a5e62340 /health
parent0a6a14e323ee3b7a2dc17b2ca5b0cce4a8b2eb5e (diff)
Provide new attributes in health conf files (#10961)
* read and store new attributes (class, component, type) from health conf files. Replace family variable in info strings * provide the attributes to jsons * remove extra semicolon * populate conf files with new attributes * added newline * remove extra defines from health.h * remove empty line * remove realloc * use helper variables for find_and_replace. Adjust position for next strstr * remove comments * Add type to mysql.conf and vcsa.conf * fix formatting * add parenthesis * remove extra assignment * changes to mysql_galera_cluster_state from master * add type Errors to unbound_request_list_overwritten * fix identation for info strings spawning more than one line * check for null, replace with empty string if true * add class, component, type to systemdunits.conf
Diffstat (limited to 'health')
-rw-r--r--health/health.c4
-rw-r--r--health/health.d/adaptec_raid.conf42
-rw-r--r--health/health.d/am2320.conf23
-rw-r--r--health/health.d/anomalies.conf30
-rw-r--r--health/health.d/apache.conf23
-rw-r--r--health/health.d/apcupsd.conf77
-rw-r--r--health/health.d/backend.conf65
-rw-r--r--health/health.d/bcache.conf50
-rw-r--r--health/health.d/beanstalkd.conf27
-rw-r--r--health/health.d/bind_rndc.conf21
-rw-r--r--health/health.d/boinc.conf118
-rw-r--r--health/health.d/btrfs.conf116
-rw-r--r--health/health.d/ceph.conf23
-rw-r--r--health/health.d/cgroups.conf54
-rw-r--r--health/health.d/cockroachdb.conf170
-rw-r--r--health/health.d/couchdb.conf23
-rw-r--r--health/health.d/cpu.conf108
-rw-r--r--health/health.d/dbengine.conf104
-rw-r--r--health/health.d/disks.conf126
-rw-r--r--health/health.d/dns_query.conf21
-rw-r--r--health/health.d/dnsmasq_dhcp.conf23
-rw-r--r--health/health.d/dockerd.conf19
-rw-r--r--health/health.d/elasticsearch.conf21
-rw-r--r--health/health.d/entropy.conf25
-rw-r--r--health/health.d/exporting.conf23
-rw-r--r--health/health.d/fping.conf108
-rw-r--r--health/health.d/fronius.conf25
-rw-r--r--health/health.d/gearman.conf46
-rw-r--r--health/health.d/haproxy.conf59
-rw-r--r--health/health.d/hdfs.conf130
-rw-r--r--health/health.d/httpcheck.conf205
-rw-r--r--health/health.d/ioping.conf29
-rw-r--r--health/health.d/ipc.conf54
-rw-r--r--health/health.d/ipfs.conf23
-rw-r--r--health/health.d/ipmi.conf44
-rw-r--r--health/health.d/kubelet.conf195
-rw-r--r--health/health.d/lighttpd.conf23
-rw-r--r--health/health.d/linux_power_supply.conf23
-rw-r--r--health/health.d/load.conf94
-rw-r--r--health/health.d/mdstat.conf84
-rw-r--r--health/health.d/megacli.conf109
-rw-r--r--health/health.d/memcached.conf88
-rw-r--r--health/health.d/memory.conf75
-rw-r--r--health/health.d/mongodb.conf23
-rw-r--r--health/health.d/mysql.conf266
-rw-r--r--health/health.d/named.conf23
-rw-r--r--health/health.d/net.conf322
-rw-r--r--health/health.d/netfilter.conf29
-rw-r--r--health/health.d/nginx.conf23
-rw-r--r--health/health.d/nginx_plus.conf23
-rw-r--r--health/health.d/phpfpm.conf23
-rw-r--r--health/health.d/pihole.conf109
-rw-r--r--health/health.d/portcheck.conf96
-rw-r--r--health/health.d/postgres.conf23
-rw-r--r--health/health.d/processes.conf25
-rw-r--r--health/health.d/pulsar.conf23
-rw-r--r--health/health.d/ram.conf129
-rw-r--r--health/health.d/redis.conf67
-rw-r--r--health/health.d/retroshare.conf46
-rw-r--r--health/health.d/riakkv.conf159
-rw-r--r--health/health.d/scaleio.conf65
-rw-r--r--health/health.d/softnet.conf85
-rw-r--r--health/health.d/squid.conf23
-rw-r--r--health/health.d/stiebeleltron.conf25
-rw-r--r--health/health.d/swap.conf56
-rw-r--r--health/health.d/systemdunits.conf210
-rw-r--r--health/health.d/tcp_conn.conf27
-rw-r--r--health/health.d/tcp_listen.conf110
-rw-r--r--health/health.d/tcp_mem.conf27
-rw-r--r--health/health.d/tcp_orphans.conf27
-rw-r--r--health/health.d/tcp_resets.conf102
-rw-r--r--health/health.d/udp_errors.conf50
-rw-r--r--health/health.d/unbound.conf65
-rw-r--r--health/health.d/varnish.conf21
-rw-r--r--health/health.d/vcsa.conf223
-rw-r--r--health/health.d/vernemq.conf597
-rw-r--r--health/health.d/vsphere.conf263
-rw-r--r--health/health.d/web_log.conf650
-rw-r--r--health/health.d/whoisquery.conf44
-rw-r--r--health/health.d/wmi.conf247
-rw-r--r--health/health.d/x509check.conf61
-rw-r--r--health/health.d/zfs.conf21
-rw-r--r--health/health.d/zookeeper.conf23
-rw-r--r--health/health.h17
-rw-r--r--health/health_config.c76
-rw-r--r--health/health_json.c50
-rw-r--r--health/health_log.c54
87 files changed, 4224 insertions, 3154 deletions
diff --git a/health/health.c b/health/health.c
index a6815f3c57..a3961b5584 100644
--- a/health/health.c
+++ b/health/health.c
@@ -930,7 +930,7 @@ void *health_main(void *ptr) {
if(likely(!rrdcalc_isrepeating(rc))) {
ALARM_ENTRY *ae = health_create_alarm_entry(
host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id,
- rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change,
+ rc->rrdset->family, rc->class, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
rc->delay_last,
(
@@ -980,7 +980,7 @@ void *health_main(void *ptr) {
rc->last_repeat = now;
ALARM_ENTRY *ae = health_create_alarm_entry(
host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id,
- rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change,
+ rc->rrdset->family, rc->class, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info,
rc->delay_last,
(
diff --git a/health/health.d/adaptec_raid.conf b/health/health.d/adaptec_raid.conf
index 0753c6e5db..b067e18401 100644
--- a/health/health.d/adaptec_raid.conf
+++ b/health/health.d/adaptec_raid.conf
@@ -1,24 +1,30 @@
# logical device status check
-template: adaptec_raid_ld_status
- on: adaptec_raid.ld_status
- lookup: max -10s foreach *
- units: bool
- every: 10s
- crit: $this > 0
- delay: down 5m multiplier 1.5 max 1h
- info: logical device status is failed or degraded
- to: sysadmin
+ template: adaptec_raid_ld_status
+ on: adaptec_raid.ld_status
+ class: System
+component: RAID
+ type: Errors
+ lookup: max -10s foreach *
+ units: bool
+ every: 10s
+ crit: $this > 0
+ delay: down 5m multiplier 1.5 max 1h
+ info: logical device status is failed or degraded
+ to: sysadmin
# physical device state check
-template: adaptec_raid_pd_state
- on: adaptec_raid.pd_state
- lookup: max -10s foreach *
- units: bool
- every: 10s
- crit: $this > 0
- delay: down 5m multiplier 1.5 max 1h
- info: physical device state is not online
- to: sysadmin
+ template: adaptec_raid_pd_state
+ on: adaptec_raid.pd_state
+ class: System
+component: RAID
+ type: Errors
+ lookup: max -10s foreach *
+ units: bool
+ every: 10s
+ crit: $this > 0
+ delay: down 5m multiplier 1.5 max 1h
+ info: physical device state is not online
+ to: sysadmin
diff --git a/health/health.d/am2320.conf b/health/health.d/am2320.conf
index ddf8b704d8..4bac98fbbb 100644
--- a/health/health.d/am2320.conf
+++ b/health/health.d/am2320.conf
@@ -1,12 +1,15 @@
# make sure am2320 is sending stats
-template: am2320_last_collected_secs
- on: am2320.temperature
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster \ No newline at end of file
+ template: am2320_last_collected_secs
+ on: am2320.temperature
+ class: Other
+component: Sensors
+ type: Latency
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: webmaster
diff --git a/health/health.d/anomalies.conf b/health/health.d/anomalies.conf
index c4c96eaf96..f27e39fc10 100644
--- a/health/health.d/anomalies.conf
+++ b/health/health.d/anomalies.conf
@@ -1,17 +1,23 @@
# raise a warning alarm if an anomaly probability is consistently above 50%
-template: anomalies_anomaly_probabilities
- on: anomalies.probability
- lookup: average -2m foreach *
- every: 1m
- warn: $this > 50
- info: average anomaly probability over the last 2 minutes
+ template: anomalies_anomaly_probabilities
+ on: anomalies.probability
+ class: Netdata
+component: ML
+ type: Errors
+ lookup: average -2m foreach *
+ every: 1m
+ warn: $this > 50
+ info: average anomaly probability over the last 2 minutes
# raise a warning alarm if an anomaly flag is consistently firing
-template: anomalies_anomaly_flags
- on: anomalies.anomaly
- lookup: sum -2m foreach *
- every: 1m
- warn: $this > 10
- info: number of anomalies in the last 2 minutes
+ template: anomalies_anomaly_flags
+ on: anomalies.anomaly
+ class: Netdata
+component: ML
+ type: Errors
+ lookup: sum -2m foreach *
+ every: 1m
+ warn: $this > 10
+ info: number of anomalies in the last 2 minutes
diff --git a/health/health.d/apache.conf b/health/health.d/apache.conf
index 0c98b87783..c623fb8801 100644
--- a/health/health.d/apache.conf
+++ b/health/health.d/apache.conf
@@ -1,14 +1,17 @@
# make sure apache is running
-template: apache_last_collected_secs
- on: apache.requests
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
+ template: apache_last_collected_secs
+ on: apache.requests
+ class: Web Server
+component: Apache
+ type: Latency
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: webmaster
diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf
index 12384fac69..07b5c28c97 100644
--- a/health/health.d/apcupsd.conf
+++ b/health/health.d/apcupsd.conf
@@ -1,40 +1,49 @@
# you can disable an alarm notification by setting the 'to' line to: silent
-template: apcupsd_10min_ups_load
- on: apcupsd.load
- os: *
- hosts: *
- lookup: average -10m unaligned of percentage
- units: %
- every: 1m
- warn: $this > (($status >= $WARNING) ? (70) : (80))
- crit: $this > (($status == $CRITICAL) ? (85) : (95))
- delay: down 10m multiplier 1.5 max 1h
- info: average UPS load over the last 10 minutes
- to: sitemgr
+ template: apcupsd_10min_ups_load
+ on: apcupsd.load
+ class: Power Supply
+component: UPS
+ type: Utilization
+ os: *
+ hosts: *
+ lookup: average -10m unaligned of percentage
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (70) : (80))
+ crit: $this > (($status == $CRITICAL) ? (85) : (95))
+ delay: down 10m multiplier 1.5 max 1h
+ info: average UPS load over the last 10 minutes
+ to: sitemgr
# Discussion in https://github.com/netdata/netdata/pull/3928:
# Fire the alarm as soon as it's going on battery (99% charge) and clear only when full.
-template: apcupsd_ups_charge
- on: apcupsd.charge
- os: *
- hosts: *
- lookup: average -60s unaligned of charge
- units: %
- every: 60s
- warn: $this < 100
- crit: $this < (($status == $CRITICAL) ? (60) : (50))
- delay: down 10m multiplier 1.5 max 1h
- info: average UPS charge over the last minute
- to: sitemgr
+ template: apcupsd_ups_charge
+ on: apcupsd.charge
+ class: Power Supply
+component: UPS
+ type: Errors
+ os: *
+ hosts: *
+ lookup: average -60s unaligned of charge
+ units: %
+ every: 60s
+ warn: $this < 100
+ crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ delay: down 10m multiplier 1.5 max 1h
+ info: average UPS charge over the last minute
+ to: sitemgr
-template: apcupsd_last_collected_secs
- on: apcupsd.load
- calc: $now - $last_collected_t
- every: 10s
- units: seconds ago
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sitemgr
+ template: apcupsd_last_collected_secs
+ on: apcupsd.load
+ class: Power Supply
+component: UPS device
+ type: Latency
+ calc: $now - $last_collected_t
+ every: 10s
+ units: seconds ago
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sitemgr
diff --git a/health/health.d/backend.conf b/health/health.d/backend.conf
index 8089dc94e2..948ea551a0 100644
--- a/health/health.d/backend.conf
+++ b/health/health.d/backend.conf
@@ -1,33 +1,42 @@
# Alert that backen