diff options
author | Emmanuel Vasilakis <mrzammler@mm.st> | 2021-04-20 16:24:41 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-04-20 16:24:41 +0300 |
commit | f5bd20e60ae8a65b9c709fcd28d08c1bea268f2d (patch) | |
tree | 8897d68a69f82d8fd0732aca8c344778a5e62340 /health | |
parent | 0a6a14e323ee3b7a2dc17b2ca5b0cce4a8b2eb5e (diff) |
Provide new attributes in health conf files (#10961)
* read and store new attributes (class, component, type) from health conf files. Replace family variable in info strings
* provide the attributes to jsons
* remove extra semicolon
* populate conf files with new attributes
* added newline
* remove extra defines from health.h
* remove empty line
* remove realloc
* use helper variables for find_and_replace. Adjust position for next strstr
* remove comments
* Add type to mysql.conf and vcsa.conf
* fix formatting
* add parenthesis
* remove extra assignment
* changes to mysql_galera_cluster_state from master
* add type Errors to unbound_request_list_overwritten
* fix identation for info strings spawning more than one line
* check for null, replace with empty string if true
* add class, component, type to systemdunits.conf
Diffstat (limited to 'health')
87 files changed, 4224 insertions, 3154 deletions
diff --git a/health/health.c b/health/health.c index a6815f3c57..a3961b5584 100644 --- a/health/health.c +++ b/health/health.c @@ -930,7 +930,7 @@ void *health_main(void *ptr) { if(likely(!rrdcalc_isrepeating(rc))) { ALARM_ENTRY *ae = health_create_alarm_entry( host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, - rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, + rc->rrdset->family, rc->class, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, rc->delay_last, ( @@ -980,7 +980,7 @@ void *health_main(void *ptr) { rc->last_repeat = now; ALARM_ENTRY *ae = health_create_alarm_entry( host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, - rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, + rc->rrdset->family, rc->class, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info, rc->delay_last, ( diff --git a/health/health.d/adaptec_raid.conf b/health/health.d/adaptec_raid.conf index 0753c6e5db..b067e18401 100644 --- a/health/health.d/adaptec_raid.conf +++ b/health/health.d/adaptec_raid.conf @@ -1,24 +1,30 @@ # logical device status check -template: adaptec_raid_ld_status - on: adaptec_raid.ld_status - lookup: max -10s foreach * - units: bool - every: 10s - crit: $this > 0 - delay: down 5m multiplier 1.5 max 1h - info: logical device status is failed or degraded - to: sysadmin + template: adaptec_raid_ld_status + on: adaptec_raid.ld_status + class: System +component: RAID + type: Errors + lookup: max -10s foreach * + units: bool + every: 10s + crit: $this > 0 + delay: down 5m multiplier 1.5 max 1h + info: logical device status is failed or degraded + to: sysadmin # physical device state check -template: adaptec_raid_pd_state - on: adaptec_raid.pd_state - lookup: max -10s foreach * - units: bool - every: 10s - crit: $this > 0 - delay: down 5m multiplier 1.5 max 1h - info: physical device state is not online - to: sysadmin + template: adaptec_raid_pd_state + on: adaptec_raid.pd_state + class: System +component: RAID + type: Errors + lookup: max -10s foreach * + units: bool + every: 10s + crit: $this > 0 + delay: down 5m multiplier 1.5 max 1h + info: physical device state is not online + to: sysadmin diff --git a/health/health.d/am2320.conf b/health/health.d/am2320.conf index ddf8b704d8..4bac98fbbb 100644 --- a/health/health.d/am2320.conf +++ b/health/health.d/am2320.conf @@ -1,12 +1,15 @@ # make sure am2320 is sending stats -template: am2320_last_collected_secs - on: am2320.temperature - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster
\ No newline at end of file + template: am2320_last_collected_secs + on: am2320.temperature + class: Other +component: Sensors + type: Latency + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster diff --git a/health/health.d/anomalies.conf b/health/health.d/anomalies.conf index c4c96eaf96..f27e39fc10 100644 --- a/health/health.d/anomalies.conf +++ b/health/health.d/anomalies.conf @@ -1,17 +1,23 @@ # raise a warning alarm if an anomaly probability is consistently above 50% -template: anomalies_anomaly_probabilities - on: anomalies.probability - lookup: average -2m foreach * - every: 1m - warn: $this > 50 - info: average anomaly probability over the last 2 minutes + template: anomalies_anomaly_probabilities + on: anomalies.probability + class: Netdata +component: ML + type: Errors + lookup: average -2m foreach * + every: 1m + warn: $this > 50 + info: average anomaly probability over the last 2 minutes # raise a warning alarm if an anomaly flag is consistently firing -template: anomalies_anomaly_flags - on: anomalies.anomaly - lookup: sum -2m foreach * - every: 1m - warn: $this > 10 - info: number of anomalies in the last 2 minutes + template: anomalies_anomaly_flags + on: anomalies.anomaly + class: Netdata +component: ML + type: Errors + lookup: sum -2m foreach * + every: 1m + warn: $this > 10 + info: number of anomalies in the last 2 minutes diff --git a/health/health.d/apache.conf b/health/health.d/apache.conf index 0c98b87783..c623fb8801 100644 --- a/health/health.d/apache.conf +++ b/health/health.d/apache.conf @@ -1,14 +1,17 @@ # make sure apache is running -template: apache_last_collected_secs - on: apache.requests - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster + template: apache_last_collected_secs + on: apache.requests + class: Web Server +component: Apache + type: Latency + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf index 12384fac69..07b5c28c97 100644 --- a/health/health.d/apcupsd.conf +++ b/health/health.d/apcupsd.conf @@ -1,40 +1,49 @@ # you can disable an alarm notification by setting the 'to' line to: silent -template: apcupsd_10min_ups_load - on: apcupsd.load - os: * - hosts: * - lookup: average -10m unaligned of percentage - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (70) : (80)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) - delay: down 10m multiplier 1.5 max 1h - info: average UPS load over the last 10 minutes - to: sitemgr + template: apcupsd_10min_ups_load + on: apcupsd.load + class: Power Supply +component: UPS + type: Utilization + os: * + hosts: * + lookup: average -10m unaligned of percentage + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 10m multiplier 1.5 max 1h + info: average UPS load over the last 10 minutes + to: sitemgr # Discussion in https://github.com/netdata/netdata/pull/3928: # Fire the alarm as soon as it's going on battery (99% charge) and clear only when full. -template: apcupsd_ups_charge - on: apcupsd.charge - os: * - hosts: * - lookup: average -60s unaligned of charge - units: % - every: 60s - warn: $this < 100 - crit: $this < (($status == $CRITICAL) ? (60) : (50)) - delay: down 10m multiplier 1.5 max 1h - info: average UPS charge over the last minute - to: sitemgr + template: apcupsd_ups_charge + on: apcupsd.charge + class: Power Supply +component: UPS + type: Errors + os: * + hosts: * + lookup: average -60s unaligned of charge + units: % + every: 60s + warn: $this < 100 + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 10m multiplier 1.5 max 1h + info: average UPS charge over the last minute + to: sitemgr -template: apcupsd_last_collected_secs - on: apcupsd.load - calc: $now - $last_collected_t - every: 10s - units: seconds ago - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sitemgr + template: apcupsd_last_collected_secs + on: apcupsd.load + class: Power Supply +component: UPS device + type: Latency + calc: $now - $last_collected_t + every: 10s + units: seconds ago + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sitemgr diff --git a/health/health.d/backend.conf b/health/health.d/backend.conf index 8089dc94e2..948ea551a0 100644 --- a/health/health.d/backend.conf +++ b/health/health.d/backend.conf @@ -1,33 +1,42 @@ # Alert that backen |