diff options
author | Ilya Mashchenko <ilya@netdata.cloud> | 2021-04-14 13:23:33 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-04-14 13:23:33 +0300 |
commit | 78b540bb75adc0298da08327a9aa95650540d2c5 (patch) | |
tree | 0f4697d1d8e0e70b74cc654ed946a9db96dec36a /health | |
parent | 563c6554efb48a7cfae28610aeecc60651ee3077 (diff) |
health: add systemdunits alarms (#10906)
Diffstat (limited to 'health')
-rw-r--r-- | health/Makefile.am | 1 | ||||
-rw-r--r-- | health/health.d/systemdunits.conf | 112 |
2 files changed, 113 insertions, 0 deletions
diff --git a/health/Makefile.am b/health/Makefile.am index 4b26d6ae64..b963ea0cd1 100644 --- a/health/Makefile.am +++ b/health/Makefile.am @@ -90,6 +90,7 @@ dist_healthconfig_DATA = \ health.d/stiebeleltron.conf \ health.d/synchronization.conf \ health.d/swap.conf \ + health.d/systemdunits.conf \ health.d/tcp_conn.conf \ health.d/tcp_listen.conf \ health.d/tcp_mem.conf \ diff --git a/health/health.d/systemdunits.conf b/health/health.d/systemdunits.conf new file mode 100644 index 0000000000..b566d8e0e8 --- /dev/null +++ b/health/health.d/systemdunits.conf @@ -0,0 +1,112 @@ +## Check if the are any systemd units in the failed state (crashed). +## States: 1 - active, 2 - inactive, 3 - activating, 4 - deactivating, 5 - failed. + +## Service units +template: systemd_service_units_state + on: systemd.service_units_state + lookup: max -1s min2max + units: ok/failed + every: 10s + warn: $this != nan AND $this == 5 + delay: down 5m multiplier 1.5 max 1h + info: one or more systemd service units are in the failed state + to: sysadmin + +## Socket units +template: systemd_socket_units_state + on: systemd.socket_unit_state + lookup: max -1s min2max + units: ok/failed + every: 10s + warn: $this != nan AND $this == 5 + delay: down 5m multiplier 1.5 max 1h + info: one or more systemd socket units are in the failed state + to: sysadmin + +## Target units +template: systemd_target_units_state + on: systemd.target_unit_state + lookup: max -1s min2max + units: ok/failed + every: 10s + warn: $this != nan AND $this == 5 + delay: down 5m multiplier 1.5 max 1h + info: one or more systemd target units are in the failed state + to: sysadmin + +## Path units +template: systemd_path_units_state + on: systemd.path_unit_state + lookup: max -1s min2max + units: ok/failed + every: 10s + warn: $this != nan AND $this == 5 + delay: down 5m multiplier 1.5 max 1h + info: one or more systemd path units are in the failed state + to: sysadmin + +## Device units +template: systemd_device_units_state + on: systemd.device_unit_state + lookup: max -1s min2max + units: ok/failed + every: 10s + warn: $this != nan AND $this == 5 + delay: down 5m multiplier 1.5 max 1h + info: one or more the systemd device units are in the failed state + to: sysadmin + +## Mount units +template: systemd_mount_units_state + on: systemd.mount_unit_state + lookup: max -1s min2max + units: ok/failed + every: 10s + warn: $this != nan AND $this == 5 + delay: down 5m multiplier 1.5 max 1h + info: one or more the systemd mount units are in the failed state + to: sysadmin + +## Automount units +template: systemd_automount_units_state + on: systemd.automount_unit_state + lookup: max -1s min2max + units: ok/failed + every: 10s + warn: $this != nan AND $this == 5 + delay: down 5m multiplier 1.5 max 1h + info: one or more systemd automount units are in the failed state + to: sysadmin + +## Swap units +template: systemd_swap_units_state + on: systemd.swap_unit_state + lookup: max -1s min2max + units: ok/failed + every: 10s + warn: $this != nan AND $this == 5 + delay: down 5m multiplier 1.5 max 1h + info: one or more systemd swap units are in the failed state + to: sysadmin + +## Scope units +template: systemd_scope_units_state + on: systemd.scope_unit_state + lookup: max -1s min2max + units: ok/failed + every: 10s + warn: $this != nan AND $this == 5 + delay: down 5m multiplier 1.5 max 1h + info: one or more systemd scope units are in the failed state + to: sysadmin + +## Slice units +template: systemd_slice_units_state + on: systemd.slice_unit_state + lookup: max -1s min2max + units: ok/failed + every: 10s + warn: $this != nan AND $this == 5 + delay: down 5m multiplier 1.5 max 1h + info: one or more systemd slice units are in the failed state + to: sysadmin |