summaryrefslogtreecommitdiffstats
path: root/health
diff options
context:
space:
mode:
authorIlya Mashchenko <ilya@netdata.cloud>2019-12-04 16:30:00 +0300
committerGitHub <noreply@github.com>2019-12-04 16:30:00 +0300
commit2d1b52948eea96127f8eee853d8db28b66f4a493 (patch)
tree1733294024d52d50967b869efcdc77ef15a64adb /health
parent14b12074fd61e4e5c00a8aac4ed01b8a6cb74bd0 (diff)
installer: include go.d.plugin version v0.12.0 (#7418)
* add unbound basic alarms * add scaleio basic alarms * update health Makefile.am * add scaleio to dashboard_info.js * packaging: set go.d.plugin version to 0.12.0 * packaging: update go.d.plugin checksums
Diffstat (limited to 'health')
-rw-r--r--health/Makefile.am2
-rw-r--r--health/health.d/scaleio.conf38
-rw-r--r--health/health.d/unbound.conf35
3 files changed, 75 insertions, 0 deletions
diff --git a/health/Makefile.am b/health/Makefile.am
index f63faa8af0..dfaaf8a423 100644
--- a/health/Makefile.am
+++ b/health/Makefile.am
@@ -80,6 +80,7 @@ dist_healthconfig_DATA = \
health.d/redis.conf \
health.d/retroshare.conf \
health.d/riakkv.conf \
+ health.d/scaleio.conf \
health.d/softnet.conf \
health.d/squid.conf \
health.d/stiebeleltron.conf \
@@ -90,6 +91,7 @@ dist_healthconfig_DATA = \
health.d/tcp_orphans.conf \
health.d/tcp_resets.conf \
health.d/udp_errors.conf \
+ health.d/unbound.conf \
health.d/varnish.conf \
health.d/vcsa.conf \
health.d/vsphere.conf \
diff --git a/health/health.d/scaleio.conf b/health/health.d/scaleio.conf
new file mode 100644
index 0000000000..1a3088a2a5
--- /dev/null
+++ b/health/health.d/scaleio.conf
@@ -0,0 +1,38 @@
+
+# make sure scaleio is running
+
+template: scaleio_last_collected_secs
+ on: scaleio.system_capacity_total
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sysadmin
+
+# make sure Storage Pool capacity utilization is under limit
+
+template: scaleio_storage_pool_capacity_utilization
+ on: scaleio.storage_pool_capacity_utilization
+ calc: $used
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ info: Storage Pool capacity utilization
+ to: sysadmin
+
+
+# make sure Sdc is connected to MDM
+
+template: scaleio_sdc_mdm_connection_state
+ on: scaleio.sdc_mdm_connection_state
+ calc: $connected
+ every: 10s
+ warn: $this != 1
+ delay: up 30s down 5m multiplier 1.5 max 1h
+ info: Sdc connection to MDM state
+ to: sysadmin
diff --git a/health/health.d/unbound.conf b/health/health.d/unbound.conf
new file mode 100644
index 0000000000..bdedc11a08
--- /dev/null
+++ b/health/health.d/unbound.conf
@@ -0,0 +1,35 @@
+
+# make sure unbound is running
+
+template: unbound_last_collected_secs
+ on: unbound.queries
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sysadmin
+
+# make sure there is no overwritten/dropped queries in the request-list
+
+template: unbound_request_list_overwritten
+ on: unbound.request_list_jostle_list
+ lookup: average -60s unaligned absolute match-names of overwritten
+ units: queries
+ every: 10s
+ warn: $this > 5
+ delay: up 10 down 5m multiplier 1.5 max 1h
+ info: the number of overwritten queries in the request-list
+ to: sysadmin
+
+template: unbound_request_list_dropped
+ on: unbound.request_list_jostle_list
+ lookup: average -60s unaligned absolute match-names of dropped
+ units: queries
+ every: 10s
+ warn: $this > 0
+ delay: up 10 down 5m multiplier 1.5 max 1h
+ info: the number of dropped queries in the request-list
+ to: sysadmin