diff options
author | Ilya Mashchenko <ilya@netdata.cloud> | 2019-09-25 12:50:25 +0300 |
---|---|---|
committer | Chris Akritidis <43294513+cakrit@users.noreply.github.com> | 2019-09-25 11:50:25 +0200 |
commit | 0aae17767ff03da1d89767420b40501c7ced1b01 (patch) | |
tree | c8258eacdbb496a202556befa024214053e7655d /health | |
parent | 65727b6a30b742ad870a4d61c9fce04116eb9e20 (diff) |
zookeeper and hdfs: alarms and dashboard_info (#6927)
* add zookeeper alarms
* add zookeeper to dashboard_info
* zookeeper alarm fix
* add hdfs alarms
* add hfds to dashboard_info
* minor
* fix hdfs zk links: use latest version
* hdfs dashboard_info: change semicolon to comma
Diffstat (limited to 'health')
-rw-r--r-- | health/Makefile.am | 2 | ||||
-rw-r--r-- | health/health.d/hdfs.conf | 75 | ||||
-rw-r--r-- | health/health.d/zookeeper.conf | 14 |
3 files changed, 91 insertions, 0 deletions
diff --git a/health/Makefile.am b/health/Makefile.am index a682b594c1..a314b35165 100644 --- a/health/Makefile.am +++ b/health/Makefile.am @@ -44,6 +44,7 @@ dist_healthconfig_DATA = \ health.d/fronius.conf \ health.d/gearman.conf \ health.d/haproxy.conf \ + health.d/hdfs.conf \ health.d/httpcheck.conf \ health.d/ipc.conf \ health.d/ipfs.conf \ @@ -91,5 +92,6 @@ dist_healthconfig_DATA = \ health.d/wmi.conf \ health.d/x509check.conf \ health.d/zfs.conf \ + health.d/zookeeper.conf \ health.d/dbengine.conf \ $(NULL) diff --git a/health/health.d/hdfs.conf b/health/health.d/hdfs.conf new file mode 100644 index 0000000000..678faab4c0 --- /dev/null +++ b/health/health.d/hdfs.conf @@ -0,0 +1,75 @@ + +# make sure hdfs is running + +template: hdfs_last_collected_secs + on: hdfs.heap_memory + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + + +# Common + +template: hdfs_capacity_usage + on: hdfs.capacity + calc: ($used) * 100 / ($used + $remaining) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (80) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: used capacity + to: sysadmin + + +# NameNode + +template: hdfs_missing_blocks + on: hdfs.blocks + calc: $missing + units: missing blocks + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: missing blocks + to: sysadmin + + +template: hdfs_stale_nodes + on: hdfs.data_nodes + calc: $stale + units: dead nodes + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: stale data nodes + to: sysadmin + + +template: hdfs_dead_nodes + on: hdfs.data_nodes + calc: $dead + units: dead nodes + every: 10s + crit: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: dead data nodes + to: sysadmin + + +# DataNode + +template: hdfs_num_failed_volumes + on: hdfs.num_failed_volumes + calc: $fsds_num_failed_volumes + units: failed volumes + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: failed volumes + to: sysadmin diff --git a/health/health.d/zookeeper.conf b/health/health.d/zookeeper.conf new file mode 100644 index 0000000000..ffbe31baf2 --- /dev/null +++ b/health/health.d/zookeeper.conf @@ -0,0 +1,14 @@ + +# make sure zookeeper is running + +template: zookeeper_last_collected_secs + on: zookeeper.requests + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + |