From 0aae17767ff03da1d89767420b40501c7ced1b01 Mon Sep 17 00:00:00 2001 From: Ilya Mashchenko Date: Wed, 25 Sep 2019 12:50:25 +0300 Subject: zookeeper and hdfs: alarms and dashboard_info (#6927) * add zookeeper alarms * add zookeeper to dashboard_info * zookeeper alarm fix * add hdfs alarms * add hfds to dashboard_info * minor * fix hdfs zk links: use latest version * hdfs dashboard_info: change semicolon to comma --- health/Makefile.am | 2 ++ health/health.d/hdfs.conf | 75 ++++++++++++++++++++++++++++++++++++++++++ health/health.d/zookeeper.conf | 14 ++++++++ web/gui/dashboard_info.js | 24 ++++++++++++++ 4 files changed, 115 insertions(+) create mode 100644 health/health.d/hdfs.conf create mode 100644 health/health.d/zookeeper.conf diff --git a/health/Makefile.am b/health/Makefile.am index a682b594c1..a314b35165 100644 --- a/health/Makefile.am +++ b/health/Makefile.am @@ -44,6 +44,7 @@ dist_healthconfig_DATA = \ health.d/fronius.conf \ health.d/gearman.conf \ health.d/haproxy.conf \ + health.d/hdfs.conf \ health.d/httpcheck.conf \ health.d/ipc.conf \ health.d/ipfs.conf \ @@ -91,5 +92,6 @@ dist_healthconfig_DATA = \ health.d/wmi.conf \ health.d/x509check.conf \ health.d/zfs.conf \ + health.d/zookeeper.conf \ health.d/dbengine.conf \ $(NULL) diff --git a/health/health.d/hdfs.conf b/health/health.d/hdfs.conf new file mode 100644 index 0000000000..678faab4c0 --- /dev/null +++ b/health/health.d/hdfs.conf @@ -0,0 +1,75 @@ + +# make sure hdfs is running + +template: hdfs_last_collected_secs + on: hdfs.heap_memory + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + + +# Common + +template: hdfs_capacity_usage + on: hdfs.capacity + calc: ($used) * 100 / ($used + $remaining) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (80) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: used capacity + to: sysadmin + + +# NameNode + +template: hdfs_missing_blocks + on: hdfs.blocks + calc: $missing + units: missing blocks + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: missing blocks + to: sysadmin + + +template: hdfs_stale_nodes + on: hdfs.data_nodes + calc: $stale + units: dead nodes + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: stale data nodes + to: sysadmin + + +template: hdfs_dead_nodes + on: hdfs.data_nodes + calc: $dead + units: dead nodes + every: 10s + crit: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: dead data nodes + to: sysadmin + + +# DataNode + +template: hdfs_num_failed_volumes + on: hdfs.num_failed_volumes + calc: $fsds_num_failed_volumes + units: failed volumes + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: failed volumes + to: sysadmin diff --git a/health/health.d/zookeeper.conf b/health/health.d/zookeeper.conf new file mode 100644 index 0000000000..ffbe31baf2 --- /dev/null +++ b/health/health.d/zookeeper.conf @@ -0,0 +1,14 @@ + +# make sure zookeeper is running + +template: zookeeper_last_collected_secs + on: zookeeper.requests + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + diff --git a/web/gui/dashboard_info.js b/web/gui/dashboard_info.js index 3f65718164..c277bbdd3a 100644 --- a/web/gui/dashboard_info.js +++ b/web/gui/dashboard_info.js @@ -504,6 +504,18 @@ netdataDashboard.menu = { title: 'VCSA', icon: '', info: 'vCenter Server Appliance health statistics. Data collected from Health API.' + }, + + 'zookeeper': { + title: 'Zookeeper', + icon: '', + info: 'Provides health statistics for Zookeeper server. Data collected through the command port using mntr command.' + }, + + 'hdfs': { + title: 'HDFS', + icon: '', + info: 'Provides Hadoop Distributed File System performance statistics. Module collects metrics over Java Management Extensions through the web interface of an HDFS daemon.' } }; @@ -2625,5 +2637,17 @@ netdataDashboard.context = { '2: non-security updates are available; ' + '3: security updates are available; ' + '4: an error retrieving information on software updates.' + }, + + // ------------------------------------------------------------------------ + // Zookeeper + + 'zookeeper.server_state': { + info: + '0: unknown, ' + + '1: leader, ' + + '2: follower, ' + + '3: observer, ' + + '4: standalone.' } }; -- cgit v1.2.3