summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--health/Makefile.am2
-rw-r--r--health/health.d/hdfs.conf75
-rw-r--r--health/health.d/zookeeper.conf14
-rw-r--r--web/gui/dashboard_info.js24
4 files changed, 115 insertions, 0 deletions
diff --git a/health/Makefile.am b/health/Makefile.am
index a682b594c1..a314b35165 100644
--- a/health/Makefile.am
+++ b/health/Makefile.am
@@ -44,6 +44,7 @@ dist_healthconfig_DATA = \
health.d/fronius.conf \
health.d/gearman.conf \
health.d/haproxy.conf \
+ health.d/hdfs.conf \
health.d/httpcheck.conf \
health.d/ipc.conf \
health.d/ipfs.conf \
@@ -91,5 +92,6 @@ dist_healthconfig_DATA = \
health.d/wmi.conf \
health.d/x509check.conf \
health.d/zfs.conf \
+ health.d/zookeeper.conf \
health.d/dbengine.conf \
$(NULL)
diff --git a/health/health.d/hdfs.conf b/health/health.d/hdfs.conf
new file mode 100644
index 0000000000..678faab4c0
--- /dev/null
+++ b/health/health.d/hdfs.conf
@@ -0,0 +1,75 @@
+
+# make sure hdfs is running
+
+template: hdfs_last_collected_secs
+ on: hdfs.heap_memory
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: webmaster
+
+
+# Common
+
+template: hdfs_capacity_usage
+ on: hdfs.capacity
+ calc: ($used) * 100 / ($used + $remaining)
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (70) : (80))
+ crit: $this > (($status == $CRITICAL) ? (80) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ info: used capacity
+ to: sysadmin
+
+
+# NameNode
+
+template: hdfs_missing_blocks
+ on: hdfs.blocks
+ calc: $missing
+ units: missing blocks
+ every: 10s
+ warn: $this > 0
+ delay: down 15m multiplier 1.5 max 1h
+ info: missing blocks
+ to: sysadmin
+
+
+template: hdfs_stale_nodes
+ on: hdfs.data_nodes
+ calc: $stale
+ units: dead nodes
+ every: 10s
+ warn: $this > 0
+ delay: down 15m multiplier 1.5 max 1h
+ info: stale data nodes
+ to: sysadmin
+
+
+template: hdfs_dead_nodes
+ on: hdfs.data_nodes
+ calc: $dead
+ units: dead nodes
+ every: 10s
+ crit: $this > 0
+ delay: down 15m multiplier 1.5 max 1h
+ info: dead data nodes
+ to: sysadmin
+
+
+# DataNode
+
+template: hdfs_num_failed_volumes
+ on: hdfs.num_failed_volumes
+ calc: $fsds_num_failed_volumes
+ units: failed volumes
+ every: 10s
+ warn: $this > 0
+ delay: down 15m multiplier 1.5 max 1h
+ info: failed volumes
+ to: sysadmin
diff --git a/health/health.d/zookeeper.conf b/health/health.d/zookeeper.conf
new file mode 100644
index 0000000000..ffbe31baf2
--- /dev/null
+++ b/health/health.d/zookeeper.conf
@@ -0,0 +1,14 @@
+
+# make sure zookeeper is running
+
+template: zookeeper_last_collected_secs
+ on: zookeeper.requests
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: webmaster
+
diff --git a/web/gui/dashboard_info.js b/web/gui/dashboard_info.js
index 3f65718164..c277bbdd3a 100644
--- a/web/gui/dashboard_info.js
+++ b/web/gui/dashboard_info.js
@@ -504,6 +504,18 @@ netdataDashboard.menu = {
title: 'VCSA',
icon: '<i class="fas fa-server"></i>',
info: 'vCenter Server Appliance health statistics. Data collected from <a href="https://vmware.github.io/vsphere-automation-sdk-rest/vsphere/index.html#SVC_com.vmware.appliance.health">Health API</a>.'
+ },
+
+ 'zookeeper': {
+ title: 'Zookeeper',
+ icon: '<i class="fas fa-database"></i>',
+ info: 'Provides health statistics for <b><a href="https://zookeeper.apache.org/">Zookeeper</a></b> server. Data collected through the command port using <code><a href="https://zookeeper.apache.org/doc/r3.5.5/zookeeperAdmin.html#sc_zkCommands">mntr</a></code> command.'
+ },
+
+ 'hdfs': {
+ title: 'HDFS',
+ icon: '<i class="fas fa-folder-open"></i>',
+ info: 'Provides <b><a href="https://hadoop.apache.org/docs/r3.2.0/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html">Hadoop Distributed File System</a></b> performance statistics. Module collects metrics over <code>Java Management Extensions</code> through the web interface of an <code>HDFS</code> daemon.'
}
};
@@ -2625,5 +2637,17 @@ netdataDashboard.context = {
'<code>2</code>: non-security updates are available; ' +
'<code>3</code>: security updates are available; ' +
'<code>4</code>: an error retrieving information on software updates.'
+ },
+
+ // ------------------------------------------------------------------------
+ // Zookeeper
+
+ 'zookeeper.server_state': {
+ info:
+ '<code>0</code>: unknown, ' +
+ '<code>1</code>: leader, ' +
+ '<code>2</code>: follower, ' +
+ '<code>3</code>: observer, ' +
+ '<code>4</code>: standalone.'
}
};