add alarms and dashboard info for Consul (#14163)

author: Ilya Mashchenko <ilya@netdata.cloud> 2022-12-23 15:10:19 +0200
committer: GitHub <noreply@github.com> 2022-12-23 15:10:19 +0200
commit: 2bca08132a436409b59803e0026034dd4ef823f2 (patch)
tree: 386c8009c337e3263c4b8327dedbde89e8140d3b /health
parent: 4409202f33cd6d20c9f6f2cdffa9d2112d6a437b (diff)
2 files changed, 147 insertions, 0 deletions
diff --git a/health/Makefile.am b/health/Makefile.am
index f97ca40d37..044ea90766 100644
--- a/health/Makefile.am
+++ b/health/Makefile.am
@@ -36,6 +36,7 @@ dist_healthconfig_DATA = \
     health.d/cgroups.conf \
     health.d/cpu.conf \
     health.d/cockroachdb.conf \
+    health.d/consul.conf \
     health.d/disks.conf \
     health.d/dnsmasq_dhcp.conf \
     health.d/dns_query.conf \
diff --git a/health/health.d/consul.conf b/health/health.d/consul.conf
new file mode 100644
index 0000000000..b591558f4c
--- /dev/null
+++ b/health/health.d/consul.conf
@@ -0,0 +1,146 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: consul_autopilot_health_status
+       on: consul.autopilot_health_status
+    class: Errors
+     type: ServiceMesh
+component: Consul
+     calc: $unhealthy
+    every: 10s
+    units: status
+     warn: $this == 1
+    delay: down 5m multiplier 1.5 max 1h
+     info: datacenter $label:datacenter cluster is unhealthy as reported by server $label:node_name
+       to: sysadmin
+
+ template: consul_autopilot_server_health_status
+       on: consul.autopilot_server_health_status
+    class: Errors
+     type: ServiceMesh
+component: Consul
+     calc: $unhealthy
+    every: 10s
+    units: status
+     warn: $this == 1
+    delay: down 5m multiplier 1.5 max 1h
+     info: server $label:node_name from datacenter $label:datacenter is unhealthy
+       to: sysadmin
+
+ template: consul_raft_leader_last_contact_time
+       on: consul.raft_leader_last_contact_time
+    class: Errors
+     type: ServiceMesh
+component: Consul
+   lookup: average -1m unaligned of quantile_0.5
+    every: 10s
+    units: milliseconds
+     warn: $this > (($status >= $WARNING)  ? (150) : (200))
+     crit: $this > (($status == $CRITICAL) ? (200) : (500))
+    delay: down 5m multiplier 1.5 max 1h
+     info: median time elapsed since leader server $label:node_name datacenter $label:datacenter was last able to contact the follower nodes
+       to: sysadmin
+
+ template: consul_raft_leadership_transitions
+       on: consul.raft_leadership_transitions_rate
+    class: Errors
+     type: ServiceMesh
+component: Consul
+   lookup: sum -1m unaligned
+    every: 10s
+    units: percentage
+     warn: $this > 0
+    delay: down 5m multiplier 1.5 max 1h
+     info: there has been a leadership change and server $label:node_name datacenter $label:datacenter has become the leader
+       to: sysadmin
+
+ template: consul_raft_thread_main_saturation
+       on: consul.raft_thread_main_saturation_perc
+    class: Utilization
+     type: ServiceMesh
+component: Consul
+   lookup: average -1m unaligned of quantile_0.9
+    every: 10s
+    units: percentage
+     warn: $this > (($status >= $WARNING)  ? (40) : (50))
+    delay: down 5m multiplier 1.5 max 1h
+     info: average saturation of the main Raft goroutine on server $label:node_name datacenter $label:datacenter
+       to: sysadmin
+
+ template: consul_raft_thread_fsm_saturation
+       on: consul.raft_thread_fsm_saturation_perc
+    class: Utilization
+     type: ServiceMesh
+component: Consul
+   lookup: average -1m unaligned of quantile_0.9
+    every: 10s
+    units: milliseconds
+     warn: $this > (($status >= $WARNING)  ? (40) : (50))
+    delay: down 5m multiplier 1.5 max 1h
+     info: average saturation of the FSM Raft goroutine on server $label:node_name datacenter $label:datacenter
+       to: sysadmin
+
+ template: consul_client_rpc_requests_exceeded
+       on: consul.client_rpc_requests_exceeded_rate
+    class: Errors
+     type: ServiceMesh
+component: Consul
+   lookup: sum -1m unaligned
+    every: 10s
+    units: requests
+     warn: $this > (($status >= $WARNING)  ? (0) : (5))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of rate-limited RPC requests made by server $label:node_name datacenter $label:datacenter
+       to: sysadmin
+
+ template: consul_client_rpc_requests_failed
+       on: consul.client_rpc_requests_failed_rate
+    class: Errors
+     type: ServiceMesh
+component: Consul
+   lookup: sum -1m unaligned
+    every: 10s
+    units: requests
+     warn: $this > (($status >= $WARNING)  ? (0) : (5))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of failed RPC requests made by server $label:node_name datacenter $label:datacenter
+       to: sysadmin
+
+ template: consul_node_health_check_status
+       on: consul.node_health_check_status
+    class: Errors
+     type: ServiceMesh
+component: Consul
+     calc: $warning + $critical
+    every: 10s
+    units: status
+     warn: $this != nan AND $this != 0
+    delay: down 5m multiplier 1.5 max 1h
+     info: node health check $label:check_name has failed on server $label:node_name datacenter $label:datacenter
+       to: sysadmin
+
+ template: consul_service_health_check_status
+       on: consul.service_health_check_status
+    class: Errors
+     type: ServiceMesh
+component: Consul
+     calc: $warning + $critical
+    every: 10s
+    units: status
+     warn: $this == 1
+    delay: down 5m multiplier 1.5 max 1h
+     info: service health check $label:check_name for service $label:service_name has failed on server $label:node_name datacenter $label:datacenter
+       to: sysadmin
+
+ template: consul_gc_pause_time
+       on: consul.gc_pause_time
+    class: Errors
+     type: ServiceMesh
+component: Consul
+   lookup: sum -1m unaligned
+    every: 10s
+    units: seconds
+     warn: $this > (($status >= $WARNING)  ? (1) : (2))
+     crit: $this > (($status >= $WARNING)  ? (2) : (5))
+    delay: down 5m multiplier 1.5 max 1h
+     info: time spent in stop-the-world garbage collection pauses on server $label:node_name datacenter $label:datacenter
+       to: sysadmin
author	Ilya Mashchenko <ilya@netdata.cloud>	2022-12-23 15:10:19 +0200
committer	GitHub <noreply@github.com>	2022-12-23 15:10:19 +0200
commit	2bca08132a436409b59803e0026034dd4ef823f2 (patch)
tree	386c8009c337e3263c4b8327dedbde89e8140d3b /health
parent	4409202f33cd6d20c9f6f2cdffa9d2112d6a437b (diff)