diff options
author | Ilya Mashchenko <ilya@netdata.cloud> | 2022-12-23 15:10:19 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-12-23 15:10:19 +0200 |
commit | 2bca08132a436409b59803e0026034dd4ef823f2 (patch) | |
tree | 386c8009c337e3263c4b8327dedbde89e8140d3b /health | |
parent | 4409202f33cd6d20c9f6f2cdffa9d2112d6a437b (diff) |
add alarms and dashboard info for Consul (#14163)
Diffstat (limited to 'health')
-rw-r--r-- | health/Makefile.am | 1 | ||||
-rw-r--r-- | health/health.d/consul.conf | 146 |
2 files changed, 147 insertions, 0 deletions
diff --git a/health/Makefile.am b/health/Makefile.am index f97ca40d37..044ea90766 100644 --- a/health/Makefile.am +++ b/health/Makefile.am @@ -36,6 +36,7 @@ dist_healthconfig_DATA = \ health.d/cgroups.conf \ health.d/cpu.conf \ health.d/cockroachdb.conf \ + health.d/consul.conf \ health.d/disks.conf \ health.d/dnsmasq_dhcp.conf \ health.d/dns_query.conf \ diff --git a/health/health.d/consul.conf b/health/health.d/consul.conf new file mode 100644 index 0000000000..b591558f4c --- /dev/null +++ b/health/health.d/consul.conf @@ -0,0 +1,146 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: consul_autopilot_health_status + on: consul.autopilot_health_status + class: Errors + type: ServiceMesh +component: Consul + calc: $unhealthy + every: 10s + units: status + warn: $this == 1 + delay: down 5m multiplier 1.5 max 1h + info: datacenter $label:datacenter cluster is unhealthy as reported by server $label:node_name + to: sysadmin + + template: consul_autopilot_server_health_status + on: consul.autopilot_server_health_status + class: Errors + type: ServiceMesh +component: Consul + calc: $unhealthy + every: 10s + units: status + warn: $this == 1 + delay: down 5m multiplier 1.5 max 1h + info: server $label:node_name from datacenter $label:datacenter is unhealthy + to: sysadmin + + template: consul_raft_leader_last_contact_time + on: consul.raft_leader_last_contact_time + class: Errors + type: ServiceMesh +component: Consul + lookup: average -1m unaligned of quantile_0.5 + every: 10s + units: milliseconds + warn: $this > (($status >= $WARNING) ? (150) : (200)) + crit: $this > (($status == $CRITICAL) ? (200) : (500)) + delay: down 5m multiplier 1.5 max 1h + info: median time elapsed since leader server $label:node_name datacenter $label:datacenter was last able to contact the follower nodes + to: sysadmin + + template: consul_raft_leadership_transitions + on: consul.raft_leadership_transitions_rate + class: Errors + type: ServiceMesh +component: Consul + lookup: sum -1m unaligned + every: 10s + units: percentage + warn: $this > 0 + delay: down 5m multiplier 1.5 max 1h + info: there has been a leadership change and server $label:node_name datacenter $label:datacenter has become the leader + to: sysadmin + + template: consul_raft_thread_main_saturation + on: consul.raft_thread_main_saturation_perc + class: Utilization + type: ServiceMesh +component: Consul + lookup: average -1m unaligned of quantile_0.9 + every: 10s + units: percentage + warn: $this > (($status >= $WARNING) ? (40) : (50)) + delay: down 5m multiplier 1.5 max 1h + info: average saturation of the main Raft goroutine on server $label:node_name datacenter $label:datacenter + to: sysadmin + + template: consul_raft_thread_fsm_saturation + on: consul.raft_thread_fsm_saturation_perc + class: Utilization + type: ServiceMesh +component: Consul + lookup: average -1m unaligned of quantile_0.9 + every: 10s + units: milliseconds + warn: $this > (($status >= $WARNING) ? (40) : (50)) + delay: down 5m multiplier 1.5 max 1h + info: average saturation of the FSM Raft goroutine on server $label:node_name datacenter $label:datacenter + to: sysadmin + + template: consul_client_rpc_requests_exceeded + on: consul.client_rpc_requests_exceeded_rate + class: Errors + type: ServiceMesh +component: Consul + lookup: sum -1m unaligned + every: 10s + units: requests + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: down 5m multiplier 1.5 max 1h + info: number of rate-limited RPC requests made by server $label:node_name datacenter $label:datacenter + to: sysadmin + + template: consul_client_rpc_requests_failed + on: consul.client_rpc_requests_failed_rate + class: Errors + type: ServiceMesh +component: Consul + lookup: sum -1m unaligned + every: 10s + units: requests + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: down 5m multiplier 1.5 max 1h + info: number of failed RPC requests made by server $label:node_name datacenter $label:datacenter + to: sysadmin + + template: consul_node_health_check_status + on: consul.node_health_check_status + class: Errors + type: ServiceMesh +component: Consul + calc: $warning + $critical + every: 10s + units: status + warn: $this != nan AND $this != 0 + delay: down 5m multiplier 1.5 max 1h + info: node health check $label:check_name has failed on server $label:node_name datacenter $label:datacenter + to: sysadmin + + template: consul_service_health_check_status + on: consul.service_health_check_status + class: Errors + type: ServiceMesh +component: Consul + calc: $warning + $critical + every: 10s + units: status + warn: $this == 1 + delay: down 5m multiplier 1.5 max 1h + info: service health check $label:check_name for service $label:service_name has failed on server $label:node_name datacenter $label:datacenter + to: sysadmin + + template: consul_gc_pause_time + on: consul.gc_pause_time + class: Errors + type: ServiceMesh +component: Consul + lookup: sum -1m unaligned + every: 10s + units: seconds + warn: $this > (($status >= $WARNING) ? (1) : (2)) + crit: $this > (($status >= $WARNING) ? (2) : (5)) + delay: down 5m multiplier 1.5 max 1h + info: time spent in stop-the-world garbage collection pauses on server $label:node_name datacenter $label:datacenter + to: sysadmin |