summaryrefslogtreecommitdiffstats
path: root/health
diff options
context:
space:
mode:
authorIlya Mashchenko <ilya@netdata.cloud>2021-03-22 17:30:02 +0300
committerGitHub <noreply@github.com>2021-03-22 17:30:02 +0300
commit48e0dc53a25354f459294fd60b63307174b19aa2 (patch)
tree9259ea0f79d717e3e3290736be65bf43f5e19f64 /health
parentffb7f94544d7abe00fe1df0f03d8d319348eb567 (diff)
health: add collector prefix to the external collectors alarms/templates (#10830)
Diffstat (limited to 'health')
-rw-r--r--health/health.d/adaptec_raid.conf8
-rw-r--r--health/health.d/anomalies.conf4
-rw-r--r--health/health.d/apcupsd.conf4
-rw-r--r--health/health.d/beanstalkd.conf6
-rw-r--r--health/health.d/ceph.conf2
-rw-r--r--health/health.d/fping.conf7
-rw-r--r--health/health.d/httpcheck.conf26
-rw-r--r--health/health.d/ioping.conf2
-rw-r--r--health/health.d/kubelet.conf22
-rw-r--r--health/health.d/megacli.conf10
-rw-r--r--health/health.d/memcached.conf6
-rw-r--r--health/health.d/portcheck.conf6
-rw-r--r--health/health.d/riakkv.conf22
13 files changed, 62 insertions, 63 deletions
diff --git a/health/health.d/adaptec_raid.conf b/health/health.d/adaptec_raid.conf
index f93bde0c7e..0753c6e5db 100644
--- a/health/health.d/adaptec_raid.conf
+++ b/health/health.d/adaptec_raid.conf
@@ -1,8 +1,8 @@
# logical device status check
-template: adapter_raid_ld_status
- on: adapter_raid.ld_status
+template: adaptec_raid_ld_status
+ on: adaptec_raid.ld_status
lookup: max -10s foreach *
units: bool
every: 10s
@@ -13,8 +13,8 @@ template: adapter_raid_ld_status
# physical device state check
-template: adapter_raid_pd_state
- on: adapter_raid.pd_state
+template: adaptec_raid_pd_state
+ on: adaptec_raid.pd_state
lookup: max -10s foreach *
units: bool
every: 10s
diff --git a/health/health.d/anomalies.conf b/health/health.d/anomalies.conf
index a2d248efe7..8c7945c3cd 100644
--- a/health/health.d/anomalies.conf
+++ b/health/health.d/anomalies.conf
@@ -1,6 +1,6 @@
# raise a warning alarm if an anomaly probability is consistently above 50%
-template: anomaly_probabilities
+template: anomalies_anomaly_probabilities
on: anomalies.probability
lookup: average -2m foreach *
every: 1m
@@ -9,7 +9,7 @@ template: anomaly_probabilities
# raise a warning alarm if an anomaly flag is consistently firing
-template: anomaly_flags
+template: anomalies_anomaly_flags
on: anomalies.anomaly
lookup: sum -2m foreach *
every: 1m
diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf
index 4f86037ba5..4d0f050708 100644
--- a/health/health.d/apcupsd.conf
+++ b/health/health.d/apcupsd.conf
@@ -1,6 +1,6 @@
# you can disable an alarm notification by setting the 'to' line to: silent
-template: 10min_ups_load
+template: apcupsd_10min_ups_load
on: apcupsd.load
os: *
hosts: *
@@ -15,7 +15,7 @@ template: 10min_ups_load
# Discussion in https://github.com/netdata/netdata/pull/3928:
# Fire the alarm as soon as it's going on battery (99% charge) and clear only when full.
-template: ups_charge
+template: apcupsd_ups_charge
on: apcupsd.charge
os: *
hosts: *
diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf
index 30dc27328e..af17358847 100644
--- a/health/health.d/beanstalkd.conf
+++ b/health/health.d/beanstalkd.conf
@@ -1,6 +1,6 @@
# get the number of buried jobs in all queues
-template: server_buried_jobs
+template: beanstalk_server_buried_jobs
on: beanstalk.current_jobs
calc: $buried
units: jobs
@@ -13,7 +13,7 @@ template: server_buried_jobs
# get the number of buried jobs per queue
-#template: tube_buried_jobs
+#template: beanstalk_tube_buried_jobs
# on: beanstalk.jobs
# calc: $buried
# units: jobs
@@ -26,7 +26,7 @@ template: server_buried_jobs
# get the current number of tubes
-#template: number_of_tubes
+#template: beanstalk_number_of_tubes
# on: beanstalk.current_tubes
# calc: $tubes
# every: 10s
diff --git a/health/health.d/ceph.conf b/health/health.d/ceph.conf
index e0a55a3e72..5cae0877c6 100644
--- a/health/health.d/ceph.conf
+++ b/health/health.d/ceph.conf
@@ -1,6 +1,6 @@
# low ceph disk available
-template: cluster_space_usage
+template: ceph_cluster_space_usage
on: ceph.general_usage
calc: $used * 100 / ($used + $avail)
units: %
diff --git a/health/health.d/fping.conf b/health/health.d/fping.conf
index 43658fef6d..fa22514bac 100644
--- a/health/health.d/fping.conf
+++ b/health/health.d/fping.conf
@@ -11,7 +11,7 @@ families: *
info: number of seconds since the last successful data collection
to: sysadmin
-template: host_reachable
+template: fping_host_reachable
families: *
on: fping.latency
calc: $average != nan
@@ -22,7 +22,7 @@ families: *
delay: down 30m multiplier 1.5 max 2h
to: sysadmin
-template: host_latency
+template: fping_host_latency
families: *
on: fping.latency
lookup: average -10s unaligned of average
@@ -36,7 +36,7 @@ families: *
delay: down 30m multiplier 1.5 max 2h
to: sysadmin
-template: packet_loss
+template: fping_packet_loss
families: *
on: fping.quality
lookup: average -10m unaligned of returned
@@ -50,4 +50,3 @@ families: *
info: packet loss percentage
delay: down 30m multiplier 1.5 max 2h
to: sysadmin
-
diff --git a/health/health.d/httpcheck.conf b/health/health.d/httpcheck.conf
index 0ddf35eab9..e4eeb2ae80 100644
--- a/health/health.d/httpcheck.conf
+++ b/health/health.d/httpcheck.conf
@@ -11,7 +11,7 @@ families: *
to: sysadmin
# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
-template: web_service_up
+template: httpcheck_web_service_up
families: *
on: httpcheck.status
lookup: average -1m unaligned percentage of success
@@ -21,7 +21,7 @@ families: *
info: at least 75% verified responses during last 60 seconds, ideal for badges
to: silent
-template: web_service_bad_content
+template: httpcheck_web_service_bad_content
families: *
on: httpcheck.status
lookup: average -5m unaligned percentage of bad_content
@@ -34,7 +34,7 @@ families: *
options: no-clear-notification
to: webmaster
-template: web_service_bad_status
+template: httpcheck_web_service_bad_status
families: *
on: httpcheck.status
lookup: average -5m unaligned percentage of bad_status
@@ -47,7 +47,7 @@ families: *
options: no-clear-notification
to: webmaster
-template: web_service_timeouts
+template: httpcheck_web_service_timeouts
families: *
on: httpcheck.status
lookup: average -5m unaligned percentage of timeout
@@ -55,7 +55,7 @@ families: *
units: %
info: average of timeouts during the last 5 minutes
-template: no_web_service_connections
+template: httpcheck_no_web_service_connections
families: *
on: httpcheck.status
lookup: average -5m unaligned percentage of no_connection
@@ -64,20 +64,20 @@ families: *
info: average of failed requests during the last 5 minutes
# combined timeout & no connection alarm
-template: web_service_unreachable
+template: httpcheck_web_service_unreachable
families: *
on: httpcheck.status
- calc: ($no_web_service_connections >= $web_service_timeouts) ? ($no_web_service_connections) : ($web_service_timeouts)
+ calc: ($httpcheck_no_web_service_connections >= $httpcheck_web_service_timeouts) ? ($httpcheck_no_web_service_connections) : ($httpcheck_web_service_timeouts)
units: %
every: 10s
- warn: ($no_web_service_connections >= 10 OR $web_service_timeouts >= 10) AND ($no_web_service_connections < 40 OR $web_service_timeouts < 40)
- crit: $no_web_service_connections >= 40 OR $web_service_timeouts >= 40
+ warn: ($httpcheck_no_web_service_connections >= 10 OR $httpcheck_web_service_timeouts >= 10) AND ($httpcheck_no_web_service_connections < 40 OR $httpcheck_web_service_timeouts < 40)
+ crit: $httpcheck_no_web_service_connections >= 40 OR $httpcheck_web_service_timeouts >= 40
delay: down 5m multiplier 1.5 max 1h
info: average of failed requests either due to timeouts or no connection during the last 5 minutes
options: no-clear-notification
to: webmaster
-template: 1h_web_service_response_time
+template: httpcheck_1h_web_service_response_time
families: *
on: httpcheck.responsetime
lookup: average -1h unaligned of time
@@ -85,14 +85,14 @@ families: *
units: ms
info: average response time over the last hour
-template: web_service_slow
+template: httpcheck_web_service_slow
families: *
on: httpcheck.responsetime
lookup: average -3m unaligned of time
units: ms
every: 10s
- warn: ($this > ($1h_web_service_response_time * 2) )
- crit: ($this > ($1h_web_service_response_time * 3) )
+ warn: ($this > ($httpcheck_1h_web_service_response_time * 2) )
+ crit: ($this > ($httpcheck_1h_web_service_response_time * 3) )
info: average response time over the last 3 minutes, compared to the average over the last hour
delay: down 5m multiplier 1.5 max 1h
options: no-clear-notification
diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf
index 59a5c8edc3..f513748045 100644
--- a/health/health.d/ioping.conf
+++ b/health/health.d/ioping.conf
@@ -1,4 +1,4 @@
-template: disk_latency
+template: ioping_disk_latency
families: *
on: ioping.latency
lookup: average -10s unaligned of average
diff --git a/health/health.d/kubelet.conf b/health/health.d/kubelet.conf
index d2ef24b589..53e0aa4761 100644
--- a/health/health.d/kubelet.conf
+++ b/health/health.d/kubelet.conf
@@ -4,7 +4,7 @@
# True (1) if the node is experiencing a configuration-related error, false (0) otherwise.
- template: node_config_error
+ template: kubelet_node_config_error
on: k8s_kubelet.kubelet_node_config_error
calc: $kubelet_node_config_error
units: bool
@@ -16,7 +16,7 @@
# Failed Token() requests to the alternate token source
- template: token_requests
+ template: kubelet_token_requests
lookup: sum -10s of token_fail_count
on: k8s_kubelet.kubelet_token_requests
units: failed requests
@@ -53,17 +53,17 @@
# quantile 0.5
-template: 1m_kubelet_pleg_relist_latency_quantile_05
+template: kubelet_1m_pleg_relist_latency_quantile_05
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
lookup: average -1m unaligned of kubelet_pleg_relist_latency_05
units: microseconds
every: 10s
info: the average value of pleg relisting latency during the last minute (quantile 0.5)
-template: 10s_kubelet_pleg_relist_latency_quantile_05
+template: kubelet_10s_pleg_relist_latency_quantile_05
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
lookup: average -10s unaligned of kubelet_pleg_relist_latency_05
- calc: $this * 100 / (($1m_kubelet_pleg_relist_latency_quantile_05 < 1000)?(1000):($1m_kubelet_pleg_relist_latency_quantile_05))
+ calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05))
every: 10s
units: %
warn: $this > (($status >= $WARNING)?(100):(200))
@@ -74,17 +74,17 @@ template: 10s_kubelet_pleg_relist_latency_quantile_05
# quantile 0.9
-template: 1m_kubelet_pleg_relist_latency_quantile_09
+template: kubelet_1m_pleg_relist_latency_quantile_09
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
lookup: average -1m unaligned of kubelet_pleg_relist_latency_09
units: microseconds
every: 10s
info: the average value of pleg relisting latency during the last minute (quantile 0.9)
-template: 10s_kubelet_pleg_relist_latency_quantile_09
+template: kubelet_10s_pleg_relist_latency_quantile_09
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
lookup: average -10s unaligned of kubelet_pleg_relist_latency_09
- calc: $this * 100 / (($1m_kubelet_pleg_relist_latency_quantile_09 < 1000)?(1000):($1m_kubelet_pleg_relist_latency_quantile_09))
+ calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09))
every: 10s
units: %
warn: $this > (($status >= $WARNING)?(200):(400))
@@ -95,17 +95,17 @@ template: 10s_kubelet_pleg_relist_latency_quantile_09
# quantile 0.99
-template: 1m_kubelet_pleg_relist_latency_quantile_099
+template: kubelet_1m_pleg_relist_latency_quantile_099
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
lookup: average -1m unaligned of kubelet_pleg_relist_latency_099
units: microseconds
every: 10s
info: the average value of pleg relisting latency during the last minute (quantile 0.99)
-template: 10s_kubelet_pleg_relist_latency_quantile_099
+template: kubelet_10s_pleg_relist_latency_quantile_099
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
lookup: average -10s unaligned of kubelet_pleg_relist_latency_099
- calc: $this * 100 / (($1m_kubelet_pleg_relist_latency_quantile_099 < 1000)?(1000):($1m_kubelet_pleg_relist_latency_quantile_099))
+ calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099))
every: 10s
units: %
warn: $this > (($status >= $WARNING)?(400):(800))
diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf
index 6e81a2a0e7..73f106530a 100644
--- a/health/health.d/megacli.conf
+++ b/health/health.d/megacli.conf
@@ -1,4 +1,4 @@
-template: adapter_state
+template: megacli_adapter_state
on: megacli.adapter_degraded
units: is degraded
lookup: sum -10s
@@ -7,7 +7,7 @@ template: adapter_state
info: adapter state
to: sysadmin
-template: bbu_relative_charge
+template: megacli_bbu_relative_charge
on: megacli.bbu_relative_charge
units: percent
lookup: average -10s
@@ -17,7 +17,7 @@ template: bbu_relative_charge
info: BBU relative state of charge
to: sysadmin
-template: bbu_cycle_count
+template: megacli_bbu_cycle_count
on: megacli.bbu_cycle_count
units: cycle count
lookup: average -10s
@@ -27,7 +27,7 @@ template: bbu_cycle_count
info: BBU cycle count
to: sysadmin
-template: pd_media_errors
+template: megacli_pd_media_errors
on: megacli.pd_media_error
units: media errors
lookup: sum -10s
@@ -37,7 +37,7 @@ template: pd_media_errors
info: physical drive media errors
to: sysadmin
-template: pd_predictive_failures
+template: megacli_pd_predictive_failures
on: megacli.pd_predictive_failure
units: predictive failures
lookup: sum -10s
diff --git a/health/health.d/memcached.conf b/health/health.d/memcached.conf
index d248ef57a7..0eaa50f275 100644
--- a/health/health.d/memcached.conf
+++ b/health/health.d/memcached.conf
@@ -29,7 +29,7 @@ template: memcached_cache_memory_usage
# find the rate memcached cache is filling
-template: cache_fill_rate
+template: memcached_cache_fill_rate
on: memcached.cache
lookup: min -10m at -50m unaligned of available
calc: ($this - $available) / (($now - $after) / 3600)
@@ -40,9 +40,9 @@ template: cache_fill_rate
# find the hours remaining until memcached cache is full
-template: out_of_cache_space_time
+template: memcached_out_of_cache_space_time
on: memcached.cache
- calc: ($cache_fill_rate > 0) ? ($available / $cache_fill_rate) : (inf)
+ calc: ($memcached_cache_fill_rate > 0) ? ($available / $memcached_cache_fill_rate) : (inf)
units: hours
every: 10s
warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf
index 696333fd83..4aab711c64 100644
--- a/health/health.d/portcheck.conf
+++ b/health/health.d/portcheck.conf
@@ -11,7 +11,7 @@ families: *
to: sysadmin
# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
-template: service_reachable
+template: portcheck_service_reachable
families: *
on: portcheck.status
lookup: average -1m unaligned percentage of success
@@ -21,7 +21,7 @@ families: *
info: at least 75% successful connections during last 60 seconds, ideal for badges
to: silent
-template: connection_timeouts
+template: portcheck_connection_timeouts
families: *
on: portcheck.status
lookup: average -5m unaligned percentage of timeout
@@ -33,7 +33,7 @@ families: *
info: average of timeouts during the last 5 minutes
to: sysadmin
-template: connection_fails
+template: portcheck_connection_fails
families: *
on: portcheck.status
lookup: average -5m unaligned percentage of no_connection,failed
diff --git a/health/health.d/riakkv.conf b/health/health.d/riakkv.conf
index 745302778d..c87123d889 100644
--- a/health/health.d/riakkv.conf
+++ b/health/health.d/riakkv.conf
@@ -1,5 +1,5 @@
# Ensure that Riak is running. template: riak_last_collected_secs
-template: riak_last_collected_secs
+template: riakkv_last_collected_secs
on: riak.kv.throughput
calc: $now - $last_collected_t
units: seconds ago
@@ -11,7 +11,7 @@ template: riak_last_collected_secs
to: dba
# Warn if a list keys operation is running.
-template: riak_list_keys_active
+template: riakkv_list_keys_active
on: riak.core.fsm_active
calc: $list_fsm_active
units: state machines
@@ -23,7 +23,7 @@ template: riak_list_keys_active
## Timing healthchecks
# KV GET
-template: 1h_kv_get_mean_latency
+template: riakkv_1h_kv_get_mean_latency
on: riak.kv.latency.get
calc: $node_get_fsm_time_mean
lookup: average -1h unaligned of time
@@ -31,20 +31,20 @@ template: 1h_kv_get_mean_latency
units: ms
info: mean average KV GET latency over the last hour
-template: riak_kv_get_slow
+template: riakkv_kv_get_slow
on: riak.kv.latency.get
calc: $mean
lookup: average -3m unaligned of time
units: ms
every: 10s
- warn: ($this > ($1h_kv_get_mean_latency * 2) )
- crit: ($this > ($1h_kv_get_mean_latency * 3) )
+ warn: ($this > ($riakkv_1h_kv_get_mean_latency * 2) )
+ crit: ($this > ($riakkv_1h_kv_get_mean_latency * 3) )
info: average KV GET time over the last 3 minutes, compared to the average over the last hour
delay: down 5m multiplier 1.5 max 1h
to: dba
# KV PUT
-template: 1h_kv_put_mean_latency
+template: riakkv_1h_kv_put_mean_latency
on: riak.kv.latency.put
calc: $node_put_fsm_time_mean
lookup: average -1h unaligned of time
@@ -52,14 +52,14 @@ template: 1h_kv_put_mean_latency
units: ms
info: mean average KV PUT latency over the last hour
-template: riak_kv_put_slow
+template: riakkv_kv_put_slow
on: riak.kv.latency.put
calc: $mean
lookup: average -3m unaligned of time
units: ms
every: 10s
- warn: ($this > ($1h_kv_put_mean_latency * 2) )
- crit: ($this > ($1h_kv_put_mean_latency * 3) )
+ warn: ($this > ($riakkv_1h_kv_put_mean_latency * 2) )
+ crit: ($this > ($riakkv_1h_kv_put_mean_latency * 3) )
info: average KV PUT time over the last 3 minutes, compared to the average over the last hour
delay: down 5m multiplier 1.5 max 1h
to: dba
@@ -69,7 +69,7 @@ template: riak_kv_put_slow
# Default Erlang VM process limit: 262144
# On systems observed, this is < 2000, but may grow depending on load.
-template: riak_vm_high_process_count
+template: riakkv_vm_high_process_count
on: riak.vm
calc: $sys_process_count
units: processes