diff options
author | Ilya Mashchenko <ilya@netdata.cloud> | 2021-03-22 17:30:02 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-03-22 17:30:02 +0300 |
commit | 48e0dc53a25354f459294fd60b63307174b19aa2 (patch) | |
tree | 9259ea0f79d717e3e3290736be65bf43f5e19f64 /health | |
parent | ffb7f94544d7abe00fe1df0f03d8d319348eb567 (diff) |
health: add collector prefix to the external collectors alarms/templates (#10830)
Diffstat (limited to 'health')
-rw-r--r-- | health/health.d/adaptec_raid.conf | 8 | ||||
-rw-r--r-- | health/health.d/anomalies.conf | 4 | ||||
-rw-r--r-- | health/health.d/apcupsd.conf | 4 | ||||
-rw-r--r-- | health/health.d/beanstalkd.conf | 6 | ||||
-rw-r--r-- | health/health.d/ceph.conf | 2 | ||||
-rw-r--r-- | health/health.d/fping.conf | 7 | ||||
-rw-r--r-- | health/health.d/httpcheck.conf | 26 | ||||
-rw-r--r-- | health/health.d/ioping.conf | 2 | ||||
-rw-r--r-- | health/health.d/kubelet.conf | 22 | ||||
-rw-r--r-- | health/health.d/megacli.conf | 10 | ||||
-rw-r--r-- | health/health.d/memcached.conf | 6 | ||||
-rw-r--r-- | health/health.d/portcheck.conf | 6 | ||||
-rw-r--r-- | health/health.d/riakkv.conf | 22 |
13 files changed, 62 insertions, 63 deletions
diff --git a/health/health.d/adaptec_raid.conf b/health/health.d/adaptec_raid.conf index f93bde0c7e..0753c6e5db 100644 --- a/health/health.d/adaptec_raid.conf +++ b/health/health.d/adaptec_raid.conf @@ -1,8 +1,8 @@ # logical device status check -template: adapter_raid_ld_status - on: adapter_raid.ld_status +template: adaptec_raid_ld_status + on: adaptec_raid.ld_status lookup: max -10s foreach * units: bool every: 10s @@ -13,8 +13,8 @@ template: adapter_raid_ld_status # physical device state check -template: adapter_raid_pd_state - on: adapter_raid.pd_state +template: adaptec_raid_pd_state + on: adaptec_raid.pd_state lookup: max -10s foreach * units: bool every: 10s diff --git a/health/health.d/anomalies.conf b/health/health.d/anomalies.conf index a2d248efe7..8c7945c3cd 100644 --- a/health/health.d/anomalies.conf +++ b/health/health.d/anomalies.conf @@ -1,6 +1,6 @@ # raise a warning alarm if an anomaly probability is consistently above 50% -template: anomaly_probabilities +template: anomalies_anomaly_probabilities on: anomalies.probability lookup: average -2m foreach * every: 1m @@ -9,7 +9,7 @@ template: anomaly_probabilities # raise a warning alarm if an anomaly flag is consistently firing -template: anomaly_flags +template: anomalies_anomaly_flags on: anomalies.anomaly lookup: sum -2m foreach * every: 1m diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf index 4f86037ba5..4d0f050708 100644 --- a/health/health.d/apcupsd.conf +++ b/health/health.d/apcupsd.conf @@ -1,6 +1,6 @@ # you can disable an alarm notification by setting the 'to' line to: silent -template: 10min_ups_load +template: apcupsd_10min_ups_load on: apcupsd.load os: * hosts: * @@ -15,7 +15,7 @@ template: 10min_ups_load # Discussion in https://github.com/netdata/netdata/pull/3928: # Fire the alarm as soon as it's going on battery (99% charge) and clear only when full. -template: ups_charge +template: apcupsd_ups_charge on: apcupsd.charge os: * hosts: * diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf index 30dc27328e..af17358847 100644 --- a/health/health.d/beanstalkd.conf +++ b/health/health.d/beanstalkd.conf @@ -1,6 +1,6 @@ # get the number of buried jobs in all queues -template: server_buried_jobs +template: beanstalk_server_buried_jobs on: beanstalk.current_jobs calc: $buried units: jobs @@ -13,7 +13,7 @@ template: server_buried_jobs # get the number of buried jobs per queue -#template: tube_buried_jobs +#template: beanstalk_tube_buried_jobs # on: beanstalk.jobs # calc: $buried # units: jobs @@ -26,7 +26,7 @@ template: server_buried_jobs # get the current number of tubes -#template: number_of_tubes +#template: beanstalk_number_of_tubes # on: beanstalk.current_tubes # calc: $tubes # every: 10s diff --git a/health/health.d/ceph.conf b/health/health.d/ceph.conf index e0a55a3e72..5cae0877c6 100644 --- a/health/health.d/ceph.conf +++ b/health/health.d/ceph.conf @@ -1,6 +1,6 @@ # low ceph disk available -template: cluster_space_usage +template: ceph_cluster_space_usage on: ceph.general_usage calc: $used * 100 / ($used + $avail) units: % diff --git a/health/health.d/fping.conf b/health/health.d/fping.conf index 43658fef6d..fa22514bac 100644 --- a/health/health.d/fping.conf +++ b/health/health.d/fping.conf @@ -11,7 +11,7 @@ families: * info: number of seconds since the last successful data collection to: sysadmin -template: host_reachable +template: fping_host_reachable families: * on: fping.latency calc: $average != nan @@ -22,7 +22,7 @@ families: * delay: down 30m multiplier 1.5 max 2h to: sysadmin -template: host_latency +template: fping_host_latency families: * on: fping.latency lookup: average -10s unaligned of average @@ -36,7 +36,7 @@ families: * delay: down 30m multiplier 1.5 max 2h to: sysadmin -template: packet_loss +template: fping_packet_loss families: * on: fping.quality lookup: average -10m unaligned of returned @@ -50,4 +50,3 @@ families: * info: packet loss percentage delay: down 30m multiplier 1.5 max 2h to: sysadmin - diff --git a/health/health.d/httpcheck.conf b/health/health.d/httpcheck.conf index 0ddf35eab9..e4eeb2ae80 100644 --- a/health/health.d/httpcheck.conf +++ b/health/health.d/httpcheck.conf @@ -11,7 +11,7 @@ families: * to: sysadmin # This is a fast-reacting no-notification alarm ideal for custom dashboards or badges -template: web_service_up +template: httpcheck_web_service_up families: * on: httpcheck.status lookup: average -1m unaligned percentage of success @@ -21,7 +21,7 @@ families: * info: at least 75% verified responses during last 60 seconds, ideal for badges to: silent -template: web_service_bad_content +template: httpcheck_web_service_bad_content families: * on: httpcheck.status lookup: average -5m unaligned percentage of bad_content @@ -34,7 +34,7 @@ families: * options: no-clear-notification to: webmaster -template: web_service_bad_status +template: httpcheck_web_service_bad_status families: * on: httpcheck.status lookup: average -5m unaligned percentage of bad_status @@ -47,7 +47,7 @@ families: * options: no-clear-notification to: webmaster -template: web_service_timeouts +template: httpcheck_web_service_timeouts families: * on: httpcheck.status lookup: average -5m unaligned percentage of timeout @@ -55,7 +55,7 @@ families: * units: % info: average of timeouts during the last 5 minutes -template: no_web_service_connections +template: httpcheck_no_web_service_connections families: * on: httpcheck.status lookup: average -5m unaligned percentage of no_connection @@ -64,20 +64,20 @@ families: * info: average of failed requests during the last 5 minutes # combined timeout & no connection alarm -template: web_service_unreachable +template: httpcheck_web_service_unreachable families: * on: httpcheck.status - calc: ($no_web_service_connections >= $web_service_timeouts) ? ($no_web_service_connections) : ($web_service_timeouts) + calc: ($httpcheck_no_web_service_connections >= $httpcheck_web_service_timeouts) ? ($httpcheck_no_web_service_connections) : ($httpcheck_web_service_timeouts) units: % every: 10s - warn: ($no_web_service_connections >= 10 OR $web_service_timeouts >= 10) AND ($no_web_service_connections < 40 OR $web_service_timeouts < 40) - crit: $no_web_service_connections >= 40 OR $web_service_timeouts >= 40 + warn: ($httpcheck_no_web_service_connections >= 10 OR $httpcheck_web_service_timeouts >= 10) AND ($httpcheck_no_web_service_connections < 40 OR $httpcheck_web_service_timeouts < 40) + crit: $httpcheck_no_web_service_connections >= 40 OR $httpcheck_web_service_timeouts >= 40 delay: down 5m multiplier 1.5 max 1h info: average of failed requests either due to timeouts or no connection during the last 5 minutes options: no-clear-notification to: webmaster -template: 1h_web_service_response_time +template: httpcheck_1h_web_service_response_time families: * on: httpcheck.responsetime lookup: average -1h unaligned of time @@ -85,14 +85,14 @@ families: * units: ms info: average response time over the last hour -template: web_service_slow +template: httpcheck_web_service_slow families: * on: httpcheck.responsetime lookup: average -3m unaligned of time units: ms every: 10s - warn: ($this > ($1h_web_service_response_time * 2) ) - crit: ($this > ($1h_web_service_response_time * 3) ) + warn: ($this > ($httpcheck_1h_web_service_response_time * 2) ) + crit: ($this > ($httpcheck_1h_web_service_response_time * 3) ) info: average response time over the last 3 minutes, compared to the average over the last hour delay: down 5m multiplier 1.5 max 1h options: no-clear-notification diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf index 59a5c8edc3..f513748045 100644 --- a/health/health.d/ioping.conf +++ b/health/health.d/ioping.conf @@ -1,4 +1,4 @@ -template: disk_latency +template: ioping_disk_latency families: * on: ioping.latency lookup: average -10s unaligned of average diff --git a/health/health.d/kubelet.conf b/health/health.d/kubelet.conf index d2ef24b589..53e0aa4761 100644 --- a/health/health.d/kubelet.conf +++ b/health/health.d/kubelet.conf @@ -4,7 +4,7 @@ # True (1) if the node is experiencing a configuration-related error, false (0) otherwise. - template: node_config_error + template: kubelet_node_config_error on: k8s_kubelet.kubelet_node_config_error calc: $kubelet_node_config_error units: bool @@ -16,7 +16,7 @@ # Failed Token() requests to the alternate token source - template: token_requests + template: kubelet_token_requests lookup: sum -10s of token_fail_count on: k8s_kubelet.kubelet_token_requests units: failed requests @@ -53,17 +53,17 @@ # quantile 0.5 -template: 1m_kubelet_pleg_relist_latency_quantile_05 +template: kubelet_1m_pleg_relist_latency_quantile_05 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds lookup: average -1m unaligned of kubelet_pleg_relist_latency_05 units: microseconds every: 10s info: the average value of pleg relisting latency during the last minute (quantile 0.5) -template: 10s_kubelet_pleg_relist_latency_quantile_05 +template: kubelet_10s_pleg_relist_latency_quantile_05 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds lookup: average -10s unaligned of kubelet_pleg_relist_latency_05 - calc: $this * 100 / (($1m_kubelet_pleg_relist_latency_quantile_05 < 1000)?(1000):($1m_kubelet_pleg_relist_latency_quantile_05)) + calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05)) every: 10s units: % warn: $this > (($status >= $WARNING)?(100):(200)) @@ -74,17 +74,17 @@ template: 10s_kubelet_pleg_relist_latency_quantile_05 # quantile 0.9 -template: 1m_kubelet_pleg_relist_latency_quantile_09 +template: kubelet_1m_pleg_relist_latency_quantile_09 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds lookup: average -1m unaligned of kubelet_pleg_relist_latency_09 units: microseconds every: 10s info: the average value of pleg relisting latency during the last minute (quantile 0.9) -template: 10s_kubelet_pleg_relist_latency_quantile_09 +template: kubelet_10s_pleg_relist_latency_quantile_09 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds lookup: average -10s unaligned of kubelet_pleg_relist_latency_09 - calc: $this * 100 / (($1m_kubelet_pleg_relist_latency_quantile_09 < 1000)?(1000):($1m_kubelet_pleg_relist_latency_quantile_09)) + calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09)) every: 10s units: % warn: $this > (($status >= $WARNING)?(200):(400)) @@ -95,17 +95,17 @@ template: 10s_kubelet_pleg_relist_latency_quantile_09 # quantile 0.99 -template: 1m_kubelet_pleg_relist_latency_quantile_099 +template: kubelet_1m_pleg_relist_latency_quantile_099 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds lookup: average -1m unaligned of kubelet_pleg_relist_latency_099 units: microseconds every: 10s info: the average value of pleg relisting latency during the last minute (quantile 0.99) -template: 10s_kubelet_pleg_relist_latency_quantile_099 +template: kubelet_10s_pleg_relist_latency_quantile_099 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds lookup: average -10s unaligned of kubelet_pleg_relist_latency_099 - calc: $this * 100 / (($1m_kubelet_pleg_relist_latency_quantile_099 < 1000)?(1000):($1m_kubelet_pleg_relist_latency_quantile_099)) + calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099)) every: 10s units: % warn: $this > (($status >= $WARNING)?(400):(800)) diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf index 6e81a2a0e7..73f106530a 100644 --- a/health/health.d/megacli.conf +++ b/health/health.d/megacli.conf @@ -1,4 +1,4 @@ -template: adapter_state +template: megacli_adapter_state on: megacli.adapter_degraded units: is degraded lookup: sum -10s @@ -7,7 +7,7 @@ template: adapter_state info: adapter state to: sysadmin -template: bbu_relative_charge +template: megacli_bbu_relative_charge on: megacli.bbu_relative_charge units: percent lookup: average -10s @@ -17,7 +17,7 @@ template: bbu_relative_charge info: BBU relative state of charge to: sysadmin -template: bbu_cycle_count +template: megacli_bbu_cycle_count on: megacli.bbu_cycle_count units: cycle count lookup: average -10s @@ -27,7 +27,7 @@ template: bbu_cycle_count info: BBU cycle count to: sysadmin -template: pd_media_errors +template: megacli_pd_media_errors on: megacli.pd_media_error units: media errors lookup: sum -10s @@ -37,7 +37,7 @@ template: pd_media_errors info: physical drive media errors to: sysadmin -template: pd_predictive_failures +template: megacli_pd_predictive_failures on: megacli.pd_predictive_failure units: predictive failures lookup: sum -10s diff --git a/health/health.d/memcached.conf b/health/health.d/memcached.conf index d248ef57a7..0eaa50f275 100644 --- a/health/health.d/memcached.conf +++ b/health/health.d/memcached.conf @@ -29,7 +29,7 @@ template: memcached_cache_memory_usage # find the rate memcached cache is filling -template: cache_fill_rate +template: memcached_cache_fill_rate on: memcached.cache lookup: min -10m at -50m unaligned of available calc: ($this - $available) / (($now - $after) / 3600) @@ -40,9 +40,9 @@ template: cache_fill_rate # find the hours remaining until memcached cache is full -template: out_of_cache_space_time +template: memcached_out_of_cache_space_time on: memcached.cache - calc: ($cache_fill_rate > 0) ? ($available / $cache_fill_rate) : (inf) + calc: ($memcached_cache_fill_rate > 0) ? ($available / $memcached_cache_fill_rate) : (inf) units: hours every: 10s warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf index 696333fd83..4aab711c64 100644 --- a/health/health.d/portcheck.conf +++ b/health/health.d/portcheck.conf @@ -11,7 +11,7 @@ families: * to: sysadmin # This is a fast-reacting no-notification alarm ideal for custom dashboards or badges -template: service_reachable +template: portcheck_service_reachable families: * on: portcheck.status lookup: average -1m unaligned percentage of success @@ -21,7 +21,7 @@ families: * info: at least 75% successful connections during last 60 seconds, ideal for badges to: silent -template: connection_timeouts +template: portcheck_connection_timeouts families: * on: portcheck.status lookup: average -5m unaligned percentage of timeout @@ -33,7 +33,7 @@ families: * info: average of timeouts during the last 5 minutes to: sysadmin -template: connection_fails +template: portcheck_connection_fails families: * on: portcheck.status lookup: average -5m unaligned percentage of no_connection,failed diff --git a/health/health.d/riakkv.conf b/health/health.d/riakkv.conf index 745302778d..c87123d889 100644 --- a/health/health.d/riakkv.conf +++ b/health/health.d/riakkv.conf @@ -1,5 +1,5 @@ # Ensure that Riak is running. template: riak_last_collected_secs -template: riak_last_collected_secs +template: riakkv_last_collected_secs on: riak.kv.throughput calc: $now - $last_collected_t units: seconds ago @@ -11,7 +11,7 @@ template: riak_last_collected_secs to: dba # Warn if a list keys operation is running. -template: riak_list_keys_active +template: riakkv_list_keys_active on: riak.core.fsm_active calc: $list_fsm_active units: state machines @@ -23,7 +23,7 @@ template: riak_list_keys_active ## Timing healthchecks # KV GET -template: 1h_kv_get_mean_latency +template: riakkv_1h_kv_get_mean_latency on: riak.kv.latency.get calc: $node_get_fsm_time_mean lookup: average -1h unaligned of time @@ -31,20 +31,20 @@ template: 1h_kv_get_mean_latency units: ms info: mean average KV GET latency over the last hour -template: riak_kv_get_slow +template: riakkv_kv_get_slow on: riak.kv.latency.get calc: $mean lookup: average -3m unaligned of time units: ms every: 10s - warn: ($this > ($1h_kv_get_mean_latency * 2) ) - crit: ($this > ($1h_kv_get_mean_latency * 3) ) + warn: ($this > ($riakkv_1h_kv_get_mean_latency * 2) ) + crit: ($this > ($riakkv_1h_kv_get_mean_latency * 3) ) info: average KV GET time over the last 3 minutes, compared to the average over the last hour delay: down 5m multiplier 1.5 max 1h to: dba # KV PUT -template: 1h_kv_put_mean_latency +template: riakkv_1h_kv_put_mean_latency on: riak.kv.latency.put calc: $node_put_fsm_time_mean lookup: average -1h unaligned of time @@ -52,14 +52,14 @@ template: 1h_kv_put_mean_latency units: ms info: mean average KV PUT latency over the last hour -template: riak_kv_put_slow +template: riakkv_kv_put_slow on: riak.kv.latency.put calc: $mean lookup: average -3m unaligned of time units: ms every: 10s - warn: ($this > ($1h_kv_put_mean_latency * 2) ) - crit: ($this > ($1h_kv_put_mean_latency * 3) ) + warn: ($this > ($riakkv_1h_kv_put_mean_latency * 2) ) + crit: ($this > ($riakkv_1h_kv_put_mean_latency * 3) ) info: average KV PUT time over the last 3 minutes, compared to the average over the last hour delay: down 5m multiplier 1.5 max 1h to: dba @@ -69,7 +69,7 @@ template: riak_kv_put_slow # Default Erlang VM process limit: 262144 # On systems observed, this is < 2000, but may grow depending on load. -template: riak_vm_high_process_count +template: riakkv_vm_high_process_count on: riak.vm calc: $sys_process_count units: processes |