health: add collector prefix to the external collectors alarms/templates (#10830)

author: Ilya Mashchenko <ilya@netdata.cloud> 2021-03-22 17:30:02 +0300
committer: GitHub <noreply@github.com> 2021-03-22 17:30:02 +0300
commit: 48e0dc53a25354f459294fd60b63307174b19aa2 (patch)
tree: 9259ea0f79d717e3e3290736be65bf43f5e19f64 /health
parent: ffb7f94544d7abe00fe1df0f03d8d319348eb567 (diff)
13 files changed, 62 insertions, 63 deletions
diff --git a/health/health.d/adaptec_raid.conf b/health/health.d/adaptec_raid.conf
index f93bde0c7e..0753c6e5db 100644
--- a/health/health.d/adaptec_raid.conf
+++ b/health/health.d/adaptec_raid.conf
@@ -1,8 +1,8 @@
 
 # logical device status check
 
-template: adapter_raid_ld_status
-      on: adapter_raid.ld_status
+template: adaptec_raid_ld_status
+      on: adaptec_raid.ld_status
   lookup: max -10s foreach *
    units: bool
    every: 10s
@@ -13,8 +13,8 @@ template: adapter_raid_ld_status
 
 # physical device state check
 
-template: adapter_raid_pd_state
-      on: adapter_raid.pd_state
+template: adaptec_raid_pd_state
+      on: adaptec_raid.pd_state
   lookup: max -10s foreach *
    units: bool
    every: 10s
diff --git a/health/health.d/anomalies.conf b/health/health.d/anomalies.conf
index a2d248efe7..8c7945c3cd 100644
--- a/health/health.d/anomalies.conf
+++ b/health/health.d/anomalies.conf
@@ -1,6 +1,6 @@
 # raise a warning alarm if an anomaly probability is consistently above 50%
 
-template: anomaly_probabilities
+template: anomalies_anomaly_probabilities
       on: anomalies.probability
   lookup: average -2m foreach *
    every: 1m
@@ -9,7 +9,7 @@ template: anomaly_probabilities
 
 # raise a warning alarm if an anomaly flag is consistently firing
 
-template: anomaly_flags
+template: anomalies_anomaly_flags
       on: anomalies.anomaly
   lookup: sum -2m foreach *
    every: 1m
diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf
index 4f86037ba5..4d0f050708 100644
--- a/health/health.d/apcupsd.conf
+++ b/health/health.d/apcupsd.conf
@@ -1,6 +1,6 @@
 # you can disable an alarm notification by setting the 'to' line to: silent
 
-template: 10min_ups_load
+template: apcupsd_10min_ups_load
       on: apcupsd.load
       os: *
    hosts: *
@@ -15,7 +15,7 @@ template: 10min_ups_load
 
 # Discussion in https://github.com/netdata/netdata/pull/3928:
 # Fire the alarm as soon as it's going on battery (99% charge) and clear only when full.
-template: ups_charge
+template: apcupsd_ups_charge
       on: apcupsd.charge
       os: *
    hosts: *
diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf
index 30dc27328e..af17358847 100644
--- a/health/health.d/beanstalkd.conf
+++ b/health/health.d/beanstalkd.conf
@@ -1,6 +1,6 @@
 # get the number of buried jobs in all queues
 
-template: server_buried_jobs
+template: beanstalk_server_buried_jobs
       on: beanstalk.current_jobs
     calc: $buried
    units: jobs
@@ -13,7 +13,7 @@ template: server_buried_jobs
       
 # get the number of buried jobs per queue
 
-#template: tube_buried_jobs
+#template: beanstalk_tube_buried_jobs
 #      on: beanstalk.jobs
 #    calc: $buried
 #   units: jobs
@@ -26,7 +26,7 @@ template: server_buried_jobs
 
 # get the current number of tubes
 
-#template: number_of_tubes
+#template: beanstalk_number_of_tubes
 #      on: beanstalk.current_tubes
 #    calc: $tubes
 #   every: 10s
diff --git a/health/health.d/ceph.conf b/health/health.d/ceph.conf
index e0a55a3e72..5cae0877c6 100644
--- a/health/health.d/ceph.conf
+++ b/health/health.d/ceph.conf
@@ -1,6 +1,6 @@
 # low ceph disk available
 
-template: cluster_space_usage
+template: ceph_cluster_space_usage
       on: ceph.general_usage
     calc: $used * 100 / ($used + $avail)
    units: %
diff --git a/health/health.d/fping.conf b/health/health.d/fping.conf
index 43658fef6d..fa22514bac 100644
--- a/health/health.d/fping.conf
+++ b/health/health.d/fping.conf
@@ -11,7 +11,7 @@ families: *
     info: number of seconds since the last successful data collection
       to: sysadmin
 
-template: host_reachable
+template: fping_host_reachable
 families: *
       on: fping.latency
     calc: $average != nan
@@ -22,7 +22,7 @@ families: *
    delay: down 30m multiplier 1.5 max 2h
       to: sysadmin
 
-template: host_latency
+template: fping_host_latency
 families: *
       on: fping.latency
   lookup: average -10s unaligned of average
@@ -36,7 +36,7 @@ families: *
    delay: down 30m multiplier 1.5 max 2h
       to: sysadmin
 
-template: packet_loss
+template: fping_packet_loss
 families: *
       on: fping.quality
   lookup: average -10m unaligned of returned
@@ -50,4 +50,3 @@ families: *
     info: packet loss percentage
    delay: down 30m multiplier 1.5 max 2h
       to: sysadmin
-
diff --git a/health/health.d/httpcheck.conf b/health/health.d/httpcheck.conf
index 0ddf35eab9..e4eeb2ae80 100644
--- a/health/health.d/httpcheck.conf
+++ b/health/health.d/httpcheck.conf
@@ -11,7 +11,7 @@ families: *
       to: sysadmin
 
 # This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
-template: web_service_up
+template: httpcheck_web_service_up
 families: *
       on: httpcheck.status
   lookup: average -1m unaligned percentage of success
@@ -21,7 +21,7 @@ families: *
     info: at least 75% verified responses during last 60 seconds, ideal for badges
       to: silent
 
-template: web_service_bad_content
+template: httpcheck_web_service_bad_content
 families: *
       on: httpcheck.status
   lookup: average -5m unaligned percentage of bad_content
@@ -34,7 +34,7 @@ families: *
  options: no-clear-notification
       to: webmaster
 
-template: web_service_bad_status
+template: httpcheck_web_service_bad_status
 families: *
       on: httpcheck.status
   lookup: average -5m unaligned percentage of bad_status
@@ -47,7 +47,7 @@ families: *
  options: no-clear-notification
       to: webmaster
 
-template: web_service_timeouts
+template: httpcheck_web_service_timeouts
 families: *
       on: httpcheck.status
   lookup: average -5m unaligned percentage of timeout
@@ -55,7 +55,7 @@ families: *
    units: %
     info: average of timeouts during the last 5 minutes
 
-template: no_web_service_connections
+template: httpcheck_no_web_service_connections
 families: *
       on: httpcheck.status
   lookup: average -5m unaligned percentage of no_connection
@@ -64,20 +64,20 @@ families: *
     info: average of failed requests during the last 5 minutes
 
 # combined timeout & no connection alarm
-template: web_service_unreachable
+template: httpcheck_web_service_unreachable
 families: *
       on: httpcheck.status
-    calc: ($no_web_service_connections >= $web_service_timeouts) ? ($no_web_service_connections) : ($web_service_timeouts)
+    calc: ($httpcheck_no_web_service_connections >= $httpcheck_web_service_timeouts) ? ($httpcheck_no_web_service_connections) : ($httpcheck_web_service_timeouts)
    units: %
    every: 10s
-    warn: ($no_web_service_connections >= 10 OR $web_service_timeouts >= 10) AND ($no_web_service_connections < 40 OR $web_service_timeouts < 40)
-    crit: $no_web_service_connections >= 40 OR $web_service_timeouts >= 40
+    warn: ($httpcheck_no_web_service_connections >= 10 OR $httpcheck_web_service_timeouts >= 10) AND ($httpcheck_no_web_service_connections < 40 OR $httpcheck_web_service_timeouts < 40)
+    crit: $httpcheck_no_web_service_connections >= 40 OR $httpcheck_web_service_timeouts >= 40
    delay: down 5m multiplier 1.5 max 1h
     info: average of failed requests either due to timeouts or no connection during the last 5 minutes
  options: no-clear-notification
       to: webmaster
 
-template: 1h_web_service_response_time
+template: httpcheck_1h_web_service_response_time
 families: *
       on: httpcheck.responsetime
   lookup: average -1h unaligned of time
@@ -85,14 +85,14 @@ families: *
    units: ms
     info: average response time over the last hour
 
-template: web_service_slow
+template: httpcheck_web_service_slow
 families: *
       on: httpcheck.responsetime
   lookup: average -3m unaligned of time
    units: ms
    every: 10s
-    warn: ($this > ($1h_web_service_response_time * 2) )
-    crit: ($this > ($1h_web_service_response_time * 3) )
+    warn: ($this > ($httpcheck_1h_web_service_response_time * 2) )
+    crit: ($this > ($httpcheck_1h_web_service_response_time * 3) )
     info: average response time over the last 3 minutes, compared to the average over the last hour
    delay: down 5m multiplier 1.5 max 1h
  options: no-clear-notification
diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf
index 59a5c8edc3..f513748045 100644
--- a/health/health.d/ioping.conf
+++ b/health/health.d/ioping.conf
@@ -1,4 +1,4 @@
-template: disk_latency
+template: ioping_disk_latency
 families: *
       on: ioping.latency
   lookup: average -10s unaligned of average
diff --git a/health/health.d/kubelet.conf b/health/health.d/kubelet.conf
index d2ef24b589..53e0aa4761 100644
--- a/health/health.d/kubelet.conf
+++ b/health/health.d/kubelet.conf
@@ -4,7 +4,7 @@
 
 # True (1) if the node is experiencing a configuration-related error, false (0) otherwise.
 
-   template: node_config_error
+   template: kubelet_node_config_error
          on: k8s_kubelet.kubelet_node_config_error
        calc: $kubelet_node_config_error
       units: bool
@@ -16,7 +16,7 @@
 
 # Failed Token() requests to the alternate token source
 
-   template: token_requests
+   template: kubelet_token_requests
      lookup: sum -10s of token_fail_count
          on: k8s_kubelet.kubelet_token_requests
       units: failed requests
@@ -53,17 +53,17 @@
 
 # quantile 0.5
 
-template: 1m_kubelet_pleg_relist_latency_quantile_05
+template: kubelet_1m_pleg_relist_latency_quantile_05
       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
   lookup: average -1m unaligned of kubelet_pleg_relist_latency_05
    units: microseconds
    every: 10s
     info: the average value of pleg relisting latency during the last minute (quantile 0.5)
 
-template: 10s_kubelet_pleg_relist_latency_quantile_05
+template: kubelet_10s_pleg_relist_latency_quantile_05
       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
   lookup: average -10s unaligned of kubelet_pleg_relist_latency_05
-    calc: $this * 100 / (($1m_kubelet_pleg_relist_latency_quantile_05 < 1000)?(1000):($1m_kubelet_pleg_relist_latency_quantile_05))
+    calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05))
    every: 10s
    units: %
    warn: $this > (($status >= $WARNING)?(100):(200))
@@ -74,17 +74,17 @@ template: 10s_kubelet_pleg_relist_latency_quantile_05
 
 # quantile 0.9
 
-template: 1m_kubelet_pleg_relist_latency_quantile_09
+template: kubelet_1m_pleg_relist_latency_quantile_09
       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
   lookup: average -1m unaligned of kubelet_pleg_relist_latency_09
    units: microseconds
    every: 10s
     info: the average value of pleg relisting latency during the last minute (quantile 0.9)
 
-template: 10s_kubelet_pleg_relist_latency_quantile_09
+template: kubelet_10s_pleg_relist_latency_quantile_09
       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
   lookup: average -10s unaligned of kubelet_pleg_relist_latency_09
-    calc: $this * 100 / (($1m_kubelet_pleg_relist_latency_quantile_09 < 1000)?(1000):($1m_kubelet_pleg_relist_latency_quantile_09))
+    calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09))
    every: 10s
    units: %
    warn: $this > (($status >= $WARNING)?(200):(400))
@@ -95,17 +95,17 @@ template: 10s_kubelet_pleg_relist_latency_quantile_09
 
 # quantile 0.99
 
-template: 1m_kubelet_pleg_relist_latency_quantile_099
+template: kubelet_1m_pleg_relist_latency_quantile_099
       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
   lookup: average -1m unaligned of kubelet_pleg_relist_latency_099
    units: microseconds
    every: 10s
     info: the average value of pleg relisting latency during the last minute (quantile 0.99)
 
-template: 10s_kubelet_pleg_relist_latency_quantile_099
+template: kubelet_10s_pleg_relist_latency_quantile_099
       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
   lookup: average -10s unaligned of kubelet_pleg_relist_latency_099
-    calc: $this * 100 / (($1m_kubelet_pleg_relist_latency_quantile_099 < 1000)?(1000):($1m_kubelet_pleg_relist_latency_quantile_099))
+    calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099))
    every: 10s
    units: %
    warn: $this > (($status >= $WARNING)?(400):(800))
diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf
index 6e81a2a0e7..73f106530a 100644
--- a/health/health.d/megacli.conf
+++ b/health/health.d/megacli.conf
@@ -1,4 +1,4 @@
-template: adapter_state
+template: megacli_adapter_state
       on: megacli.adapter_degraded
    units: is degraded
   lookup: sum -10s
@@ -7,7 +7,7 @@ template: adapter_state
     info: adapter state
       to: sysadmin
 
-template: bbu_relative_charge
+template: megacli_bbu_relative_charge
       on: megacli.bbu_relative_charge
    units: percent
   lookup: average -10s
@@ -17,7 +17,7 @@ template: bbu_relative_charge
     info: BBU relative state of charge
       to: sysadmin
 
-template: bbu_cycle_count
+template: megacli_bbu_cycle_count
       on: megacli.bbu_cycle_count
    units: cycle count
   lookup: average -10s
@@ -27,7 +27,7 @@ template: bbu_cycle_count
     info: BBU cycle count
       to: sysadmin
 
-template: pd_media_errors
+template: megacli_pd_media_errors
       on: megacli.pd_media_error
    units: media errors
   lookup: sum -10s
@@ -37,7 +37,7 @@ template: pd_media_errors
     info: physical drive media errors
       to: sysadmin
 
-template: pd_predictive_failures
+template: megacli_pd_predictive_failures
       on: megacli.pd_predictive_failure
    units: predictive failures
   lookup: sum -10s
diff --git a/health/health.d/memcached.conf b/health/health.d/memcached.conf
index d248ef57a7..0eaa50f275 100644
--- a/health/health.d/memcached.conf
+++ b/health/health.d/memcached.conf
@@ -29,7 +29,7 @@ template: memcached_cache_memory_usage
 
 # find the rate memcached cache is filling
 
-template: cache_fill_rate
+template: memcached_cache_fill_rate
       on: memcached.cache
   lookup: min -10m at -50m unaligned of available
     calc: ($this - $available) / (($now - $after) / 3600)
@@ -40,9 +40,9 @@ template: cache_fill_rate
 
 # find the hours remaining until memcached cache is full
 
-template: out_of_cache_space_time
+template: memcached_out_of_cache_space_time
       on: memcached.cache
-    calc: ($cache_fill_rate > 0) ? ($available / $cache_fill_rate) : (inf)
+    calc: ($memcached_cache_fill_rate > 0) ? ($available / $memcached_cache_fill_rate) : (inf)
    units: hours
    every: 10s
     warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf
index 696333fd83..4aab711c64 100644
--- a/health/health.d/portcheck.conf
+++ b/health/health.d/portcheck.conf
@@ -11,7 +11,7 @@ families: *
       to: sysadmin
 
 # This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
-template: service_reachable
+template: portcheck_service_reachable
 families: *
       on: portcheck.status
   lookup: average -1m unaligned percentage of success
@@ -21,7 +21,7 @@ families: *
     info: at least 75% successful connections during last 60 seconds, ideal for badges
       to: silent
 
-template: connection_timeouts
+template: portcheck_connection_timeouts
 families: *
       on: portcheck.status
   lookup: average -5m unaligned percentage of timeout
@@ -33,7 +33,7 @@ families: *
     info: average of timeouts during the last 5 minutes
       to: sysadmin
 
-template: connection_fails
+template: portcheck_connection_fails
 families: *
       on: portcheck.status
   lookup: average -5m unaligned percentage of no_connection,failed
diff --git a/health/health.d/riakkv.conf b/health/health.d/riakkv.conf
index 745302778d..c87123d889 100644
--- a/health/health.d/riakkv.conf
+++ b/health/health.d/riakkv.conf
@@ -1,5 +1,5 @@
 # Ensure that Riak is running.  template: riak_last_collected_secs
-template: riak_last_collected_secs
+template: riakkv_last_collected_secs
       on: riak.kv.throughput
     calc: $now - $last_collected_t
    units: seconds ago
@@ -11,7 +11,7 @@ template: riak_last_collected_secs
       to: dba
 
 # Warn if a list keys operation is running.
-template: riak_list_keys_active
+template: riakkv_list_keys_active
       on: riak.core.fsm_active
     calc: $list_fsm_active
    units: state machines
@@ -23,7 +23,7 @@ template: riak_list_keys_active
 
 ## Timing healthchecks
 # KV GET
-template: 1h_kv_get_mean_latency
+template: riakkv_1h_kv_get_mean_latency
       on: riak.kv.latency.get
     calc: $node_get_fsm_time_mean
   lookup: average -1h unaligned of time
@@ -31,20 +31,20 @@ template: 1h_kv_get_mean_latency
    units: ms
     info: mean average KV GET latency over the last hour
 
-template: riak_kv_get_slow
+template: riakkv_kv_get_slow
       on: riak.kv.latency.get
     calc: $mean
   lookup: average -3m unaligned of time
    units: ms
    every: 10s
-    warn: ($this > ($1h_kv_get_mean_latency * 2) )
-    crit: ($this > ($1h_kv_get_mean_latency * 3) )
+    warn: ($this > ($riakkv_1h_kv_get_mean_latency * 2) )
+    crit: ($this > ($riakkv_1h_kv_get_mean_latency * 3) )
     info: average KV GET time over the last 3 minutes, compared to the average over the last hour
    delay: down 5m multiplier 1.5 max 1h
       to: dba
 
 # KV PUT
-template: 1h_kv_put_mean_latency
+template: riakkv_1h_kv_put_mean_latency
       on: riak.kv.latency.put
     calc: $node_put_fsm_time_mean
   lookup: average -1h unaligned of time
@@ -52,14 +52,14 @@ template: 1h_kv_put_mean_latency
    units: ms
     info: mean average KV PUT latency over the last hour
 
-template: riak_kv_put_slow
+template: riakkv_kv_put_slow
       on: riak.kv.latency.put
     calc: $mean
   lookup: average -3m unaligned of time
    units: ms
    every: 10s
-    warn: ($this > ($1h_kv_put_mean_latency * 2) )
-    crit: ($this > ($1h_kv_put_mean_latency * 3) )
+    warn: ($this > ($riakkv_1h_kv_put_mean_latency * 2) )
+    crit: ($this > ($riakkv_1h_kv_put_mean_latency * 3) )
     info: average KV PUT time over the last 3 minutes, compared to the average over the last hour
    delay: down 5m multiplier 1.5 max 1h
       to: dba
@@ -69,7 +69,7 @@ template: riak_kv_put_slow
 
 # Default Erlang VM process limit: 262144
 # On systems observed, this is < 2000, but may grow depending on load.
-template: riak_vm_high_process_count
+template: riakkv_vm_high_process_count
       on: riak.vm
     calc: $sys_process_count
    units: processes
author	Ilya Mashchenko <ilya@netdata.cloud>	2021-03-22 17:30:02 +0300
committer	GitHub <noreply@github.com>	2021-03-22 17:30:02 +0300
commit	48e0dc53a25354f459294fd60b63307174b19aa2 (patch)
tree	9259ea0f79d717e3e3290736be65bf43f5e19f64 /health
parent	ffb7f94544d7abe00fe1df0f03d8d319348eb567 (diff)