From 2555dccae77ebe4ac132c13feca610be1dd26857 Mon Sep 17 00:00:00 2001 From: Ilya Mashchenko Date: Mon, 24 May 2021 14:42:49 +0300 Subject: health: add python.d/go.d jobs last_collected_secs alarms (#11168) --- health/Makefile.am | 15 ++------------- health/health.d/am2320.conf | 15 --------------- health/health.d/apache.conf | 17 ----------------- health/health.d/cockroachdb.conf | 16 ---------------- health/health.d/couchdb.conf | 16 ---------------- health/health.d/gearman.conf | 14 -------------- health/health.d/go.d.plugin.conf | 17 +++++++++++++++++ health/health.d/hdfs.conf | 17 ----------------- health/health.d/httpcheck.conf | 14 -------------- health/health.d/lighttpd.conf | 17 ----------------- health/health.d/memcached.conf | 17 ----------------- health/health.d/mongodb.conf | 16 ---------------- health/health.d/mysql.conf | 18 ------------------ health/health.d/named.conf | 17 ----------------- health/health.d/nginx.conf | 17 ----------------- health/health.d/nginx_plus.conf | 17 ----------------- health/health.d/phpfpm.conf | 17 ----------------- health/health.d/pihole.conf | 16 ---------------- health/health.d/portcheck.conf | 14 -------------- health/health.d/postgres.conf | 16 ---------------- health/health.d/pulsar.conf | 16 ---------------- health/health.d/python.d.plugin.conf | 17 +++++++++++++++++ health/health.d/redis.conf | 16 ---------------- health/health.d/retroshare.conf | 15 --------------- health/health.d/riakkv.conf | 14 -------------- health/health.d/scaleio.conf | 16 ---------------- health/health.d/squid.conf | 17 ----------------- health/health.d/unbound.conf | 16 ---------------- health/health.d/vcsa.conf | 16 ---------------- health/health.d/vernemq.conf | 16 ---------------- health/health.d/web_log.conf | 35 ----------------------------------- health/health.d/whoisquery.conf | 17 ----------------- health/health.d/wmi.conf | 18 ------------------ health/health.d/x509check.conf | 17 ----------------- health/health.d/zookeeper.conf | 17 ----------------- 35 files changed, 36 insertions(+), 550 deletions(-) delete mode 100644 health/health.d/am2320.conf delete mode 100644 health/health.d/apache.conf delete mode 100644 health/health.d/couchdb.conf create mode 100644 health/health.d/go.d.plugin.conf delete mode 100644 health/health.d/lighttpd.conf delete mode 100644 health/health.d/mongodb.conf delete mode 100644 health/health.d/named.conf delete mode 100644 health/health.d/nginx.conf delete mode 100644 health/health.d/nginx_plus.conf delete mode 100644 health/health.d/phpfpm.conf delete mode 100644 health/health.d/postgres.conf delete mode 100644 health/health.d/pulsar.conf create mode 100644 health/health.d/python.d.plugin.conf delete mode 100644 health/health.d/squid.conf delete mode 100644 health/health.d/zookeeper.conf diff --git a/health/Makefile.am b/health/Makefile.am index b963ea0cd1..7b25c07cdd 100644 --- a/health/Makefile.am +++ b/health/Makefile.am @@ -25,9 +25,7 @@ install-exec-local: healthconfigdir=$(libconfigdir)/health.d dist_healthconfig_DATA = \ health.d/adaptec_raid.conf \ - health.d/am2320.conf \ health.d/anomalies.conf \ - health.d/apache.conf \ health.d/apcupsd.conf \ health.d/backend.conf \ health.d/bcache.conf \ @@ -39,7 +37,6 @@ dist_healthconfig_DATA = \ health.d/cgroups.conf \ health.d/cpu.conf \ health.d/cockroachdb.conf \ - health.d/couchdb.conf \ health.d/disks.conf \ health.d/dnsmasq_dhcp.conf \ health.d/dns_query.conf \ @@ -51,6 +48,7 @@ dist_healthconfig_DATA = \ health.d/ioping.conf \ health.d/fronius.conf \ health.d/gearman.conf \ + health.d/go.d.plugin.conf \ health.d/haproxy.conf \ health.d/hdfs.conf \ health.d/httpcheck.conf \ @@ -59,26 +57,19 @@ dist_healthconfig_DATA = \ health.d/ipmi.conf \ health.d/isc_dhcpd.conf \ health.d/kubelet.conf \ - health.d/lighttpd.conf \ health.d/linux_power_supply.conf \ health.d/load.conf \ health.d/mdstat.conf \ health.d/megacli.conf \ health.d/memcached.conf \ health.d/memory.conf \ - health.d/mongodb.conf \ health.d/mysql.conf \ - health.d/named.conf \ health.d/net.conf \ health.d/netfilter.conf \ - health.d/nginx.conf \ - health.d/nginx_plus.conf \ health.d/pihole.conf \ - health.d/phpfpm.conf \ health.d/portcheck.conf \ - health.d/postgres.conf \ health.d/processes.conf \ - health.d/pulsar.conf \ + health.d/python.d.plugin.conf \ health.d/qos.conf \ health.d/ram.conf \ health.d/redis.conf \ @@ -86,7 +77,6 @@ dist_healthconfig_DATA = \ health.d/riakkv.conf \ health.d/scaleio.conf \ health.d/softnet.conf \ - health.d/squid.conf \ health.d/stiebeleltron.conf \ health.d/synchronization.conf \ health.d/swap.conf \ @@ -107,6 +97,5 @@ dist_healthconfig_DATA = \ health.d/wmi.conf \ health.d/x509check.conf \ health.d/zfs.conf \ - health.d/zookeeper.conf \ health.d/dbengine.conf \ $(NULL) diff --git a/health/health.d/am2320.conf b/health/health.d/am2320.conf deleted file mode 100644 index 4bac98fbbb..0000000000 --- a/health/health.d/am2320.conf +++ /dev/null @@ -1,15 +0,0 @@ -# make sure am2320 is sending stats - - template: am2320_last_collected_secs - on: am2320.temperature - class: Other -component: Sensors - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster diff --git a/health/health.d/apache.conf b/health/health.d/apache.conf deleted file mode 100644 index c623fb8801..0000000000 --- a/health/health.d/apache.conf +++ /dev/null @@ -1,17 +0,0 @@ - -# make sure apache is running - - template: apache_last_collected_secs - on: apache.requests - class: Web Server -component: Apache - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - diff --git a/health/health.d/cockroachdb.conf b/health/health.d/cockroachdb.conf index dccd2b0644..2c913a2cf8 100644 --- a/health/health.d/cockroachdb.conf +++ b/health/health.d/cockroachdb.conf @@ -1,20 +1,4 @@ -# Availability - - template: cockroachdb_last_collected_secs - on: cockroachdb.live_nodes - class: Database -component: CockroachDB - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba - # Capacity template: cockroachdb_used_storage_capacity diff --git a/health/health.d/couchdb.conf b/health/health.d/couchdb.conf deleted file mode 100644 index c86c6b9887..0000000000 --- a/health/health.d/couchdb.conf +++ /dev/null @@ -1,16 +0,0 @@ - -# make sure couchdb is running - - template: couchdb_last_collected_secs - on: couchdb.request_methods - class: Database -component: CouchDB - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba diff --git a/health/health.d/gearman.conf b/health/health.d/gearman.conf index e2031bf2b9..289e5fbb58 100644 --- a/health/health.d/gearman.conf +++ b/health/health.d/gearman.conf @@ -1,17 +1,3 @@ -# make sure Gearman is running - template: gearman_last_collected_secs - on: gearman.total_jobs - class: Computing -component: Gearman - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin template: gearman_workers_queued on: gearman.single_job diff --git a/health/health.d/go.d.plugin.conf b/health/health.d/go.d.plugin.conf new file mode 100644 index 0000000000..ecd79c208f --- /dev/null +++ b/health/health.d/go.d.plugin.conf @@ -0,0 +1,17 @@ + +# make sure go.d.plugin data collection job is running + + template: go.d_job_last_collected_secs + on: netdata.go_plugin_execution_time + class: Netdata +component: go.d.plugin + type: Error + module: * + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster diff --git a/health/health.d/hdfs.conf b/health/health.d/hdfs.conf index bd8308bedc..c67bf11dae 100644 --- a/health/health.d/hdfs.conf +++ b/health/health.d/hdfs.conf @@ -1,21 +1,4 @@ -# make sure hdfs is running - - template: hdfs_last_collected_secs - on: hdfs.heap_memory - class: Storage -component: HDFS - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - - # Common template: hdfs_capacity_usage diff --git a/health/health.d/httpcheck.conf b/health/health.d/httpcheck.conf index d4d6376a3f..39fc95a2c8 100644 --- a/health/health.d/httpcheck.conf +++ b/health/health.d/httpcheck.conf @@ -1,17 +1,3 @@ - template: httpcheck_last_collected_secs - families: * - on: httpcheck.status - class: Other -component: HTTP endpoint - type: Latency - calc: $now - $last_collected_t - every: 10s - units: seconds ago - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin # This is a fast-reacting no-notification alarm ideal for custom dashboards or badges template: httpcheck_web_service_up diff --git a/health/health.d/lighttpd.conf b/health/health.d/lighttpd.conf deleted file mode 100644 index 0f067549e7..0000000000 --- a/health/health.d/lighttpd.conf +++ /dev/null @@ -1,17 +0,0 @@ - -# make sure lighttpd is running - - template: lighttpd_last_collected_secs - on: lighttpd.requests - class: Web Server -component: Lighttpd - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - diff --git a/health/health.d/memcached.conf b/health/health.d/memcached.conf index f4b734c381..1efad98a05 100644 --- a/health/health.d/memcached.conf +++ b/health/health.d/memcached.conf @@ -1,21 +1,4 @@ -# make sure memcached is running - - template: memcached_last_collected_secs - on: memcached.cache - class: KV Storage -component: Memcached - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba - - # detect if memcached cache is full template: memcached_cache_memory_usage diff --git a/health/health.d/mongodb.conf b/health/health.d/mongodb.conf deleted file mode 100644 index 8c9bdeb6fe..0000000000 --- a/health/health.d/mongodb.conf +++ /dev/null @@ -1,16 +0,0 @@ - -# make sure mongodb is running - - template: mongodb_last_collected_secs - on: mongodb.read_operations - class: Database -component: MongoDB - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf index 91860c4a77..a30e1b3bc3 100644 --- a/health/health.d/mysql.conf +++ b/health/health.d/mysql.conf @@ -1,22 +1,4 @@ -# make sure mysql is running - - template: mysql_last_collected_secs - on: mysql.queries - class: Database -component: MySQL - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba - - -# ----------------------------------------------------------------------------- # slow queries template: mysql_10s_slow_queries diff --git a/health/health.d/named.conf b/health/health.d/named.conf deleted file mode 100644 index 90266df167..0000000000 --- a/health/health.d/named.conf +++ /dev/null @@ -1,17 +0,0 @@ - -# make sure named is running - - template: named_last_collected_secs - on: named.global_queries - class: DNS -component: BIND - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: domainadmin - diff --git a/health/health.d/nginx.conf b/health/health.d/nginx.conf deleted file mode 100644 index 30c738f47d..0000000000 --- a/health/health.d/nginx.conf +++ /dev/null @@ -1,17 +0,0 @@ - -# make sure nginx is running - - template: nginx_last_collected_secs - on: nginx.requests - class: Web Server -component: NGINX - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - diff --git a/health/health.d/nginx_plus.conf b/health/health.d/nginx_plus.conf deleted file mode 100644 index 5849a9e7ed..0000000000 --- a/health/health.d/nginx_plus.conf +++ /dev/null @@ -1,17 +0,0 @@ - -# make sure nginx_plus is running - - template: nginx_plus_last_collected_secs - on: nginx_plus.requests_total - class: Web Server -component: NGINX Plus - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - diff --git a/health/health.d/phpfpm.conf b/health/health.d/phpfpm.conf deleted file mode 100644 index fc073a9442..0000000000 --- a/health/health.d/phpfpm.conf +++ /dev/null @@ -1,17 +0,0 @@ - -# make sure phpfpm is running - - template: phpfpm_last_collected_secs - on: phpfpm.requests - class: Web Server -component: PHP-FPM - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf index 72622caed2..429ff17849 100644 --- a/health/health.d/pihole.conf +++ b/health/health.d/pihole.conf @@ -1,20 +1,4 @@ -# Make sure Pi-hole is responding. - - template: pihole_last_collected_secs - on: pihole.dns_queries_total - class: Ad Filtering -component: Pi-hole - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - # Blocked DNS queries. template: pihole_blocked_queries diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf index b977dbb310..d864b7b82b 100644 --- a/health/health.d/portcheck.conf +++ b/health/health.d/portcheck.conf @@ -1,17 +1,3 @@ - template: portcheck_last_collected_secs - families: * - on: portcheck.status - class: Other -component: TCP endpoint - type: Latency - calc: $now - $last_collected_t - every: 10s - units: seconds ago - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin # This is a fast-reacting no-notification alarm ideal for custom dashboards or badges template: portcheck_service_reachable diff --git a/health/health.d/postgres.conf b/health/health.d/postgres.conf deleted file mode 100644 index f908a802a1..0000000000 --- a/health/health.d/postgres.conf +++ /dev/null @@ -1,16 +0,0 @@ - -# make sure postgres is running - - template: postgres_last_collected_secs - on: postgres.db_stat_transactions - class: Database -component: PostgreSQL - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba diff --git a/health/health.d/pulsar.conf b/health/health.d/pulsar.conf deleted file mode 100644 index 9903d4e381..0000000000 --- a/health/health.d/pulsar.conf +++ /dev/null @@ -1,16 +0,0 @@ - -# Availability - - template: pulsar_last_collected_secs - on: pulsar.broker_components - class: Messaging -component: Pulsar - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin diff --git a/health/health.d/python.d.plugin.conf b/health/health.d/python.d.plugin.conf new file mode 100644 index 0000000000..7a3ebe1d26 --- /dev/null +++ b/health/health.d/python.d.plugin.conf @@ -0,0 +1,17 @@ + +# make sure python.d.plugin data collection job is running + + template: python.d_job_last_collected_secs + on: netdata.pythond_runtime + class: Netdata +component: python.d.plugin + type: Error + module: * + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf index e8b2899427..d597f0be91 100644 --- a/health/health.d/redis.conf +++ b/health/health.d/redis.conf @@ -1,20 +1,4 @@ -# make sure redis is running - - template: redis_last_collected_secs - on: redis.operations - class: KV Storage -component: Redis - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba - template: redis_bgsave_broken families: * on: redis.bgsave_health diff --git a/health/health.d/retroshare.conf b/health/health.d/retroshare.conf index ca22e60dea..6b3ab9dc3a 100644 --- a/health/health.d/retroshare.conf +++ b/health/health.d/retroshare.conf @@ -1,18 +1,3 @@ -# make sure RetroShare is running - - template: retroshare_last_collected_secs - on: retroshare.peers - class: Data Sharing -component: Retroshare - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin # make sure the DHT is fine when active diff --git a/health/health.d/riakkv.conf b/health/health.d/riakkv.conf index b2c0e8d9c3..b390840843 100644 --- a/health/health.d/riakkv.conf +++ b/health/health.d/riakkv.conf @@ -1,17 +1,3 @@ -# Ensure that Riak is running. template: riak_last_collected_secs - template: riakkv_last_collected_secs - on: riak.kv.throughput - class: Database -component: Riak KV - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: dba # Warn if a list keys operation is running. template: riakkv_list_keys_active diff --git a/health/health.d/scaleio.conf b/health/health.d/scaleio.conf index 3c0dc1168f..894ee0d754 100644 --- a/health/health.d/scaleio.conf +++ b/health/health.d/scaleio.conf @@ -1,20 +1,4 @@ -# make sure scaleio is running - - template: scaleio_last_collected_secs - on: scaleio.system_capacity_total - class: Storage -component: ScaleIO - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - # make sure Storage Pool capacity utilization is under limit template: scaleio_storage_pool_capacity_utilization diff --git a/health/health.d/squid.conf b/health/health.d/squid.conf deleted file mode 100644 index 5c3d176294..0000000000 --- a/health/health.d/squid.conf +++ /dev/null @@ -1,17 +0,0 @@ - -# make sure squid is running - - template: squid_last_collected_secs - on: squid.clients_requests - class: Web Proxy -component: Squid - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: proxyadmin - diff --git a/health/health.d/unbound.conf b/health/health.d/unbound.conf index 1df15474f0..cd9ffa4313 100644 --- a/health/health.d/unbound.conf +++ b/health/health.d/unbound.conf @@ -1,20 +1,4 @@ -# make sure unbound is running - - template: unbound_last_collected_secs - on: unbound.queries - class: DNS -component: Unbound - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - # make sure there is no overwritten/dropped queries in the request-list template: unbound_request_list_overwritten diff --git a/health/health.d/vcsa.conf b/health/health.d/vcsa.conf index 8538e488cb..42eb394c77 100644 --- a/health/health.d/vcsa.conf +++ b/health/health.d/vcsa.conf @@ -1,20 +1,4 @@ -# make sure vcsa is running and responding - - template: vcsa_last_collected_secs - on: vcsa.system_health - class: Virtual Machine -component: VMware vCenter - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - # Overall system health: # - 0: all components are healthy. # - 1: one or more components might become overloaded soon. diff --git a/health/health.d/vernemq.conf b/health/health.d/vernemq.conf index 737147f38f..ce608e266d 100644 --- a/health/health.d/vernemq.conf +++ b/health/health.d/vernemq.conf @@ -1,20 +1,4 @@ -# Availability - - template: vernemq_last_collected_secs - on: vernemq.node_uptime - class: Messaging -component: VerneMQ - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - # Socket errors template: vernemq_socket_errors diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf index c1237fa06d..2cad70d040 100644 --- a/health/health.d/web_log.conf +++ b/health/health.d/web_log.conf @@ -1,22 +1,4 @@ -# make sure we can collect web log data - - template: last_collected_secs - on: web_log.response_codes - class: Web Server -component: Web log - type: Latency - families: * - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - - # ----------------------------------------------------------------------------- # high level response code alarms @@ -231,23 +213,6 @@ component: Web log # ---------------------------------------------------GO-VERSION--------------------------------------------------------- -# make sure we can collect web log data - - template: web_log_last_collected_secs - on: web_log.requests - class: Web Server -component: Web log - type: Latency - families: * - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - # unmatched lines # the following alarms trigger only when there are enough data. diff --git a/health/health.d/whoisquery.conf b/health/health.d/whoisquery.conf index c6d3a9de04..6666229463 100644 --- a/health/health.d/whoisquery.conf +++ b/health/health.d/whoisquery.conf @@ -1,21 +1,4 @@ -# make sure whoisquery is running - - template: whoisquery_last_collected_secs - on: whoisquery.time_until_expiration - class: Other -component: WHOIS - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 60s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - - template: whoisquery_days_until_expiration on: whoisquery.time_until_expiration class: Other diff --git a/health/health.d/wmi.conf b/health/health.d/wmi.conf index 6bd4e077f7..35a93c3938 100644 --- a/health/health.d/wmi.conf +++ b/health/health.d/wmi.conf @@ -1,22 +1,4 @@ -# you can disable an alarm notification by setting the 'to' line to: silent - -## Availability - - template: wmi_last_collected_secs - on: cpu.collector_duration - class: Windows -component: Availability - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - ## CPU template: wmi_10min_cpu_usage diff --git a/health/health.d/x509check.conf b/health/health.d/x509check.conf index 93c406b7a4..d97b694c86 100644 --- a/health/health.d/x509check.conf +++ b/health/health.d/x509check.conf @@ -1,21 +1,4 @@ -# make sure x509check is running - - template: x509check_last_collected_secs - on: x509check.time_until_expiration - class: Certificates -component: x509 certificates - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 60s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - - template: x509check_days_until_expiration on: x509check.time_until_expiration class: Certificates diff --git a/health/health.d/zookeeper.conf b/health/health.d/zookeeper.conf deleted file mode 100644 index 8c7d5a73d8..0000000000 --- a/health/health.d/zookeeper.conf +++ /dev/null @@ -1,17 +0,0 @@ - -# make sure zookeeper is running - - template: zookeeper_last_collected_secs - on: zookeeper.requests - class: KV Storage -component: ZooKeeper - type: Latency - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster - -- cgit v1.2.3