summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIlya Mashchenko <ilya@netdata.cloud>2021-05-24 14:42:49 +0300
committerGitHub <noreply@github.com>2021-05-24 14:42:49 +0300
commit2555dccae77ebe4ac132c13feca610be1dd26857 (patch)
tree1b8cb6ac8c4a4b19a1472258296b6421db1a6317
parentdfc4915ffb73b5b12ce50bcf6c1981afa9928f0a (diff)
health: add python.d/go.d jobs last_collected_secs alarms (#11168)
-rw-r--r--health/Makefile.am15
-rw-r--r--health/health.d/am2320.conf15
-rw-r--r--health/health.d/cockroachdb.conf16
-rw-r--r--health/health.d/couchdb.conf16
-rw-r--r--health/health.d/gearman.conf14
-rw-r--r--health/health.d/go.d.plugin.conf (renamed from health/health.d/nginx_plus.conf)14
-rw-r--r--health/health.d/hdfs.conf17
-rw-r--r--health/health.d/httpcheck.conf14
-rw-r--r--health/health.d/lighttpd.conf17
-rw-r--r--health/health.d/memcached.conf17
-rw-r--r--health/health.d/mongodb.conf16
-rw-r--r--health/health.d/mysql.conf18
-rw-r--r--health/health.d/named.conf17
-rw-r--r--health/health.d/nginx.conf17
-rw-r--r--health/health.d/phpfpm.conf17
-rw-r--r--health/health.d/pihole.conf16
-rw-r--r--health/health.d/portcheck.conf14
-rw-r--r--health/health.d/postgres.conf16
-rw-r--r--health/health.d/pulsar.conf16
-rw-r--r--health/health.d/python.d.plugin.conf (renamed from health/health.d/apache.conf)14
-rw-r--r--health/health.d/redis.conf16
-rw-r--r--health/health.d/retroshare.conf15
-rw-r--r--health/health.d/riakkv.conf14
-rw-r--r--health/health.d/scaleio.conf16
-rw-r--r--health/health.d/squid.conf17
-rw-r--r--health/health.d/unbound.conf16
-rw-r--r--health/health.d/vcsa.conf16
-rw-r--r--health/health.d/vernemq.conf16
-rw-r--r--health/health.d/web_log.conf35
-rw-r--r--health/health.d/whoisquery.conf17
-rw-r--r--health/health.d/wmi.conf18
-rw-r--r--health/health.d/x509check.conf17
-rw-r--r--health/health.d/zookeeper.conf17
33 files changed, 16 insertions, 530 deletions
diff --git a/health/Makefile.am b/health/Makefile.am
index b963ea0cd1..7b25c07cdd 100644
--- a/health/Makefile.am
+++ b/health/Makefile.am
@@ -25,9 +25,7 @@ install-exec-local:
healthconfigdir=$(libconfigdir)/health.d
dist_healthconfig_DATA = \
health.d/adaptec_raid.conf \
- health.d/am2320.conf \
health.d/anomalies.conf \
- health.d/apache.conf \
health.d/apcupsd.conf \
health.d/backend.conf \
health.d/bcache.conf \
@@ -39,7 +37,6 @@ dist_healthconfig_DATA = \
health.d/cgroups.conf \
health.d/cpu.conf \
health.d/cockroachdb.conf \
- health.d/couchdb.conf \
health.d/disks.conf \
health.d/dnsmasq_dhcp.conf \
health.d/dns_query.conf \
@@ -51,6 +48,7 @@ dist_healthconfig_DATA = \
health.d/ioping.conf \
health.d/fronius.conf \
health.d/gearman.conf \
+ health.d/go.d.plugin.conf \
health.d/haproxy.conf \
health.d/hdfs.conf \
health.d/httpcheck.conf \
@@ -59,26 +57,19 @@ dist_healthconfig_DATA = \
health.d/ipmi.conf \
health.d/isc_dhcpd.conf \
health.d/kubelet.conf \
- health.d/lighttpd.conf \
health.d/linux_power_supply.conf \
health.d/load.conf \
health.d/mdstat.conf \
health.d/megacli.conf \
health.d/memcached.conf \
health.d/memory.conf \
- health.d/mongodb.conf \
health.d/mysql.conf \
- health.d/named.conf \
health.d/net.conf \
health.d/netfilter.conf \
- health.d/nginx.conf \
- health.d/nginx_plus.conf \
health.d/pihole.conf \
- health.d/phpfpm.conf \
health.d/portcheck.conf \
- health.d/postgres.conf \
health.d/processes.conf \
- health.d/pulsar.conf \
+ health.d/python.d.plugin.conf \
health.d/qos.conf \
health.d/ram.conf \
health.d/redis.conf \
@@ -86,7 +77,6 @@ dist_healthconfig_DATA = \
health.d/riakkv.conf \
health.d/scaleio.conf \
health.d/softnet.conf \
- health.d/squid.conf \
health.d/stiebeleltron.conf \
health.d/synchronization.conf \
health.d/swap.conf \
@@ -107,6 +97,5 @@ dist_healthconfig_DATA = \
health.d/wmi.conf \
health.d/x509check.conf \
health.d/zfs.conf \
- health.d/zookeeper.conf \
health.d/dbengine.conf \
$(NULL)
diff --git a/health/health.d/am2320.conf b/health/health.d/am2320.conf
deleted file mode 100644
index 4bac98fbbb..0000000000
--- a/health/health.d/am2320.conf
+++ /dev/null
@@ -1,15 +0,0 @@
-# make sure am2320 is sending stats
-
- template: am2320_last_collected_secs
- on: am2320.temperature
- class: Other
-component: Sensors
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
diff --git a/health/health.d/cockroachdb.conf b/health/health.d/cockroachdb.conf
index dccd2b0644..2c913a2cf8 100644
--- a/health/health.d/cockroachdb.conf
+++ b/health/health.d/cockroachdb.conf
@@ -1,20 +1,4 @@
-# Availability
-
- template: cockroachdb_last_collected_secs
- on: cockroachdb.live_nodes
- class: Database
-component: CockroachDB
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: dba
-
# Capacity
template: cockroachdb_used_storage_capacity
diff --git a/health/health.d/couchdb.conf b/health/health.d/couchdb.conf
deleted file mode 100644
index c86c6b9887..0000000000
--- a/health/health.d/couchdb.conf
+++ /dev/null
@@ -1,16 +0,0 @@
-
-# make sure couchdb is running
-
- template: couchdb_last_collected_secs
- on: couchdb.request_methods
- class: Database
-component: CouchDB
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: dba
diff --git a/health/health.d/gearman.conf b/health/health.d/gearman.conf
index e2031bf2b9..289e5fbb58 100644
--- a/health/health.d/gearman.conf
+++ b/health/health.d/gearman.conf
@@ -1,17 +1,3 @@
-# make sure Gearman is running
- template: gearman_last_collected_secs
- on: gearman.total_jobs
- class: Computing
-component: Gearman
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
template: gearman_workers_queued
on: gearman.single_job
diff --git a/health/health.d/nginx_plus.conf b/health/health.d/go.d.plugin.conf
index 5849a9e7ed..ecd79c208f 100644
--- a/health/health.d/nginx_plus.conf
+++ b/health/health.d/go.d.plugin.conf
@@ -1,11 +1,12 @@
-# make sure nginx_plus is running
+# make sure go.d.plugin data collection job is running
- template: nginx_plus_last_collected_secs
- on: nginx_plus.requests_total
- class: Web Server
-component: NGINX Plus
- type: Latency
+ template: go.d_job_last_collected_secs
+ on: netdata.go_plugin_execution_time
+ class: Netdata
+component: go.d.plugin
+ type: Error
+ module: *
calc: $now - $last_collected_t
units: seconds ago
every: 10s
@@ -14,4 +15,3 @@ component: NGINX Plus
delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
to: webmaster
-
diff --git a/health/health.d/hdfs.conf b/health/health.d/hdfs.conf
index bd8308bedc..c67bf11dae 100644
--- a/health/health.d/hdfs.conf
+++ b/health/health.d/hdfs.conf
@@ -1,21 +1,4 @@
-# make sure hdfs is running
-
- template: hdfs_last_collected_secs
- on: hdfs.heap_memory
- class: Storage
-component: HDFS
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
-
# Common
template: hdfs_capacity_usage
diff --git a/health/health.d/httpcheck.conf b/health/health.d/httpcheck.conf
index d4d6376a3f..39fc95a2c8 100644
--- a/health/health.d/httpcheck.conf
+++ b/health/health.d/httpcheck.conf
@@ -1,17 +1,3 @@
- template: httpcheck_last_collected_secs
- families: *
- on: httpcheck.status
- class: Other
-component: HTTP endpoint
- type: Latency
- calc: $now - $last_collected_t
- every: 10s
- units: seconds ago
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
template: httpcheck_web_service_up
diff --git a/health/health.d/lighttpd.conf b/health/health.d/lighttpd.conf
deleted file mode 100644
index 0f067549e7..0000000000
--- a/health/health.d/lighttpd.conf
+++ /dev/null
@@ -1,17 +0,0 @@
-
-# make sure lighttpd is running
-
- template: lighttpd_last_collected_secs
- on: lighttpd.requests
- class: Web Server
-component: Lighttpd
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
diff --git a/health/health.d/memcached.conf b/health/health.d/memcached.conf
index f4b734c381..1efad98a05 100644
--- a/health/health.d/memcached.conf
+++ b/health/health.d/memcached.conf
@@ -1,21 +1,4 @@
-# make sure memcached is running
-
- template: memcached_last_collected_secs
- on: memcached.cache
- class: KV Storage
-component: Memcached
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: dba
-
-
# detect if memcached cache is full
template: memcached_cache_memory_usage
diff --git a/health/health.d/mongodb.conf b/health/health.d/mongodb.conf
deleted file mode 100644
index 8c9bdeb6fe..0000000000
--- a/health/health.d/mongodb.conf
+++ /dev/null
@@ -1,16 +0,0 @@
-
-# make sure mongodb is running
-
- template: mongodb_last_collected_secs
- on: mongodb.read_operations
- class: Database
-component: MongoDB
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: dba
diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf
index 91860c4a77..a30e1b3bc3 100644
--- a/health/health.d/mysql.conf
+++ b/health/health.d/mysql.conf
@@ -1,22 +1,4 @@
-# make sure mysql is running
-
- template: mysql_last_collected_secs
- on: mysql.queries
- class: Database
-component: MySQL
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: dba
-
-
-# -----------------------------------------------------------------------------
# slow queries
template: mysql_10s_slow_queries
diff --git a/health/health.d/named.conf b/health/health.d/named.conf
deleted file mode 100644
index 90266df167..0000000000
--- a/health/health.d/named.conf
+++ /dev/null
@@ -1,17 +0,0 @@
-
-# make sure named is running
-
- template: named_last_collected_secs
- on: named.global_queries
- class: DNS
-component: BIND
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: domainadmin
-
diff --git a/health/health.d/nginx.conf b/health/health.d/nginx.conf
deleted file mode 100644
index 30c738f47d..0000000000
--- a/health/health.d/nginx.conf
+++ /dev/null
@@ -1,17 +0,0 @@
-
-# make sure nginx is running
-
- template: nginx_last_collected_secs
- on: nginx.requests
- class: Web Server
-component: NGINX
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
diff --git a/health/health.d/phpfpm.conf b/health/health.d/phpfpm.conf
deleted file mode 100644
index fc073a9442..0000000000
--- a/health/health.d/phpfpm.conf
+++ /dev/null
@@ -1,17 +0,0 @@
-
-# make sure phpfpm is running
-
- template: phpfpm_last_collected_secs
- on: phpfpm.requests
- class: Web Server
-component: PHP-FPM
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf
index 72622caed2..429ff17849 100644
--- a/health/health.d/pihole.conf
+++ b/health/health.d/pihole.conf
@@ -1,20 +1,4 @@
-# Make sure Pi-hole is responding.
-
- template: pihole_last_collected_secs
- on: pihole.dns_queries_total
- class: Ad Filtering
-component: Pi-hole
- type: Latency
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: webmaster
-
# Blocked DNS queries.
template: pihole_blocked_queries
diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf
index b