summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIlya Mashchenko <ilya@netdata.cloud>2021-03-16 18:29:22 +0300
committerGitHub <noreply@github.com>2021-03-16 18:29:22 +0300
commitd9354c86b51bdeb9b6341660ad05a10b70a2640d (patch)
tree5b8235bb2c5a74c534887e7fd0dd7b6716476517
parentae2bc09a34d939fdd7658a3a1ef3c4319daa3d24 (diff)
health: make vernemq alarms less sensitive (#10770)
-rw-r--r--health/health.d/vernemq.conf267
1 files changed, 84 insertions, 183 deletions
diff --git a/health/health.d/vernemq.conf b/health/health.d/vernemq.conf
index 36bbaf82b2..328936b4f8 100644
--- a/health/health.d/vernemq.conf
+++ b/health/health.d/vernemq.conf
@@ -18,9 +18,9 @@ template: vernemq_socket_errors
on: vernemq.socket_errors
lookup: sum -1m unaligned absolute of socket_error
units: errors
- every: 10s
- warn: $this > (($status == $WARNING) ? (0) : (5))
- delay: down 5m multiplier 1.5 max 2h
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 2m down 5m multiplier 1.5 max 2h
info: socket errors in the last minute
to: sysadmin
@@ -30,9 +30,9 @@ template: vernemq_queue_message_drop
on: vernemq.queue_undelivered_messages
lookup: sum -1m unaligned absolute of queue_message_drop
units: dropped messages
- every: 10s
- warn: $this > (($status == $WARNING) ? (0) : (5))
- delay: down 5m multiplier 1.5 max 2h
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
info: dropped messaged due to full queues in the last minute
to: sysadmin
@@ -40,9 +40,9 @@ template: vernemq_queue_message_expired
on: vernemq.queue_undelivered_messages
lookup: sum -1m unaligned absolute of queue_message_expired
units: expired messages
- every: 10s
- warn: $this > (($status == $WARNING) ? (0) : (15))
- delay: down 5m multiplier 1.5 max 2h
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (15))
+ delay: up 5m down 5m multiplier 1.5 max 2h
info: messages which expired before delivery in the last minute
to: sysadmin
@@ -50,9 +50,9 @@ template: vernemq_queue_message_unhandled
on: vernemq.queue_undelivered_messages
lookup: sum -1m unaligned absolute of queue_message_unhandled
units: unhandled messages
- every: 10s
- warn: $this > (($status == $WARNING) ? (0) : (5))
- delay: down 5m multiplier 1.5 max 2h
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
info: unhandled messages (connections with clean session=true) in the last minute
to: sysadmin
@@ -73,11 +73,11 @@ template: vernemq_average_scheduler_utilization
template: vernemq_cluster_dropped
on: vernemq.cluster_dropped
- lookup: average -1m unaligned
- units: KiB/s
- every: 10s
+ lookup: sum -1m unaligned
+ units: KiB
+ every: 1m
warn: $this > 0
- delay: down 5m multiplier 1.5 max 1h
+ delay: up 5m down 5m multiplier 1.5 max 1h
info: the amount of traffic dropped during communication with the cluster nodes in the last minute
to: sysadmin
@@ -93,62 +93,35 @@ template: vernemq_netsplits
# Unsuccessful CONNACK
-template: vernemq_mqtt_connack_sent_reason_success
- on: vernemq.mqtt_connack_sent_reason
- lookup: sum -1m unaligned absolute match-names of success
- units: packets
- every: 10s
- info: successful v3/v5 CONNACK sent in the last minute
- to: sysadmin
-
template: vernemq_mqtt_connack_sent_reason_unsuccessful
on: vernemq.mqtt_connack_sent_reason
- lookup: sum -1m unaligned absolute
- calc: $this - $vernemq_mqtt_connack_sent_reason_success
+ lookup: sum -1m unaligned absolute match-names of !success,*
units: packets
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
info: unsuccessful v3/v5 CONNACK sent in the last minute
to: sysadmin
# Not normal DISCONNECT
-template: vernemq_mqtt_disconnect_received_reason_normal_disconnect
- on: vernemq.mqtt_disconnect_received_reason
- lookup: sum -1m unaligned absolute match-names of normal_disconnect
- units: packets
- every: 10s
- info: normal v5 DISCONNECT received in the last minute
- to: sysadmin
-
-template: vernemq_mqtt_disconnect_sent_reason_normal_disconnect
- on: vernemq.mqtt_disconnect_sent_reason
- lookup: sum -1m unaligned absolute match-names of normal_disconnect
- units: packets
- every: 10s
- info: normal v5 DISCONNECT sent in the last minute
- to: sysadmin
-
template: vernemq_mqtt_disconnect_received_reason_not_normal
on: vernemq.mqtt_disconnect_received_reason
- lookup: sum -1m unaligned absolute
- calc: $this - $vernemq_mqtt_disconnect_received_reason_normal_disconnect
+ lookup: sum -1m unaligned absolute match-names of !normal_disconnect,*
units: packets
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
info: not normal v5 DISCONNECT received in the last minute
to: sysadmin
template: vernemq_mqtt_disconnect_sent_reason_not_normal
on: vernemq.mqtt_disconnect_sent_reason
- lookup: sum -1m unaligned absolute
- calc: $this - $vernemq_mqtt_disconnect_sent_reason_normal_disconnect
+ lookup: sum -1m unaligned absolute match-names of !normal_disconnect,*
units: packets
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
info: not normal v5 DISCONNECT sent in the last minute
to: sysadmin
@@ -158,9 +131,9 @@ template: vernemq_mqtt_subscribe_error
on: vernemq.mqtt_subscribe_error
lookup: sum -1m unaligned absolute
units: failed ops
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
info: failed v3/v5 SUBSCRIBE operations in the last minute
to: sysadmin
@@ -168,9 +141,9 @@ template: vernemq_mqtt_subscribe_auth_error
on: vernemq.mqtt_subscribe_auth_error
lookup: sum -1m unaligned absolute
units: attempts
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
info: unauthorized v3/v5 SUBSCRIBE attempts in the last minute
to: sysadmin
@@ -180,9 +153,9 @@ template: vernemq_mqtt_unsubscribe_error
on: vernemq.mqtt_unsubscribe_error
lookup: sum -1m unaligned absolute
units: failed ops
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
info: failed v3/v5 UNSUBSCRIBE operations in the last minute
to: sysadmin
@@ -192,9 +165,9 @@ template: vernemq_mqtt_publish_errors
on: vernemq.mqtt_publish_errors
lookup: sum -1m unaligned absolute
units: failed ops
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
info: failed v3/v5 PUBLISH operations in the last minute
to: sysadmin
@@ -202,49 +175,31 @@ template: vernemq_mqtt_publish_auth_errors
on: vernemq.mqtt_publish_auth_errors
lookup: sum -1m unaligned absolute
units: attempts
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
info: unauthorized v3/v5 PUBLISH attempts in the last minute
to: sysadmin
# Unsuccessful and unexpected PUBACK
-template: vernemq_mqtt_puback_received_reason_success
- on: vernemq.mqtt_puback_received_reason
- lookup: sum -1m unaligned absolute match-names of success
- units: packets
- every: 10s
- info: successful v5 PUBACK received in the last minute
- to: sysadmin
-
-template: vernemq_mqtt_puback_sent_reason_success
- on: vernemq.mqtt_puback_sent_reason
- lookup: sum -1m unaligned absolute match-names of success
- units: packets
- every: 10s
- info: successful v5 PUBACK sent in the last minute
- to: sysadmin
-
template: vernemq_mqtt_puback_received_reason_unsuccessful
on: vernemq.mqtt_puback_received_reason
- lookup: sum -1m unaligned absolute
- calc: $this - $vernemq_mqtt_puback_received_reason_success
+ lookup: sum -1m unaligned absolute match-names of !success,*
units: packets
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
info: unsuccessful v5 PUBACK received in the last minute
to: sysadmin
template: vernemq_mqtt_puback_sent_reason_unsuccessful
on: vernemq.mqtt_puback_sent_reason
- lookup: sum -1m unaligned absolute
- calc: $this - $vernemq_mqtt_puback_sent_reason_success
+ lookup: sum -1m unaligned absolute match-names of !success,*
units: packets
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
info: unsuccessful v5 PUBACK sent in the last minute
to: sysadmin
@@ -252,49 +207,31 @@ template: vernemq_mqtt_puback_unexpected
on: vernemq.mqtt_puback_invalid_error
lookup: sum -1m unaligned absolute
units: messages
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
info: unexpected v3/v5 PUBACK received in the last minute
to: sysadmin
# Unsuccessful and unexpected PUBREC
-template: vernemq_mqtt_pubrec_received_reason_success
- on: vernemq.mqtt_pubrec_received_reason
- lookup: sum -1m unaligned absolute match-names of success
- units: packets
- every: 10s
- info: successful v5 PUBREC received in the last minute
- to: sysadmin
-
-template: vernemq_mqtt_pubrec_sent_reason_success
- on: vernemq.mqtt_pubrec_sent_reason
- lookup: sum -1m unaligned absolute match-names of success
- units: packets
- every: 10s
- info: successful v5 PUBREC sent in the last minute
- to: sysadmin
-
template: vernemq_mqtt_pubrec_received_reason_unsuccessful
on: vernemq.mqtt_pubrec_received_reason
- lookup: sum -1m unaligned absolute
- calc: $this - $vernemq_mqtt_pubrec_received_reason_success
+ lookup: sum -1m unaligned absolute match-names of !success,*
units: packets
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
info: unsuccessful v5 PUBREC received in the last minute
to: sysadmin
template: vernemq_mqtt_pubrec_sent_reason_unsuccessful
on: vernemq.mqtt_pubrec_sent_reason
- lookup: sum -1m unaligned absolute
- calc: $this - $vernemq_mqtt_pubrec_sent_reason_success
+ lookup: sum -1m unaligned absolute match-names of !success,*
units: packets
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
info: unsuccessful v5 PUBREC sent in the last minute
to: sysadmin
@@ -302,89 +239,53 @@ template: vernemq_mqtt_pubrec_invalid_error
on: vernemq.mqtt_pubrec_invalid_error
lookup: sum -1m unaligned absolute
units: messages
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
info: unexpected v3 PUBREC received in the last minute
to: sysadmin
# Unsuccessful PUBREL
-template: vernemq_mqtt_pubrel_received_reason_success
- on: vernemq.mqtt_pubrel_received_reason
- lookup: sum -1m unaligned absolute match-names of success
- units: packets
- every: 10s
- info: successful v5 PUBREL received in the last minute
- to: sysadmin
-
-template: vernemq_mqtt_pubrel_sent_reason_success
- on: vernemq.mqtt_pubrel_sent_reason
- lookup: sum -1m unaligned absolute match-names of success
- units: packets
- every: 10s
- info: successful v5 PUBREL sent in the last minute
- to: sysadmin
-
template: vernemq_mqtt_pubrel_received_reason_unsuccessful
on: vernemq.mqtt_pubrel_received_reason
- lookup: sum -1m unaligned absolute
- calc: $this - $vernemq_mqtt_pubrel_received_reason_success
+ lookup: sum -1m unaligned absolute match-names of !success,*
units: packets
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
info: unsuccessful v5 PUBREL received in the last minute
to: sysadmin
template: vernemq_mqtt_pubrel_sent_reason_unsuccessful
on: vernemq.mqtt_pubrel_sent_reason
- lookup: sum -1m unaligned absolute
- calc: $this - $vernemq_mqtt_pubrel_sent_reason_success
+ lookup: sum -1m unaligned absolute match-names of !success,*
units: packets
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
info: unsuccessful v5 PUBREL sent in the last minute
to: sysadmin
# Unsuccessful and unexpected PUBCOMP
-template: vernemq_mqtt_pubcomp_received_reason_success
- on: vernemq.mqtt_pubcomp_received_reason
- lookup: sum -1m unaligned absolute match-names of success
- units: packets
- every: 10s
- info: successful v5 PUBCOMP received in the last minute
- to: sysadmin
-
-template: vernemq_mqtt_pubcomp_sent_reason_success
- on: vernemq.mqtt_pubcomp_sent_reason
- lookup: sum -1m unaligned absolute match-names of success
- units: packets
- every: 10s
- info: successful v5 PUBCOMP sent in the last minute
- to: sysadmin
-
template: vernemq_mqtt_pubcomp_received_reason_unsuccessful
on: vernemq.mqtt_pubcomp_received_reason
- lookup: sum -1m unaligned absolute
- calc: $this - $vernemq_mqtt_pubcomp_received_reason_success
+ lookup: sum -1m unaligned absolute match-names of !success,*
units: packets
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
info: unsuccessful v5 PUBCOMP received in the last minute
to: sysadmin
template: vernemq_mqtt_pubcomp_sent_reason_unsuccessful
on: vernemq.mqtt_pubcomp_sent_reason
- lookup: sum -1m unaligned absolute
- calc: $this - $vernemq_mqtt_pubcomp_sent_reason_success
+ lookup: sum -1m unaligned absolute match-names of !success,*
units: packets
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
info: unsuccessful v5 PUBCOMP sent in the last minute
to: sysadmin
@@ -392,8 +293,8 @@ template: vernemq_mqtt_pubcomp_unexpected
on: vernemq.mqtt_pubcomp_invalid_error
lookup: sum -1m unaligned absolute
units: messages
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
info: unexpected v3/v5 PUBCOMP received in the last minute
to: sysadmin