diff options
author | Ilya Mashchenko <ilya@netdata.cloud> | 2021-03-16 18:29:22 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-03-16 18:29:22 +0300 |
commit | d9354c86b51bdeb9b6341660ad05a10b70a2640d (patch) | |
tree | 5b8235bb2c5a74c534887e7fd0dd7b6716476517 /health | |
parent | ae2bc09a34d939fdd7658a3a1ef3c4319daa3d24 (diff) |
health: make vernemq alarms less sensitive (#10770)
Diffstat (limited to 'health')
-rw-r--r-- | health/health.d/vernemq.conf | 267 |
1 files changed, 84 insertions, 183 deletions
diff --git a/health/health.d/vernemq.conf b/health/health.d/vernemq.conf index 36bbaf82b2..328936b4f8 100644 --- a/health/health.d/vernemq.conf +++ b/health/health.d/vernemq.conf @@ -18,9 +18,9 @@ template: vernemq_socket_errors on: vernemq.socket_errors lookup: sum -1m unaligned absolute of socket_error units: errors - every: 10s - warn: $this > (($status == $WARNING) ? (0) : (5)) - delay: down 5m multiplier 1.5 max 2h + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h info: socket errors in the last minute to: sysadmin @@ -30,9 +30,9 @@ template: vernemq_queue_message_drop on: vernemq.queue_undelivered_messages lookup: sum -1m unaligned absolute of queue_message_drop units: dropped messages - every: 10s - warn: $this > (($status == $WARNING) ? (0) : (5)) - delay: down 5m multiplier 1.5 max 2h + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h info: dropped messaged due to full queues in the last minute to: sysadmin @@ -40,9 +40,9 @@ template: vernemq_queue_message_expired on: vernemq.queue_undelivered_messages lookup: sum -1m unaligned absolute of queue_message_expired units: expired messages - every: 10s - warn: $this > (($status == $WARNING) ? (0) : (15)) - delay: down 5m multiplier 1.5 max 2h + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (15)) + delay: up 5m down 5m multiplier 1.5 max 2h info: messages which expired before delivery in the last minute to: sysadmin @@ -50,9 +50,9 @@ template: vernemq_queue_message_unhandled on: vernemq.queue_undelivered_messages lookup: sum -1m unaligned absolute of queue_message_unhandled units: unhandled messages - every: 10s - warn: $this > (($status == $WARNING) ? (0) : (5)) - delay: down 5m multiplier 1.5 max 2h + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h info: unhandled messages (connections with clean session=true) in the last minute to: sysadmin @@ -73,11 +73,11 @@ template: vernemq_average_scheduler_utilization template: vernemq_cluster_dropped on: vernemq.cluster_dropped - lookup: average -1m unaligned - units: KiB/s - every: 10s + lookup: sum -1m unaligned + units: KiB + every: 1m warn: $this > 0 - delay: down 5m multiplier 1.5 max 1h + delay: up 5m down 5m multiplier 1.5 max 1h info: the amount of traffic dropped during communication with the cluster nodes in the last minute to: sysadmin @@ -93,62 +93,35 @@ template: vernemq_netsplits # Unsuccessful CONNACK -template: vernemq_mqtt_connack_sent_reason_success - on: vernemq.mqtt_connack_sent_reason - lookup: sum -1m unaligned absolute match-names of success - units: packets - every: 10s - info: successful v3/v5 CONNACK sent in the last minute - to: sysadmin - template: vernemq_mqtt_connack_sent_reason_unsuccessful on: vernemq.mqtt_connack_sent_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_connack_sent_reason_success + lookup: sum -1m unaligned absolute match-names of !success,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h info: unsuccessful v3/v5 CONNACK sent in the last minute to: sysadmin # Not normal DISCONNECT -template: vernemq_mqtt_disconnect_received_reason_normal_disconnect - on: vernemq.mqtt_disconnect_received_reason - lookup: sum -1m unaligned absolute match-names of normal_disconnect - units: packets - every: 10s - info: normal v5 DISCONNECT received in the last minute - to: sysadmin - -template: vernemq_mqtt_disconnect_sent_reason_normal_disconnect - on: vernemq.mqtt_disconnect_sent_reason - lookup: sum -1m unaligned absolute match-names of normal_disconnect - units: packets - every: 10s - info: normal v5 DISCONNECT sent in the last minute - to: sysadmin - template: vernemq_mqtt_disconnect_received_reason_not_normal on: vernemq.mqtt_disconnect_received_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_disconnect_received_reason_normal_disconnect + lookup: sum -1m unaligned absolute match-names of !normal_disconnect,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h info: not normal v5 DISCONNECT received in the last minute to: sysadmin template: vernemq_mqtt_disconnect_sent_reason_not_normal on: vernemq.mqtt_disconnect_sent_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_disconnect_sent_reason_normal_disconnect + lookup: sum -1m unaligned absolute match-names of !normal_disconnect,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h info: not normal v5 DISCONNECT sent in the last minute to: sysadmin @@ -158,9 +131,9 @@ template: vernemq_mqtt_subscribe_error on: vernemq.mqtt_subscribe_error lookup: sum -1m unaligned absolute units: failed ops - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h info: failed v3/v5 SUBSCRIBE operations in the last minute to: sysadmin @@ -168,9 +141,9 @@ template: vernemq_mqtt_subscribe_auth_error on: vernemq.mqtt_subscribe_auth_error lookup: sum -1m unaligned absolute units: attempts - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h info: unauthorized v3/v5 SUBSCRIBE attempts in the last minute to: sysadmin @@ -180,9 +153,9 @@ template: vernemq_mqtt_unsubscribe_error on: vernemq.mqtt_unsubscribe_error lookup: sum -1m unaligned absolute units: failed ops - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h info: failed v3/v5 UNSUBSCRIBE operations in the last minute to: sysadmin @@ -192,9 +165,9 @@ template: vernemq_mqtt_publish_errors on: vernemq.mqtt_publish_errors lookup: sum -1m unaligned absolute units: failed ops - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h info: failed v3/v5 PUBLISH operations in the last minute to: sysadmin @@ -202,49 +175,31 @@ template: vernemq_mqtt_publish_auth_errors on: vernemq.mqtt_publish_auth_errors lookup: sum -1m unaligned absolute units: attempts - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h info: unauthorized v3/v5 PUBLISH attempts in the last minute to: sysadmin # Unsuccessful and unexpected PUBACK -template: vernemq_mqtt_puback_received_reason_success - on: vernemq.mqtt_puback_received_reason - lookup: sum -1m unaligned absolute match-names of success - units: packets - every: 10s - info: successful v5 PUBACK received in the last minute - to: sysadmin - -template: vernemq_mqtt_puback_sent_reason_success - on: vernemq.mqtt_puback_sent_reason - lookup: sum -1m unaligned absolute match-names of success - units: packets - every: 10s - info: successful v5 PUBACK sent in the last minute - to: sysadmin - template: vernemq_mqtt_puback_received_reason_unsuccessful on: vernemq.mqtt_puback_received_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_puback_received_reason_success + lookup: sum -1m unaligned absolute match-names of !success,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h info: unsuccessful v5 PUBACK received in the last minute to: sysadmin template: vernemq_mqtt_puback_sent_reason_unsuccessful on: vernemq.mqtt_puback_sent_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_puback_sent_reason_success + lookup: sum -1m unaligned absolute match-names of !success,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h info: unsuccessful v5 PUBACK sent in the last minute to: sysadmin @@ -252,49 +207,31 @@ template: vernemq_mqtt_puback_unexpected on: vernemq.mqtt_puback_invalid_error lookup: sum -1m unaligned absolute units: messages - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h info: unexpected v3/v5 PUBACK received in the last minute to: sysadmin # Unsuccessful and unexpected PUBREC -template: vernemq_mqtt_pubrec_received_reason_success - on: vernemq.mqtt_pubrec_received_reason - lookup: sum -1m unaligned absolute match-names of success - units: packets - every: 10s - info: successful v5 PUBREC received in the last minute - to: sysadmin - -template: vernemq_mqtt_pubrec_sent_reason_success - on: vernemq.mqtt_pubrec_sent_reason - lookup: sum -1m unaligned absolute match-names of success - units: packets - every: 10s - info: successful v5 PUBREC sent in the last minute - to: sysadmin - template: vernemq_mqtt_pubrec_received_reason_unsuccessful on: vernemq.mqtt_pubrec_received_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_pubrec_received_reason_success + lookup: sum -1m unaligned absolute match-names of !success,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h info: unsuccessful v5 PUBREC received in the last minute to: sysadmin template: vernemq_mqtt_pubrec_sent_reason_unsuccessful on: vernemq.mqtt_pubrec_sent_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_pubrec_sent_reason_success + lookup: sum -1m unaligned absolute match-names of !success,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h info: unsuccessful v5 PUBREC sent in the last minute to: sysadmin @@ -302,89 +239,53 @@ template: vernemq_mqtt_pubrec_invalid_error on: vernemq.mqtt_pubrec_invalid_error lookup: sum -1m unaligned absolute units: messages - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h info: unexpected v3 PUBREC received in the last minute to: sysadmin # Unsuccessful PUBREL -template: vernemq_mqtt_pubrel_received_reason_success - on: vernemq.mqtt_pubrel_received_reason - lookup: sum -1m unaligned absolute match-names of success - units: packets - every: 10s - info: successful v5 PUBREL received in the last minute - to: sysadmin - -template: vernemq_mqtt_pubrel_sent_reason_success - on: vernemq.mqtt_pubrel_sent_reason - lookup: sum -1m unaligned absolute match-names of success - units: packets - every: 10s - info: successful v5 PUBREL sent in the last minute - to: sysadmin - template: vernemq_mqtt_pubrel_received_reason_unsuccessful on: vernemq.mqtt_pubrel_received_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_pubrel_received_reason_success + lookup: sum -1m unaligned absolute match-names of !success,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h info: unsuccessful v5 PUBREL received in the last minute to: sysadmin template: vernemq_mqtt_pubrel_sent_reason_unsuccessful on: vernemq.mqtt_pubrel_sent_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_pubrel_sent_reason_success + lookup: sum -1m unaligned absolute match-names of !success,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h info: unsuccessful v5 PUBREL sent in the last minute to: sysadmin # Unsuccessful and unexpected PUBCOMP -template: vernemq_mqtt_pubcomp_received_reason_success - on: vernemq.mqtt_pubcomp_received_reason - lookup: sum -1m unaligned absolute match-names of success - units: packets - every: 10s - info: successful v5 PUBCOMP received in the last minute - to: sysadmin - -template: vernemq_mqtt_pubcomp_sent_reason_success - on: vernemq.mqtt_pubcomp_sent_reason - lookup: sum -1m unaligned absolute match-names of success - units: packets - every: 10s - info: successful v5 PUBCOMP sent in the last minute - to: sysadmin - template: vernemq_mqtt_pubcomp_received_reason_unsuccessful on: vernemq.mqtt_pubcomp_received_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_pubcomp_received_reason_success + lookup: sum -1m unaligned absolute match-names of !success,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h info: unsuccessful v5 PUBCOMP received in the last minute to: sysadmin template: vernemq_mqtt_pubcomp_sent_reason_unsuccessful on: vernemq.mqtt_pubcomp_sent_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_pubcomp_sent_reason_success + lookup: sum -1m unaligned absolute match-names of !success,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h info: unsuccessful v5 PUBCOMP sent in the last minute to: sysadmin @@ -392,8 +293,8 @@ template: vernemq_mqtt_pubcomp_unexpected on: vernemq.mqtt_pubcomp_invalid_error lookup: sum -1m unaligned absolute units: messages - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h info: unexpected v3/v5 PUBCOMP received in the last minute to: sysadmin |