summaryrefslogtreecommitdiffstats
path: root/health
diff options
context:
space:
mode:
Diffstat (limited to 'health')
-rw-r--r--health/health.c108
-rw-r--r--health/health.h3
-rw-r--r--health/health_config.c11
-rw-r--r--health/health_log.c73
-rwxr-xr-xhealth/notifications/alarm-notify.sh.in489
-rwxr-xr-xhealth/notifications/health_alarm_notify.conf4
6 files changed, 469 insertions, 219 deletions
diff --git a/health/health.c b/health/health.c
index 390578799e..2ed15b3214 100644
--- a/health/health.c
+++ b/health/health.c
@@ -82,10 +82,13 @@ static bool prepare_command(BUFFER *wb,
const char *edit_command,
const char *machine_guid,
uuid_t *transition_id,
- const char *summary
+ const char *summary,
+ const char *context,
+ const char *component,
+ const char *type
) {
char buf[8192];
- size_t n = 8192 - 1;
+ size_t n = sizeof(buf) - 1;
buffer_strcat(wb, "exec");
@@ -195,6 +198,18 @@ static bool prepare_command(BUFFER *wb,
return false;
buffer_sprintf(wb, " '%s'", buf);
+ if (!sanitize_command_argument_string(buf, context, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, component, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, type, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
return true;
}
@@ -342,7 +357,9 @@ static void health_reload_host(RRDHOST *host) {
if(unlikely(!host->health.health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH))
return;
- netdata_log_health("[%s]: Reloading health.", rrdhost_hostname(host));
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Reloading health.",
+ rrdhost_hostname(host));
char *user_path = health_user_config_dir();
char *stock_path = health_stock_config_dir();
@@ -436,8 +453,10 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
// do not send notifications for disabled statuses
- netdata_log_debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
- netdata_log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)",
+ rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
// mark it as run, so that we will send the same alarm if it happens again
goto done;
@@ -454,10 +473,10 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
// we have executed this alarm notification in the past
if(last_executed_status == ae->new_status && !(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING)) {
// don't send the notification for the same status again
- netdata_log_debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_id(ae), ae_name(ae)
- , rrdcalc_status2string(ae->new_status));
- netdata_log_health("[%s]: Health not sending again notification for alarm '%s.%s' status %s", rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae)
- , rrdcalc_status2string(ae->new_status));
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Health not sending again notification for alarm '%s.%s' status %s",
+ rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae),
+ rrdcalc_status2string(ae->new_status));
goto done;
}
}
@@ -476,11 +495,16 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
// Check if alarm notifications are silenced
if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) {
- netdata_log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Health not sending notification for alarm '%s.%s' status %s "
+ "(command API has disabled notifications)",
+ rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
goto done;
}
- netdata_log_health("[%s]: Sending notification for alarm '%s.%s' status %s.", rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Sending notification for alarm '%s.%s' status %s.",
+ rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
const char *exec = (ae->exec) ? ae_exec(ae) : string2str(host->health.health_default_exec);
const char *recipient = (ae->recipient) ? ae_recipient(ae) : string2str(host->health.health_default_recipient);
@@ -581,7 +605,11 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
edit_command,
host->machine_guid,
&ae->transition_id,
- host->health.use_summary_for_notifications && ae->summary?ae_summary(ae):ae_name(ae));
+ host->health.use_summary_for_notifications && ae->summary?ae_summary(ae):ae_name(ae),
+ string2str(ae->chart_context),
+ string2str(ae->component),
+ string2str(ae->type)
+ );
const char *command_to_run = buffer_tostring(wb);
if (ok) {
@@ -778,7 +806,8 @@ static void health_main_cleanup(void *ptr) {
netdata_log_info("cleaning up...");
static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
- netdata_log_health("Health thread ended.");
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "Health thread ended.");
}
static void initialize_health(RRDHOST *host)
@@ -790,7 +819,9 @@ static void initialize_health(RRDHOST *host)
rrdhost_flag_set(host, RRDHOST_FLAG_INITIALIZED_HEALTH);
- netdata_log_health("[%s]: Initializing health.", rrdhost_hostname(host));
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Initializing health.",
+ rrdhost_hostname(host));
host->health.health_default_warn_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat warning", "never");
host->health.health_default_crit_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat critical", "never");
@@ -803,7 +834,11 @@ static void initialize_health(RRDHOST *host)
long n = config_get_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", host->health_log.max);
if(n < 10) {
- netdata_log_health("Host '%s': health configuration has invalid max log entries %ld. Using default %u", rrdhost_hostname(host), n, host->health_log.max);
+ nd_log(NDLS_DAEMON, NDLP_WARNING,
+ "Host '%s': health configuration has invalid max log entries %ld. "
+ "Using default %u",
+ rrdhost_hostname(host), n, host->health_log.max);
+
config_set_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", (long)host->health_log.max);
}
else
@@ -811,7 +846,11 @@ static void initialize_health(RRDHOST *host)
uint32_t m = config_get_number(CONFIG_SECTION_HEALTH, "health log history", HEALTH_LOG_DEFAULT_HISTORY);
if (m < HEALTH_LOG_MINIMUM_HISTORY) {
- netdata_log_health("Host '%s': health configuration has invalid health log history %u. Using minimum %d", rrdhost_hostname(host), m, HEALTH_LOG_MINIMUM_HISTORY);
+ nd_log(NDLS_DAEMON, NDLP_WARNING,
+ "Host '%s': health configuration has invalid health log history %u. "
+ "Using minimum %d",
+ rrdhost_hostname(host), m, HEALTH_LOG_MINIMUM_HISTORY);
+
config_set_number(CONFIG_SECTION_HEALTH, "health log history", HEALTH_LOG_MINIMUM_HISTORY);
m = HEALTH_LOG_MINIMUM_HISTORY;
}
@@ -823,7 +862,9 @@ static void initialize_health(RRDHOST *host)
} else
host->health_log.health_log_history = m;
- netdata_log_health("[%s]: Health log history is set to %u seconds (%u days)", rrdhost_hostname(host), host->health_log.health_log_history, host->health_log.health_log_history / 86400);
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Health log history is set to %u seconds (%u days)",
+ rrdhost_hostname(host), host->health_log.health_log_history, host->health_log.health_log_history / 86400);
conf_enabled_alarms = simple_pattern_create(config_get(CONFIG_SECTION_HEALTH, "enabled alarms", "*"), NULL,
SIMPLE_PATTERN_EXACT, true);
@@ -1049,7 +1090,7 @@ void *health_main(void *ptr) {
if (unlikely(check_if_resumed_from_suspension())) {
apply_hibernation_delay = 1;
- netdata_log_health(
+ nd_log(NDLS_DAEMON, NDLP_NOTICE,
"Postponing alarm checks for %"PRId64" seconds, "
"because it seems that the system was just resumed from suspension.",
(int64_t)hibernation_delay);
@@ -1058,8 +1099,9 @@ void *health_main(void *ptr) {
if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) {
static int logged=0;
if (!logged) {
- netdata_log_health("Skipping health checks, because all alarms are disabled via a %s command.",
- HEALTH_CMDAPI_CMD_DISABLEALL);
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "Skipping health checks, because all alarms are disabled via a %s command.",
+ HEALTH_CMDAPI_CMD_DISABLEALL);
logged = 1;
}
}
@@ -1081,7 +1123,7 @@ void *health_main(void *ptr) {
rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
if (unlikely(apply_hibernation_delay)) {
- netdata_log_health(
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
"[%s]: Postponing health checks for %"PRId64" seconds.",
rrdhost_hostname(host),
(int64_t)hibernation_delay);
@@ -1094,20 +1136,30 @@ void *health_main(void *ptr) {
continue;
}
- netdata_log_health("[%s]: Resuming health checks after delay.", rrdhost_hostname(host));
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Resuming health checks after delay.",
+ rrdhost_hostname(host));
+
host->health.health_delay_up_to = 0;
}
// wait until cleanup of obsolete charts on children is complete
if (host != localhost) {
if (unlikely(host->trigger_chart_obsoletion_check == 1)) {
- netdata_log_health("[%s]: Waiting for chart obsoletion check.", rrdhost_hostname(host));
+
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Waiting for chart obsoletion check.",
+ rrdhost_hostname(host));
+
continue;
}
}
if (!health_running_logged) {
- netdata_log_health("[%s]: Health is running.", rrdhost_hostname(host));
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Health is running.",
+ rrdhost_hostname(host));
+
health_running_logged = true;
}
@@ -1161,6 +1213,7 @@ void *health_main(void *ptr) {
rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0);
if (ae) {
+ health_log_alert(host, ae);
health_alarm_log_add_entry(host, ae);
rc->old_status = rc->status;
rc->status = RRDCALC_STATUS_REMOVED;
@@ -1432,9 +1485,13 @@ void *health_main(void *ptr) {
)
);
+ health_log_alert(host, ae);
health_alarm_log_add_entry(host, ae);
- netdata_log_health("[%s]: Alert event for [%s.%s], value [%s], status [%s].", rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), ae_new_value_string(ae), rrdcalc_status2string(ae->new_status));
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Alert event for [%s.%s], value [%s], status [%s].",
+ rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), ae_new_value_string(ae),
+ rrdcalc_status2string(ae->new_status));
rc->last_status_change_value = rc->value;
rc->last_status_change = now;
@@ -1519,6 +1576,7 @@ void *health_main(void *ptr) {
)
);
+ health_log_alert(host, ae);
ae->last_repeat = rc->last_repeat;
if (!(rc->run_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) {
ae->flags |= HEALTH_ENTRY_RUN_ONCE;
diff --git a/health/health.h b/health/health.h
index f7e50b85d6..ff8fb4261e 100644
--- a/health/health.h
+++ b/health/health.h
@@ -105,4 +105,7 @@ void sql_refresh_hashes(void);
void health_add_host_labels(void);
void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix);
+void health_log_alert_transition_with_trace(RRDHOST *host, ALARM_ENTRY *ae, int line, const char *file, const char *function);
+#define health_log_alert(host, ae) health_log_alert_transition_with_trace(host, ae, __LINE__, __FILE__, __FUNCTION__)
+
#endif //NETDATA_HEALTH_H
diff --git a/health/health_config.c b/health/health_config.c
index f33207b5c8..27b2a71aa1 100644
--- a/health/health_config.c
+++ b/health/health_config.c
@@ -1368,7 +1368,10 @@ void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path
CONFIG_BOOLEAN_YES);
if (!stock_enabled) {
- netdata_log_health("[%s]: Netdata will not load stock alarms.", rrdhost_hostname(host));
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Netdata will not load stock alarms.",
+ rrdhost_hostname(host));
+
stock_path = user_path;
}
@@ -1376,6 +1379,10 @@ void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path
health_rrdvars = health_rrdvariables_create();
recursive_config_double_dir_load(user_path, stock_path, subpath, health_readfile, (void *) host, 0);
- netdata_log_health("[%s]: Read health configuration.", rrdhost_hostname(host));
+
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Read health configuration.",
+ rrdhost_hostname(host));
+
sql_store_hashes = 0;
}
diff --git a/health/health_log.c b/health/health_log.c
index 35f297007d..250937c58b 100644
--- a/health/health_log.c
+++ b/health/health_log.c
@@ -8,6 +8,79 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
sql_health_alarm_log_save(host, ae);
}
+
+void health_log_alert_transition_with_trace(RRDHOST *host, ALARM_ENTRY *ae, int line, const char *file, const char *function) {
+ ND_LOG_STACK lgs[] = {
+ ND_LOG_FIELD_UUID(NDF_MESSAGE_ID, &health_alert_transition_msgid),
+ ND_LOG_FIELD_STR(NDF_NIDL_NODE, host->hostname),
+ ND_LOG_FIELD_STR(NDF_NIDL_INSTANCE, ae->chart_name),
+ ND_LOG_FIELD_STR(NDF_NIDL_CONTEXT, ae->chart_context),
+ ND_LOG_FIELD_U64(NDF_ALERT_ID, ae->alarm_id),
+ ND_LOG_FIELD_U64(NDF_ALERT_UNIQUE_ID, ae->unique_id),
+ ND_LOG_FIELD_U64(NDF_ALERT_EVENT_ID, ae->alarm_event_id),
+ ND_LOG_FIELD_UUID(NDF_ALERT_CONFIG_HASH, &ae->config_hash_id),
+ ND_LOG_FIELD_UUID(NDF_ALERT_TRANSITION_ID, &ae->transition_id),
+ ND_LOG_FIELD_STR(NDF_ALERT_NAME, ae->name),
+ ND_LOG_FIELD_STR(NDF_ALERT_CLASS, ae->classification),
+ ND_LOG_FIELD_STR(NDF_ALERT_COMPONENT, ae->component),
+ ND_LOG_FIELD_STR(NDF_ALERT_TYPE, ae->type),
+ ND_LOG_FIELD_STR(NDF_ALERT_EXEC, ae->exec),
+ ND_LOG_FIELD_STR(NDF_ALERT_RECIPIENT, ae->recipient),
+ ND_LOG_FIELD_STR(NDF_ALERT_SOURCE, ae->exec),
+ ND_LOG_FIELD_STR(NDF_ALERT_UNITS, ae->units),
+ ND_LOG_FIELD_STR(NDF_ALERT_SUMMARY, ae->summary),
+ ND_LOG_FIELD_STR(NDF_ALERT_INFO, ae->info),
+ ND_LOG_FIELD_DBL(NDF_ALERT_VALUE, ae->new_value),
+ ND_LOG_FIELD_DBL(NDF_ALERT_VALUE_OLD, ae->old_value),
+ ND_LOG_FIELD_TXT(NDF_ALERT_STATUS, rrdcalc_status2string(ae->new_status)),
+ ND_LOG_FIELD_TXT(NDF_ALERT_STATUS_OLD, rrdcalc_status2string(ae->old_status)),
+ ND_LOG_FIELD_I64(NDF_ALERT_DURATION, ae->duration),
+ ND_LOG_FIELD_I64(NDF_RESPONSE_CODE, ae->exec_code),
+ ND_LOG_FIELD_U64(NDF_ALERT_NOTIFICATION_REALTIME_USEC, ae->delay_up_to_timestamp * USEC_PER_SEC),
+ ND_LOG_FIELD_END(),
+ };
+ ND_LOG_STACK_PUSH(lgs);
+
+ errno = 0;
+
+ ND_LOG_FIELD_PRIORITY priority = NDLP_INFO;
+
+ switch(ae->new_status) {
+ case RRDCALC_STATUS_UNDEFINED:
+ if(ae->old_status >= RRDCALC_STATUS_CLEAR)
+ priority = NDLP_NOTICE;
+ else
+ priority = NDLP_DEBUG;
+ break;
+
+ default:
+ case RRDCALC_STATUS_UNINITIALIZED:
+ case RRDCALC_STATUS_REMOVED:
+ priority = NDLP_DEBUG;
+ break;
+
+ case RRDCALC_STATUS_CLEAR:
+ priority = NDLP_INFO;
+ break;
+
+ case RRDCALC_STATUS_WARNING:
+ if(ae->old_status < RRDCALC_STATUS_WARNING)
+ priority = NDLP_WARNING;
+ break;
+
+ case RRDCALC_STATUS_CRITICAL:
+ if(ae->old_status < RRDCALC_STATUS_CRITICAL)
+ priority = NDLP_CRIT;
+ break;
+ }
+
+ netdata_logger(NDLS_HEALTH, priority, file, function, line,
+ "ALERT '%s' of instance '%s' on node '%s', transitioned from %s to %s",
+ string2str(ae->name), string2str(ae->chart), string2str(host->hostname),
+ rrdcalc_status2string(ae->old_status), rrdcalc_status2string(ae->new_status)
+ );
+}
+
// ----------------------------------------------------------------------------
// health alarm log management
diff --git a/health/notifications/alarm-notify.sh.in b/health/notifications/alarm-notify.sh.in
index 217bc64864..b1977b800b 100755
--- a/health/notifications/alarm-notify.sh.in
+++ b/health/notifications/alarm-notify.sh.in
@@ -42,6 +42,8 @@
# -----------------------------------------------------------------------------
# testing notifications
+cmd_line="'${0}' $(printf "'%s' " "${@}")"
+
if { [ "${1}" = "test" ] || [ "${2}" = "test" ]; } && [ "${#}" -le 2 ]; then
if [ "${2}" = "test" ]; then
recipient="${1}"
@@ -78,61 +80,139 @@ export PATH="${PATH}:/sbin:/usr/sbin:/usr/local/sbin"
export LC_ALL=C
# -----------------------------------------------------------------------------
+# logging
PROGRAM_NAME="$(basename "${0}")"
-LOG_LEVEL_ERR=1
-LOG_LEVEL_WARN=2
-LOG_LEVEL_INFO=3
-LOG_LEVEL="$LOG_LEVEL_INFO"
+# these should be the same with syslog() priorities
+NDLP_EMERG=0 # system is unusable
+NDLP_ALERT=1 # action must be taken immediately
+NDLP_CRIT=2 # critical conditions
+NDLP_ERR=3 # error conditions
+NDLP_WARN=4 # warning conditions
+NDLP_NOTICE=5 # normal but significant condition
+NDLP_INFO=6 # informational
+NDLP_DEBUG=7 # debug-level messages
+
+# the max (numerically) log level we will log
+LOG_LEVEL=$NDLP_INFO
+
+set_log_min_priority() {
+ case "${NETDATA_LOG_PRIORITY_LEVEL,,}" in
+ "emerg" | "emergency")
+ LOG_LEVEL=$NDLP_EMERG
+ ;;
-set_log_severity_level() {
- case ${NETDATA_LOG_SEVERITY_LEVEL,,} in
- "info") LOG_LEVEL="$LOG_LEVEL_INFO";;
- "warn" | "warning") LOG_LEVEL="$LOG_LEVEL_WARN";;
- "err" | "error") LOG_LEVEL="$LOG_LEVEL_ERR";;
- esac
-}
+ "alert")
+ LOG_LEVEL=$NDLP_ALERT
+ ;;
-set_log_severity_level
+ "crit" | "critical")
+ LOG_LEVEL=$NDLP_CRIT
+ ;;
-logdate() {
- date "+%Y-%m-%d %H:%M:%S"
-}
+ "err" | "error")
+ LOG_LEVEL=$NDLP_ERR
+ ;;
-log() {
- local status="${1}"
- shift
+ "warn" | "warning")
+ LOG_LEVEL=$NDLP_WARN
+ ;;
- echo >&2 "$(logdate): ${PROGRAM_NAME}: ${status}: ${*}"
+ "notice")
+ LOG_LEVEL=$NDLP_NOTICE
+ ;;
+ "info")
+ LOG_LEVEL=$NDLP_INFO
+ ;;
+
+ "debug")
+ LOG_LEVEL=$NDLP_DEBUG
+ ;;
+ esac
+}
+
+set_log_min_priority
+
+log() {
+ local level="${1}"
+ shift 1
+
+ [[ -n "$level" && -n "$LOG_LEVEL" && "$level" -gt "$LOG_LEVEL" ]] && return
+
+ systemd-cat-native --log-as-netdata --newline="{NEWLINE}" <<EOFLOG
+INVOCATION_ID=${NETDATA_INVOCATION_ID}
+SYSLOG_IDENTIFIER=${PROGRAM_NAME}
+PRIORITY=${level}
+THREAD_TAG="alarm-notify"
+ND_LOG_SOURCE=health
+ND_NIDL_NODE=${host}
+ND_NIDL_INSTANCE=${chart}
+ND_NIDL_CONTEXT=${context}
+ND_ALERT_NAME=${name}
+ND_ALERT_ID=${alarm_id}
+ND_ALERT_UNIQUE_ID=${unique_id}
+ND_ALERT_EVENT_ID=${alarm_event_id}
+ND_ALERT_TRANSITION_ID=${transition_id//-/}
+ND_ALERT_CLASS=${classification}
+ND_ALERT_COMPONENT=${component}
+ND_ALERT_TYPE=${type}
+ND_ALERT_RECIPIENT=${roles}
+ND_ALERT_VALUE=${value}
+ND_ALERT_VALUE_OLD=${old_value}
+ND_ALERT_STATUS=${status}
+ND_ALERT_STATUS_OLD=${old_status}
+ND_ALERT_UNITS=${units}
+ND_ALERT_SUMMARY=${summary}
+ND_ALERT_INFO=${info}
+ND_ALERT_DURATION=${duration}
+ND_REQUEST=${cmd_line}
+MESSAGE_ID=6db0018e83e34320ae2a659d78019fb7
+MESSAGE=[ALERT NOTIFICATION]: ${*//[$'\r\n']/{NEWLINE}}
+
+EOFLOG
+ # AN EMPTY LINE IS NEEDED ABOVE
}
info() {
- [[ -n "$LOG_LEVEL" && "$LOG_LEVEL_INFO" -gt "$LOG_LEVEL" ]] && return
- log INFO "${@}"
+ log "$NDLP_INFO" "${@}"
}
warning() {
- [[ -n "$LOG_LEVEL" && "$LOG_LEVEL_WARN" -gt "$LOG_LEVEL" ]] && return
- log WARNING "${@}"
+ log "$NDLP_WARN" "${@}"
}
error() {
- [[ -n "$LOG_LEVEL" && "$LOG_LEVEL_ERR" -gt "$LOG_LEVEL" ]] && return
- log ERROR "${@}"
+ log "$NDLP_ERR" "${@}"
}
fatal() {
- log FATAL "${@}"
+ log "$NDLP_ALERT" "${@}"
exit 1
}
-debug=${NETDATA_ALARM_NOTIFY_DEBUG-0}
debug() {
- [ "${debug}" = "1" ] && log DEBUG "${@}"
+ log "$NDLP_DEBUG" "${@}"
}
+debug=0
+if [ "${NETDATA_ALARM_NOTIFY_DEBUG-0}" = "1" ]; then
+ debug=1
+ LOG_LEVEL=$NDLP_DEBUG
+fi
+
+# -----------------------------------------------------------------------------
+# check for BASH v4+ (required for associative arrays)
+
+if [ ${BASH_VERSINFO[0]} -lt 4 ]; then
+ echo >&2 "BASH version 4 or later is required (this is ${BASH_VERSION})."
+ exit 1
+fi
+
+
+# -----------------------------------------------------------------------------
+
docurl() {
if [ -z "${curl}" ]; then
error "${curl} is unset."
@@ -199,17 +279,10 @@ ntfy
# this is to be overwritten by the config file
custom_sender() {
- info "not sending custom notification for ${status} of '${host}.${chart}.${name}'"
+ info "custom notification mechanism is not configured; not sending ${notification_description}"
}
# -----------------------------------------------------------------------------
-
-# check for BASH v4+ (required for associative arrays)
-if [ ${BASH_VERSINFO[0]} -lt 4 ]; then
- fatal "BASH version 4 or later is required (this is ${BASH_VERSION})."
-fi
-
-# -----------------------------------------------------------------------------
# defaults to allow running this script by hand
[ -z "${NETDATA_USER_CONFIG_DIR}" ] && NETDATA_USER_CONFIG_DIR="@configdir_POST@"
@@ -228,8 +301,8 @@ if [[ ${1} = "unittest" ]]; then
status="${4}" # the current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
old_status="${5}" # the previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
elif [[ ${1} = "dump_methods" ]]; then
- dump_methods=1
- status="WARNING"
+ dump_methods=1
+ status="WARNING"
else
roles="${1}" # the roles that should be notified for this event
args_host="${2}" # the host generated this event
@@ -263,6 +336,9 @@ else
child_machine_guid="${28}" # the machine_guid of the child
transition_id="${29}" # the transition_id of the alert
summary="${30}" # the summary text field of the alert
+ context="${31}" # the context of the chart
+ component="${32}"
+ type="${33}"
fi
# -----------------------------------------------------------------------------
@@ -276,18 +352,20 @@ else
host="${args_host}"
fi
+notification_description="notification to '${roles}' for transition from ${old_status} to ${status}, of alert '${name}' = '${value_string}', of instance '${chart}', context '${context}' on host '${host}'"
+
# -----------------------------------------------------------------------------
# screen statuses we don't need to send a notification
# don't do anything if this is not WARNING, CRITICAL or CLEAR
if [ "${status}" != "WARNING" ] && [ "${status}" != "CRITICAL" ] && [ "${status}" != "CLEAR" ]; then
- info "not sending notification for ${status} of '${host}.${chart}.${name}'"
+ debug "not sending ${notification_description}"
exit 1
fi
# don't do anything if this is CLEAR, but it was not WARNING or CRITICAL
if [ "${clear_alarm_always}" != "YES" ] && [ "${old_status}" != "WARNING" ] && [ "${old_status}" != "CRITICAL" ] && [ "${status}" = "CLEAR" ]; then
- info "not sending notification for ${status} of '${host}.${chart}.${name}' (last status was ${old_status})"
+ debug "not sending ${notification_description}"
exit 1
fi
@@ -434,7 +512,7 @@ else
debug "Loading config file '${CONFIG}'..."
source "${CONFIG}" || error "Failed to load config file '${CONFIG}'."
else
- warning "Cannot find file '${CONFIG}'."
+ debug "Cannot find file '${CONFIG}'."
fi
done
fi
@@ -598,7 +676,16 @@ filter_recipient_by_criticality() {
}
# -----------------------------------------------------------------------------
-# verify the delivery methods supported
+# check the configured targets
+
+# check email
+if [ "${SEND_EMAIL}" = "AUTO" ]; then
+ if command -v curl >/dev/null 2>&1; then
+ SEND_EMAIL="YES"
+ else
+ SEND_EMAIL="NO"
+ fi
+fi
# check slack
[ -z "${SLACK_WEBHOOK_URL}" ] && SEND_SLACK="NO"
@@ -677,112 +764,121 @@ filter_recipient_by_criticality() {
# check custom
[ -z "${DEFAULT_RECIPIENT_CUSTOM}" ] && SEND_CUSTOM="NO"
-if [ "${SEND_PUSHOVER}" = "YES" ] ||
- [ "${SEND_SLACK}" = "YES" ] ||
- [ "${SEND_ROCKETCHAT}" = "YES" ] ||
- [ "${SEND_ALERTA}" = "YES" ] ||
- [ "${SEND_PD}" = "YES" ] ||
- [ "${SEND_FLOCK}" = "YES" ] ||
- [ "${SEND_DISCORD}" = "YES" ] ||
- [ "${SEND_HIPCHAT}" = "YES" ] ||
- [ "${SEND_TWILIO}" = "YES" ] ||
- [ "${SEND_MESSAGEBIRD}" = "YES" ] ||
- [ "${SEND_KAVENEGAR}" = "YES" ] ||
- [ "${SEND_TELEGRAM}" = "YES" ] ||
- [ "${SEND_PUSHBULLET}" = "YES" ] ||
- [ "${SEND_KAFKA}" = "YES" ] ||
- [ "${SEND_FLEEP}" = "YES" ] ||
- [ "${SEND_PROWL}" = "YES" ] ||
- [ "${SEND_MATRIX}" = "YES" ] ||
- [ "${SEND_CUSTOM}" = "YES" ] ||
- [ "${SEND_MSTEAMS}" = "YES" ] ||
- [ "${SEND_DYNATRACE}" = "YES" ] ||
- [ "${SEND_OPSGENIE}" = "YES" ] ||
- [ "${SEND_GOTIFY}" = "YES" ] ||
- [ "${SEND_NTFY}" = "YES" ]; then
- # if we need curl, check for the curl command
- if [ -z "${curl}" ]; then
- curl="$(command -v curl 2>/dev/null)"
- fi
- if [ -z "${curl}" ]; then
- error "Cannot find curl command in the system path. Disabling all curl based notifications."
- SEND_PUSHOVER="NO"
- SEND_PUSHBULLET="NO"
- SEND_TELEGRAM="NO"
- SEND_SLACK="NO"
- SEND_MSTEAMS="NO"
- SEND_ROCKETCHAT="NO"
- SEND_ALERTA="NO"
- SEND_PD="NO"
- SEND_FLOCK="NO"
- SEND_DISCORD="NO"
- SEND_TWILIO="NO"
- SEND_HIPCHAT="NO"
- SEND_MESSAGEBIRD="NO"
- SEND_KAVENEGAR="NO"
- SEND_KAFKA="NO"
- SEND_FLEEP="NO"
- SEND_PROWL="NO"
- SEND_MATRIX="NO"
- SEND_CUSTOM="NO"
- SEND_DYNATRACE="NO"
- SEND_OPSGENIE="NO"
- SEND_GOTIFY="NO"
- SEND_NTFY="NO"
- fi
-fi
+# -----------------------------------------------------------------------------
+# check the availability of targets
-if [ "${SEND_SMS}" = "YES" ]; then
- if [ -z "${sendsms}" ]; then
- sendsms="$(command -v sendsms 2>/dev/null)"
+check_supported_targets() {
+ local log=${1}
+ shift
+
+ if [ "${SEND_PUSHOVER}" = "YES" ] ||
+ [ "${SEND_SLACK}" = "YES" ] ||
+ [ "${SEND_ROCKETCHAT}" = "YES" ] ||
+ [ "${SEND_ALERTA}" = "YES" ] ||
+ [ "${SEND_PD}" = "YES" ] ||
+ [ "${SEND_FLOCK}" = "YES" ] ||
+ [ "${SEND_DISCORD}" = "YES" ] ||
+ [ "${SEND_HIPCHAT}" = "YES" ] ||
+ [ "${SEND_TWILIO}" = "YES" ] ||
+ [ "${SEND_MESSAGEBIRD}" = "YES" ] ||
+ [ "${SEND_KAVENEGAR}" = "YES" ] ||
+ [ "${SEND_TELEGRAM}" = "YES" ] ||
+ [ "${SEND_PUSHBULLET}" = "YES" ] ||
+ [ "${SEND_KAFKA}" = "YES" ] ||
+ [ "${SEND_FLEEP}" = "YES" ] ||
+ [ "${SEND_PROWL}" = "YES" ] ||
+ [ "${SEND_MATRIX}" = "YES" ] ||
+ [ "${SEND_CUSTOM}" = "YES" ] ||
+ [ "${SEND_MSTEAMS}" = "YES" ] ||
+ [ "${SEND_DYNATRACE}" = "YES" ] ||
+ [ "${SEND_OPSGENIE}" = "YES" ] ||
+ [ "${SEND_GOTIFY}" = "YES" ] ||
+ [ "${SEND_NTFY}" = "YES" ]; then
+ # if we need curl, check for the curl command
+ if [ -z "${curl}" ]; then
+ curl="$(command -v curl 2>/dev/null)"
+ fi
+ if [ -z "${curl}" ]; then
+ $log "Cannot find curl command in the system path. Disabling all curl based notifications."
+ SEND_PUSHOVER="NO"
+ SEND_PUSHBULLET="NO"
+ SEND_TELEGRAM="NO"
+ SEND_SLACK="NO"
+ SEND_MSTEAMS="NO"
+ SEND_ROCKETCHAT="NO"
+ SEND_ALERTA="NO"
+ SEND_PD="NO"
+ SEND_FLOCK="NO"
+ SEND_DISCORD="NO"
+ SEND_TWILIO="NO"
+ SEND_HIPCHAT="NO"
+ SEND_MESSAGEBIRD="NO"
+ SEND_KAVENEGAR="NO"
+ SEND_KAFKA="NO"
+ SEND_FLEEP="NO"
+ SEND_PROWL="NO"
+ SEND_MATRIX="NO"
+ SEND_CUSTOM="NO"
+ SEND_DYNATRACE="NO"
+ SEND_OPSGENIE="NO"
+ SEND_GOTIFY="NO"
+ SEND_NTFY="NO"
+ fi
fi
- if [ -z "${sendsms}" ]; then
- SEND_SMS="NO"
+
+ if [ "${SEND_SMS}" = "YES" ]; then
+ if [ -z "${sendsms}" ]; then
+ sendsms="$(command -v sendsms 2>/dev/null)"
+ fi
+ if [ -z "${sendsms}" ]; then
+ SEND_SMS="NO"
+ fi
fi
-fi
-# if we need sendmail, check for the sendmail command
-if [ "${SEND_EMAIL}" = "YES" ] && [ -z "${sendmail}" ]; then
- sendmail="$(command -v sendmail 2>/dev/null)"
- if [ -z "${sendmail}" ]; then
- debug "Cannot find sendmail command in the system path. Disabling email notifications."
- SEND_EMAIL="NO"
+ # if we need sendmail, check for the sendmail command
+ if [ "${SEND_EMAIL}" = "YES" ] && [ -z "${sendmail}" ]; then
+ sendmail="$(command -v sendmail 2>/dev/null)"
+ if [ -z "${sendmail}" ]; then
+ $log "Cannot find sendmail command in the system path. Disabling email notifications."
+ SEND_EMAIL="NO"
+ fi
fi
-fi
-# if we need logger, check for the logger command
-if [ "${SEND_SYSLOG}" = "YES" ] && [ -z "${logger}" ]; then
- logger="$(command -v logger 2>/dev/null)"
- if [ -z "${logger}" ]; then
- debug "Cannot find logger command in the system path. Disabling syslog notifications."
- SEND_SYSLOG="NO"
+ # if we need logger, check for the logger command
+ if [ "${SEND_SYSLOG}" = "YES" ] && [ -z "${logger}" ]; then
+ logger="$(command -v logger 2>/dev/null)"
+ if [ -z "${logger}" ]; then
+ $log "Cannot find logger command in the system path. Disabling syslog notifications."
+ SEND_SYSLOG="NO"
+ fi
fi
-fi
-# if we need aws, check for the aws command
-if [ "${SEND_AWSSNS}" = "YES" ] && [ -z "${aws}" ]; then
- aws="$(command -v aws 2>/dev/null)"
- if [ -z "${aws}" ]; then
- debug "Cannot find aws command in the system path. Disabling Amazon SNS notifications."
- SEND_AWSSNS="NO"
+ # if we need aws, check for the aws command
+ if [ "${SEND_AWSSNS}" = "YES" ] && [ -z "${aws}" ]; then
+ aws="$(command -v aws 2>/dev/null)"
+ if [ -z "${aws}" ]; then
+ $log "Cannot find aws command in the system path. Disabling Amazon SNS notifications."
+ SEND_AWSSNS="NO"
+ fi
fi
-fi
-# if we need nc, check for the nc command
-if [ "${SEND_IRC}" = "YES" ] && [ -z "${nc}" ]; then
- nc="$(command -v nc 2>/dev/null)"
- if [ -z "${nc}" ]; then
- debug "Cannot find nc command in the system path. Disabling IRC notifications."
- SEND_IRC="NO"
+ # if we need nc, check for the nc command
+ if [ "${SEND_IRC}" = "YES" ] && [ -z "${nc}" ]; then
+ nc="$(command -v nc 2>/dev/null)"
+ if [ -z "${nc}" ]; then
+ $log "Cannot find nc command in the system path. Disabling IRC notifications."
+ SEND_IRC="NO"
+ fi
fi
-fi
+}
if [ ${dump_methods} ]; then
+ check_supported_targets debug
for name in "${!SEND_@}"; do
if [ "${!name}" = "YES" ]; then
echo "$name"
fi
done
- exit