diff options
Diffstat (limited to 'health')
-rw-r--r-- | health/health.c | 108 | ||||
-rw-r--r-- | health/health.h | 3 | ||||
-rw-r--r-- | health/health_config.c | 11 | ||||
-rw-r--r-- | health/health_log.c | 73 | ||||
-rwxr-xr-x | health/notifications/alarm-notify.sh.in | 489 | ||||
-rwxr-xr-x | health/notifications/health_alarm_notify.conf | 4 |
6 files changed, 469 insertions, 219 deletions
diff --git a/health/health.c b/health/health.c index 390578799e..2ed15b3214 100644 --- a/health/health.c +++ b/health/health.c @@ -82,10 +82,13 @@ static bool prepare_command(BUFFER *wb, const char *edit_command, const char *machine_guid, uuid_t *transition_id, - const char *summary + const char *summary, + const char *context, + const char *component, + const char *type ) { char buf[8192]; - size_t n = 8192 - 1; + size_t n = sizeof(buf) - 1; buffer_strcat(wb, "exec"); @@ -195,6 +198,18 @@ static bool prepare_command(BUFFER *wb, return false; buffer_sprintf(wb, " '%s'", buf); + if (!sanitize_command_argument_string(buf, context, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, component, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, type, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + return true; } @@ -342,7 +357,9 @@ static void health_reload_host(RRDHOST *host) { if(unlikely(!host->health.health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH)) return; - netdata_log_health("[%s]: Reloading health.", rrdhost_hostname(host)); + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "[%s]: Reloading health.", + rrdhost_hostname(host)); char *user_path = health_user_config_dir(); char *stock_path = health_stock_config_dir(); @@ -436,8 +453,10 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) { // do not send notifications for disabled statuses - netdata_log_debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); - netdata_log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); + + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "[%s]: Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", + rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); // mark it as run, so that we will send the same alarm if it happens again goto done; @@ -454,10 +473,10 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { // we have executed this alarm notification in the past if(last_executed_status == ae->new_status && !(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING)) { // don't send the notification for the same status again - netdata_log_debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_id(ae), ae_name(ae) - , rrdcalc_status2string(ae->new_status)); - netdata_log_health("[%s]: Health not sending again notification for alarm '%s.%s' status %s", rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae) - , rrdcalc_status2string(ae->new_status)); + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "[%s]: Health not sending again notification for alarm '%s.%s' status %s", + rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), + rrdcalc_status2string(ae->new_status)); goto done; } } @@ -476,11 +495,16 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { // Check if alarm notifications are silenced if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) { - netdata_log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "[%s]: Health not sending notification for alarm '%s.%s' status %s " + "(command API has disabled notifications)", + rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); goto done; } - netdata_log_health("[%s]: Sending notification for alarm '%s.%s' status %s.", rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "[%s]: Sending notification for alarm '%s.%s' status %s.", + rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); const char *exec = (ae->exec) ? ae_exec(ae) : string2str(host->health.health_default_exec); const char *recipient = (ae->recipient) ? ae_recipient(ae) : string2str(host->health.health_default_recipient); @@ -581,7 +605,11 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { edit_command, host->machine_guid, &ae->transition_id, - host->health.use_summary_for_notifications && ae->summary?ae_summary(ae):ae_name(ae)); + host->health.use_summary_for_notifications && ae->summary?ae_summary(ae):ae_name(ae), + string2str(ae->chart_context), + string2str(ae->component), + string2str(ae->type) + ); const char *command_to_run = buffer_tostring(wb); if (ok) { @@ -778,7 +806,8 @@ static void health_main_cleanup(void *ptr) { netdata_log_info("cleaning up..."); static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; - netdata_log_health("Health thread ended."); + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "Health thread ended."); } static void initialize_health(RRDHOST *host) @@ -790,7 +819,9 @@ static void initialize_health(RRDHOST *host) rrdhost_flag_set(host, RRDHOST_FLAG_INITIALIZED_HEALTH); - netdata_log_health("[%s]: Initializing health.", rrdhost_hostname(host)); + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "[%s]: Initializing health.", + rrdhost_hostname(host)); host->health.health_default_warn_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat warning", "never"); host->health.health_default_crit_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat critical", "never"); @@ -803,7 +834,11 @@ static void initialize_health(RRDHOST *host) long n = config_get_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", host->health_log.max); if(n < 10) { - netdata_log_health("Host '%s': health configuration has invalid max log entries %ld. Using default %u", rrdhost_hostname(host), n, host->health_log.max); + nd_log(NDLS_DAEMON, NDLP_WARNING, + "Host '%s': health configuration has invalid max log entries %ld. " + "Using default %u", + rrdhost_hostname(host), n, host->health_log.max); + config_set_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", (long)host->health_log.max); } else @@ -811,7 +846,11 @@ static void initialize_health(RRDHOST *host) uint32_t m = config_get_number(CONFIG_SECTION_HEALTH, "health log history", HEALTH_LOG_DEFAULT_HISTORY); if (m < HEALTH_LOG_MINIMUM_HISTORY) { - netdata_log_health("Host '%s': health configuration has invalid health log history %u. Using minimum %d", rrdhost_hostname(host), m, HEALTH_LOG_MINIMUM_HISTORY); + nd_log(NDLS_DAEMON, NDLP_WARNING, + "Host '%s': health configuration has invalid health log history %u. " + "Using minimum %d", + rrdhost_hostname(host), m, HEALTH_LOG_MINIMUM_HISTORY); + config_set_number(CONFIG_SECTION_HEALTH, "health log history", HEALTH_LOG_MINIMUM_HISTORY); m = HEALTH_LOG_MINIMUM_HISTORY; } @@ -823,7 +862,9 @@ static void initialize_health(RRDHOST *host) } else host->health_log.health_log_history = m; - netdata_log_health("[%s]: Health log history is set to %u seconds (%u days)", rrdhost_hostname(host), host->health_log.health_log_history, host->health_log.health_log_history / 86400); + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "[%s]: Health log history is set to %u seconds (%u days)", + rrdhost_hostname(host), host->health_log.health_log_history, host->health_log.health_log_history / 86400); conf_enabled_alarms = simple_pattern_create(config_get(CONFIG_SECTION_HEALTH, "enabled alarms", "*"), NULL, SIMPLE_PATTERN_EXACT, true); @@ -1049,7 +1090,7 @@ void *health_main(void *ptr) { if (unlikely(check_if_resumed_from_suspension())) { apply_hibernation_delay = 1; - netdata_log_health( + nd_log(NDLS_DAEMON, NDLP_NOTICE, "Postponing alarm checks for %"PRId64" seconds, " "because it seems that the system was just resumed from suspension.", (int64_t)hibernation_delay); @@ -1058,8 +1099,9 @@ void *health_main(void *ptr) { if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) { static int logged=0; if (!logged) { - netdata_log_health("Skipping health checks, because all alarms are disabled via a %s command.", - HEALTH_CMDAPI_CMD_DISABLEALL); + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "Skipping health checks, because all alarms are disabled via a %s command.", + HEALTH_CMDAPI_CMD_DISABLEALL); logged = 1; } } @@ -1081,7 +1123,7 @@ void *health_main(void *ptr) { rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host); if (unlikely(apply_hibernation_delay)) { - netdata_log_health( + nd_log(NDLS_DAEMON, NDLP_DEBUG, "[%s]: Postponing health checks for %"PRId64" seconds.", rrdhost_hostname(host), (int64_t)hibernation_delay); @@ -1094,20 +1136,30 @@ void *health_main(void *ptr) { continue; } - netdata_log_health("[%s]: Resuming health checks after delay.", rrdhost_hostname(host)); + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "[%s]: Resuming health checks after delay.", + rrdhost_hostname(host)); + host->health.health_delay_up_to = 0; } // wait until cleanup of obsolete charts on children is complete if (host != localhost) { if (unlikely(host->trigger_chart_obsoletion_check == 1)) { - netdata_log_health("[%s]: Waiting for chart obsoletion check.", rrdhost_hostname(host)); + + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "[%s]: Waiting for chart obsoletion check.", + rrdhost_hostname(host)); + continue; } } if (!health_running_logged) { - netdata_log_health("[%s]: Health is running.", rrdhost_hostname(host)); + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "[%s]: Health is running.", + rrdhost_hostname(host)); + health_running_logged = true; } @@ -1161,6 +1213,7 @@ void *health_main(void *ptr) { rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0); if (ae) { + health_log_alert(host, ae); health_alarm_log_add_entry(host, ae); rc->old_status = rc->status; rc->status = RRDCALC_STATUS_REMOVED; @@ -1432,9 +1485,13 @@ void *health_main(void *ptr) { ) ); + health_log_alert(host, ae); health_alarm_log_add_entry(host, ae); - netdata_log_health("[%s]: Alert event for [%s.%s], value [%s], status [%s].", rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), ae_new_value_string(ae), rrdcalc_status2string(ae->new_status)); + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "[%s]: Alert event for [%s.%s], value [%s], status [%s].", + rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), ae_new_value_string(ae), + rrdcalc_status2string(ae->new_status)); rc->last_status_change_value = rc->value; rc->last_status_change = now; @@ -1519,6 +1576,7 @@ void *health_main(void *ptr) { ) ); + health_log_alert(host, ae); ae->last_repeat = rc->last_repeat; if (!(rc->run_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) { ae->flags |= HEALTH_ENTRY_RUN_ONCE; diff --git a/health/health.h b/health/health.h index f7e50b85d6..ff8fb4261e 100644 --- a/health/health.h +++ b/health/health.h @@ -105,4 +105,7 @@ void sql_refresh_hashes(void); void health_add_host_labels(void); void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix); +void health_log_alert_transition_with_trace(RRDHOST *host, ALARM_ENTRY *ae, int line, const char *file, const char *function); +#define health_log_alert(host, ae) health_log_alert_transition_with_trace(host, ae, __LINE__, __FILE__, __FUNCTION__) + #endif //NETDATA_HEALTH_H diff --git a/health/health_config.c b/health/health_config.c index f33207b5c8..27b2a71aa1 100644 --- a/health/health_config.c +++ b/health/health_config.c @@ -1368,7 +1368,10 @@ void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path CONFIG_BOOLEAN_YES); if (!stock_enabled) { - netdata_log_health("[%s]: Netdata will not load stock alarms.", rrdhost_hostname(host)); + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "[%s]: Netdata will not load stock alarms.", + rrdhost_hostname(host)); + stock_path = user_path; } @@ -1376,6 +1379,10 @@ void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path health_rrdvars = health_rrdvariables_create(); recursive_config_double_dir_load(user_path, stock_path, subpath, health_readfile, (void *) host, 0); - netdata_log_health("[%s]: Read health configuration.", rrdhost_hostname(host)); + + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "[%s]: Read health configuration.", + rrdhost_hostname(host)); + sql_store_hashes = 0; } diff --git a/health/health_log.c b/health/health_log.c index 35f297007d..250937c58b 100644 --- a/health/health_log.c +++ b/health/health_log.c @@ -8,6 +8,79 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) { sql_health_alarm_log_save(host, ae); } + +void health_log_alert_transition_with_trace(RRDHOST *host, ALARM_ENTRY *ae, int line, const char *file, const char *function) { + ND_LOG_STACK lgs[] = { + ND_LOG_FIELD_UUID(NDF_MESSAGE_ID, &health_alert_transition_msgid), + ND_LOG_FIELD_STR(NDF_NIDL_NODE, host->hostname), + ND_LOG_FIELD_STR(NDF_NIDL_INSTANCE, ae->chart_name), + ND_LOG_FIELD_STR(NDF_NIDL_CONTEXT, ae->chart_context), + ND_LOG_FIELD_U64(NDF_ALERT_ID, ae->alarm_id), + ND_LOG_FIELD_U64(NDF_ALERT_UNIQUE_ID, ae->unique_id), + ND_LOG_FIELD_U64(NDF_ALERT_EVENT_ID, ae->alarm_event_id), + ND_LOG_FIELD_UUID(NDF_ALERT_CONFIG_HASH, &ae->config_hash_id), + ND_LOG_FIELD_UUID(NDF_ALERT_TRANSITION_ID, &ae->transition_id), + ND_LOG_FIELD_STR(NDF_ALERT_NAME, ae->name), + ND_LOG_FIELD_STR(NDF_ALERT_CLASS, ae->classification), + ND_LOG_FIELD_STR(NDF_ALERT_COMPONENT, ae->component), + ND_LOG_FIELD_STR(NDF_ALERT_TYPE, ae->type), + ND_LOG_FIELD_STR(NDF_ALERT_EXEC, ae->exec), + ND_LOG_FIELD_STR(NDF_ALERT_RECIPIENT, ae->recipient), + ND_LOG_FIELD_STR(NDF_ALERT_SOURCE, ae->exec), + ND_LOG_FIELD_STR(NDF_ALERT_UNITS, ae->units), + ND_LOG_FIELD_STR(NDF_ALERT_SUMMARY, ae->summary), + ND_LOG_FIELD_STR(NDF_ALERT_INFO, ae->info), + ND_LOG_FIELD_DBL(NDF_ALERT_VALUE, ae->new_value), + ND_LOG_FIELD_DBL(NDF_ALERT_VALUE_OLD, ae->old_value), + ND_LOG_FIELD_TXT(NDF_ALERT_STATUS, rrdcalc_status2string(ae->new_status)), + ND_LOG_FIELD_TXT(NDF_ALERT_STATUS_OLD, rrdcalc_status2string(ae->old_status)), + ND_LOG_FIELD_I64(NDF_ALERT_DURATION, ae->duration), + ND_LOG_FIELD_I64(NDF_RESPONSE_CODE, ae->exec_code), + ND_LOG_FIELD_U64(NDF_ALERT_NOTIFICATION_REALTIME_USEC, ae->delay_up_to_timestamp * USEC_PER_SEC), + ND_LOG_FIELD_END(), + }; + ND_LOG_STACK_PUSH(lgs); + + errno = 0; + + ND_LOG_FIELD_PRIORITY priority = NDLP_INFO; + + switch(ae->new_status) { + case RRDCALC_STATUS_UNDEFINED: + if(ae->old_status >= RRDCALC_STATUS_CLEAR) + priority = NDLP_NOTICE; + else + priority = NDLP_DEBUG; + break; + + default: + case RRDCALC_STATUS_UNINITIALIZED: + case RRDCALC_STATUS_REMOVED: + priority = NDLP_DEBUG; + break; + + case RRDCALC_STATUS_CLEAR: + priority = NDLP_INFO; + break; + + case RRDCALC_STATUS_WARNING: + if(ae->old_status < RRDCALC_STATUS_WARNING) + priority = NDLP_WARNING; + break; + + case RRDCALC_STATUS_CRITICAL: + if(ae->old_status < RRDCALC_STATUS_CRITICAL) + priority = NDLP_CRIT; + break; + } + + netdata_logger(NDLS_HEALTH, priority, file, function, line, + "ALERT '%s' of instance '%s' on node '%s', transitioned from %s to %s", + string2str(ae->name), string2str(ae->chart), string2str(host->hostname), + rrdcalc_status2string(ae->old_status), rrdcalc_status2string(ae->new_status) + ); +} + // ---------------------------------------------------------------------------- // health alarm log management diff --git a/health/notifications/alarm-notify.sh.in b/health/notifications/alarm-notify.sh.in index 217bc64864..b1977b800b 100755 --- a/health/notifications/alarm-notify.sh.in +++ b/health/notifications/alarm-notify.sh.in @@ -42,6 +42,8 @@ # ----------------------------------------------------------------------------- # testing notifications +cmd_line="'${0}' $(printf "'%s' " "${@}")" + if { [ "${1}" = "test" ] || [ "${2}" = "test" ]; } && [ "${#}" -le 2 ]; then if [ "${2}" = "test" ]; then recipient="${1}" @@ -78,61 +80,139 @@ export PATH="${PATH}:/sbin:/usr/sbin:/usr/local/sbin" export LC_ALL=C # ----------------------------------------------------------------------------- +# logging PROGRAM_NAME="$(basename "${0}")" -LOG_LEVEL_ERR=1 -LOG_LEVEL_WARN=2 -LOG_LEVEL_INFO=3 -LOG_LEVEL="$LOG_LEVEL_INFO" +# these should be the same with syslog() priorities +NDLP_EMERG=0 # system is unusable +NDLP_ALERT=1 # action must be taken immediately +NDLP_CRIT=2 # critical conditions +NDLP_ERR=3 # error conditions +NDLP_WARN=4 # warning conditions +NDLP_NOTICE=5 # normal but significant condition +NDLP_INFO=6 # informational +NDLP_DEBUG=7 # debug-level messages + +# the max (numerically) log level we will log +LOG_LEVEL=$NDLP_INFO + +set_log_min_priority() { + case "${NETDATA_LOG_PRIORITY_LEVEL,,}" in + "emerg" | "emergency") + LOG_LEVEL=$NDLP_EMERG + ;; -set_log_severity_level() { - case ${NETDATA_LOG_SEVERITY_LEVEL,,} in - "info") LOG_LEVEL="$LOG_LEVEL_INFO";; - "warn" | "warning") LOG_LEVEL="$LOG_LEVEL_WARN";; - "err" | "error") LOG_LEVEL="$LOG_LEVEL_ERR";; - esac -} + "alert") + LOG_LEVEL=$NDLP_ALERT + ;; -set_log_severity_level + "crit" | "critical") + LOG_LEVEL=$NDLP_CRIT + ;; -logdate() { - date "+%Y-%m-%d %H:%M:%S" -} + "err" | "error") + LOG_LEVEL=$NDLP_ERR + ;; -log() { - local status="${1}" - shift + "warn" | "warning") + LOG_LEVEL=$NDLP_WARN + ;; - echo >&2 "$(logdate): ${PROGRAM_NAME}: ${status}: ${*}" + "notice") + LOG_LEVEL=$NDLP_NOTICE + ;; + "info") + LOG_LEVEL=$NDLP_INFO + ;; + + "debug") + LOG_LEVEL=$NDLP_DEBUG + ;; + esac +} + +set_log_min_priority + +log() { + local level="${1}" + shift 1 + + [[ -n "$level" && -n "$LOG_LEVEL" && "$level" -gt "$LOG_LEVEL" ]] && return + + systemd-cat-native --log-as-netdata --newline="{NEWLINE}" <<EOFLOG +INVOCATION_ID=${NETDATA_INVOCATION_ID} +SYSLOG_IDENTIFIER=${PROGRAM_NAME} +PRIORITY=${level} +THREAD_TAG="alarm-notify" +ND_LOG_SOURCE=health +ND_NIDL_NODE=${host} +ND_NIDL_INSTANCE=${chart} +ND_NIDL_CONTEXT=${context} +ND_ALERT_NAME=${name} +ND_ALERT_ID=${alarm_id} +ND_ALERT_UNIQUE_ID=${unique_id} +ND_ALERT_EVENT_ID=${alarm_event_id} +ND_ALERT_TRANSITION_ID=${transition_id//-/} +ND_ALERT_CLASS=${classification} +ND_ALERT_COMPONENT=${component} +ND_ALERT_TYPE=${type} +ND_ALERT_RECIPIENT=${roles} +ND_ALERT_VALUE=${value} +ND_ALERT_VALUE_OLD=${old_value} +ND_ALERT_STATUS=${status} +ND_ALERT_STATUS_OLD=${old_status} +ND_ALERT_UNITS=${units} +ND_ALERT_SUMMARY=${summary} +ND_ALERT_INFO=${info} +ND_ALERT_DURATION=${duration} +ND_REQUEST=${cmd_line} +MESSAGE_ID=6db0018e83e34320ae2a659d78019fb7 +MESSAGE=[ALERT NOTIFICATION]: ${*//[$'\r\n']/{NEWLINE}} + +EOFLOG + # AN EMPTY LINE IS NEEDED ABOVE } info() { - [[ -n "$LOG_LEVEL" && "$LOG_LEVEL_INFO" -gt "$LOG_LEVEL" ]] && return - log INFO "${@}" + log "$NDLP_INFO" "${@}" } warning() { - [[ -n "$LOG_LEVEL" && "$LOG_LEVEL_WARN" -gt "$LOG_LEVEL" ]] && return - log WARNING "${@}" + log "$NDLP_WARN" "${@}" } error() { - [[ -n "$LOG_LEVEL" && "$LOG_LEVEL_ERR" -gt "$LOG_LEVEL" ]] && return - log ERROR "${@}" + log "$NDLP_ERR" "${@}" } fatal() { - log FATAL "${@}" + log "$NDLP_ALERT" "${@}" exit 1 } -debug=${NETDATA_ALARM_NOTIFY_DEBUG-0} debug() { - [ "${debug}" = "1" ] && log DEBUG "${@}" + log "$NDLP_DEBUG" "${@}" } +debug=0 +if [ "${NETDATA_ALARM_NOTIFY_DEBUG-0}" = "1" ]; then + debug=1 + LOG_LEVEL=$NDLP_DEBUG +fi + +# ----------------------------------------------------------------------------- +# check for BASH v4+ (required for associative arrays) + +if [ ${BASH_VERSINFO[0]} -lt 4 ]; then + echo >&2 "BASH version 4 or later is required (this is ${BASH_VERSION})." + exit 1 +fi + + +# ----------------------------------------------------------------------------- + docurl() { if [ -z "${curl}" ]; then error "${curl} is unset." @@ -199,17 +279,10 @@ ntfy # this is to be overwritten by the config file custom_sender() { - info "not sending custom notification for ${status} of '${host}.${chart}.${name}'" + info "custom notification mechanism is not configured; not sending ${notification_description}" } # ----------------------------------------------------------------------------- - -# check for BASH v4+ (required for associative arrays) -if [ ${BASH_VERSINFO[0]} -lt 4 ]; then - fatal "BASH version 4 or later is required (this is ${BASH_VERSION})." -fi - -# ----------------------------------------------------------------------------- # defaults to allow running this script by hand [ -z "${NETDATA_USER_CONFIG_DIR}" ] && NETDATA_USER_CONFIG_DIR="@configdir_POST@" @@ -228,8 +301,8 @@ if [[ ${1} = "unittest" ]]; then status="${4}" # the current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL old_status="${5}" # the previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL elif [[ ${1} = "dump_methods" ]]; then - dump_methods=1 - status="WARNING" + dump_methods=1 + status="WARNING" else roles="${1}" # the roles that should be notified for this event args_host="${2}" # the host generated this event @@ -263,6 +336,9 @@ else child_machine_guid="${28}" # the machine_guid of the child transition_id="${29}" # the transition_id of the alert summary="${30}" # the summary text field of the alert + context="${31}" # the context of the chart + component="${32}" + type="${33}" fi # ----------------------------------------------------------------------------- @@ -276,18 +352,20 @@ else host="${args_host}" fi +notification_description="notification to '${roles}' for transition from ${old_status} to ${status}, of alert '${name}' = '${value_string}', of instance '${chart}', context '${context}' on host '${host}'" + # ----------------------------------------------------------------------------- # screen statuses we don't need to send a notification # don't do anything if this is not WARNING, CRITICAL or CLEAR if [ "${status}" != "WARNING" ] && [ "${status}" != "CRITICAL" ] && [ "${status}" != "CLEAR" ]; then - info "not sending notification for ${status} of '${host}.${chart}.${name}'" + debug "not sending ${notification_description}" exit 1 fi # don't do anything if this is CLEAR, but it was not WARNING or CRITICAL if [ "${clear_alarm_always}" != "YES" ] && [ "${old_status}" != "WARNING" ] && [ "${old_status}" != "CRITICAL" ] && [ "${status}" = "CLEAR" ]; then - info "not sending notification for ${status} of '${host}.${chart}.${name}' (last status was ${old_status})" + debug "not sending ${notification_description}" exit 1 fi @@ -434,7 +512,7 @@ else debug "Loading config file '${CONFIG}'..." source "${CONFIG}" || error "Failed to load config file '${CONFIG}'." else - warning "Cannot find file '${CONFIG}'." + debug "Cannot find file '${CONFIG}'." fi done fi @@ -598,7 +676,16 @@ filter_recipient_by_criticality() { } # ----------------------------------------------------------------------------- -# verify the delivery methods supported +# check the configured targets + +# check email +if [ "${SEND_EMAIL}" = "AUTO" ]; then + if command -v curl >/dev/null 2>&1; then + SEND_EMAIL="YES" + else + SEND_EMAIL="NO" + fi +fi # check slack [ -z "${SLACK_WEBHOOK_URL}" ] && SEND_SLACK="NO" @@ -677,112 +764,121 @@ filter_recipient_by_criticality() { # check custom [ -z "${DEFAULT_RECIPIENT_CUSTOM}" ] && SEND_CUSTOM="NO" -if [ "${SEND_PUSHOVER}" = "YES" ] || - [ "${SEND_SLACK}" = "YES" ] || - [ "${SEND_ROCKETCHAT}" = "YES" ] || - [ "${SEND_ALERTA}" = "YES" ] || - [ "${SEND_PD}" = "YES" ] || - [ "${SEND_FLOCK}" = "YES" ] || - [ "${SEND_DISCORD}" = "YES" ] || - [ "${SEND_HIPCHAT}" = "YES" ] || - [ "${SEND_TWILIO}" = "YES" ] || - [ "${SEND_MESSAGEBIRD}" = "YES" ] || - [ "${SEND_KAVENEGAR}" = "YES" ] || - [ "${SEND_TELEGRAM}" = "YES" ] || - [ "${SEND_PUSHBULLET}" = "YES" ] || - [ "${SEND_KAFKA}" = "YES" ] || - [ "${SEND_FLEEP}" = "YES" ] || - [ "${SEND_PROWL}" = "YES" ] || - [ "${SEND_MATRIX}" = "YES" ] || - [ "${SEND_CUSTOM}" = "YES" ] || - [ "${SEND_MSTEAMS}" = "YES" ] || - [ "${SEND_DYNATRACE}" = "YES" ] || - [ "${SEND_OPSGENIE}" = "YES" ] || - [ "${SEND_GOTIFY}" = "YES" ] || - [ "${SEND_NTFY}" = "YES" ]; then - # if we need curl, check for the curl command - if [ -z "${curl}" ]; then - curl="$(command -v curl 2>/dev/null)" - fi - if [ -z "${curl}" ]; then - error "Cannot find curl command in the system path. Disabling all curl based notifications." - SEND_PUSHOVER="NO" - SEND_PUSHBULLET="NO" - SEND_TELEGRAM="NO" - SEND_SLACK="NO" - SEND_MSTEAMS="NO" - SEND_ROCKETCHAT="NO" - SEND_ALERTA="NO" - SEND_PD="NO" - SEND_FLOCK="NO" - SEND_DISCORD="NO" - SEND_TWILIO="NO" - SEND_HIPCHAT="NO" - SEND_MESSAGEBIRD="NO" - SEND_KAVENEGAR="NO" - SEND_KAFKA="NO" - SEND_FLEEP="NO" - SEND_PROWL="NO" - SEND_MATRIX="NO" - SEND_CUSTOM="NO" - SEND_DYNATRACE="NO" - SEND_OPSGENIE="NO" - SEND_GOTIFY="NO" - SEND_NTFY="NO" - fi -fi +# ----------------------------------------------------------------------------- +# check the availability of targets -if [ "${SEND_SMS}" = "YES" ]; then - if [ -z "${sendsms}" ]; then - sendsms="$(command -v sendsms 2>/dev/null)" +check_supported_targets() { + local log=${1} + shift + + if [ "${SEND_PUSHOVER}" = "YES" ] || + [ "${SEND_SLACK}" = "YES" ] || + [ "${SEND_ROCKETCHAT}" = "YES" ] || + [ "${SEND_ALERTA}" = "YES" ] || + [ "${SEND_PD}" = "YES" ] || + [ "${SEND_FLOCK}" = "YES" ] || + [ "${SEND_DISCORD}" = "YES" ] || + [ "${SEND_HIPCHAT}" = "YES" ] || + [ "${SEND_TWILIO}" = "YES" ] || + [ "${SEND_MESSAGEBIRD}" = "YES" ] || + [ "${SEND_KAVENEGAR}" = "YES" ] || + [ "${SEND_TELEGRAM}" = "YES" ] || + [ "${SEND_PUSHBULLET}" = "YES" ] || + [ "${SEND_KAFKA}" = "YES" ] || + [ "${SEND_FLEEP}" = "YES" ] || + [ "${SEND_PROWL}" = "YES" ] || + [ "${SEND_MATRIX}" = "YES" ] || + [ "${SEND_CUSTOM}" = "YES" ] || + [ "${SEND_MSTEAMS}" = "YES" ] || + [ "${SEND_DYNATRACE}" = "YES" ] || + [ "${SEND_OPSGENIE}" = "YES" ] || + [ "${SEND_GOTIFY}" = "YES" ] || + [ "${SEND_NTFY}" = "YES" ]; then + # if we need curl, check for the curl command + if [ -z "${curl}" ]; then + curl="$(command -v curl 2>/dev/null)" + fi + if [ -z "${curl}" ]; then + $log "Cannot find curl command in the system path. Disabling all curl based notifications." + SEND_PUSHOVER="NO" + SEND_PUSHBULLET="NO" + SEND_TELEGRAM="NO" + SEND_SLACK="NO" + SEND_MSTEAMS="NO" + SEND_ROCKETCHAT="NO" + SEND_ALERTA="NO" + SEND_PD="NO" + SEND_FLOCK="NO" + SEND_DISCORD="NO" + SEND_TWILIO="NO" + SEND_HIPCHAT="NO" + SEND_MESSAGEBIRD="NO" + SEND_KAVENEGAR="NO" + SEND_KAFKA="NO" + SEND_FLEEP="NO" + SEND_PROWL="NO" + SEND_MATRIX="NO" + SEND_CUSTOM="NO" + SEND_DYNATRACE="NO" + SEND_OPSGENIE="NO" + SEND_GOTIFY="NO" + SEND_NTFY="NO" + fi fi - if [ -z "${sendsms}" ]; then - SEND_SMS="NO" + + if [ "${SEND_SMS}" = "YES" ]; then + if [ -z "${sendsms}" ]; then + sendsms="$(command -v sendsms 2>/dev/null)" + fi + if [ -z "${sendsms}" ]; then + SEND_SMS="NO" + fi fi -fi -# if we need sendmail, check for the sendmail command -if [ "${SEND_EMAIL}" = "YES" ] && [ -z "${sendmail}" ]; then - sendmail="$(command -v sendmail 2>/dev/null)" - if [ -z "${sendmail}" ]; then - debug "Cannot find sendmail command in the system path. Disabling email notifications." - SEND_EMAIL="NO" + # if we need sendmail, check for the sendmail command + if [ "${SEND_EMAIL}" = "YES" ] && [ -z "${sendmail}" ]; then + sendmail="$(command -v sendmail 2>/dev/null)" + if [ -z "${sendmail}" ]; then + $log "Cannot find sendmail command in the system path. Disabling email notifications." + SEND_EMAIL="NO" + fi fi -fi -# if we need logger, check for the logger command -if [ "${SEND_SYSLOG}" = "YES" ] && [ -z "${logger}" ]; then - logger="$(command -v logger 2>/dev/null)" - if [ -z "${logger}" ]; then - debug "Cannot find logger command in the system path. Disabling syslog notifications." - SEND_SYSLOG="NO" + # if we need logger, check for the logger command + if [ "${SEND_SYSLOG}" = "YES" ] && [ -z "${logger}" ]; then + logger="$(command -v logger 2>/dev/null)" + if [ -z "${logger}" ]; then + $log "Cannot find logger command in the system path. Disabling syslog notifications." + SEND_SYSLOG="NO" + fi fi -fi -# if we need aws, check for the aws command -if [ "${SEND_AWSSNS}" = "YES" ] && [ -z "${aws}" ]; then - aws="$(command -v aws 2>/dev/null)" - if [ -z "${aws}" ]; then - debug "Cannot find aws command in the system path. Disabling Amazon SNS notifications." - SEND_AWSSNS="NO" + # if we need aws, check for the aws command + if [ "${SEND_AWSSNS}" = "YES" ] && [ -z "${aws}" ]; then + aws="$(command -v aws 2>/dev/null)" + if [ -z "${aws}" ]; then + $log "Cannot find aws command in the system path. Disabling Amazon SNS notifications." + SEND_AWSSNS="NO" + fi fi -fi -# if we need nc, check for the nc command -if [ "${SEND_IRC}" = "YES" ] && [ -z "${nc}" ]; then - nc="$(command -v nc 2>/dev/null)" - if [ -z "${nc}" ]; then - debug "Cannot find nc command in the system path. Disabling IRC notifications." - SEND_IRC="NO" + # if we need nc, check for the nc command + if [ "${SEND_IRC}" = "YES" ] && [ -z "${nc}" ]; then + nc="$(command -v nc 2>/dev/null)" + if [ -z "${nc}" ]; then + $log "Cannot find nc command in the system path. Disabling IRC notifications." + SEND_IRC="NO" + fi fi -fi +} if [ ${dump_methods} ]; then + check_supported_targets debug for name in "${!SEND_@}"; do if [ "${!name}" = "YES" ]; then echo "$name" fi done - exit |