summaryrefslogtreecommitdiffstats
path: root/health/health.c
diff options
context:
space:
mode:
Diffstat (limited to 'health/health.c')
-rw-r--r--health/health.c1730
1 files changed, 112 insertions, 1618 deletions
diff --git a/health/health.c b/health/health.c
index 2eb782cb44..7039a193cc 100644
--- a/health/health.c
+++ b/health/health.c
@@ -1,1678 +1,172 @@
// SPDX-License-Identifier: GPL-3.0-or-later
#include "health.h"
+#include "health_internals.h"
-#define WORKER_HEALTH_JOB_RRD_LOCK 0
-#define WORKER_HEALTH_JOB_HOST_LOCK 1
-#define WORKER_HEALTH_JOB_DB_QUERY 2
-#define WORKER_HEALTH_JOB_CALC_EVAL 3
-#define WORKER_HEALTH_JOB_WARNING_EVAL 4
-#define WORKER_HEALTH_JOB_CRITICAL_EVAL 5
-#define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6
-#define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7
-#define WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET 8
-#define WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM 9
+struct health_plugin_globals health_globals = {
+ .initialization = {
+ .spinlock = NETDATA_SPINLOCK_INITIALIZER,
+ .done = false,
+ },
+ .config = {
+ .enabled = true,
+ .stock_enabled = true,
+ .use_summary_for_notifications = true,
-#if WORKER_UTILIZATION_MAX_JOB_TYPES < 10
-#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 10
-#endif
+ .health_log_entries_max = HEALTH_LOG_ENTRIES_DEFAULT,
+ .health_log_history = HEALTH_LOG_HISTORY_DEFAULT,
-unsigned int default_health_enabled = 1;
-char *silencers_filename;
-SIMPLE_PATTERN *conf_enabled_alarms = NULL;
-DICTIONARY *health_rrdvars;
+ .default_warn_repeat_every = 0,
+ .default_crit_repeat_every = 0,
-bool health_alarm_log_get_global_id_and_transition_id_for_rrdcalc(RRDCALC *rc, usec_t *global_id, uuid_t *transitions_id) {
- if(!rc->rrdset)
- return false;
-
- RRDHOST *host = rc->rrdset->rrdhost;
-
- rw_spinlock_read_lock(&host->health_log.spinlock);
-
- ALARM_ENTRY *ae;
- for(ae = host->health_log.alarms; ae ; ae = ae->next) {
- if(unlikely(ae->alarm_id == rc->id))
- break;
- }
-
- if(ae) {
- *global_id = ae->global_id;
- uuid_copy(*transitions_id, ae->transition_id);
- }
- else {
- *global_id = 0;
- uuid_clear(*transitions_id);
+ .run_at_least_every_seconds = 10,
+ .postpone_alarms_during_hibernation_for_seconds = 60,
+ },
+ .prototypes = {
+ .dict = NULL,
}
+};
- rw_spinlock_read_unlock(&host->health_log.spinlock);
-
- return ae != NULL;
+bool health_plugin_enabled(void) {
+ return health_globals.config.enabled;
}
-void health_entry_flags_to_json_array(BUFFER *wb, const char *key, HEALTH_ENTRY_FLAGS flags) {
- buffer_json_member_add_array(wb, key);
-
- if(flags & HEALTH_ENTRY_FLAG_PROCESSED)
- buffer_json_add_array_item_string(wb, "PROCESSED");
- if(flags & HEALTH_ENTRY_FLAG_UPDATED)
- buffer_json_add_array_item_string(wb, "UPDATED");
- if(flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
- buffer_json_add_array_item_string(wb, "EXEC_RUN");
- if(flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)
- buffer_json_add_array_item_string(wb, "EXEC_FAILED");
- if(flags & HEALTH_ENTRY_FLAG_SILENCED)
- buffer_json_add_array_item_string(wb, "SILENCED");
- if(flags & HEALTH_ENTRY_RUN_ONCE)
- buffer_json_add_array_item_string(wb, "RUN_ONCE");
- if(flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS)
- buffer_json_add_array_item_string(wb, "EXEC_IN_PROGRESS");
- if(flags & HEALTH_ENTRY_FLAG_IS_REPEATING)
- buffer_json_add_array_item_string(wb, "RECURRING");
- if(flags & HEALTH_ENTRY_FLAG_SAVED)
- buffer_json_add_array_item_string(wb, "SAVED");
- if(flags & HEALTH_ENTRY_FLAG_ACLK_QUEUED)
- buffer_json_add_array_item_string(wb, "ACLK_QUEUED");
- if(flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION)
- buffer_json_add_array_item_string(wb, "NO_CLEAR_NOTIFICATION");
-
- buffer_json_array_close(wb);
+void health_plugin_disable(void) {
+ health_globals.config.enabled = false;
}
-static bool prepare_command(BUFFER *wb,
- const char *exec,
- const char *recipient,
- const char *registry_hostname,
- uint32_t unique_id,
- uint32_t alarm_id,
- uint32_t alarm_event_id,
- uint32_t when,
- const char *alert_name,
- const char *alert_chart_name,
- const char *new_status,
- const char *old_status,
- NETDATA_DOUBLE new_value,
- NETDATA_DOUBLE old_value,
- const char *alert_source,
- uint32_t duration,
- uint32_t non_clear_duration,
- const char *alert_units,
- const char *alert_info,
- const char *new_value_string,
- const char *old_value_string,
- const char *source,
- const char *error_msg,
- int n_warn,
- int n_crit,
- const char *warn_alarms,
- const char *crit_alarms,
- const char *classification,
- const char *edit_command,
- const char *machine_guid,
- uuid_t *transition_id,
- const char *summary,
- const char *context,
- const char *component,
- const char *type
-) {
- char buf[8192];
- size_t n = sizeof(buf) - 1;
-
- buffer_strcat(wb, "exec");
-
- if (!sanitize_command_argument_string(buf, exec, n))
- return false;
- buffer_sprintf(wb, " '%s'", buf);
-
- if (!sanitize_command_argument_string(buf, recipient, n))
- return false;
- buffer_sprintf(wb, " '%s'", buf);
-
- if (!sanitize_command_argument_string(buf, registry_hostname, n))
- return false;
- buffer_sprintf(wb, " '%s'", buf);
-
- buffer_sprintf(wb, " '%u'", unique_id);
-
- buffer_sprintf(wb, " '%u'", alarm_id);
-
- buffer_sprintf(wb, " '%u'", alarm_event_id);
-
- buffer_sprintf(wb, " '%u'", when);
-
- if (!sanitize_command_argument_string(buf, alert_name, n))
- return false;
- buffer_sprintf(wb, " '%s'", buf);
-
- if (!sanitize_command_argument_string(buf, alert_chart_name, n))
- return false;
- buffer_sprintf(wb, " '%s'", buf);
-
- if (!sanitize_command_argument_string(buf, new_status, n))
- return false;
- buffer_sprintf(wb, " '%s'", buf);
-
- if (!sanitize_command_argument_string(buf, old_status, n))
- return false;
- buffer_sprintf(wb, " '%s'", buf);
- buffer_sprintf(wb, " '" NETDATA_DOUBLE_FORMAT_ZERO "'", new_value);
-
- buffer_sprintf(wb, " '" NETDATA_DOUBLE_FORMAT_ZERO "'", old_value);
-
- if (!sanitize_command_argument_string(buf, alert_source, n))
- return false;
- buffer_sprintf(wb, " '%s'", buf);
-
- buffer_sprintf(wb, " '%u'", duration);
-
- buffer_sprintf(wb, " '%u'", non_clear_duration);
-
- if (!sanitize_command_argument_string(buf, alert_units, n))
- return false;
- buffer_sprintf(wb, " '%s'", buf);
-
- if (!sanitize_command_argument_string(buf, alert_info, n))
- return false;
- buffer_sprintf(wb, " '%s'", buf);
-
- if (!sanitize_command_argument_string(buf, new_value_string, n))
- return false;
- buffer_sprintf(wb, " '%s'", buf);
-
- if (!sanitize_command_argument_string(buf, old_value_string, n))
- return false;
- buffer_sprintf(wb, " '%s'", buf);
-
- if (!sanitize_command_argument_string(buf, source, n))
- return false;
- buffer_sprintf(wb, " '%s'", buf);
-
- if (!sanitize_command_argument_string(buf, error_msg, n))
- return false;
- buffer_sprintf(wb, " '%s'", buf);
+static void health_load_config_defaults(void) {
+ char filename[FILENAME_MAX + 1];
- buffer_sprintf(wb, " '%d'", n_warn);
+ health_globals.config.enabled =
+ config_get_boolean(CONFIG_SECTION_HEALTH,
+ "enabled",
+ health_globals.config.enabled);
- buffer_sprintf(wb, " '%d'", n_crit);
+ health_globals.config.stock_enabled =
+ config_get_boolean(CONFIG_SECTION_HEALTH,
+ "enable stock health configuration",
+ health_globals.config.stock_enabled);
- if (!sanitize_command_argument_string(buf, warn_alarms, n))
- return false;
- buffer_sprintf(wb, " '%s'", buf);
+ health_globals.config.use_summary_for_notifications =
+ config_get_boolean(CONFIG_SECTION_HEALTH,
+ "use summary for notifications",
+ health_globals.config.use_summary_for_notifications);
- if (!sanitize_command_argument_string(buf, crit_alarms, n))
- return false;
- buffer_sprintf(wb, " '%s'", buf);
+ health_globals.config.default_warn_repeat_every =
+ config_get_duration(CONFIG_SECTION_HEALTH, "default repeat warning", "never");
- if (!sanitize_command_argument_string(buf, classification, n))
- return false;
- buffer_sprintf(wb, " '%s'", buf);
+ health_globals.config.default_crit_repeat_every =
+ config_get_duration(CONFIG_SECTION_HEALTH, "default repeat critical", "never");
- if (!sanitize_command_argument_string(buf, edit_command, n))
- return false;
- buffer_sprintf(wb, " '%s'", buf);
+ health_globals.config.health_log_entries_max =
+ config_get_number(CONFIG_SECTION_HEALTH, "in memory max health log entries",
+ health_globals.config.health_log_entries_max);
- if (!sanitize_command_argument_string(buf, machine_guid, n))
- return false;
- buffer_sprintf(wb, " '%s'", buf);
+ health_globals.config.health_log_history =
+ config_get_number(CONFIG_SECTION_HEALTH, "health log history", HEALTH_LOG_DEFAULT_HISTORY);
- char tr_id[UUID_STR_LEN];
- uuid_unparse_lower(*transition_id, tr_id);
- if (!sanitize_command_argument_string(buf, tr_id, n))
- return false;
- buffer_sprintf(wb, " '%s'", buf);
+ snprintfz(filename, FILENAME_MAX, "%s/alarm-notify.sh", netdata_configured_primary_plugins_dir);
+ health_globals.config.default_exec =
+ string_strdupz(config_get(CONFIG_SECTION_HEALTH, "script to execute on alarm", filename));
- if (!sanitize_command_argument_string(buf, summary, n))
- return false;
- buffer_sprintf(wb, " '%s'", buf);
+ health_globals.config.enabled_alerts =
+ simple_pattern_create(config_get(CONFIG_SECTION_HEALTH, "enabled alarms", "*"),
+ NULL, SIMPLE_PATTERN_EXACT, true);
- if (!sanitize_command_argument_string(buf, context, n))
- return false;
- buffer_sprintf(wb, " '%s'", buf);
+ health_globals.config.run_at_least_every_seconds =
+ (int)config_get_number(CONFIG_SECTION_HEALTH,
+ "run at least every seconds",
+ health_globals.config.run_at_least_every_seconds);
- if (!sanitize_command_argument_string(buf, component, n))
- return false;
- buffer_sprintf(wb, " '%s'", buf);
+ health_globals.config.postpone_alarms_during_hibernation_for_seconds =
+ config_get_number(CONFIG_SECTION_HEALTH,
+ "postpone alarms during hibernation for seconds",
+ health_globals.config.postpone_alarms_during_hibernation_for_seconds);
- if (!sanitize_command_argument_string(buf, type, n))
- return false;
- buffer_sprintf(wb, " '%s'", buf);
+ health_globals.config.default_recipient =
+ string_strdupz("root");
- return true;
-}
-
-// the queue of executed alarm notifications that haven't been waited for yet
-static struct {
- ALARM_ENTRY *head; // oldest
- ALARM_ENTRY *tail; // latest
-} alarm_notifications_in_progress = {NULL, NULL};
+ // ------------------------------------------------------------------------
+ // verify after loading
-typedef struct active_alerts {
- char *name;
- time_t last_status_change;
- RRDCALC_STATUS status;
-} active_alerts_t;
+ if(health_globals.config.run_at_least_every_seconds < 1)
+ health_globals.config.run_at_least_every_seconds = 1;
-static inline void enqueue_alarm_notify_in_progress(ALARM_ENTRY *ae)
-{
- ae->prev_in_progress = NULL;
- ae->next_in_progress = NULL;
+ if(health_globals.config.health_log_entries_max < HEALTH_LOG_ENTRIES_MIN) {
+ nd_log(NDLS_DAEMON, NDLP_WARNING,
+ "Health configuration has invalid max log entries %u, using minimum of %u",
+ health_globals.config.health_log_entries_max,
+ HEALTH_LOG_ENTRIES_MIN);
- if (NULL != alarm_notifications_in_progress.tail) {
- ae->prev_in_progress = alarm_notifications_in_progress.tail;
- alarm_notifications_in_progress.tail->next_in_progress = ae;
- }
- if (NULL == alarm_notifications_in_progress.head) {
- alarm_notifications_in_progress.head = ae;
+ health_globals.config.health_log_entries_max = HEALTH_LOG_ENTRIES_MIN;
+ config_set_number(CONFIG_SECTION_HEALTH, "in memory max health log entries",
+ (long)health_globals.config.health_log_entries_max);
}
- alarm_notifications_in_progress.tail = ae;
+ else if(health_globals.config.health_log_entries_max > HEALTH_LOG_ENTRIES_MAX) {
+ nd_log(NDLS_DAEMON, NDLP_WARNING,
+ "Health configuration has invalid max log entries %u, using maximum of %u",
+ health_globals.config.health_log_entries_max,
+ HEALTH_LOG_ENTRIES_MAX);
-}
+ health_globals.config.health_log_entries_max = HEALTH_LOG_ENTRIES_MAX;
+ config_set_number(CONFIG_SECTION_HEALTH, "in memory max health log entries",
+ (long)health_globals.config.health_log_entries_max);
+ }
-static inline void unlink_alarm_notify_in_progress(ALARM_ENTRY *ae)
-{
- struct alarm_entry *prev = ae->prev_in_progress;
- struct alarm_entry *next = ae->next_in_progress;
+ if (health_globals.config.health_log_history < HEALTH_LOG_MINIMUM_HISTORY) {
+ nd_log(NDLS_DAEMON, NDLP_WARNING,
+ "Health configuration has invalid health log history %u. Using minimum %d",
+ health_globals.config.health_log_history, HEALTH_LOG_MINIMUM_HISTORY);
- if (NULL != prev) {
- prev->next_in_progress = next;
- }
- if (NULL != next) {
- next->prev_in_progress = prev;
- }
- if (ae == alarm_notifications_in_progress.head) {
- alarm_notifications_in_progress.head = next;
- }
- if (ae == alarm_notifications_in_progress.tail) {
- alarm_notifications_in_progress.tail = prev;
+ health_globals.config.health_log_history = HEALTH_LOG_MINIMUM_HISTORY;
+ config_set_number(CONFIG_SECTION_HEALTH, "health log history", health_globals.config.health_log_history);
}
+
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "Health log history is set to %u seconds (%u days)",
+ health_globals.config.health_log_history, health_globals.config.health_log_history / 86400);
}
-// ----------------------------------------------------------------------------
-// health initialization
-/**
- * User Config directory
- *
- * Get the config directory for health and return it.
- *
- * @return a pointer to the user config directory
- */
inline char *health_user_config_dir(void) {
char buffer[FILENAME_MAX + 1];
snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_user_config_dir);
return config_get(CONFIG_SECTION_DIRECTORIES, "health config", buffer);
}
-/**
- * Stock Config Directory
- *
- * Get the Stock config directory and return it.
- *
- * @return a pointer to the stock config directory.
- */
inline char *health_stock_config_dir(void) {
char buffer[FILENAME_MAX + 1];
snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_stock_config_dir);
return config_get(CONFIG_SECTION_DIRECTORIES, "stock health config", buffer);
}
-/**
- * Silencers init
- *
- * Function used to initialize the silencer structure.
- */
-static void health_silencers_init(void) {
- FILE *fd = fopen(silencers_filename, "r");
- if (fd) {
- fseek(fd, 0 , SEEK_END);
- off_t length = (off_t) ftell(fd);
- fseek(fd, 0 , SEEK_SET);
-
- if (length > 0 && length < HEALTH_SILENCERS_MAX_FILE_LEN) {
- char *str = mallocz((length+1)* sizeof(char));
- if(str) {
- size_t copied;
- copied = fread(str, sizeof(char), length, fd);
- if (copied == (length* sizeof(char))) {
- str[length] = 0x00;
- json_parse(str, NULL, health_silencers_json_read_callback);
- netdata_log_info("Parsed health silencers file %s", silencers_filename);
- } else {
- netdata_log_error("Cannot read the data from health silencers file %s", silencers_filename);
- }
- freez(str);
- }
- } else {
- netdata_log_error("Health silencers file %s has the size %" PRId64 " that is out of range[ 1 , %d ]. Aborting read.",
- silencers_filename,
- (int64_t)length,
- HEALTH_SILENCERS_MAX_FILE_LEN);
- }
- fclose(fd);
- } else {
- netdata_log_info("Cannot open the file %s, so Netdata will work with the default health configuration.",
- silencers_filename);
- }
-}
-
-/**
- * Health Init
- *
- * Initialize the health thread.
- */
-void health_init(void) {
- netdata_log_debug(D_HEALTH, "Health configuration initializing");
-
- if(!(default_health_enabled = (unsigned int)config_get_boolean(CONFIG_SECTION_HEALTH, "enabled", default_health_enabled))) {
- netdata_log_debug(D_HEALTH, "Health is disabled.");
- return;
- }
-
- health_silencers_init();
-}
-
-// ----------------------------------------------------------------------------
-// re-load health configuration
-
-/**
- * Reload host
- *
- * Reload configuration for a specific host.
- *
- * @param host the structure of the host that the function will reload the configuration.
- */
-static void health_reload_host(RRDHOST *host) {
- if(unlikely(!host->health.health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH))
- return;
-
- nd_log(NDLS_DAEMON, NDLP_DEBUG,
- "[%s]: Reloading health.",
- rrdhost_hostname(host));
-
- char *user_path = health_user_config_dir();
- char *stock_path = health_stock_config_dir();
-
- // free all running alarms
- rrdcalc_delete_all(host);
- rrdcalctemplate_delete_all(host);
-
- // invalidate all previous entries in the alarm log
- rw_spinlock_read_lock(&host->health_log.spinlock);
- ALARM_ENTRY *t;
- for(t = host->health_log.alarms ; t ; t = t->next) {
- if(t->new_status != RRDCALC_STATUS_REMOVED)
- t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
- }
- rw_spinlock_read_unlock(&host->health_log.spinlock);
-
- // reset all thresholds to all charts
- RRDSET *st;
- rrdset_foreach_read(st, host) {
- st->green = NAN;
- st->red = NAN;
- }
- rrdset_foreach_done(st);
-
- // load the new alarms
- health_readdir(host, user_path, stock_path, NULL);
-
- //Discard alarms with labels that do not apply to host
- rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
-
- // link the loaded alarms to their charts
- rrdset_foreach_write(st, host) {
- rrdcalc_link_matching_alerts_to_rrdset(st);
- rrdcalctemplate_link_matching_templates_to_rrdset(st);
- }
- rrdset_foreach_done(st);
-
-#ifdef ENABLE_ACLK
- if (netdata_cloud_enabled) {
- struct aclk_sync_cfg_t *wc = host->aclk_config;
- if (likely(wc)) {
- wc->alert_queue_removed = SEND_REMOVED_AFTER_HEALTH_LOOPS;
- }
- }
-#endif
-}
-
-/**
- * Reload
- *
- * Reload the host configuration for all hosts.
- */
-void health_reload(void) {
- sql_refresh_hashes();
-
- RRDHOST *host;
- dfe_start_reentrant(rrdhost_root_index, host){
- health_reload_host(host);
- }
- dfe_done(host);
-}
-
-// ----------------------------------------------------------------------------
-// health main thread and friends
-
-static inline RRDCALC_STATUS rrdcalc_value2status(NETDATA_DOUBLE n) {
- if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
- if(n) return RRDCALC_STATUS_RAISED;
- return RRDCALC_STATUS_CLEAR;
-}
-
-#define ACTIVE_ALARMS_LIST_EXAMINE 500
-#define ACTIVE_ALARMS_LIST 15
-
-static inline int compare_active_alerts(const void * a, const void * b) {
- active_alerts_t *active_alerts_a = (active_alerts_t *)a;
- active_alerts_t *active_alerts_b = (active_alerts_t *)b;
-
- return (int) ( active_alerts_b->last_status_change - active_alerts_a->last_status_change );
-}
-
-static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
- ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
-
- if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
- // do not send notifications for internal statuses
- netdata_log_debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
- goto done;
- }
-
- if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
- // do not send notifications for disabled statuses
-
- nd_log(NDLS_DAEMON, NDLP_DEBUG,
- "[%s]: Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)",
- rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
-
- // mark it as run, so that we will send the same alarm if it happens again
- goto done;
- }
-
- // find the previous notification for the same alarm
- // which we have run the exec script
- // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
- RRDCALC_STATUS last_executed_status = -3;
- if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
- int ret = sql_health_get_last_executed_event(host, ae, &last_executed_status);
-
- if (likely(ret == 1)) {
- // we have executed this alarm notification in the past
- if(last_executed_status == ae->new_status && !(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING)) {
- // don't send the notification for the same status again
- nd_log(NDLS_DAEMON, NDLP_DEBUG,
- "[%s]: Health not sending again notification for alarm '%s.%s' status %s",
- rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae),
- rrdcalc_status2string(ae->new_status));
- goto done;
- }
- }
- else {
- // we have not executed this alarm notification in the past
- // so, don't send CLEAR notifications
- if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
- if((!(ae->flags & HEALTH_ENTRY_RUN_ONCE)) || (ae->flags & HEALTH_ENTRY_RUN_ONCE && ae->old_status < RRDCALC_STATUS_RAISED) ) {
- netdata_log_debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
- , ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
- goto done;
- }
- }
- }
- }
-
- // Check if alarm notifications are silenced
- if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) {
- nd_log(NDLS_DAEMON, NDLP_DEBUG,
- "[%s]: Health not sending notification for alarm '%s.%s' status %s "
- "(command API has disabled notifications)",
- rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
- goto done;
- }
-
- nd_log(NDLS_DAEMON, NDLP_DEBUG,
- "[%s]: Sending notification for alarm '%s.%s' status %s.",
- rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
-
- const char *exec = (ae->exec) ? ae_exec(ae) : string2str(host->health.health_default_exec);
- const char *recipient = (ae->recipient) ? ae_recipient(ae) : string2str(host->health.health_default_recipient);
-
- int n_warn=0, n_crit=0;
- RRDCALC *rc;
- EVAL_EXPRESSION *expr=NULL;
- BUFFER *warn_alarms, *crit_alarms;
- active_alerts_t *active_alerts = callocz(ACTIVE_ALARMS_LIST_EXAMINE, sizeof(active_alerts_t));
-
- warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE, &netdata_buffers_statistics.buffers_health);
- crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE, &netdata_buffers_statistics.buffers_health);
-
- foreach_rrdcalc_in_rrdhost_read(host, rc) {
- if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
- continue;
-
- if(unlikely((n_warn + n_crit) >= ACTIVE_ALARMS_LIST_EXAMINE))
- break;
-
- if (unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
- if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
- active_alerts[n_warn+n_crit].name = (char *)rrdcalc_name(rc);
- active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change;
- active_alerts[n_warn+n_crit].status = rc->status;
- n_warn++;
- } else if (ae->alarm_id == rc->id)
- expr = rc->warning;
- } else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
- if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
- active_alerts[n_warn+n_crit].name = (char *)rrdcalc_name(rc);
- active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change;
- active_alerts[n_warn+n_crit].status = rc->status;
- n_crit++;
- } else if (ae->alarm_id == rc->id)
- expr = rc->critical;
- } else if (unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
- if (ae->alarm_id == rc->id)
- expr = rc->warning;
- }
- }
- foreach_rrdcalc_in_rrdhost_done(rc);
-
- if (n_warn+n_crit>1)
- qsort (active_alerts, n_warn+n_crit, sizeof(active_alerts_t), compare_active_alerts);
-
- int count_w = 0, count_c = 0;
- while (count_w + count_c < n_warn + n_crit && count_w + count_c < ACTIVE_ALARMS_LIST) {
- if (active_alerts[count_w+count_c].status == RRDCALC_STATUS_WARNING) {
- if (count_w)
- buffer_strcat(warn_alarms, ",");
- buffer_strcat(warn_alarms, active_alerts[count_w+count_c].name);
- buffer_strcat(warn_alarms, "=");
- buffer_snprintf(warn_alarms, 11, "%"PRId64"", (int64_t)active_alerts[count_w+count_c].last_status_change);
- count_w++;
- }
- else if (active_alerts[count_w+count_c].status == RRDCALC_STATUS_CRITICAL) {
- if (count_c)
- buffer_strcat(crit_alarms, ",");
- buffer_strcat(crit_alarms, active_alerts[count_w+count_c].name);
- buffer_strcat(crit_alarms, "=");
- buffer_snprintf(crit_alarms, 11, "%"PRId64"", (int64_t)active_alerts[count_w+count_c].last_status_change);
- count_c++;
- }
- }
-
- char *edit_command = ae->source ? health_edit_command_from_source(ae_source(ae)) : strdupz("UNKNOWN=0=UNKNOWN");
-
- BUFFER *wb = buffer_create(8192, &netdata_buffers_statistics.buffers_health);
- bool ok = prepare_command(wb,
- exec,
- recipient,
- rrdhost_registry_hostname(host),
- ae->unique_id,
- ae->alarm_id,
- ae->alarm_event_id,
- (unsigned long)ae->when,
- ae_name(ae),
- ae->chart?ae_chart_id(ae):"NOCHART",
- rrdcalc_status2string(ae->new_status),
- rrdcalc_status2string(ae->old_status),
- ae->new_value,
- ae->old_value,
- ae->source?ae_source(ae):"UNKNOWN",
- (uint32_t)ae->duration,
- (ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING && ae->new_status >= RRDCALC_STATUS_WARNING) ? (uint32_t)ae->duration : (uint32_t)ae->non_clear_duration,
- ae_units(ae),
- ae_info(ae),
- ae_new_value_string(ae),
- ae_old_value_string(ae),
- (expr && expr->source)?expr->source:"NOSOURCE",
- (expr && expr->error_msg)?buffer_tostring(expr->error_msg):"NOERRMSG",
- n_warn,
- n_crit,
- buffer_tostring(warn_alarms),
- buffer_tostring(crit_alarms),
- ae->classification?ae_classification(ae):"Unknown",
- edit_command,
- host->machine_guid,
- &ae->transition_id,
- host->health.use_summary_for_notifications && ae->summary?ae_summary(ae):ae_name(ae),
- string2str(ae->chart_context),
- string2str(ae->component),
- string2str(ae->type)
- );
-
- const char *command_to_run = buffer_tostring(wb);
- if (ok) {
- ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
- ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */
-
- netdata_log_debug(D_HEALTH, "executing command '%s'", command_to_run);
- ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
- ae->exec_spawn_serial = spawn_enq_cmd(command_to_run);
- enqueue_alarm_notify_in_progress(ae);
- health_alarm_log_save(host, ae);
- } else {
- netdata_log_error("Failed to format command arguments");
- }
-
- buffer_free(wb);
- freez(edit_command);
- buffer_free(warn_alarms);
- buffer_free(crit_alarms);
- freez(active_alerts);
-
- return; //health_alarm_wait_for_execution
-done:
- health_alarm_log_save(host, ae);
-}
-
-static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) {
- if (!(ae->flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS))
- return;
-
- spawn_wait_cmd(ae->exec_spawn_serial, &ae->exec_code, &ae->exec_run_timestamp);
- netdata_log_debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
- ae->flags &= ~HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
-
- if(ae->exec_code != 0)
- ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
-
- unlink_alarm_notify_in_progress(ae);
-}
-
-static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
- netdata_log_debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s",
- ae->chart?ae_chart_id(ae):"NOCHART", ae_name(ae),
- ae->new_value,
- rrdcalc_status2string(ae->old_status),
- rrdcalc_status2string(ae->new_status)
- );
-
- health_alarm_execute(host, ae);
-}
-
-static inline void health_alarm_log_process(RRDHOST *host) {
- uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
- time_t now = now_realtime_sec();
-
- rw_spinlock_read_lock(&host->health_log.spinlock);
-
- ALARM_ENTRY *ae;
- for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) {
- if(unlikely(
- !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
- !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
- )) {
- if(unlikely(ae->unique_id < first_waiting))
- first_waiting = ae->unique_id;
-
- if(likely(now >= ae->delay_up_to_timestamp))
- health_process_notifications(host, ae);
- }
- }
-
- rw_spinlock_read_unlock(&host->health_log.spinlock);
-
- // remember this for the next iteration
- host->health_last_processed_id = first_waiting;
-
- //delete those that are updated, no in progress execution, and is not repeating
- rw_spinlock_write_lock(&host->health_log.spinlock);
-
- ALARM_ENTRY *prev = NULL, *next = NULL;
- for(ae = host->health_log.alarms; ae ; ae = next) {
- next = ae->next; // set it here, for the next iteration
-
- if((likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING)) &&
- (ae->flags & HEALTH_ENTRY_FLAG_UPDATED) &&
- (ae->flags & HEALTH_ENTRY_FLAG_SAVED) &&
- !(ae->flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS))
- ||
- ((ae->new_status == RRDCALC_STATUS_REMOVED) &&
- (ae->flags & HEALTH_ENTRY_FLAG_SAVED) &&
- (ae->when + 86400 < now_realtime_sec())))
- {
-
- if(host->health_log.alarms == ae) {
- host->health_log.alarms = next;
- // prev is also NULL here
- }
- else {
- prev->next = next;
- // prev should not be touched here - we need it for the next iteration
- // because we may have to also remove the next item
- }
-
- health_alarm_log_free_one_nochecks_nounlink(ae);
- }
- else
- prev = ae;
- }
-
- rw_spinlock_write_unlock(&host->health_log.spinlock);
-}
-
-static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
- if(unlikely(!rc->rrdset)) {
- netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rrdcalc_chart_name(rc), rrdcalc_name(rc));
- return 0;
- }
-
- if(unlikely(rc->next_update > now)) {
- if (unlikely(*next_run > rc->next_update)) {
- // update the next_run time of the main loop
- // to run this alarm precisely the time required
- *next_run = rc->next_update;
- }
-
- netdata_log_debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rrdcalc_chart_name(rc), rrdcalc_name(rc), (int) (rc->next_update - now));
- return 0;
- }
-
- if(unlikely(!rc->update_every)) {
- netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rrdcalc_chart_name(rc), rrdcalc_name(rc));
- return 0;
- }
-
- if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE))) {
- netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rrdcalc_chart_name(rc), rrdcalc_name(rc));
- return 0;
- }
-
- if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
- netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rrdcalc_chart_name(rc), rrdcalc_name(rc));
- return 0;
- }
-
- int update_every = rc->rrdset->update_eve