summaryrefslogtreecommitdiffstats
path: root/health
diff options
context:
space:
mode:
authorCosta Tsaousis <costa@netdata.cloud>2022-09-05 19:31:06 +0300
committerGitHub <noreply@github.com>2022-09-05 19:31:06 +0300
commit5e1b95cf92168c4df74586fb4430dc284806da82 (patch)
treef42077d8b02eaf316683453a7474bd1f599a833d /health
parent544aef1fde6e79ac57d2dea85d3f063076d7f885 (diff)
Deduplicate all netdata strings (#13570)
* rrdfamily * rrddim * rrdset plugin and module names * rrdset units * rrdset type * rrdset family * rrdset title * rrdset title more * rrdset context * rrdcalctemplate context and removal of context hash from rrdset * strings statistics * rrdset name * rearranged members of rrdset * eliminate rrdset name hash; rrdcalc chart converted to STRING * rrdset id, eliminated rrdset hash * rrdcalc, alarm_entry, alert_config and some of rrdcalctemplate * rrdcalctemplate * rrdvar * eval_variable * rrddimvar and rrdsetvar * rrdhost hostname, os and tags * fix master commits * added thread cache; implemented string_dup without locks * faster thread cache * rrdset and rrddim now use dictionaries for indexing * rrdhost now uses dictionary * rrdfamily now uses DICTIONARY * rrdvar using dictionary instead of AVL * allocate the right size to rrdvar flag members * rrdhost remaining char * members to STRING * * better error handling on indexing * strings now use a read/write lock to allow parallel searches to the index * removed AVL support from dictionaries; implemented STRING with native Judy calls * string releases should be negative * only 31 bits are allowed for enum flags * proper locking on strings * string threading unittest and fixes * fix lgtm finding * fixed naming * stream chart/dimension definitions at the beginning of a streaming session * thread stack variable is undefined on thread cancel * rrdcontext garbage collect per host on startup * worker control in garbage collection * relaxed deletion of rrdmetrics * type checking on dictfe * netdata chart to monitor rrdcontext triggers * Group chart label updates * rrdcontext better handling of collected rrdsets * rrdpush incremental transmition of definitions should use as much buffer as possible * require 1MB per chart * empty the sender buffer before enabling metrics streaming * fill up to 50% of buffer * reset signaling metrics sending * use the shared variable for status * use separate host flag for enabling streaming of metrics * make sure the flag is clear * add logging for streaming * add logging for streaming on buffer overflow * circular_buffer proper sizing * removed obsolete logs * do not execute worker jobs if not necessary * better messages about compression disabling * proper use of flags and updating rrdset last access time every time the obsoletion flag is flipped * monitor stream sender used buffer ratio * Update exporting unit tests * no need to compare label value with strcmp * streaming send workers now monitor bandwidth * workers now use strings * streaming receiver monitors incoming bandwidth * parser shift of worker ids * minor fixes * Group chart label updates * Populate context with dimensions that have data * Fix chart id * better shift of parser worker ids * fix for streaming compression * properly count received bytes * ensure LZ4 compression ring buffer does not wrap prematurely * do not stream empty charts; do not process empty instances in rrdcontext * need_to_send_chart_definition() does not need an rrdset lock any more * rrdcontext objects are collected, after data have been written to the db * better logging of RRDCONTEXT transitions * always set all variables needed by the worker utilization charts * implemented double linked list for most objects; eliminated alarm indexes from rrdhost; and many more fixes * lockless strings design - string_dup() and string_freez() are totally lockless when they dont need to touch Judy - only Judy is protected with a read/write lock * STRING code re-organization for clarity * thread_cache improvements; double numbers precision on worker threads * STRING_ENTRY now shadown STRING, so no duplicate definition is required; string_length() renamed to string_strlen() to follow the paradigm of all other functions, STRING internal statistics are now only compiled with NETDATA_INTERNAL_CHECKS * rrdhost index by hostname now cleans up; aclk queries of archieved hosts do not index hosts * Add index to speed up database context searches * Removed last_updated optimization (was also buggy after latest merge with master) Co-authored-by: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com> Co-authored-by: Vladimir Kobal <vlad@prokk.net>
Diffstat (limited to 'health')
-rw-r--r--health/health.c250
-rw-r--r--health/health.h52
-rw-r--r--health/health_config.c592
-rw-r--r--health/health_json.c97
-rw-r--r--health/health_log.c292
5 files changed, 667 insertions, 616 deletions
diff --git a/health/health.c b/health/health.c
index 9eb36a9c60..c1045c3d81 100644
--- a/health/health.c
+++ b/health/health.c
@@ -155,18 +155,11 @@ static void health_reload_host(RRDHOST *host) {
// free all running alarms
rrdhost_wrlock(host);
- while(host->templates)
- rrdcalctemplate_unlink_and_free(host, host->templates);
+ while(host->alarms_templates)
+ rrdcalctemplate_unlink_and_free(host, host->alarms_templates);
- RRDCALCTEMPLATE *rt,*next;
- for(rt = host->alarms_template_with_foreach; rt ; rt = next) {
- next = rt->next;
- rrdcalctemplate_free(rt);
- }
- host->alarms_template_with_foreach = NULL;
-
- while(host->alarms)
- rrdcalc_unlink_and_free(host, host->alarms);
+ while(host->host_alarms)
+ rrdcalc_unlink_and_free(host, host->host_alarms);
RRDCALC *rc,*nc;
for(rc = host->alarms_with_foreach; rc ; rc = nc) {
@@ -266,13 +259,13 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
// do not send notifications for internal statuses
- debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
+ debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
goto done;
}
if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
// do not send notifications for disabled statuses
- debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
+ debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
// mark it as run, so that we will send the same alarm if it happens again
goto done;
}
@@ -292,7 +285,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
// we have executed this alarm notification in the past
if(t && t->new_status == ae->new_status) {
// don't send the notification for the same status again
- debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name
+ debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_name(ae), ae_name(ae)
, rrdcalc_status2string(ae->new_status));
goto done;
}
@@ -303,7 +296,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
if((!(ae->flags & HEALTH_ENTRY_RUN_ONCE)) || (ae->flags & HEALTH_ENTRY_RUN_ONCE && ae->old_status < RRDCALC_STATUS_RAISED) ) {
debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
- , ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
+ , ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
goto done;
}
}
@@ -312,14 +305,14 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
// Check if alarm notifications are silenced
if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) {
- info("Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
+ info("Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
goto done;
}
static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1];
- const char *exec = (ae->exec) ? ae->exec : host->health_default_exec;
- const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient;
+ const char *exec = (ae->exec) ? ae_exec(ae) : string2str(host->health_default_exec);
+ const char *recipient = (ae->recipient) ? ae_recipient(ae) : string2str(host->health_default_recipient);
int n_warn=0, n_crit=0;
RRDCALC *rc;
@@ -330,13 +323,16 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
- for(rc = host->alarms; rc && (n_warn + n_crit) < ACTIVE_ALARMS_LIST_EXAMINE ; rc = rc->next) {
+ foreach_rrdcalc_in_rrdhost(host, rc) {
if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
continue;
+ if(unlikely((n_warn + n_crit) >= ACTIVE_ALARMS_LIST_EXAMINE))
+ break;
+
if (unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
- active_alerts[n_warn+n_crit].name = rc->name;
+ active_alerts[n_warn+n_crit].name = (char *)rrdcalc_name(rc);
active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change;
active_alerts[n_warn+n_crit].status = rc->status;
n_warn++;
@@ -344,7 +340,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
expr = rc->warning;
} else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
- active_alerts[n_warn+n_crit].name = rc->name;
+ active_alerts[n_warn+n_crit].name = (char *)rrdcalc_name(rc);
active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change;
active_alerts[n_warn+n_crit].status = rc->status;
n_crit++;
@@ -379,39 +375,39 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
}
}
- char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0=UNKNOWN");
+ char *edit_command = ae->source ? health_edit_command_from_source(ae_source(ae)) : strdupz("UNKNOWN=0=UNKNOWN");
snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" NETDATA_DOUBLE_FORMAT_ZERO
"' '" NETDATA_DOUBLE_FORMAT_ZERO
"' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'",
exec,
recipient,
- host->registry_hostname,
+ rrdhost_registry_hostname(host),
ae->unique_id,
ae->alarm_id,
ae->alarm_event_id,
(unsigned long)ae->when,
- ae->name,
- ae->chart?ae->chart:"NOCHART",
- ae->family?ae->family:"NOFAMILY",
+ ae_name(ae),
+ ae->chart?ae_chart_name(ae):"NOCHART",
+ ae->family?ae_family(ae):"NOFAMILY",
rrdcalc_status2string(ae->new_status),
rrdcalc_status2string(ae->old_status),
ae->new_value,
ae->old_value,
- ae->source?ae->source:"UNKNOWN",
+ ae->source?ae_source(ae):"UNKNOWN",
(uint32_t)ae->duration,
(uint32_t)ae->non_clear_duration,
- ae->units?ae->units:"",
- ae->info?ae->info:"",
- ae->new_value_string,
- ae->old_value_string,
+ ae_units(ae),
+ ae_info(ae),
+ ae_new_value_string(ae),
+ ae_old_value_string(ae),
(expr && expr->source)?expr->source:"NOSOURCE",
(expr && expr->error_msg)?buffer_tostring(expr->error_msg):"NOERRMSG",
n_warn,
n_crit,
buffer_tostring(warn_alarms),
buffer_tostring(crit_alarms),
- ae->classification?ae->classification:"Unknown",
+ ae->classification?ae_classification(ae):"Unknown",
edit_command,
host != localhost ? host->machine_guid:""
);
@@ -450,7 +446,7 @@ static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) {
static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s",
- ae->chart?ae->chart:"NOCHART", ae->name,
+ ae->chart?ae_chart_name(ae):"NOCHART", ae_name(ae),
ae->new_value,
rrdcalc_status2string(ae->old_status),
rrdcalc_status2string(ae->new_status)
@@ -467,7 +463,7 @@ static inline void health_alarm_log_process(RRDHOST *host) {
ALARM_ENTRY *ae;
for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) {
- if(likely(!alarm_entry_isrepeating(host, ae))) {
+ if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) {
if(unlikely(
!(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
!(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
@@ -508,7 +504,7 @@ static inline void health_alarm_log_process(RRDHOST *host) {
ALARM_ENTRY *t = ae->next;
- if(likely(!alarm_entry_isrepeating(host, ae))) {
+ if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) {
health_alarm_wait_for_execution(ae);
health_alarm_log_free_one_nochecks_nounlink(ae);
host->health_log.count--;
@@ -522,7 +518,7 @@ static inline void health_alarm_log_process(RRDHOST *host) {
static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
if(unlikely(!rc->rrdset)) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
+ debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rrdcalc_chart_name(rc), rrdcalc_name(rc));
return 0;
}
@@ -533,27 +529,27 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run)
*next_run = rc->next_update;
}
- debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
+ debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rrdcalc_chart_name(rc), rrdcalc_name(rc), (int) (rc->next_update - now));
return 0;
}
if(unlikely(!rc->update_every)) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
+ debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rrdcalc_chart_name(rc), rrdcalc_name(rc));
return 0;
}
if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE))) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rc->chart?rc->chart:"NOCHART", rc->name);
+ debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rrdcalc_chart_name(rc), rrdcalc_name(rc));
return 0;
}
if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ARCHIVED))) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rc->chart?rc->chart:"NOCHART", rc->name);
+ debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rrdcalc_chart_name(rc), rrdcalc_name(rc));
return 0;
}
if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
+ debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rrdcalc_chart_name(rc), rrdcalc_name(rc));
return 0;
}
@@ -566,7 +562,7 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run)
if(unlikely(now + update_every < first /* || now - update_every > last */)) {
debug(D_HEALTH
, "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
- , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
+ , rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) now, (unsigned long) first
, (unsigned long) last);
return 0;
}
@@ -577,7 +573,7 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run)
if(needed + update_every < first || needed - update_every > last) {
debug(D_HEALTH
, "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
- , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
+ , rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) needed, (unsigned long) first
, (unsigned long) last);
return 0;
}
@@ -614,30 +610,30 @@ static void health_main_cleanup(void *ptr) {
static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
}
-static SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) {
+static SILENCE_TYPE check_silenced(RRDCALC *rc, const char *host, SILENCERS *silencers) {
SILENCER *s;
debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s",
- rc->name, (rc->rrdset)?rc->rrdset->context:"", rc->chart, host, (rc->rrdset)?rc->rrdset->family:"");
+ rrdcalc_name(rc), (rc->rrdset)?rrdset_context(rc->rrdset):"", rrdcalc_chart_name(rc), host, (rc->rrdset)?rrdset_family(rc->rrdset):"");
for (s = silencers->silencers; s!=NULL; s=s->next){
if (
- (!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches(s->alarms_pattern,rc->name))) &&
- (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches(s->contexts_pattern,rc->rrdset->context))) &&
+ (!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches(s->alarms_pattern, rrdcalc_name(rc)))) &&
+ (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches(s->contexts_pattern, rrdset_context(rc->rrdset)))) &&
(!s->hosts_pattern || (host && s->hosts_pattern && simple_pattern_matches(s->hosts_pattern,host))) &&
- (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches(s->charts_pattern,rc->chart))) &&
- (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches(s->families_pattern,rc->rrdset->family)))
+ (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches(s->charts_pattern, rrdcalc_chart_name(rc)))) &&
+ (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches(s->families_pattern, rrdset_family(rc->rrdset))))
) {
debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts, s->families);
if (unlikely(silencers->stype == STYPE_NONE)) {
- debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rc->name);
+ debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rrdcalc_name(rc));
} else {
debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s"
, (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced"
- , rc->name
- , (rc->rrdset)?rc->rrdset->context:""
- , rc->chart
+ , rrdcalc_name(rc)
+ , (rc->rrdset)?rrdset_context(rc->rrdset):""
+ , rrdcalc_chart_name(rc)
, host
- , (rc->rrdset)?rc->rrdset->family:""
+ , (rc->rrdset)?rrdset_family(rc->rrdset):""
);
}
return silencers->stype;
@@ -664,15 +660,15 @@ static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
if (silencers->stype == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED;
else if (silencers->stype == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED;
} else {
- SILENCE_TYPE st = check_silenced(rc, host->hostname, silencers);
+ SILENCE_TYPE st = check_silenced(rc, rrdhost_hostname(host), silencers);
if (st == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED;
else if (st == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED;
}
if (rrdcalc_flags_old != rc->rrdcalc_flags) {
info("Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s",
- host->hostname,
- rc->name,
+ rrdhost_hostname(host),
+ rrdcalc_name(rc),
(rrdcalc_flags_old & RRDCALC_FLAG_DISABLED)?"true":"false",
(rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)?"true":"false",
(rrdcalc_flags_old & RRDCALC_FLAG_SILENCED)?"true":"false",
@@ -812,7 +808,7 @@ void *health_main(void *ptr) {
info(
"Postponing health checks for %"PRId64" seconds, on host '%s'.",
(int64_t)hibernation_delay,
- host->hostname);
+ rrdhost_hostname(host));
host->health_delay_up_to = now + hibernation_delay;
}
@@ -821,7 +817,7 @@ void *health_main(void *ptr) {
if (unlikely(now < host->health_delay_up_to))
continue;
- info("Resuming health checks on host '%s'.", host->hostname);
+ info("Resuming health checks on host '%s'.", rrdhost_hostname(host));
host->health_delay_up_to = 0;
}
@@ -839,7 +835,7 @@ void *health_main(void *ptr) {
rrdhost_rdlock(host);
// the first loop is to lookup values from the db
- for (rc = host->alarms; rc; rc = rc->next) {
+ foreach_rrdcalc_in_rrdhost(host, rc) {
if (update_disabled_silenced(host, rc))
continue;
@@ -852,10 +848,33 @@ void *health_main(void *ptr) {
if (!rrdcalc_isrepeating(rc)) {
worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
time_t now = now_realtime_sec();
+
ALARM_ENTRY *ae = health_create_alarm_entry(
- host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context,
- rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
- rc->value, NAN, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0, 0);
+ host,
+ rc->id,
+ rc->next_event_id++,
+ rc->config_hash_id,
+ now,
+ rc->name,
+ rc->rrdset->id,
+ rc->rrdset->context,
+ rc->rrdset->family,
+ rc->classification,
+ rc->component,
+ rc->type,
+ rc->exec,
+ rc->recipient,
+ now - rc->last_status_change,
+ rc->value,
+ NAN,
+ rc->status,
+ RRDCALC_STATUS_REMOVED,
+ rc->source,
+ rc->units,
+ rc->info,
+ 0,
+ rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0);
+
if (ae) {
health_alarm_log(host, ae);
rc->old_status = rc->status;
@@ -890,7 +909,7 @@ void *health_main(void *ptr) {
/* time_t old_db_timestamp = rc->db_before; */
int value_is_null = 0;
- int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rc->dimensions, 1,
+ int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rrdcalc_dimensions(rc), 1,
rc->after, rc->before, rc->group, NULL,
0, rc->options,
&rc->db_after,&rc->db_before,
@@ -903,7 +922,7 @@ void *health_main(void *ptr) {
rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d",
- host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret
+ rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), ret
);
} else
rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
@@ -930,13 +949,13 @@ void *health_main(void *ptr) {
debug(D_HEALTH,
"Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
- host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name
+ rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc)
);
} else
rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
- rc->value
+ debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT,
+ rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), rc->value
);
}
@@ -952,7 +971,7 @@ void *health_main(void *ptr) {
rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s",
- host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
+ rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg)
);
} else {
@@ -960,9 +979,9 @@ void *health_main(void *ptr) {
debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
NETDATA_DOUBLE_FORMAT
- ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
+ ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
rc->calculation->parsed_as, rc->calculation->result,
- buffer_tostring(rc->calculation->error_msg), rc->source
+ buffer_tostring(rc->calculation->error_msg), rrdcalc_source(rc)
);
rc->value = rc->calculation->result;
@@ -980,7 +999,7 @@ void *health_main(void *ptr) {
if (unlikely(runnable && !netdata_exit)) {
rrdhost_rdlock(host);
- for (rc = host->alarms; rc; rc = rc->next) {
+ foreach_rrdcalc_in_rrdhost(host, rc) {
if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
continue;
@@ -1002,15 +1021,15 @@ void *health_main(void *ptr) {
debug(D_HEALTH,
"Health on host '%s', alarm '%s.%s': warning expression failed with error: %s",
- host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
+ rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
buffer_tostring(rc->warning->error_msg)
);
} else {
rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value "
NETDATA_DOUBLE_FORMAT
- ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART",
- rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source
+ ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc),
+ rrdcalc_name(rc), rc->warning->result, buffer_tostring(rc->warning->error_msg), rrdcalc_source(rc)
);
warning_status = rrdcalc_value2status(rc->warning->result);
}
@@ -1028,16 +1047,16 @@ void *health_main(void *ptr) {
debug(D_HEALTH,
"Health on host '%s', alarm '%s.%s': critical expression failed with error: %s",
- host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
+ rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
buffer_tostring(rc->critical->error_msg)
);
} else {
rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value "
NETDATA_DOUBLE_FORMAT
- ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART",
- rc->name, rc->critical->result, buffer_tostring(rc->critical->error_msg),
- rc->source
+ ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc),
+ rrdcalc_name(rc), rc->critical->result, buffer_tostring(rc->critical->error_msg),
+ rrdcalc_source(rc)
);
critical_status = rrdcalc_value2status(rc->critical->result);
}
@@ -1113,15 +1132,36 @@ void *health_main(void *ptr) {
ALARM_ENTRY *ae = health_create_alarm_entry(
- host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context,
- rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
- rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
- rc->delay_last,
- (
- ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
- ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
+ host,
+ rc->id,
+ rc->next_event_id++,
+ rc->config_hash_id,
+ now,
+ rc->name,
+ rc->rrdset->id,
+ rc->rrdset->context,
+ rc->rrdset->family,
+ rc->classification,
+ rc->component,
+ rc->type,
+ rc->exec,
+ rc->recipient,
+ now - rc->last_status_change,
+ rc->old_value,
+ rc->value,
+ rc->status,
+ status,
+ rc->source,
+ rc->units,
+ rc->info,
+ rc->delay_last,
+ (
+ ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
+ ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) |
+ (rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0)
)
);
+
health_alarm_log(host, ae);
rc->last_status_change = now;
@@ -1138,7 +1178,7 @@ void *health_main(void *ptr) {
// process repeating alarms
RRDCALC *rc;
- for(rc = host->alarms; rc ; rc = rc->next) {
+ foreach_rrdcalc_in_rrdhost(host, rc) {
int repeat_every = 0;
if(unlikely(rrdcalc_isrepeating(rc) && rc->delay_up_to_timestamp <= now)) {
if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
@@ -1164,16 +1204,38 @@ void *health_main(void *ptr) {
worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
rc->last_repeat = now;
if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++;
+
ALARM_ENTRY *ae = health_create_alarm_entry(
- host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context,
- rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
- rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info,
- rc->delay_last,
- (
- ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
- ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
+ host,
+ rc->id,
+ rc->next_event_id++,
+ rc->config_hash_id,
+ now,
+ rc->name,
+ rc->rrdset->id,
+ rc->rrdset->context,
+ rc->rrdset->family,
+ rc->classification,
+ rc->component,
+ rc->type,
+ rc->exec,
+ rc->recipient,
+ now - rc->last_status_change,
+ rc->old_value,
+ rc->value,
+ rc->old_status,
+ rc->status,
+ rc->source,
+ rc->units,
+ rc->info,
+ rc->delay_last,
+ (
+ ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
+ ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) |
+ (rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0)
)
);
+
ae->last_repeat = rc->last_repeat;
if (!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) {
ae->flags |= HEALTH_ENTRY_RUN_ONCE;
diff --git a/health/health.h b/health/health.h
index 3e77c12a70..aae608b522 100644
--- a/health/health.h
+++ b/health/health.h
@@ -14,6 +14,7 @@ extern unsigned int default_health_enabled;
#define HEALTH_ENTRY_FLAG_SILENCED 0x00000010
#define HEALTH_ENTRY_RUN_ONCE 0x00000020
#define HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS 0x00000040
+#define HEALTH_ENTRY_FLAG_IS_REPEATING 0x00000080
#define HEALTH_ENTRY_FLAG_SAVED 0x10000000
#define HEALTH_ENTRY_FLAG_ACLK_QUEUED 0x20000000
@@ -35,7 +36,6 @@ extern void health_init(void);