diff options
author | Costa Tsaousis <costa@netdata.cloud> | 2022-09-05 19:31:06 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-09-05 19:31:06 +0300 |
commit | 5e1b95cf92168c4df74586fb4430dc284806da82 (patch) | |
tree | f42077d8b02eaf316683453a7474bd1f599a833d /health | |
parent | 544aef1fde6e79ac57d2dea85d3f063076d7f885 (diff) |
Deduplicate all netdata strings (#13570)
* rrdfamily
* rrddim
* rrdset plugin and module names
* rrdset units
* rrdset type
* rrdset family
* rrdset title
* rrdset title more
* rrdset context
* rrdcalctemplate context and removal of context hash from rrdset
* strings statistics
* rrdset name
* rearranged members of rrdset
* eliminate rrdset name hash; rrdcalc chart converted to STRING
* rrdset id, eliminated rrdset hash
* rrdcalc, alarm_entry, alert_config and some of rrdcalctemplate
* rrdcalctemplate
* rrdvar
* eval_variable
* rrddimvar and rrdsetvar
* rrdhost hostname, os and tags
* fix master commits
* added thread cache; implemented string_dup without locks
* faster thread cache
* rrdset and rrddim now use dictionaries for indexing
* rrdhost now uses dictionary
* rrdfamily now uses DICTIONARY
* rrdvar using dictionary instead of AVL
* allocate the right size to rrdvar flag members
* rrdhost remaining char * members to STRING *
* better error handling on indexing
* strings now use a read/write lock to allow parallel searches to the index
* removed AVL support from dictionaries; implemented STRING with native Judy calls
* string releases should be negative
* only 31 bits are allowed for enum flags
* proper locking on strings
* string threading unittest and fixes
* fix lgtm finding
* fixed naming
* stream chart/dimension definitions at the beginning of a streaming session
* thread stack variable is undefined on thread cancel
* rrdcontext garbage collect per host on startup
* worker control in garbage collection
* relaxed deletion of rrdmetrics
* type checking on dictfe
* netdata chart to monitor rrdcontext triggers
* Group chart label updates
* rrdcontext better handling of collected rrdsets
* rrdpush incremental transmition of definitions should use as much buffer as possible
* require 1MB per chart
* empty the sender buffer before enabling metrics streaming
* fill up to 50% of buffer
* reset signaling metrics sending
* use the shared variable for status
* use separate host flag for enabling streaming of metrics
* make sure the flag is clear
* add logging for streaming
* add logging for streaming on buffer overflow
* circular_buffer proper sizing
* removed obsolete logs
* do not execute worker jobs if not necessary
* better messages about compression disabling
* proper use of flags and updating rrdset last access time every time the obsoletion flag is flipped
* monitor stream sender used buffer ratio
* Update exporting unit tests
* no need to compare label value with strcmp
* streaming send workers now monitor bandwidth
* workers now use strings
* streaming receiver monitors incoming bandwidth
* parser shift of worker ids
* minor fixes
* Group chart label updates
* Populate context with dimensions that have data
* Fix chart id
* better shift of parser worker ids
* fix for streaming compression
* properly count received bytes
* ensure LZ4 compression ring buffer does not wrap prematurely
* do not stream empty charts; do not process empty instances in rrdcontext
* need_to_send_chart_definition() does not need an rrdset lock any more
* rrdcontext objects are collected, after data have been written to the db
* better logging of RRDCONTEXT transitions
* always set all variables needed by the worker utilization charts
* implemented double linked list for most objects; eliminated alarm indexes from rrdhost; and many more fixes
* lockless strings design - string_dup() and string_freez() are totally lockless when they dont need to touch Judy - only Judy is protected with a read/write lock
* STRING code re-organization for clarity
* thread_cache improvements; double numbers precision on worker threads
* STRING_ENTRY now shadown STRING, so no duplicate definition is required; string_length() renamed to string_strlen() to follow the paradigm of all other functions, STRING internal statistics are now only compiled with NETDATA_INTERNAL_CHECKS
* rrdhost index by hostname now cleans up; aclk queries of archieved hosts do not index hosts
* Add index to speed up database context searches
* Removed last_updated optimization (was also buggy after latest merge with master)
Co-authored-by: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com>
Co-authored-by: Vladimir Kobal <vlad@prokk.net>
Diffstat (limited to 'health')
-rw-r--r-- | health/health.c | 250 | ||||
-rw-r--r-- | health/health.h | 52 | ||||
-rw-r--r-- | health/health_config.c | 592 | ||||
-rw-r--r-- | health/health_json.c | 97 | ||||
-rw-r--r-- | health/health_log.c | 292 |
5 files changed, 667 insertions, 616 deletions
diff --git a/health/health.c b/health/health.c index 9eb36a9c60..c1045c3d81 100644 --- a/health/health.c +++ b/health/health.c @@ -155,18 +155,11 @@ static void health_reload_host(RRDHOST *host) { // free all running alarms rrdhost_wrlock(host); - while(host->templates) - rrdcalctemplate_unlink_and_free(host, host->templates); + while(host->alarms_templates) + rrdcalctemplate_unlink_and_free(host, host->alarms_templates); - RRDCALCTEMPLATE *rt,*next; - for(rt = host->alarms_template_with_foreach; rt ; rt = next) { - next = rt->next; - rrdcalctemplate_free(rt); - } - host->alarms_template_with_foreach = NULL; - - while(host->alarms) - rrdcalc_unlink_and_free(host, host->alarms); + while(host->host_alarms) + rrdcalc_unlink_and_free(host, host->host_alarms); RRDCALC *rc,*nc; for(rc = host->alarms_with_foreach; rc ; rc = nc) { @@ -266,13 +259,13 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) { // do not send notifications for internal statuses - debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status)); + debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); goto done; } if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) { // do not send notifications for disabled statuses - debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status)); + debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); // mark it as run, so that we will send the same alarm if it happens again goto done; } @@ -292,7 +285,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { // we have executed this alarm notification in the past if(t && t->new_status == ae->new_status) { // don't send the notification for the same status again - debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name + debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_name(ae), ae_name(ae) , rrdcalc_status2string(ae->new_status)); goto done; } @@ -303,7 +296,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) { if((!(ae->flags & HEALTH_ENTRY_RUN_ONCE)) || (ae->flags & HEALTH_ENTRY_RUN_ONCE && ae->old_status < RRDCALC_STATUS_RAISED) ) { debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s" - , ae->chart, ae->name, rrdcalc_status2string(ae->new_status)); + , ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); goto done; } } @@ -312,14 +305,14 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { // Check if alarm notifications are silenced if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) { - info("Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status)); + info("Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); goto done; } static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1]; - const char *exec = (ae->exec) ? ae->exec : host->health_default_exec; - const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient; + const char *exec = (ae->exec) ? ae_exec(ae) : string2str(host->health_default_exec); + const char *recipient = (ae->recipient) ? ae_recipient(ae) : string2str(host->health_default_recipient); int n_warn=0, n_crit=0; RRDCALC *rc; @@ -330,13 +323,16 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE); crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE); - for(rc = host->alarms; rc && (n_warn + n_crit) < ACTIVE_ALARMS_LIST_EXAMINE ; rc = rc->next) { + foreach_rrdcalc_in_rrdhost(host, rc) { if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec)) continue; + if(unlikely((n_warn + n_crit) >= ACTIVE_ALARMS_LIST_EXAMINE)) + break; + if (unlikely(rc->status == RRDCALC_STATUS_WARNING)) { if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) { - active_alerts[n_warn+n_crit].name = rc->name; + active_alerts[n_warn+n_crit].name = (char *)rrdcalc_name(rc); active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change; active_alerts[n_warn+n_crit].status = rc->status; n_warn++; @@ -344,7 +340,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { expr = rc->warning; } else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) { if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) { - active_alerts[n_warn+n_crit].name = rc->name; + active_alerts[n_warn+n_crit].name = (char *)rrdcalc_name(rc); active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change; active_alerts[n_warn+n_crit].status = rc->status; n_crit++; @@ -379,39 +375,39 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { } } - char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0=UNKNOWN"); + char *edit_command = ae->source ? health_edit_command_from_source(ae_source(ae)) : strdupz("UNKNOWN=0=UNKNOWN"); snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" NETDATA_DOUBLE_FORMAT_ZERO "' '" NETDATA_DOUBLE_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'", exec, recipient, - host->registry_hostname, + rrdhost_registry_hostname(host), ae->unique_id, ae->alarm_id, ae->alarm_event_id, (unsigned long)ae->when, - ae->name, - ae->chart?ae->chart:"NOCHART", - ae->family?ae->family:"NOFAMILY", + ae_name(ae), + ae->chart?ae_chart_name(ae):"NOCHART", + ae->family?ae_family(ae):"NOFAMILY", rrdcalc_status2string(ae->new_status), rrdcalc_status2string(ae->old_status), ae->new_value, ae->old_value, - ae->source?ae->source:"UNKNOWN", + ae->source?ae_source(ae):"UNKNOWN", (uint32_t)ae->duration, (uint32_t)ae->non_clear_duration, - ae->units?ae->units:"", - ae->info?ae->info:"", - ae->new_value_string, - ae->old_value_string, + ae_units(ae), + ae_info(ae), + ae_new_value_string(ae), + ae_old_value_string(ae), (expr && expr->source)?expr->source:"NOSOURCE", (expr && expr->error_msg)?buffer_tostring(expr->error_msg):"NOERRMSG", n_warn, n_crit, buffer_tostring(warn_alarms), buffer_tostring(crit_alarms), - ae->classification?ae->classification:"Unknown", + ae->classification?ae_classification(ae):"Unknown", edit_command, host != localhost ? host->machine_guid:"" ); @@ -450,7 +446,7 @@ static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) { static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) { debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s", - ae->chart?ae->chart:"NOCHART", ae->name, + ae->chart?ae_chart_name(ae):"NOCHART", ae_name(ae), ae->new_value, rrdcalc_status2string(ae->old_status), rrdcalc_status2string(ae->new_status) @@ -467,7 +463,7 @@ static inline void health_alarm_log_process(RRDHOST *host) { ALARM_ENTRY *ae; for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) { - if(likely(!alarm_entry_isrepeating(host, ae))) { + if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) { if(unlikely( !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) && !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED) @@ -508,7 +504,7 @@ static inline void health_alarm_log_process(RRDHOST *host) { ALARM_ENTRY *t = ae->next; - if(likely(!alarm_entry_isrepeating(host, ae))) { + if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) { health_alarm_wait_for_execution(ae); health_alarm_log_free_one_nochecks_nounlink(ae); host->health_log.count--; @@ -522,7 +518,7 @@ static inline void health_alarm_log_process(RRDHOST *host) { static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) { if(unlikely(!rc->rrdset)) { - debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name); + debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return 0; } @@ -533,27 +529,27 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) *next_run = rc->next_update; } - debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now)); + debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rrdcalc_chart_name(rc), rrdcalc_name(rc), (int) (rc->next_update - now)); return 0; } if(unlikely(!rc->update_every)) { - debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name); + debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return 0; } if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE))) { - debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rc->chart?rc->chart:"NOCHART", rc->name); + debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return 0; } if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ARCHIVED))) { - debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rc->chart?rc->chart:"NOCHART", rc->name); + debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return 0; } if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) { - debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name); + debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return 0; } @@ -566,7 +562,7 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) if(unlikely(now + update_every < first /* || now - update_every > last */)) { debug(D_HEALTH , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)." - , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first + , rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) now, (unsigned long) first , (unsigned long) last); return 0; } @@ -577,7 +573,7 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) if(needed + update_every < first || needed - update_every > last) { debug(D_HEALTH , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)." - , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first + , rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) needed, (unsigned long) first , (unsigned long) last); return 0; } @@ -614,30 +610,30 @@ static void health_main_cleanup(void *ptr) { static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; } -static SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) { +static SILENCE_TYPE check_silenced(RRDCALC *rc, const char *host, SILENCERS *silencers) { SILENCER *s; debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s", - rc->name, (rc->rrdset)?rc->rrdset->context:"", rc->chart, host, (rc->rrdset)?rc->rrdset->family:""); + rrdcalc_name(rc), (rc->rrdset)?rrdset_context(rc->rrdset):"", rrdcalc_chart_name(rc), host, (rc->rrdset)?rrdset_family(rc->rrdset):""); for (s = silencers->silencers; s!=NULL; s=s->next){ if ( - (!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches(s->alarms_pattern,rc->name))) && - (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches(s->contexts_pattern,rc->rrdset->context))) && + (!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches(s->alarms_pattern, rrdcalc_name(rc)))) && + (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches(s->contexts_pattern, rrdset_context(rc->rrdset)))) && (!s->hosts_pattern || (host && s->hosts_pattern && simple_pattern_matches(s->hosts_pattern,host))) && - (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches(s->charts_pattern,rc->chart))) && - (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches(s->families_pattern,rc->rrdset->family))) + (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches(s->charts_pattern, rrdcalc_chart_name(rc)))) && + (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches(s->families_pattern, rrdset_family(rc->rrdset)))) ) { debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts, s->families); if (unlikely(silencers->stype == STYPE_NONE)) { - debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rc->name); + debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rrdcalc_name(rc)); } else { debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s" , (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced" - , rc->name - , (rc->rrdset)?rc->rrdset->context:"" - , rc->chart + , rrdcalc_name(rc) + , (rc->rrdset)?rrdset_context(rc->rrdset):"" + , rrdcalc_chart_name(rc) , host - , (rc->rrdset)?rc->rrdset->family:"" + , (rc->rrdset)?rrdset_family(rc->rrdset):"" ); } return silencers->stype; @@ -664,15 +660,15 @@ static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) { if (silencers->stype == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED; else if (silencers->stype == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED; } else { - SILENCE_TYPE st = check_silenced(rc, host->hostname, silencers); + SILENCE_TYPE st = check_silenced(rc, rrdhost_hostname(host), silencers); if (st == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED; else if (st == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED; } if (rrdcalc_flags_old != rc->rrdcalc_flags) { info("Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s", - host->hostname, - rc->name, + rrdhost_hostname(host), + rrdcalc_name(rc), (rrdcalc_flags_old & RRDCALC_FLAG_DISABLED)?"true":"false", (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)?"true":"false", (rrdcalc_flags_old & RRDCALC_FLAG_SILENCED)?"true":"false", @@ -812,7 +808,7 @@ void *health_main(void *ptr) { info( "Postponing health checks for %"PRId64" seconds, on host '%s'.", (int64_t)hibernation_delay, - host->hostname); + rrdhost_hostname(host)); host->health_delay_up_to = now + hibernation_delay; } @@ -821,7 +817,7 @@ void *health_main(void *ptr) { if (unlikely(now < host->health_delay_up_to)) continue; - info("Resuming health checks on host '%s'.", host->hostname); + info("Resuming health checks on host '%s'.", rrdhost_hostname(host)); host->health_delay_up_to = 0; } @@ -839,7 +835,7 @@ void *health_main(void *ptr) { rrdhost_rdlock(host); // the first loop is to lookup values from the db - for (rc = host->alarms; rc; rc = rc->next) { + foreach_rrdcalc_in_rrdhost(host, rc) { if (update_disabled_silenced(host, rc)) continue; @@ -852,10 +848,33 @@ void *health_main(void *ptr) { if (!rrdcalc_isrepeating(rc)) { worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); time_t now = now_realtime_sec(); + ALARM_ENTRY *ae = health_create_alarm_entry( - host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context, - rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change, - rc->value, NAN, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0, 0); + host, + rc->id, + rc->next_event_id++, + rc->config_hash_id, + now, + rc->name, + rc->rrdset->id, + rc->rrdset->context, + rc->rrdset->family, + rc->classification, + rc->component, + rc->type, + rc->exec, + rc->recipient, + now - rc->last_status_change, + rc->value, + NAN, + rc->status, + RRDCALC_STATUS_REMOVED, + rc->source, + rc->units, + rc->info, + 0, + rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0); + if (ae) { health_alarm_log(host, ae); rc->old_status = rc->status; @@ -890,7 +909,7 @@ void *health_main(void *ptr) { /* time_t old_db_timestamp = rc->db_before; */ int value_is_null = 0; - int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rc->dimensions, 1, + int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rrdcalc_dimensions(rc), 1, rc->after, rc->before, rc->group, NULL, 0, rc->options, &rc->db_after,&rc->db_before, @@ -903,7 +922,7 @@ void *health_main(void *ptr) { rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR; debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d", - host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret + rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), ret ); } else rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR; @@ -930,13 +949,13 @@ void *health_main(void *ptr) { debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", - host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name + rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc) ); } else rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, - rc->value + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT, + rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), rc->value ); } @@ -952,7 +971,7 @@ void *health_main(void *ptr) { rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR; debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", - host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, + rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg) ); } else { @@ -960,9 +979,9 @@ void *health_main(void *ptr) { debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value " NETDATA_DOUBLE_FORMAT - ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, + ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), rc->calculation->parsed_as, rc->calculation->result, - buffer_tostring(rc->calculation->error_msg), rc->source + buffer_tostring(rc->calculation->error_msg), rrdcalc_source(rc) ); rc->value = rc->calculation->result; @@ -980,7 +999,7 @@ void *health_main(void *ptr) { if (unlikely(runnable && !netdata_exit)) { rrdhost_rdlock(host); - for (rc = host->alarms; rc; rc = rc->next) { + foreach_rrdcalc_in_rrdhost(host, rc) { if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))) continue; @@ -1002,15 +1021,15 @@ void *health_main(void *ptr) { debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", - host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, + rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), buffer_tostring(rc->warning->error_msg) ); } else { rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR; debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value " NETDATA_DOUBLE_FORMAT - ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", - rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source + ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc), + rrdcalc_name(rc), rc->warning->result, buffer_tostring(rc->warning->error_msg), rrdcalc_source(rc) ); warning_status = rrdcalc_value2status(rc->warning->result); } @@ -1028,16 +1047,16 @@ void *health_main(void *ptr) { debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", - host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, + rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), buffer_tostring(rc->critical->error_msg) ); } else { rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR; debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value " NETDATA_DOUBLE_FORMAT - ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", - rc->name, rc->critical->result, buffer_tostring(rc->critical->error_msg), - rc->source + ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc), + rrdcalc_name(rc), rc->critical->result, buffer_tostring(rc->critical->error_msg), + rrdcalc_source(rc) ); critical_status = rrdcalc_value2status(rc->critical->result); } @@ -1113,15 +1132,36 @@ void *health_main(void *ptr) { ALARM_ENTRY *ae = health_create_alarm_entry( - host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context, - rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change, - rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, - rc->delay_last, - ( - ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | - ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) + host, + rc->id, + rc->next_event_id++, + rc->config_hash_id, + now, + rc->name, + rc->rrdset->id, + rc->rrdset->context, + rc->rrdset->family, + rc->classification, + rc->component, + rc->type, + rc->exec, + rc->recipient, + now - rc->last_status_change, + rc->old_value, + rc->value, + rc->status, + status, + rc->source, + rc->units, + rc->info, + rc->delay_last, + ( + ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | + ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) | + (rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0) ) ); + health_alarm_log(host, ae); rc->last_status_change = now; @@ -1138,7 +1178,7 @@ void *health_main(void *ptr) { // process repeating alarms RRDCALC *rc; - for(rc = host->alarms; rc ; rc = rc->next) { + foreach_rrdcalc_in_rrdhost(host, rc) { int repeat_every = 0; if(unlikely(rrdcalc_isrepeating(rc) && rc->delay_up_to_timestamp <= now)) { if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) { @@ -1164,16 +1204,38 @@ void *health_main(void *ptr) { worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); rc->last_repeat = now; if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++; + ALARM_ENTRY *ae = health_create_alarm_entry( - host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context, - rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change, - rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info, - rc->delay_last, - ( - ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | - ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) + host, + rc->id, + rc->next_event_id++, + rc->config_hash_id, + now, + rc->name, + rc->rrdset->id, + rc->rrdset->context, + rc->rrdset->family, + rc->classification, + rc->component, + rc->type, + rc->exec, + rc->recipient, + now - rc->last_status_change, + rc->old_value, + rc->value, + rc->old_status, + rc->status, + rc->source, + rc->units, + rc->info, + rc->delay_last, + ( + ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | + ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) | + (rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0) ) ); + ae->last_repeat = rc->last_repeat; if (!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) { ae->flags |= HEALTH_ENTRY_RUN_ONCE; diff --git a/health/health.h b/health/health.h index 3e77c12a70..aae608b522 100644 --- a/health/health.h +++ b/health/health.h @@ -14,6 +14,7 @@ extern unsigned int default_health_enabled; #define HEALTH_ENTRY_FLAG_SILENCED 0x00000010 #define HEALTH_ENTRY_RUN_ONCE 0x00000020 #define HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS 0x00000040 +#define HEALTH_ENTRY_FLAG_IS_REPEATING 0x00000080 #define HEALTH_ENTRY_FLAG_SAVED 0x10000000 #define HEALTH_ENTRY_FLAG_ACLK_QUEUED 0x20000000 @@ -35,7 +36,6 @@ extern void health_init(void); |