summaryrefslogtreecommitdiffstats
path: root/health
diff options
context:
space:
mode:
authorCosta Tsaousis <costa@netdata.cloud>2022-09-19 23:46:13 +0300
committerGitHub <noreply@github.com>2022-09-19 23:46:13 +0300
commitcb7af25c09d8775d1967cb0553268075cda868d4 (patch)
tree9e86bc359bb2b1ec72d3a1382236703dc633ad63 /health
parent62246029160025a8d6503d9fbb617c7b029b9126 (diff)
RRD structures managed by dictionaries (#13646)
* rrdset - in progress * rrdset optimal constructor; rrdset conflict * rrdset final touches * re-organization of rrdset object members * prevent use-after-free * dictionary dfe supports also counting of iterations * rrddim managed by dictionary * rrd.h cleanup * DICTIONARY_ITEM now is referencing actual dictionary items in the code * removed rrdset linked list * Revert "removed rrdset linked list" This reverts commit 690d6a588b4b99619c2c5e10f84e8f868ae6def5. * removed rrdset linked list * added comments * Switch chart uuid to static allocation in rrdset Remove unused functions * rrdset_archive() and friends... * always create rrdfamily * enable ml_free_dimension * rrddim_foreach done with dfe * most custom rrddim loops replaced with rrddim_foreach * removed accesses to rrddim->dimensions * removed locks that are no longer needed * rrdsetvar is now managed by the dictionary * set rrdset is rrdsetvar, fixes https://github.com/netdata/netdata/pull/13646#issuecomment-1242574853 * conflict callback of rrdsetvar now properly checks if it has to reset the variable * dictionary registered callbacks accept as first parameter the DICTIONARY_ITEM * dictionary dfe now uses internal counter to report; avoided excess variables defined with dfe * dictionary walkthrough callbacks get dictionary acquired items * dictionary reference counters that can be dupped from zero * added advanced functions for get and del * rrdvar managed by dictionaries * thread safety for rrdsetvar * faster rrdvar initialization * rrdvar string lengths should match in all add, del, get functions * rrdvar internals hidden from the rest of the world * rrdvar is now acquired throughout netdata * hide the internal structures of rrdsetvar * rrdsetvar is now acquired through out netdata * rrddimvar managed by dictionary; rrddimvar linked list removed; rrddimvar structures hidden from the rest of netdata * better error handling * dont create variables if not initialized for health * dont create variables if not initialized for health again * rrdfamily is now managed by dictionaries; references of it are acquired dictionary items * type checking on acquired objects * rrdcalc renaming of functions * type checking for rrdfamily_acquired * rrdcalc managed by dictionaries * rrdcalc double free fix * host rrdvars is always needed * attempt to fix deadlock 1 * attempt to fix deadlock 2 * Remove unused variable * attempt to fix deadlock 3 * snprintfz * rrdcalc index in rrdset fix * Stop storing active charts and computing chart hashes * Remove store active chart function * Remove compute chart hash function * Remove sql_store_chart_hash function * Remove store_active_dimension function * dictionary delayed destruction * formatting and cleanup * zero dictionary base on rrdsetvar * added internal error to log delayed destructions of dictionaries * typo in rrddimvar * added debugging info to dictionary * debug info * fix for rrdcalc keys being empty * remove forgotten unlock * remove deadlock * Switch to metadata version 5 and drop chart_hash chart_hash_map chart_active dimension_active v_chart_hash * SQL cosmetic changes * do not busy wait while destroying a referenced dictionary * remove deadlock * code cleanup; re-organization; * fast cleanup and flushing of dictionaries * number formatting fixes * do not delete configured alerts when archiving a chart * rrddim obsolete linked list management outside dictionaries * removed duplicate contexts call * fix crash when rrdfamily is not initialized * dont keep rrddimvar referenced * properly cleanup rrdvar * removed some locks * Do not attempt to cleanup chart_hash / chart_hash_map * rrdcalctemplate managed by dictionary * register callbacks on the right dictionary * removed some more locks * rrdcalc secondary index replaced with linked-list; rrdcalc labels updates are now executed by health thread * when looking up for an alarm look using both chart id and chart name * host initialization a bit more modular * init rrdlabels on host update * preparation for dictionary views * improved comment * unused variables without internal checks * service threads isolation and worker info * more worker info in service thread * thread cancelability debugging with internal checks * strings data races addressed; fixes https://github.com/netdata/netdata/issues/13647 * dictionary modularization * Remove unused SQL statement definition * unit-tested thread safety of dictionaries; removed data race conditions on dictionaries and strings; dictionaries now can detect if the caller is holds a write lock and automatically all the calls become their unsafe versions; all direct calls to unsafe version is eliminated * remove worker_is_idle() from the exit of service functions, because we lose the lock time between loops * rewritten dictionary to have 2 separate locks, one for indexing and another for traversal * Update collectors/cgroups.plugin/sys_fs_cgroup.c Co-authored-by: Vladimir Kobal <vlad@prokk.net> * Update collectors/cgroups.plugin/sys_fs_cgroup.c Co-authored-by: Vladimir Kobal <vlad@prokk.net> * Update collectors/proc.plugin/proc_net_dev.c Co-authored-by: Vladimir Kobal <vlad@prokk.net> * fix memory leak in rrdset cache_dir * minor dictionary changes * dont use index locks in single threaded * obsolete dict option * rrddim options and flags separation; rrdset_done() optimization to keep array of reference pointers to rrddim; * fix jump on uninitialized value in dictionary; remove double free of cache_dir * addressed codacy findings * removed debugging code * use the private refcount on dictionaries * make dictionary item desctructors work on dictionary destruction; strictier control on dictionary API; proper cleanup sequence on rrddim; * more dictionary statistics * global statistics about dictionary operations, memory, items, callbacks * dictionary support for views - missing the public API * removed warning about unused parameter * chart and context name for cloud * chart and context name for cloud, again * dictionary statistics fixed; first implementation of dictionary views - not currently used * only the master can globally delete an item * context needs netdata prefix * fix context and chart it of spins * fix for host variables when health is not enabled * run garbage collector on item insert too * Fix info message; remove extra "using" * update dict unittest for new placement of garbage collector * we need RRDHOST->rrdvars for maintaining custom host variables * Health initialization needs the host->host_uuid * split STRING to its own files; no code changes other than that * initialize health unconditionally * unit tests do not pollute the global scope with their variables * Skip initialization when creating archived hosts on startup. When a child connects it will initialize properly Co-authored-by: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com> Co-authored-by: Vladimir Kobal <vlad@prokk.net>
Diffstat (limited to 'health')
-rw-r--r--health/health.c184
-rw-r--r--health/health.h4
-rw-r--r--health/health_config.c188
-rw-r--r--health/health_json.c31
-rw-r--r--health/health_log.c19
5 files changed, 149 insertions, 277 deletions
diff --git a/health/health.c b/health/health.c
index c1045c3d81..be8b50ebb3 100644
--- a/health/health.c
+++ b/health/health.c
@@ -153,63 +153,42 @@ static void health_reload_host(RRDHOST *host) {
char *stock_path = health_stock_config_dir();
// free all running alarms
- rrdhost_wrlock(host);
-
- while(host->alarms_templates)
- rrdcalctemplate_unlink_and_free(host, host->alarms_templates);
-
- while(host->host_alarms)
- rrdcalc_unlink_and_free(host, host->host_alarms);
-
- RRDCALC *rc,*nc;
- for(rc = host->alarms_with_foreach; rc ; rc = nc) {
- nc = rc->next;
- rrdcalc_free(rc);
- }
- host->alarms_with_foreach = NULL;
-
- rrdhost_unlock(host);
+ rrdcalc_delete_all(host);
+ rrdcalctemplate_delete_all(host);
// invalidate all previous entries in the alarm log
+ netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
ALARM_ENTRY *t;
for(t = host->health_log.alarms ; t ; t = t->next) {
if(t->new_status != RRDCALC_STATUS_REMOVED)
t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
}
+ netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
- rrdhost_rdlock(host);
// reset all thresholds to all charts
RRDSET *st;
rrdset_foreach_read(st, host) {
st->green = NAN;
st->red = NAN;
}
- rrdhost_unlock(host);
+ rrdset_foreach_done(st);
// load the new alarms
- rrdhost_wrlock(host);
health_readdir(host, user_path, stock_path, NULL);
//Discard alarms with labels that do not apply to host
- rrdcalc_labels_unlink_alarm_from_host(host);
+ rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
// link the loaded alarms to their charts
- RRDDIM *rd;
rrdset_foreach_write(st, host) {
if (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED))
continue;
- rrdsetcalc_link_matching(st);
- rrdcalctemplate_link_matching(st);
- //This loop must be the last, because ` rrdcalctemplate_link_matching` will create alarms related to it.
- rrdset_rdlock(st);
- rrddim_foreach_read(rd, st) {
- rrdcalc_link_to_rrddim(rd, st, host);
- }
- rrdset_unlock(st);
+ rrdcalc_link_matching_alerts_to_rrdset(st);
+ rrdcalctemplate_link_matching_templates_to_rrdset(st);
}
+ rrdset_foreach_done(st);
- rrdhost_unlock(host);
}
/**
@@ -323,7 +302,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
- foreach_rrdcalc_in_rrdhost(host, rc) {
+ foreach_rrdcalc_in_rrdhost_read(host, rc) {
if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
continue;
@@ -351,6 +330,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
expr = rc->warning;
}
}
+ foreach_rrdcalc_in_rrdhost_done(rc);
if (n_warn+n_crit>1)
qsort (active_alerts, n_warn+n_crit, sizeof(active_alerts_t), compare_active_alerts);
@@ -477,13 +457,13 @@ static inline void health_alarm_log_process(RRDHOST *host) {
}
}
+ netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
+
// remember this for the next iteration
host->health_last_processed_id = first_waiting;
bool cleanup_excess_log_entries = host->health_log.count > host->health_log.max;
- netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
-
if (!cleanup_excess_log_entries)
return;
@@ -554,10 +534,8 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run)
}
int update_every = rc->rrdset->update_every;
- rrdset_rdlock(rc->rrdset);
- time_t first = rrdset_first_entry_t_nolock(rc->rrdset);
- time_t last = rrdset_last_entry_t_nolock(rc->rrdset);
- rrdset_unlock(rc->rrdset);
+ time_t first = rrdset_first_entry_t(rc->rrdset);
+ time_t last = rrdset_last_entry_t(rc->rrdset);
if(unlikely(now + update_every < first /* || now - update_every > last */)) {
debug(D_HEALTH
@@ -653,29 +631,29 @@ static SILENCE_TYPE check_silenced(RRDCALC *rc, const char *host, SILENCERS *sil
* @return It returns 1 case rrdcalc_flags is DISABLED or 0 otherwise
*/
static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
- uint32_t rrdcalc_flags_old = rc->rrdcalc_flags;
+ uint32_t rrdcalc_flags_old = rc->run_flags;
// Clear the flags
- rc->rrdcalc_flags &= ~(RRDCALC_FLAG_DISABLED | RRDCALC_FLAG_SILENCED);
+ rc->run_flags &= ~(RRDCALC_FLAG_DISABLED | RRDCALC_FLAG_SILENCED);
if (unlikely(silencers->all_alarms)) {
- if (silencers->stype == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED;
- else if (silencers->stype == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED;
+ if (silencers->stype == STYPE_DISABLE_ALARMS) rc->run_flags |= RRDCALC_FLAG_DISABLED;
+ else if (silencers->stype == STYPE_SILENCE_NOTIFICATIONS) rc->run_flags |= RRDCALC_FLAG_SILENCED;
} else {
SILENCE_TYPE st = check_silenced(rc, rrdhost_hostname(host), silencers);
- if (st == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED;
- else if (st == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED;
+ if (st == STYPE_DISABLE_ALARMS) rc->run_flags |= RRDCALC_FLAG_DISABLED;
+ else if (st == STYPE_SILENCE_NOTIFICATIONS) rc->run_flags |= RRDCALC_FLAG_SILENCED;
}
- if (rrdcalc_flags_old != rc->rrdcalc_flags) {
+ if (rrdcalc_flags_old != rc->run_flags) {
info("Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s",
rrdhost_hostname(host),
rrdcalc_name(rc),
(rrdcalc_flags_old & RRDCALC_FLAG_DISABLED)?"true":"false",
- (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)?"true":"false",
+ (rc->run_flags & RRDCALC_FLAG_DISABLED)?"true":"false",
(rrdcalc_flags_old & RRDCALC_FLAG_SILENCED)?"true":"false",
- (rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)?"true":"false"
+ (rc->run_flags & RRDCALC_FLAG_SILENCED)?"true":"false"
);
}
- if (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)
+ if (rc->run_flags & RRDCALC_FLAG_DISABLED)
return 1;
else
return 0;
@@ -683,36 +661,38 @@ static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
// Create alarms for dimensions that have been added to charts
// since the previous iteration.
-static void init_pending_foreach_alarms(RRDHOST *host) {
+static void health_execute_pending_updates(RRDHOST *host) {
RRDSET *st;
- RRDDIM *rd;
if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS))
return;
- rrdhost_wrlock(host);
-
- rrdset_foreach_write(st, host) {
- if (!rrdset_flag_check(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS))
+ rrdset_foreach_reentrant(st, host) {
+ if(!rrdset_flag_check(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS))
continue;
- rrdset_rdlock(st);
-
+ RRDDIM *rd;
rrddim_foreach_read(rd, st) {
- if (!rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM))
+ if(!rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARMS))
continue;
- rrdcalc_link_to_rrddim(rd, st, host);
+ RRDCALCTEMPLATE *rt;
+ foreach_rrdcalctemplate_read(host, rt) {
+ if(!rt->foreach_dimension_pattern)
+ continue;
- rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM);
- }
+ if(rrdcalctemplate_check_rrdset_conditions(rt, st, host))
+ rrdcalctemplate_check_rrddim_conditions_and_link(rt, st, rd, host);
+ }
+ foreach_rrdcalctemplate_done(rt);
+ rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARMS);
+ }
+ rrddim_foreach_done(rd);
rrdset_flag_clear(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS);
- rrdset_unlock(st);
}
-
+ rrdset_foreach_done(st);
rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS);
- rrdhost_unlock(host);
}
/**
@@ -759,7 +739,7 @@ void *health_main(void *ptr) {
time_t now = now_realtime_sec();
time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60);
- rrdcalc_labels_unlink();
+ rrdcalc_delete_alerts_not_matching_host_labels_from_all_hosts();
unsigned int loop = 0;
#ifdef ENABLE_ACLK
@@ -829,13 +809,14 @@ void *health_main(void *ptr) {
if(likely(!host->health_log_fp) && (loop == 1 || loop % cleanup_sql_every_loop == 0))
sql_health_alarm_log_cleanup(host);
- init_pending_foreach_alarms(host);
+ health_execute_pending_updates(host);
worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK);
- rrdhost_rdlock(host);
// the first loop is to lookup values from the db
- foreach_rrdcalc_in_rrdhost(host, rc) {
+ foreach_rrdcalc_in_rrdhost_read(host, rc) {
+
+ rrdcalc_update_info_using_rrdset_labels(rc);
if (update_disabled_silenced(host, rc))
continue;
@@ -876,7 +857,7 @@ void *health_main(void *ptr) {
rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0);
if (ae) {
- health_alarm_log(host, ae);
+ health_alarm_log_add_entry(host, ae);
rc->old_status = rc->status;
rc->status = RRDCALC_STATUS_REMOVED;
rc->last_status_change = now;
@@ -891,14 +872,14 @@ void *health_main(void *ptr) {
}
if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
- if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
+ if (unlikely(rc->run_flags & RRDCALC_FLAG_RUNNABLE))
+ rc->run_flags &= ~RRDCALC_FLAG_RUNNABLE;
continue;
}
runnable++;
rc->old_value = rc->value;
- rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
+ rc->run_flags |= RRDCALC_FLAG_RUNNABLE;
// ------------------------------------------------------------
// if there is database lookup, do it
@@ -919,13 +900,13 @@ void *health_main(void *ptr) {
if (unlikely(ret != 200)) {
// database lookup failed
rc->value = NAN;
- rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
+ rc->run_flags |= RRDCALC_FLAG_DB_ERROR;
debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d",
rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), ret
);
} else
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
+ rc->run_flags &= ~RRDCALC_FLAG_DB_ERROR;
/* - RRDCALC_FLAG_DB_STALE not currently used
if (unlikely(old_db_timestamp == rc->db_before)) {
@@ -945,14 +926,14 @@ void *health_main(void *ptr) {
if (unlikely(value_is_null)) {
// collected value is null
rc->value = NAN;
- rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
+ rc->run_flags |= RRDCALC_FLAG_DB_NAN;
debug(D_HEALTH,
"Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc)
);
} else
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
+ rc->run_flags &= ~RRDCALC_FLAG_DB_NAN;
debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT,
rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), rc->value
@@ -968,14 +949,14 @@ void *health_main(void *ptr) {
if (unlikely(!expression_evaluate(rc->calculation))) {
// calculation failed
rc->value = NAN;
- rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
+ rc->run_flags |= RRDCALC_FLAG_CALC_ERROR;
debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s",
rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg)
);
} else {
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
+ rc->run_flags &= ~RRDCALC_FLAG_CALC_ERROR;
debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
NETDATA_DOUBLE_FORMAT
@@ -985,25 +966,17 @@ void *health_main(void *ptr) {
);
rc->value = rc->calculation->result;
-
- if (rc->local) rc->local->last_updated = now;
- if (rc->family) rc->family->last_updated = now;
- if (rc->hostid) rc->hostid->last_updated = now;
- if (rc->hostname) rc->hostname->last_updated = now;
}
}
}
-
- rrdhost_unlock(host);
+ foreach_rrdcalc_in_rrdhost_done(rc);
if (unlikely(runnable && !netdata_exit)) {
- rrdhost_rdlock(host);
-
- foreach_rrdcalc_in_rrdhost(host, rc) {
- if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
+ foreach_rrdcalc_in_rrdhost_read(host, rc) {
+ if (unlikely(!(rc->run_flags & RRDCALC_FLAG_RUNNABLE)))
continue;
- if (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED) {
+ if (rc->run_flags & RRDCALC_FLAG_DISABLED) {
continue;
}
RRDCALC_STATUS warning_status = RRDCALC_STATUS_UNDEFINED;
@@ -1017,7 +990,7 @@ void *health_main(void *ptr) {
if (unlikely(!expression_evaluate(rc->warning))) {
// calculation failed
- rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
+ rc->run_flags |= RRDCALC_FLAG_WARN_ERROR;
debug(D_HEALTH,
"Health on host '%s', alarm '%s.%s': warning expression failed with error: %s",
@@ -1025,7 +998,7 @@ void *health_main(void *ptr) {
buffer_tostring(rc->warning->error_msg)
);
} else {
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
+ rc->run_flags &= ~RRDCALC_FLAG_WARN_ERROR;
debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value "
NETDATA_DOUBLE_FORMAT
": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc),
@@ -1043,7 +1016,7 @@ void *health_main(void *ptr) {
if (unlikely(!expression_evaluate(rc->critical))) {
// calculation failed
- rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
+ rc->run_flags |= RRDCALC_FLAG_CRIT_ERROR;
debug(D_HEALTH,
"Health on host '%s', alarm '%s.%s': critical expression failed with error: %s",
@@ -1051,7 +1024,7 @@ void *health_main(void *ptr) {
buffer_tostring(rc->critical->error_msg)
);
} else {
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
+ rc->run_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value "
NETDATA_DOUBLE_FORMAT
": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc),
@@ -1156,13 +1129,13 @@ void *health_main(void *ptr) {
rc->info,
rc->delay_last,
(
- ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
- ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) |
+ ((rc->options & RRDCALC_OPTION_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
+ ((rc->run_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) |
(rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0)
)
);
- health_alarm_log(host, ae);
+ health_alarm_log_add_entry(host, ae);
rc->last_status_change = now;
rc->old_status = rc->status;
@@ -1175,20 +1148,20 @@ void *health_main(void *ptr) {
if (next_run > rc->next_update)
next_run = rc->next_update;
}
+ foreach_rrdcalc_in_rrdhost_done(rc);
// process repeating alarms
- RRDCALC *rc;
- foreach_rrdcalc_in_rrdhost(host, rc) {
+ foreach_rrdcalc_in_rrdhost_read(host, rc) {
int repeat_every = 0;
if(unlikely(rrdcalc_isrepeating(rc) && rc->delay_up_to_timestamp <= now)) {
if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUN_ONCE;
+ rc->run_flags &= ~RRDCALC_FLAG_RUN_ONCE;
repeat_every = rc->warn_repeat_every;
} else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUN_ONCE;
+ rc->run_flags &= ~RRDCALC_FLAG_RUN_ONCE;
repeat_every = rc->crit_repeat_every;
} else if(unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
- if(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE)) {
+ if(!(rc->run_flags & RRDCALC_FLAG_RUN_ONCE)) {
if(rc->old_status == RRDCALC_STATUS_CRITICAL) {
repeat_every = 1;
} else if (rc->old_status == RRDCALC_STATUS_WARNING) {
@@ -1230,25 +1203,24 @@ void *health_main(void *ptr) {
rc->info,
rc->delay_last,
(
- ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
- ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) |
+ ((rc->options & RRDCALC_OPTION_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
+ ((rc->run_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) |
(rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0)
)
);
ae->last_repeat = rc->last_repeat;
- if (!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) {
+ if (!(rc->run_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) {
ae->flags |= HEALTH_ENTRY_RUN_ONCE;
}
- rc->rrdcalc_flags |= RRDCALC_FLAG_RUN_ONCE;
+ rc->run_flags |= RRDCALC_FLAG_RUN_ONCE;
health_process_notifications(host, ae);
debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
health_alarm_wait_for_execution(ae);
health_alarm_log_free_one_nochecks_nounlink(ae);
}
}
-
- rrdhost_unlock(host);
+ foreach_rrdcalc_in_rrdhost_done(rc);
}
if (unlikely(netdata_exit))
diff --git a/health/health.h b/health/health.h
index aae608b522..aa7a5c65ed 100644
--- a/health/health.h
+++ b/health/health.h
@@ -74,7 +74,7 @@ extern ALARM_ENTRY* health_create_alarm_entry(
int delay,
uint32_t flags);
-extern void health_alarm_log(RRDHOST *host, ALARM_ENTRY *ae);
+extern void health_alarm_log_add_entry(RRDHOST *host, ALARM_ENTRY *ae);
extern void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path, const char *subpath);
extern char *health_user_config_dir(void);
@@ -90,6 +90,4 @@ extern void health_label_log_save(RRDHOST *host);
extern char *health_edit_command_from_source(const char *source);
extern void sql_refresh_hashes(void);
-extern SIMPLE_PATTERN *health_pattern_from_foreach(const char *s);
-
#endif //NETDATA_HEALTH_H
diff --git a/health/health_config.c b/health/health_config.c
index eb09a5f81d..365c4d015d 100644
--- a/health/health_config.c
+++ b/health/health_config.c
@@ -33,122 +33,6 @@
#define HEALTH_HOST_LABEL_KEY "host labels"
#define HEALTH_FOREACH_KEY "foreach"
-static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
- if(!rc->chart) {
- error("Health configuration for alarm '%s' does not have a chart", rrdcalc_name(rc));
- return 0;
- }
-
- if(!rc->update_every) {
- error("Health configuration for alarm '%s.%s' has no frequency (parameter 'every'). Ignoring it.", rrdcalc_chart_name(rc), rrdcalc_name(rc));
- return 0;
- }
-
- if(!RRDCALC_HAS_DB_LOOKUP(rc) && !rc->calculation && !rc->warning && !rc->critical) {
- error("Health configuration for alarm '%s.%s' is useless (no db lookup, no calculation, no warning and no critical expressions)", rrdcalc_chart_name(rc), rrdcalc_name(rc));
- return 0;
- }
-
- if (rrdcalc_exists(host, rrdcalc_chart_name(rc), rrdcalc_name(rc)))
- return 0;
-
- rc->id = rrdcalc_get_unique_id(host, rc->chart, rc->name, &rc->next_event_id);
-
- debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green " NETDATA_DOUBLE_FORMAT_AUTO
- ", red " NETDATA_DOUBLE_FORMAT_AUTO
- ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', for each dimension '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u",
- rrdcalc_chart_name(rc),
- rrdcalc_name(rc),
- rc->id,
- (rc->exec)?rrdcalc_exec(rc):"DEFAULT",
- (rc->recipient)?rrdcalc_recipient(rc):"DEFAULT",
- rc->green,
- rc->red,
- (int)rc->group,
- rc->after,
- rc->before,
- rc->options,
- (rc->dimensions)?rrdcalc_dimensions(rc):"NONE",
- (rc->foreachdim)?rrdcalc_foreachdim(rc):"NONE",
- rc->update_every,
- (rc->calculation)?rc->calculation->parsed_as:"NONE",
- (rc->warning)?rc->warning->parsed_as:"NONE",
- (rc->critical)?rc->critical->parsed_as:"NONE",
- rrdcalc_source(rc),
- rc->delay_up_duration,
- rc->delay_down_duration,
- rc->delay_max_duration,
- rc->delay_multiplier,
- rc->warn_repeat_every,
- rc->crit_repeat_every
- );
-
- rrdcalc_add_to_host(host, rc);
-
- return 1;
-}
-
-static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCALCTEMPLATE *rt) {
- if(unlikely(!rt->context)) {
- error("Health configuration for template '%s' does not have a context", rrdcalctemplate_name(rt));
- return 0;
- }
-
- if(unlikely(!rt->update_every)) {
- error("Health configuration for template '%s' has no frequency (parameter 'every'). Ignoring it.", rrdcalctemplate_name(rt));
- return 0;
- }
-
- if(unlikely(!RRDCALCTEMPLATE_HAS_DB_LOOKUP(rt) && !rt->calculation && !rt->warning && !rt->critical)) {
- error("Health configuration for template '%s' is useless (no calculation, no warning and no critical evaluation)", rrdcalctemplate_name(rt));
- return 0;
- }
-
- RRDCALCTEMPLATE *t;
- foreach_rrdcalctemplate_in_rrdhost(host, t) {
- if(unlikely(t->name == rt->name && !strcmp(t->family_match?rrdcalctemplate_family_match(t):"*", rt->family_match?rrdcalctemplate_family_match(rt):"*"))) {
- info("Health configuration template '%s' already exists for host '%s'.", rrdcalctemplate_name(rt), rrdhost_hostname(host));
- return 0;
- }
- }
-
- if(rt->foreachdim)
- DOUBLE_LINKED_LIST_PREPEND_UNSAFE(host->alarms_templates, rt, prev, next);
- else
- DOUBLE_LINKED_LIST_APPEND_UNSAFE(host->alarms_templates, rt, prev, next);
-
- debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green " NETDATA_DOUBLE_FORMAT_AUTO
- ", red " NETDATA_DOUBLE_FORMAT_AUTO
- ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', for each dimension '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u",
- rrdcalctemplate_name(rt),
- (rt->context)?string2str(rt->context):"NONE",
- (rt->exec)?rrdcalctemplate_exec(rt):"DEFAULT",
- (rt->recipient)?rrdcalctemplate_recipient(rt):"DEFAULT",
- rt->green,
- rt->red,
- (int)rt->group,
- rt->after,
- rt->before,
- rt->options,
- (rt->dimensions)?rrdcalctemplate_dimensions(rt):"NONE",
- (rt->foreachdim)?rrdcalctemplate_foreachdim(rt):"NONE",
- rt->update_every,
- (rt->calculation)?rt->calculation->parsed_as:"NONE",
- (rt->warning)?rt->warning->parsed_as:"NONE",
- (rt->critical)?rt->critical->parsed_as:"NONE",
- rrdcalctemplate_source(rt),
- rt->delay_up_duration,
- rt->delay_down_duration,
- rt->delay_max_duration,
- rt->delay_multiplier,
- rt->warn_repeat_every,
- rt->crit_repeat_every
- );
-
-
- return 1;
-}
-
static inline int health_parse_delay(
size_t line, const char *filename, char *string,
int *delay_up_duration,
@@ -249,7 +133,7 @@ static inline uint32_t health_parse_options(const char *s) {
buf[count] = '\0';
if(!strcasecmp(buf, "no-clear-notification") || !strcasecmp(buf, "no-clear"))
- options |= RRDCALC_FLAG_NO_CLEAR_NOTIFICATION;
+ options |= RRDCALC_OPTION_NO_CLEAR_NOTIFICATION;
else
error("Ignoring unknown alarm option '%s'", buf);
}
@@ -308,13 +192,21 @@ static inline int health_parse_repeat(
*
* @param s the string that will be used to create the simple pattern.
*/
-SIMPLE_PATTERN *health_pattern_from_foreach(const char *s) {
+
+static void dimension_remove_pipe_comma(char *str) {
+ while(*str) {
+ if(*str == '|' || *str == ',') *str = ' ';
+ str++;
+ }
+}
+
+static SIMPLE_PATTERN *health_pattern_from_foreach(const char *s) {
char *convert= strdupz(s);
SIMPLE_PATTERN *val = NULL;
+
if(convert) {
dimension_remove_pipe_comma(convert);
val = simple_pattern_create(convert, NULL, SIMPLE_PATTERN_EXACT);
-
freez(convert);
}
@@ -324,7 +216,7 @@ SIMPLE_PATTERN *health_pattern_from_foreach(const char *s) {
static inline int health_parse_db_lookup(
size_t line, const char *filename, char *string,
RRDR_GROUPING *group_method, int *after, int *before, int *every,
- uint32_t *options, STRING **dimensions, STRING **foreachdim
+ RRDCALC_OPTIONS *options, STRING **dimensions, STRING **foreachdim
) {
debug(D_HEALTH, "Health configuration parsing database lookup %zu@%s: %s", line, filename, string);
@@ -335,7 +227,7 @@ static inline int health_parse_db_lookup(
*after = 0;
*before = 0;
*every = 0;
- *options = 0;
+ *options = (*options) & RRDCALC_ALL_OPTIONS_EXCLUDING_THE_RRDR_ONES; // preserve rrdcalc options
char *s = string, *key;
@@ -644,16 +536,20 @@ static int health_readfile(const char *filename, void *data) {
if(hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) {
if(rc) {
- if(!alert_hash_and_store_config(rc->config_hash_id, alert_cfg, sql_store_hashes) || ignore_this || !rrdcalc_add_alarm_from_config(host, rc)) {
- rrdcalc_free(rc);
- }
+ if(!alert_hash_and_store_config(rc->config_hash_id, alert_cfg, sql_store_hashes) || ignore_this)
+ rrdcalc_free_unused_rrdcalc_loaded_from_config(rc);
+ else
+ rrdcalc_add_from_config(host, rc);
+