diff options
author | thiagoftsm <thiagoftsm@gmail.com> | 2019-07-01 11:55:16 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-07-01 11:55:16 +0000 |
commit | dd73f3e0cd0d9a1797f0c82de0fae5ccf97e9464 (patch) | |
tree | 239fdd722a21d0d3f1e8adecc788edeca5eb907f /health | |
parent | 266cbec7a8c8f626b9a73a8f6ea7e269b9ce837e (diff) |
Repeating alarm notifications (#6309)
* Alarm_repeat mergin the original!
* Alarm_repeat binary tree!
* Alarm_repeat binary tree finished!
* Alarm_repeat move function and format string
* Alarms bringing a new Binary tree
* Alarms fixing the last two
* Alarm_repeat useless var!
* Alarm fix format and repeat alarm!
* Alarm_backend steps!
* Alarm_repeat stopping to test cloud!
* Alarm_repeat stopping to test cloud 2!
* Alarm_repeat fixing when restart!
Diffstat (limited to 'health')
-rw-r--r-- | health/README.md | 22 | ||||
-rw-r--r-- | health/health.c | 84 | ||||
-rw-r--r-- | health/health.h | 4 | ||||
-rw-r--r-- | health/health_config.c | 136 | ||||
-rw-r--r-- | health/health_json.c | 4 | ||||
-rw-r--r-- | health/health_log.c | 57 |
6 files changed, 221 insertions, 86 deletions
diff --git a/health/README.md b/health/README.md index 54f6a3e1f7..81cc043d0e 100644 --- a/health/README.md +++ b/health/README.md @@ -11,7 +11,6 @@ packet dropped). Netdata also supports alarm **templates**, so that an alarm can be attached to all the charts of the same context (i.e. all network interfaces, or all disks, or all mysql servers, etc.). - Each alarm can execute a single query to the database using statistical algorithms against past data, but alarms can be combined. So, if you need 2 queries in the database, you can combine 2 alarms together (both will run a query to the database, and the results can be combined). @@ -342,6 +341,24 @@ delay: [[[up U] [down D] multiplier M] max X] their matching one) and a delay is in place. - All are reset to their defaults when the alarm switches state without a delay in place. +--- + +#### Alarm line `repeat` + +Defines the interval between repeating notifications for the alarms in CRITICAL or WARNING mode. This will override the default interval settings inherited from health settings in `netdata.conf`. The default settings for repeating notifications are `default repeat warning = DURATION` and `default repeat critical = DURATION` which can be found in health stock configuration. + +Format: + +``` +repeat: [off] [warning DURATION] [critical DURATION] +``` + +* `off`: Turns off the repeating feature for the current alarm. This is effective when the default repeat settings has been enabled in health configuration. +* `warning DURATION`: Defines the interval when the alarm is in WARNING state. Use `0s` to turn off the repeating notification for WARNING mode. +* `critical DURATION`: Defines the interval when the alarm is in CRITICAL state. Use `0s` to turn off the repeating notification for CRITICAL mode. + +--- + #### Alarm line `option` The only possible value for the `option` line is @@ -567,12 +584,15 @@ template: disk_full_percent every: 1m warn: $this > 80 crit: $this > 95 + repeat: warning 120s critical 10s ``` `$used` and `$avail` are the `used` and `avail` chart dimensions as shown on the dashboard. So, the `calc` line finds the percentage of used space. `$this` resolves to this percentage. +This is a repeating alarm and if the alarm becomes CRITICAL it repeats the notifications every 10 seconds. It also repeats notifications every 2 minutes if the alarm goes into WARNING mode. + ### Example 3 Predict if any disk will run out of space in the near future. diff --git a/health/health.c b/health/health.c index f92a1ba6b0..eec42d420d 100644 --- a/health/health.c +++ b/health/health.c @@ -255,17 +255,18 @@ static inline void health_alarm_log_process(RRDHOST *host) { netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); ALARM_ENTRY *ae; - for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id ; ae = ae->next) { - if(unlikely( - !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) && - !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED) + for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) { + if(likely(!alarm_entry_isrepeating(host, ae))) { + if(unlikely( + !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) && + !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED) )) { + if(unlikely(ae->unique_id < first_waiting)) + first_waiting = ae->unique_id; - if(unlikely(ae->unique_id < first_waiting)) - first_waiting = ae->unique_id; - - if(likely(now >= ae->delay_up_to_timestamp)) - health_process_notifications(host, ae); + if(likely(now >= ae->delay_up_to_timestamp)) + health_process_notifications(host, ae); + } } } @@ -295,6 +296,10 @@ static inline void health_alarm_log_process(RRDHOST *host) { ALARM_ENTRY *t = ae->next; health_alarm_log_free_one_nochecks_nounlink(ae); + if(likely(!alarm_entry_isrepeating(host, ae))) { + health_alarm_log_free_one_nochecks_nounlink(ae); + host->health_log.count--; + } ae = t; host->health_log.count--; @@ -411,7 +416,7 @@ SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) { debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rc->name); } else { debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s" - , (silencers->stype==STYPE_DISABLE_ALARMS)?"Disabled":"Silenced" + , (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced" , rc->name , (rc->rrdset)?rc->rrdset->context:"" , rc->chart @@ -756,20 +761,22 @@ void *health_main(void *ptr) { rc->delay_last = delay; rc->delay_up_to_timestamp = now + delay; - health_alarm_log( - host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, - rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, - rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, - rc->delay_last, - ( - ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | - ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) - ) - - ); - - rc->last_status_change = now; - rc->status = status; + if(likely(!rrdcalc_isrepeating(rc))) { + ALARM_ENTRY *ae = health_create_alarm_entry( + host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, + rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, + rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, + rc->delay_last, + ( + ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | + ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) + ) + ); + health_alarm_log(host, ae); + } + rc->last_status_change = now; + rc->old_status = rc->status; + rc->status = status; } rc->last_updated = now; @@ -779,6 +786,35 @@ void *health_main(void *ptr) { next_run = rc->next_update; } + // process repeating alarms + RRDCALC *rc; + for(rc = host->alarms; rc ; rc = rc->next) { + int repeat_every = 0; + if(unlikely(rrdcalc_isrepeating(rc))) { + if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) + repeat_every = rc->warn_repeat_every; + else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) + repeat_every = rc->crit_repeat_every; + } + if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) { + rc->last_repeat = now; + ALARM_ENTRY *ae = health_create_alarm_entry( + host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, + rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, + rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info, + rc->delay_last, + ( + ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | + ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) + ) + ); + ae->last_repeat = rc->last_repeat; + health_process_notifications(host, ae); + debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id); + health_alarm_log_free_one_nochecks_nounlink(ae); + } + } + rrdhost_unlock(host); } diff --git a/health/health.h b/health/health.h index 1511f36486..cc0692a567 100644 --- a/health/health.h +++ b/health/health.h @@ -108,7 +108,7 @@ extern void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae); extern ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filename); extern void health_alarm_log_load(RRDHOST *host); -extern void health_alarm_log( +extern ALARM_ENTRY* health_create_alarm_entry( RRDHOST *host, uint32_t alarm_id, uint32_t alarm_event_id, @@ -129,6 +129,8 @@ extern void health_alarm_log( int delay, uint32_t flags); +extern void health_alarm_log(RRDHOST *host, ALARM_ENTRY *ae); + extern void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path, const char *subpath); extern char *health_user_config_dir(void); extern char *health_stock_config_dir(void); diff --git a/health/health_config.c b/health/health_config.c index 35fde90bc2..17108dd326 100644 --- a/health/health_config.c +++ b/health/health_config.c @@ -23,6 +23,7 @@ #define HEALTH_INFO_KEY "info" #define HEALTH_DELAY_KEY "delay" #define HEALTH_OPTIONS_KEY "options" +#define HEALTH_REPEAT_KEY "repeat" static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) { if(!rc->chart) { @@ -45,7 +46,7 @@ static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) { rc->id = rrdcalc_get_unique_id(host, rc->chart, rc->name, &rc->next_event_id); - debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f", + debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u", rc->chart?rc->chart:"NOCHART", rc->name, rc->id, @@ -66,10 +67,12 @@ static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) { rc->delay_up_duration, rc->delay_down_duration, rc->delay_max_duration, - rc->delay_multiplier + rc->delay_multiplier, + rc->warn_repeat_every, + rc->crit_repeat_every ); - rrdcalc_create_part2(host, rc); + rrdcalc_add_to_host(host, rc); return 1; } @@ -100,7 +103,7 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL } } - debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f", + debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u", rt->name, (rt->context)?rt->context:"NONE", (rt->exec)?rt->exec:"DEFAULT", @@ -120,7 +123,9 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL rt->delay_up_duration, rt->delay_down_duration, rt->delay_max_duration, - rt->delay_multiplier + rt->delay_multiplier, + rt->warn_repeat_every, + rt->crit_repeat_every ); if(likely(last)) { @@ -134,48 +139,6 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL return 1; } -static inline int health_parse_duration(char *string, int *result) { - // make sure it is a number - if(!*string || !(isdigit(*string) || *string == '+' || *string == '-')) { - *result = 0; - return 0; - } - - char *e = NULL; - calculated_number n = str2ld(string, &e); - if(e && *e) { - switch (*e) { - case 'Y': - *result = (int) (n * 86400 * 365); - break; - case 'M': - *result = (int) (n * 86400 * 30); - break; - case 'w': - *result = (int) (n * 86400 * 7); - break; - case 'd': - *result = (int) (n * 86400); - break; - case 'h': - *result = (int) (n * 3600); - break; - case 'm': - *result = (int) (n * 60); - break; - - default: - case 's': - *result = (int) (n); - break; - } - } - else - *result = (int)(n); - - return 1; -} - static inline int health_parse_delay( size_t line, const char *filename, char *string, int *delay_up_duration, @@ -202,14 +165,14 @@ static inline int health_parse_delay( while(*s && isspace(*s)) *s++ = '\0'; if(!strcasecmp(key, "up")) { - if (!health_parse_duration(value, delay_up_duration)) { + if (!config_parse_duration(value, delay_up_duration)) { error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", line, filename, value, key); } else given_up = 1; } else if(!strcasecmp(key, "down")) { - if (!health_parse_duration(value, delay_down_duration)) { + if (!config_parse_duration(value, delay_down_duration)) { error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", line, filename, value, key); } @@ -224,7 +187,7 @@ static inline int health_parse_delay( else given_multiplier = 1; } else if(!strcasecmp(key, "max")) { - if (!health_parse_duration(value, delay_max_duration)) { + if (!config_parse_duration(value, delay_max_duration)) { error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", line, filename, value, key); } @@ -285,6 +248,50 @@ static inline uint32_t health_parse_options(const char *s) { return options; } +static inline int health_parse_repeat( + size_t line, + const char *file, + char *string, + uint32_t *warn_repeat_every, + uint32_t *crit_repeat_every +) { + + char *s = string; + while(*s) { + char *key = s; + + while(*s && !isspace(*s)) s++; + while(*s && isspace(*s)) *s++ = '\0'; + + if(!*key) break; + + char *value = s; + while(*s && !isspace(*s)) s++; + while(*s && isspace(*s)) *s++ = '\0'; + + if(!strcasecmp(key, "off")) { + *warn_repeat_every = 0; + *crit_repeat_every = 0; + return 1; + } + if(!strcasecmp(key, "warning")) { + if (!config_parse_duration(value, (int*)warn_repeat_every)) { + error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", + line, file, value, key); + } + } + else if(!strcasecmp(key, "critical")) { + if (!config_parse_duration(value, (int*)crit_repeat_every)) { + error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", + line, file, value, key); + } + } + } + + return 1; +} + + static inline int health_parse_db_lookup( size_t line, const char *filename, char *string, RRDR_GROUPING *group_method, int *after, int *before, int *every, @@ -322,7 +329,7 @@ static inline int health_parse_db_lookup( while(*s && !isspace(*s)) s++; while(*s && isspace(*s)) *s++ = '\0'; - if(!health_parse_duration(key, after)) { + if(!config_parse_duration(key, after)) { error("Health configuration at line %zu of file '%s': invalid duration '%s' after group method", line, filename, key); return 0; @@ -343,7 +350,7 @@ static inline int health_parse_db_lookup( while(*s && !isspace(*s)) s++; while(*s && isspace(*s)) *s++ = '\0'; - if (!health_parse_duration(value, before)) { + if (!config_parse_duration(value, before)) { error("Health configuration at line %zu of file '%s': invalid duration '%s' for '%s' keyword", line, filename, value, key); } @@ -353,7 +360,7 @@ static inline int health_parse_db_lookup( while(*s && !isspace(*s)) s++; while(*s && isspace(*s)) *s++ = '\0'; - if (!health_parse_duration(value, every)) { + if (!config_parse_duration(value, every)) { error("Health configuration at line %zu of file '%s': invalid duration '%s' for '%s' keyword", line, filename, value, key); } @@ -430,7 +437,8 @@ static int health_readfile(const char *filename, void *data) { hash_info = 0, hash_recipient = 0, hash_delay = 0, - hash_options = 0; + hash_options = 0, + hash_repeat = 0; char buffer[HEALTH_CONF_MAX_LINE + 1]; @@ -454,6 +462,7 @@ static int health_readfile(const char *filename, void *data) { hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY); hash_delay = simple_uhash(HEALTH_DELAY_KEY); hash_options = simple_uhash(HEALTH_OPTIONS_KEY); + hash_repeat = simple_uhash(HEALTH_REPEAT_KEY); } FILE *fp = fopen(filename, "r"); @@ -532,6 +541,9 @@ static int health_readfile(const char *filename, void *data) { rc->value = NAN; rc->old_value = NAN; rc->delay_multiplier = 1.0; + rc->old_status = RRDCALC_STATUS_UNINITIALIZED; + rc->warn_repeat_every = host->health_default_warn_repeat_every; + rc->crit_repeat_every = host->health_default_crit_repeat_every; if(rrdvar_fix_name(rc->name)) error("Health configuration renamed alarm '%s' to '%s'", value, rc->name); @@ -556,6 +568,8 @@ static int health_readfile(const char *filename, void *data) { rt->green = NAN; rt->red = NAN; rt->delay_multiplier = 1.0; + rt->warn_repeat_every = host->health_default_warn_repeat_every; + rt->crit_repeat_every = host->health_default_crit_repeat_every; if(rrdvar_fix_name(rt->name)) error("Health configuration renamed template '%s' to '%s'", value, rt->name); @@ -612,7 +626,7 @@ static int health_readfile(const char *filename, void *data) { &rc->options, &rc->dimensions); } else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) { - if(!health_parse_duration(value, &rc->update_every)) + if(!config_parse_duration(value, &rc->update_every)) error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.", line, filename, rc->name, key, value); } @@ -707,6 +721,11 @@ static int health_readfile(const char *filename, void *data) { else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) { rc->options |= health_parse_options(value); } + else if(hash == hash_repeat && !strcasecmp(key, HEALTH_REPEAT_KEY)){ + health_parse_repeat(line, filename, value, + &rc->warn_repeat_every, + &rc->crit_repeat_every); + } else { error("Health configuration at line %zu of file '%s' for alarm '%s' has unknown key '%s'.", line, filename, rc->name, key); @@ -736,7 +755,7 @@ static int health_readfile(const char *filename, void *data) { &rt->update_every, &rt->options, &rt->dimensions); } else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) { - if(!health_parse_duration(value, &rt->update_every)) + if(!config_parse_duration(value, &rt->update_every)) error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' cannot parse duration: '%s'.", line, filename, rt->name, key, value); } @@ -831,6 +850,11 @@ static int health_readfile(const char *filename, void *data) { else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) { rt->options |= health_parse_options(value); } + else if(hash == hash_repeat && !strcasecmp(key, HEALTH_REPEAT_KEY)){ + health_parse_repeat(line, filename, value, + &rt->warn_repeat_every, + &rt->crit_repeat_every); + } else { error("Health configuration at line %zu of file '%s' for template '%s' has unknown key '%s'.", line, filename, rt->name, key); diff --git a/health/health_json.c b/health/health_json.c index 7811324472..e923b05c6c 100644 --- a/health/health_json.c +++ b/health/health_json.c @@ -140,6 +140,8 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC "\t\t\t\"delay_multiplier\": %f,\n" "\t\t\t\"delay\": %d,\n" "\t\t\t\"delay_up_to_timestamp\": %lu,\n" + "\t\t\t\"warn_repeat_every\": \"%u\",\n" + "\t\t\t\"crit_repeat_every\": \"%u\",\n" "\t\t\t\"value_string\": \"%s\",\n" , rc->chart, rc->name , (unsigned long)rc->id @@ -165,6 +167,8 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC , rc->delay_multiplier , rc->delay_last , (unsigned long)rc->delay_up_to_timestamp + , rc->warn_repeat_every + , rc->crit_repeat_every , value_string ); diff --git a/health/health_log.c b/health/health_log.c index 009e426737..c91cde6cbf 100644 --- a/health/health_log.c +++ b/health/health_log.c @@ -79,6 +79,7 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) { "\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" "\t%d\t%d\t%d\t%d" "\t" CALCULATED_NUMBER_FORMAT_AUTO "\t" CALCULATED_NUMBER_FORMAT_AUTO + "\t%016lx" "\n" , (ae->flags & HEALTH_ENTRY_FLAG_SAVED)?'U':'A' , host->hostname @@ -112,6 +113,7 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) { , ae->new_value , ae->old_value + , (uint64_t)ae->last_repeat ) < 0)) error("HEALTH [%s]: failed to save alarm log entry to '%s'. Health data may be lost in case of abnormal restart.", host->hostname, host->health_log_filename); else { @@ -174,10 +176,40 @@ inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filena continue; } + // Check if we got last_repeat field + time_t last_repeat = 0; + if(entries > 27) { + char* alarm_name = pointers[13]; + last_repeat = (time_t)strtoul(pointers[27], NULL, 16); + + RRDCALC *rc = alarm_max_last_repeat(host, alarm_name,simple_hash(alarm_name)); + if (!rc) { + for(rc = host->alarms; rc ; rc = rc->next) { + RRDCALC *rdcmp = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_name, (avl *)rc); + if(rdcmp != rc) { + error("Cannot insert the alarm index ID using log %s", rc->name); + } + } + + rc = alarm_max_last_repeat(host, alarm_name,simple_hash(alarm_name)); + } + + if(unlikely(rc)) { + if (rrdcalc_isrepeating(rc)) { + rc->last_repeat = last_repeat; + // We iterate through repeating alarm entries only to + // find the latest last_repeat timestamp. Otherwise, + // there is no need to keep them in memory. + continue; + } + } + } + if(unlikely(*pointers[0] == 'A')) { // make sure it is properly numbered if(unlikely(host->health_log.alarms && unique_id < host->health_log.alarms->unique_id)) { - error("HEALTH [%s]: line %zu of file '%s' has alarm log entry %u in wrong order. Ignoring it.", host->hostname, line, filename, unique_id); + error( "HEALTH [%s]: line %zu of file '%s' has alarm log entry %u in wrong order. Ignoring it." + , host->hostname, line, filename, unique_id); errored++; continue; } @@ -186,11 +218,11 @@ inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filena } else if(unlikely(*pointers[0] == 'U')) { // find the original - for(ae = host->health_log.alarms; ae; ae = ae->next) { + for(ae = host->health_log.alarms; ae ; ae = ae->next) { if(unlikely(unique_id == ae->unique_id)) { if(unlikely(*pointers[0] == 'A')) { error("HEALTH [%s]: line %zu of file '%s' adds duplicate alarm log entry %u. Using the later." - , host->hostname, line, filename, unique_id); + , host->hostname, line, filename, unique_id); *pointers[0] = 'U'; duplicate++; } @@ -270,6 +302,8 @@ inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filena ae->new_value = str2l(pointers[25]); ae->old_value = str2l(pointers[26]); + ae->last_repeat = last_repeat; + char value_string[100 + 1]; freez(ae->old_value_string); freez(ae->new_value_string); @@ -339,7 +373,7 @@ inline void health_alarm_log_load(RRDHOST *host) { // ---------------------------------------------------------------------------- // health alarm log management -inline void health_alarm_log( +inline ALARM_ENTRY* health_create_alarm_entry( RRDHOST *host, uint32_t alarm_id, uint32_t alarm_event_id, @@ -398,9 +432,24 @@ inline void health_alarm_log( ae->delay_up_to_timestamp = when + delay; ae->flags |= flags; + ae->last_repeat = 0; + if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL) ae->non_clear_duration += ae->duration; + return ae; +} + +inline void health_alarm_log( + RRDHOST *host, + ALARM_ENTRY *ae +) { + debug(D_HEALTH, "Health adding alarm log entry with id: %u", ae->unique_id); + + if(unlikely(alarm_entry_isrepeating(host, ae))) { + error("Repeating alarms cannot be added to host's alarm log entries. It seems somewhere in the logic, API is being misused. Alarm id: %u", ae->alarm_id); + return; + } // link it netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock); ae->next = host->health_log.alarms; |