summaryrefslogtreecommitdiffstats
path: root/health
diff options
context:
space:
mode:
authorthiagoftsm <thiagoftsm@gmail.com>2019-07-01 11:55:16 +0000
committerGitHub <noreply@github.com>2019-07-01 11:55:16 +0000
commitdd73f3e0cd0d9a1797f0c82de0fae5ccf97e9464 (patch)
tree239fdd722a21d0d3f1e8adecc788edeca5eb907f /health
parent266cbec7a8c8f626b9a73a8f6ea7e269b9ce837e (diff)
Repeating alarm notifications (#6309)
* Alarm_repeat mergin the original! * Alarm_repeat binary tree! * Alarm_repeat binary tree finished! * Alarm_repeat move function and format string * Alarms bringing a new Binary tree * Alarms fixing the last two * Alarm_repeat useless var! * Alarm fix format and repeat alarm! * Alarm_backend steps! * Alarm_repeat stopping to test cloud! * Alarm_repeat stopping to test cloud 2! * Alarm_repeat fixing when restart!
Diffstat (limited to 'health')
-rw-r--r--health/README.md22
-rw-r--r--health/health.c84
-rw-r--r--health/health.h4
-rw-r--r--health/health_config.c136
-rw-r--r--health/health_json.c4
-rw-r--r--health/health_log.c57
6 files changed, 221 insertions, 86 deletions
diff --git a/health/README.md b/health/README.md
index 54f6a3e1f7..81cc043d0e 100644
--- a/health/README.md
+++ b/health/README.md
@@ -11,7 +11,6 @@ packet dropped).
Netdata also supports alarm **templates**, so that an alarm can be attached to all the charts of the same context (i.e. all network interfaces, or all disks, or all mysql servers, etc.).
-
Each alarm can execute a single query to the database using statistical algorithms against past data,
but alarms can be combined. So, if you need 2 queries in the database, you can combine
2 alarms together (both will run a query to the database, and the results can be combined).
@@ -342,6 +341,24 @@ delay: [[[up U] [down D] multiplier M] max X]
their matching one) and a delay is in place.
- All are reset to their defaults when the alarm switches state without a delay in place.
+---
+
+#### Alarm line `repeat`
+
+Defines the interval between repeating notifications for the alarms in CRITICAL or WARNING mode. This will override the default interval settings inherited from health settings in `netdata.conf`. The default settings for repeating notifications are `default repeat warning = DURATION` and `default repeat critical = DURATION` which can be found in health stock configuration.
+
+Format:
+
+```
+repeat: [off] [warning DURATION] [critical DURATION]
+```
+
+* `off`: Turns off the repeating feature for the current alarm. This is effective when the default repeat settings has been enabled in health configuration.
+* `warning DURATION`: Defines the interval when the alarm is in WARNING state. Use `0s` to turn off the repeating notification for WARNING mode.
+* `critical DURATION`: Defines the interval when the alarm is in CRITICAL state. Use `0s` to turn off the repeating notification for CRITICAL mode.
+
+---
+
#### Alarm line `option`
The only possible value for the `option` line is
@@ -567,12 +584,15 @@ template: disk_full_percent
every: 1m
warn: $this > 80
crit: $this > 95
+ repeat: warning 120s critical 10s
```
`$used` and `$avail` are the `used` and `avail` chart dimensions as shown on the dashboard.
So, the `calc` line finds the percentage of used space. `$this` resolves to this percentage.
+This is a repeating alarm and if the alarm becomes CRITICAL it repeats the notifications every 10 seconds. It also repeats notifications every 2 minutes if the alarm goes into WARNING mode.
+
### Example 3
Predict if any disk will run out of space in the near future.
diff --git a/health/health.c b/health/health.c
index f92a1ba6b0..eec42d420d 100644
--- a/health/health.c
+++ b/health/health.c
@@ -255,17 +255,18 @@ static inline void health_alarm_log_process(RRDHOST *host) {
netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
ALARM_ENTRY *ae;
- for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id ; ae = ae->next) {
- if(unlikely(
- !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
- !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
+ for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) {
+ if(likely(!alarm_entry_isrepeating(host, ae))) {
+ if(unlikely(
+ !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
+ !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
)) {
+ if(unlikely(ae->unique_id < first_waiting))
+ first_waiting = ae->unique_id;
- if(unlikely(ae->unique_id < first_waiting))
- first_waiting = ae->unique_id;
-
- if(likely(now >= ae->delay_up_to_timestamp))
- health_process_notifications(host, ae);
+ if(likely(now >= ae->delay_up_to_timestamp))
+ health_process_notifications(host, ae);
+ }
}
}
@@ -295,6 +296,10 @@ static inline void health_alarm_log_process(RRDHOST *host) {
ALARM_ENTRY *t = ae->next;
health_alarm_log_free_one_nochecks_nounlink(ae);
+ if(likely(!alarm_entry_isrepeating(host, ae))) {
+ health_alarm_log_free_one_nochecks_nounlink(ae);
+ host->health_log.count--;
+ }
ae = t;
host->health_log.count--;
@@ -411,7 +416,7 @@ SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) {
debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rc->name);
} else {
debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s"
- , (silencers->stype==STYPE_DISABLE_ALARMS)?"Disabled":"Silenced"
+ , (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced"
, rc->name
, (rc->rrdset)?rc->rrdset->context:""
, rc->chart
@@ -756,20 +761,22 @@ void *health_main(void *ptr) {
rc->delay_last = delay;
rc->delay_up_to_timestamp = now + delay;
- health_alarm_log(
- host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id,
- rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change,
- rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
- rc->delay_last,
- (
- ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
- ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
- )
-
- );
-
- rc->last_status_change = now;
- rc->status = status;
+ if(likely(!rrdcalc_isrepeating(rc))) {
+ ALARM_ENTRY *ae = health_create_alarm_entry(
+ host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id,
+ rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change,
+ rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
+ rc->delay_last,
+ (
+ ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
+ ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
+ )
+ );
+ health_alarm_log(host, ae);
+ }
+ rc->last_status_change = now;
+ rc->old_status = rc->status;
+ rc->status = status;
}
rc->last_updated = now;
@@ -779,6 +786,35 @@ void *health_main(void *ptr) {
next_run = rc->next_update;
}
+ // process repeating alarms
+ RRDCALC *rc;
+ for(rc = host->alarms; rc ; rc = rc->next) {
+ int repeat_every = 0;
+ if(unlikely(rrdcalc_isrepeating(rc))) {
+ if(unlikely(rc->status == RRDCALC_STATUS_WARNING))
+ repeat_every = rc->warn_repeat_every;
+ else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL))
+ repeat_every = rc->crit_repeat_every;
+ }
+ if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
+ rc->last_repeat = now;
+ ALARM_ENTRY *ae = health_create_alarm_entry(
+ host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id,
+ rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change,
+ rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info,
+ rc->delay_last,
+ (
+ ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
+ ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
+ )
+ );
+ ae->last_repeat = rc->last_repeat;
+ health_process_notifications(host, ae);
+ debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
+ health_alarm_log_free_one_nochecks_nounlink(ae);
+ }
+ }
+
rrdhost_unlock(host);
}
diff --git a/health/health.h b/health/health.h
index 1511f36486..cc0692a567 100644
--- a/health/health.h
+++ b/health/health.h
@@ -108,7 +108,7 @@ extern void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae);
extern ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filename);
extern void health_alarm_log_load(RRDHOST *host);
-extern void health_alarm_log(
+extern ALARM_ENTRY* health_create_alarm_entry(
RRDHOST *host,
uint32_t alarm_id,
uint32_t alarm_event_id,
@@ -129,6 +129,8 @@ extern void health_alarm_log(
int delay,
uint32_t flags);
+extern void health_alarm_log(RRDHOST *host, ALARM_ENTRY *ae);
+
extern void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path, const char *subpath);
extern char *health_user_config_dir(void);
extern char *health_stock_config_dir(void);
diff --git a/health/health_config.c b/health/health_config.c
index 35fde90bc2..17108dd326 100644
--- a/health/health_config.c
+++ b/health/health_config.c
@@ -23,6 +23,7 @@
#define HEALTH_INFO_KEY "info"
#define HEALTH_DELAY_KEY "delay"
#define HEALTH_OPTIONS_KEY "options"
+#define HEALTH_REPEAT_KEY "repeat"
static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
if(!rc->chart) {
@@ -45,7 +46,7 @@ static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
rc->id = rrdcalc_get_unique_id(host, rc->chart, rc->name, &rc->next_event_id);
- debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
+ debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u",
rc->chart?rc->chart:"NOCHART",
rc->name,
rc->id,
@@ -66,10 +67,12 @@ static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
rc->delay_up_duration,
rc->delay_down_duration,
rc->delay_max_duration,
- rc->delay_multiplier
+ rc->delay_multiplier,
+ rc->warn_repeat_every,
+ rc->crit_repeat_every
);
- rrdcalc_create_part2(host, rc);
+ rrdcalc_add_to_host(host, rc);
return 1;
}
@@ -100,7 +103,7 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL
}
}
- debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
+ debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u",
rt->name,
(rt->context)?rt->context:"NONE",
(rt->exec)?rt->exec:"DEFAULT",
@@ -120,7 +123,9 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL
rt->delay_up_duration,
rt->delay_down_duration,
rt->delay_max_duration,
- rt->delay_multiplier
+ rt->delay_multiplier,
+ rt->warn_repeat_every,
+ rt->crit_repeat_every
);
if(likely(last)) {
@@ -134,48 +139,6 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL
return 1;
}
-static inline int health_parse_duration(char *string, int *result) {
- // make sure it is a number
- if(!*string || !(isdigit(*string) || *string == '+' || *string == '-')) {
- *result = 0;
- return 0;
- }
-
- char *e = NULL;
- calculated_number n = str2ld(string, &e);
- if(e && *e) {
- switch (*e) {
- case 'Y':
- *result = (int) (n * 86400 * 365);
- break;
- case 'M':
- *result = (int) (n * 86400 * 30);
- break;
- case 'w':
- *result = (int) (n * 86400 * 7);
- break;
- case 'd':
- *result = (int) (n * 86400);
- break;
- case 'h':
- *result = (int) (n * 3600);
- break;
- case 'm':
- *result = (int) (n * 60);
- break;
-
- default:
- case 's':
- *result = (int) (n);
- break;
- }
- }
- else
- *result = (int)(n);
-
- return 1;
-}
-
static inline int health_parse_delay(
size_t line, const char *filename, char *string,
int *delay_up_duration,
@@ -202,14 +165,14 @@ static inline int health_parse_delay(
while(*s && isspace(*s)) *s++ = '\0';
if(!strcasecmp(key, "up")) {
- if (!health_parse_duration(value, delay_up_duration)) {
+ if (!config_parse_duration(value, delay_up_duration)) {
error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
line, filename, value, key);
}
else given_up = 1;
}
else if(!strcasecmp(key, "down")) {
- if (!health_parse_duration(value, delay_down_duration)) {
+ if (!config_parse_duration(value, delay_down_duration)) {
error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
line, filename, value, key);
}
@@ -224,7 +187,7 @@ static inline int health_parse_delay(
else given_multiplier = 1;
}
else if(!strcasecmp(key, "max")) {
- if (!health_parse_duration(value, delay_max_duration)) {
+ if (!config_parse_duration(value, delay_max_duration)) {
error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
line, filename, value, key);
}
@@ -285,6 +248,50 @@ static inline uint32_t health_parse_options(const char *s) {
return options;
}
+static inline int health_parse_repeat(
+ size_t line,
+ const char *file,
+ char *string,
+ uint32_t *warn_repeat_every,
+ uint32_t *crit_repeat_every
+) {
+
+ char *s = string;
+ while(*s) {
+ char *key = s;
+
+ while(*s && !isspace(*s)) s++;
+ while(*s && isspace(*s)) *s++ = '\0';
+
+ if(!*key) break;
+
+ char *value = s;
+ while(*s && !isspace(*s)) s++;
+ while(*s && isspace(*s)) *s++ = '\0';
+
+ if(!strcasecmp(key, "off")) {
+ *warn_repeat_every = 0;
+ *crit_repeat_every = 0;
+ return 1;
+ }
+ if(!strcasecmp(key, "warning")) {
+ if (!config_parse_duration(value, (int*)warn_repeat_every)) {
+ error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
+ line, file, value, key);
+ }
+ }
+ else if(!strcasecmp(key, "critical")) {
+ if (!config_parse_duration(value, (int*)crit_repeat_every)) {
+ error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
+ line, file, value, key);
+ }
+ }
+ }
+
+ return 1;
+}
+
+
static inline int health_parse_db_lookup(
size_t line, const char *filename, char *string,
RRDR_GROUPING *group_method, int *after, int *before, int *every,
@@ -322,7 +329,7 @@ static inline int health_parse_db_lookup(
while(*s && !isspace(*s)) s++;
while(*s && isspace(*s)) *s++ = '\0';
- if(!health_parse_duration(key, after)) {
+ if(!config_parse_duration(key, after)) {
error("Health configuration at line %zu of file '%s': invalid duration '%s' after group method",
line, filename, key);
return 0;
@@ -343,7 +350,7 @@ static inline int health_parse_db_lookup(
while(*s && !isspace(*s)) s++;
while(*s && isspace(*s)) *s++ = '\0';
- if (!health_parse_duration(value, before)) {
+ if (!config_parse_duration(value, before)) {
error("Health configuration at line %zu of file '%s': invalid duration '%s' for '%s' keyword",
line, filename, value, key);
}
@@ -353,7 +360,7 @@ static inline int health_parse_db_lookup(
while(*s && !isspace(*s)) s++;
while(*s && isspace(*s)) *s++ = '\0';
- if (!health_parse_duration(value, every)) {
+ if (!config_parse_duration(value, every)) {
error("Health configuration at line %zu of file '%s': invalid duration '%s' for '%s' keyword",
line, filename, value, key);
}
@@ -430,7 +437,8 @@ static int health_readfile(const char *filename, void *data) {
hash_info = 0,
hash_recipient = 0,
hash_delay = 0,
- hash_options = 0;
+ hash_options = 0,
+ hash_repeat = 0;
char buffer[HEALTH_CONF_MAX_LINE + 1];
@@ -454,6 +462,7 @@ static int health_readfile(const char *filename, void *data) {
hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY);
hash_delay = simple_uhash(HEALTH_DELAY_KEY);
hash_options = simple_uhash(HEALTH_OPTIONS_KEY);
+ hash_repeat = simple_uhash(HEALTH_REPEAT_KEY);
}
FILE *fp = fopen(filename, "r");
@@ -532,6 +541,9 @@ static int health_readfile(const char *filename, void *data) {
rc->value = NAN;
rc->old_value = NAN;
rc->delay_multiplier = 1.0;
+ rc->old_status = RRDCALC_STATUS_UNINITIALIZED;
+ rc->warn_repeat_every = host->health_default_warn_repeat_every;
+ rc->crit_repeat_every = host->health_default_crit_repeat_every;
if(rrdvar_fix_name(rc->name))
error("Health configuration renamed alarm '%s' to '%s'", value, rc->name);
@@ -556,6 +568,8 @@ static int health_readfile(const char *filename, void *data) {
rt->green = NAN;
rt->red = NAN;
rt->delay_multiplier = 1.0;
+ rt->warn_repeat_every = host->health_default_warn_repeat_every;
+ rt->crit_repeat_every = host->health_default_crit_repeat_every;
if(rrdvar_fix_name(rt->name))
error("Health configuration renamed template '%s' to '%s'", value, rt->name);
@@ -612,7 +626,7 @@ static int health_readfile(const char *filename, void *data) {
&rc->options, &rc->dimensions);
}
else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
- if(!health_parse_duration(value, &rc->update_every))
+ if(!config_parse_duration(value, &rc->update_every))
error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.",
line, filename, rc->name, key, value);
}
@@ -707,6 +721,11 @@ static int health_readfile(const char *filename, void *data) {
else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) {
rc->options |= health_parse_options(value);
}
+ else if(hash == hash_repeat && !strcasecmp(key, HEALTH_REPEAT_KEY)){
+ health_parse_repeat(line, filename, value,
+ &rc->warn_repeat_every,
+ &rc->crit_repeat_every);
+ }
else {
error("Health configuration at line %zu of file '%s' for alarm '%s' has unknown key '%s'.",
line, filename, rc->name, key);
@@ -736,7 +755,7 @@ static int health_readfile(const char *filename, void *data) {
&rt->update_every, &rt->options, &rt->dimensions);
}
else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
- if(!health_parse_duration(value, &rt->update_every))
+ if(!config_parse_duration(value, &rt->update_every))
error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' cannot parse duration: '%s'.",
line, filename, rt->name, key, value);
}
@@ -831,6 +850,11 @@ static int health_readfile(const char *filename, void *data) {
else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) {
rt->options |= health_parse_options(value);
}
+ else if(hash == hash_repeat && !strcasecmp(key, HEALTH_REPEAT_KEY)){
+ health_parse_repeat(line, filename, value,
+ &rt->warn_repeat_every,
+ &rt->crit_repeat_every);
+ }
else {
error("Health configuration at line %zu of file '%s' for template '%s' has unknown key '%s'.",
line, filename, rt->name, key);
diff --git a/health/health_json.c b/health/health_json.c
index 7811324472..e923b05c6c 100644
--- a/health/health_json.c
+++ b/health/health_json.c
@@ -140,6 +140,8 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
"\t\t\t\"delay_multiplier\": %f,\n"
"\t\t\t\"delay\": %d,\n"
"\t\t\t\"delay_up_to_timestamp\": %lu,\n"
+ "\t\t\t\"warn_repeat_every\": \"%u\",\n"
+ "\t\t\t\"crit_repeat_every\": \"%u\",\n"
"\t\t\t\"value_string\": \"%s\",\n"
, rc->chart, rc->name
, (unsigned long)rc->id
@@ -165,6 +167,8 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
, rc->delay_multiplier
, rc->delay_last
, (unsigned long)rc->delay_up_to_timestamp
+ , rc->warn_repeat_every
+ , rc->crit_repeat_every
, value_string
);
diff --git a/health/health_log.c b/health/health_log.c
index 009e426737..c91cde6cbf 100644
--- a/health/health_log.c
+++ b/health/health_log.c
@@ -79,6 +79,7 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
"\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s"
"\t%d\t%d\t%d\t%d"
"\t" CALCULATED_NUMBER_FORMAT_AUTO "\t" CALCULATED_NUMBER_FORMAT_AUTO
+ "\t%016lx"
"\n"
, (ae->flags & HEALTH_ENTRY_FLAG_SAVED)?'U':'A'
, host->hostname
@@ -112,6 +113,7 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
, ae->new_value
, ae->old_value
+ , (uint64_t)ae->last_repeat
) < 0))
error("HEALTH [%s]: failed to save alarm log entry to '%s'. Health data may be lost in case of abnormal restart.", host->hostname, host->health_log_filename);
else {
@@ -174,10 +176,40 @@ inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filena
continue;
}
+ // Check if we got last_repeat field
+ time_t last_repeat = 0;
+ if(entries > 27) {
+ char* alarm_name = pointers[13];
+ last_repeat = (time_t)strtoul(pointers[27], NULL, 16);
+
+ RRDCALC *rc = alarm_max_last_repeat(host, alarm_name,simple_hash(alarm_name));
+ if (!rc) {
+ for(rc = host->alarms; rc ; rc = rc->next) {
+ RRDCALC *rdcmp = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_name, (avl *)rc);
+ if(rdcmp != rc) {
+ error("Cannot insert the alarm index ID using log %s", rc->name);
+ }
+ }
+
+ rc = alarm_max_last_repeat(host, alarm_name,simple_hash(alarm_name));
+ }
+
+ if(unlikely(rc)) {
+ if (rrdcalc_isrepeating(rc)) {
+ rc->last_repeat = last_repeat;
+ // We iterate through repeating alarm entries only to
+ // find the latest last_repeat timestamp. Otherwise,
+ // there is no need to keep them in memory.
+ continue;
+ }
+ }
+ }
+
if(unlikely(*pointers[0] == 'A')) {
// make sure it is properly numbered
if(unlikely(host->health_log.alarms && unique_id < host->health_log.alarms->unique_id)) {
- error("HEALTH [%s]: line %zu of file '%s' has alarm log entry %u in wrong order. Ignoring it.", host->hostname, line, filename, unique_id);
+ error( "HEALTH [%s]: line %zu of file '%s' has alarm log entry %u in wrong order. Ignoring it."
+ , host->hostname, line, filename, unique_id);
errored++;
continue;
}
@@ -186,11 +218,11 @@ inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filena
}
else if(unlikely(*pointers[0] == 'U')) {
// find the original
- for(ae = host->health_log.alarms; ae; ae = ae->next) {
+ for(ae = host->health_log.alarms; ae ; ae = ae->next) {
if(unlikely(unique_id == ae->unique_id)) {
if(unlikely(*pointers[0] == 'A')) {
error("HEALTH [%s]: line %zu of file '%s' adds duplicate alarm log entry %u. Using the later."
- , host->hostname, line, filename, unique_id);
+ , host->hostname, line, filename, unique_id);
*pointers[0] = 'U';
duplicate++;
}
@@ -270,6 +302,8 @@ inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filena
ae->new_value = str2l(pointers[25]);
ae->old_value = str2l(pointers[26]);
+ ae->last_repeat = last_repeat;
+
char value_string[100 + 1];
freez(ae->old_value_string);
freez(ae->new_value_string);
@@ -339,7 +373,7 @@ inline void health_alarm_log_load(RRDHOST *host) {
// ----------------------------------------------------------------------------
// health alarm log management
-inline void health_alarm_log(
+inline ALARM_ENTRY* health_create_alarm_entry(
RRDHOST *host,
uint32_t alarm_id,
uint32_t alarm_event_id,
@@ -398,9 +432,24 @@ inline void health_alarm_log(
ae->delay_up_to_timestamp = when + delay;
ae->flags |= flags;
+ ae->last_repeat = 0;
+
if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL)
ae->non_clear_duration += ae->duration;
+ return ae;
+}
+
+inline void health_alarm_log(
+ RRDHOST *host,
+ ALARM_ENTRY *ae
+) {
+ debug(D_HEALTH, "Health adding alarm log entry with id: %u", ae->unique_id);
+
+ if(unlikely(alarm_entry_isrepeating(host, ae))) {
+ error("Repeating alarms cannot be added to host's alarm log entries. It seems somewhere in the logic, API is being misused. Alarm id: %u", ae->alarm_id);
+ return;
+ }
// link it
netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
ae->next = host->health_log.alarms;