summaryrefslogtreecommitdiffstats
path: root/health
diff options
context:
space:
mode:
authorEmmanuel Vasilakis <mrzammler@mm.st>2023-05-23 15:56:56 +0300
committerGitHub <noreply@github.com>2023-05-23 15:56:56 +0300
commitc0c1e0e85a627d0509a37ea4e7ef00c2cf4aa29f (patch)
tree6c1208d26ad8fb47fcb78c242cfb1b6ada5a5907 /health
parenta53850f534f45bd86971160415befd6957ddb6d6 (diff)
Better cleanup of health log table (#15045)
Diffstat (limited to 'health')
-rw-r--r--health/health.c81
-rw-r--r--health/health.h4
-rw-r--r--health/health_json.c170
3 files changed, 48 insertions, 207 deletions
diff --git a/health/health.c b/health/health.c
index 5c2b85bc5a..df4798a204 100644
--- a/health/health.c
+++ b/health/health.c
@@ -412,17 +412,13 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
// find the previous notification for the same alarm
// which we have run the exec script
// exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
+ RRDCALC_STATUS last_executed_status = -3;
if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
- uint32_t id = ae->alarm_id;
- ALARM_ENTRY *t;
- for(t = ae->next; t ; t = t->next) {
- if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
- break;
- }
+ int ret = sql_health_get_last_executed_event(host, ae, &last_executed_status);
- if(likely(t)) {
+ if (likely(ret == 1)) {
// we have executed this alarm notification in the past
- if(t && t->new_status == ae->new_status) {
+ if(last_executed_status == ae->new_status) {
// don't send the notification for the same status again
debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_name(ae), ae_name(ae)
, rrdcalc_status2string(ae->new_status));
@@ -561,6 +557,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
ae->exec_spawn_serial = spawn_enq_cmd(command_to_run);
enqueue_alarm_notify_in_progress(ae);
+ health_alarm_log_save(host, ae);
} else {
error("Failed to format command arguments");
}
@@ -628,35 +625,32 @@ static inline void health_alarm_log_process(RRDHOST *host) {
// remember this for the next iteration
host->health_last_processed_id = first_waiting;
- bool cleanup_excess_log_entries = host->health_log.count > host->health_log.max;
-
- if (!cleanup_excess_log_entries)
- return;
-
- // cleanup excess entries in the log
+ //delete those that are updated, no in progress execution, and is not repeating
netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
- ALARM_ENTRY *last = NULL;
- unsigned int count = host->health_log.max * 2 / 3;
- for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
-
- if(ae && last && last->next == ae)
- last->next = NULL;
- else
- ae = NULL;
-
- while(ae) {
- debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
-
- ALARM_ENTRY *t = ae->next;
-
- if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) {
- health_alarm_wait_for_execution(ae);
+ ALARM_ENTRY *prev = host->health_log.alarms;
+ for(ae = host->health_log.alarms; ae ; ae = ae->next) {
+
+ if((likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING)) &&
+ (ae->flags & HEALTH_ENTRY_FLAG_UPDATED) &&
+ (ae->flags & HEALTH_ENTRY_FLAG_SAVED) &&
+ !(ae->flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS))
+ ||
+ ((ae->new_status == RRDCALC_STATUS_REMOVED) &&
+ (ae->flags & HEALTH_ENTRY_FLAG_SAVED) &&
+ (ae->when + 3600 < now_realtime_sec())))
+ {
+
+ if (ae == host->health_log.alarms) {
+ host->health_log.alarms = ae->next;
+ prev = ae->next;
+ } else {
+ prev->next = ae->next;
+ }
health_alarm_log_free_one_nochecks_nounlink(ae);
- host->health_log.count--;
- }
-
- ae = t;
+ ae = prev;
+ } else
+ prev = ae;
}
netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
@@ -904,8 +898,24 @@ static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
return 0;
}
+static void sql_health_postpone_queue_removed(RRDHOST *host __maybe_unused) {
+#ifdef ENABLE_ACLK
+ if (netdata_cloud_setting) {
+ struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config;
+ if (unlikely(!wc)) {
+ return;
+ }
+
+ if (wc->alert_queue_removed >= 1) {
+ wc->alert_queue_removed+=6;
+ }
+ }
+#endif
+}
+
static void health_execute_delayed_initializations(RRDHOST *host) {
RRDSET *st;
+ bool must_postpone = false;
if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION)) return;
rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION);
@@ -941,8 +951,11 @@ static void health_execute_delayed_initializations(RRDHOST *host) {
rrdvar_store_for_chart(host, st);
}
rrddim_foreach_done(rd);
+ must_postpone = true;
}
rrdset_foreach_done(st);
+ if (must_postpone)
+ sql_health_postpone_queue_removed(host);
}
/**
diff --git a/health/health.h b/health/health.h
index 902e36c622..c36aabac7e 100644
--- a/health/health.h
+++ b/health/health.h
@@ -41,7 +41,6 @@ void health_reload(void);
void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* context, RRDCALC_STATUS status);
void health_alarms2json(RRDHOST *host, BUFFER *wb, int all);
void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all);
-void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart);
void health_api_v1_chart_variables2json(RRDSET *st, BUFFER *buf);
void health_api_v1_chart_custom_variables2json(RRDSET *st, BUFFER *buf);
@@ -87,11 +86,10 @@ void health_alarm_log_free_one_nochecks_nounlink(ALARM_ENTRY *ae);
void *health_cmdapi_thread(void *ptr);
-void health_label_log_save(RRDHOST *host);
-
char *health_edit_command_from_source(const char *source);
void sql_refresh_hashes(void);
void health_add_host_labels(void);
+void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix);
#endif //NETDATA_HEALTH_H
diff --git a/health/health_json.c b/health/health_json.c
index ba18bddba9..4f81998f07 100644
--- a/health/health_json.c
+++ b/health/health_json.c
@@ -13,136 +13,6 @@ void health_string2json(BUFFER *wb, const char *prefix, const char *label, const
buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix);
}
-void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) {
- char *edit_command = ae->source ? health_edit_command_from_source(ae_source(ae)) : strdupz("UNKNOWN=0=UNKNOWN");
- char config_hash_id[GUID_LEN + 1];
- uuid_unparse_lower(ae->config_hash_id, config_hash_id);
-
- buffer_sprintf(wb,
- "\n\t{\n"
- "\t\t\"hostname\": \"%s\",\n"
- "\t\t\"utc_offset\": %d,\n"
- "\t\t\"timezone\": \"%s\",\n"
- "\t\t\"unique_id\": %u,\n"
- "\t\t\"alarm_id\": %u,\n"
- "\t\t\"alarm_event_id\": %u,\n"
- "\t\t\"config_hash_id\": \"%s\",\n"
- "\t\t\"name\": \"%s\",\n"
- "\t\t\"chart\": \"%s\",\n"
- "\t\t\"context\": \"%s\",\n"
- "\t\t\"family\": \"%s\",\n"
- "\t\t\"class\": \"%s\",\n"
- "\t\t\"component\": \"%s\",\n"
- "\t\t\"type\": \"%s\",\n"
- "\t\t\"processed\": %s,\n"
- "\t\t\"updated\": %s,\n"
- "\t\t\"exec_run\": %lu,\n"
- "\t\t\"exec_failed\": %s,\n"
- "\t\t\"exec\": \"%s\",\n"
- "\t\t\"recipient\": \"%s\",\n"
- "\t\t\"exec_code\": %d,\n"
- "\t\t\"source\": \"%s\",\n"
- "\t\t\"command\": \"%s\",\n"
- "\t\t\"units\": \"%s\",\n"
- "\t\t\"when\": %lu,\n"
- "\t\t\"duration\": %lu,\n"
- "\t\t\"non_clear_duration\": %lu,\n"
- "\t\t\"status\": \"%s\",\n"
- "\t\t\"old_status\": \"%s\",\n"
- "\t\t\"delay\": %d,\n"
- "\t\t\"delay_up_to_timestamp\": %lu,\n"
- "\t\t\"updated_by_id\": %u,\n"
- "\t\t\"updates_id\": %u,\n"
- "\t\t\"value_string\": \"%s\",\n"
- "\t\t\"old_value_string\": \"%s\",\n"
- "\t\t\"last_repeat\": \"%lu\",\n"
- "\t\t\"silenced\": \"%s\",\n"
- , rrdhost_hostname(host)
- , host->utc_offset
- , rrdhost_abbrev_timezone(host)
- , ae->unique_id
- , ae->alarm_id
- , ae->alarm_event_id
- , config_hash_id
- , ae_name(ae)
- , ae_chart_name(ae)
- , ae_chart_context(ae)
- , ae_family(ae)
- , ae->classification?ae_classification(ae):"Unknown"
- , ae->component?ae_component(ae):"Unknown"
- , ae->type?ae_type(ae):"Unknown"
- , (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false"
- , (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false"
- , (unsigned long)ae->exec_run_timestamp
- , (ae->flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false"
- , ae->exec?ae_exec(ae):string2str(host->health.health_default_exec)
- , ae->recipient?ae_recipient(ae):string2str(host->health.health_default_recipient)
- , ae->exec_code
- , ae_source(ae)
- , edit_command
- , ae_units(ae)
- , (unsigned long)ae->when
- , (unsigned long)ae->duration
- , (unsigned long)ae->non_clear_duration
- , rrdcalc_status2string(ae->new_status)
- , rrdcalc_status2string(ae->old_status)
- , ae->delay
- , (unsigned long)ae->delay_up_to_timestamp
- , ae->updated_by_id
- , ae->updates_id
- , ae_new_value_string(ae)
- , ae_old_value_string(ae)
- , (unsigned long)ae->last_repeat
- , (ae->flags & HEALTH_ENTRY_FLAG_SILENCED)?"true":"false"
- );
-
- health_string2json(wb, "\t\t", "info", ae->info ? ae_info(ae) : "", ",\n");
-
- if(unlikely(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION)) {
- buffer_strcat(wb, "\t\t\"no_clear_notification\": true,\n");
- }
-
- buffer_strcat(wb, "\t\t\"value\":");
- buffer_print_netdata_double(wb, ae->new_value);
- buffer_strcat(wb, ",\n");
-
- buffer_strcat(wb, "\t\t\"old_value\":");
- buffer_print_netdata_double(wb, ae->old_value);
- buffer_strcat(wb, "\n");
-
- buffer_strcat(wb, "\t}");
-
- freez(edit_command);
-}
-
-void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart) {
-
- buffer_strcat(wb, "[");
-
- unsigned int max = host->health_log.max;
- unsigned int count = 0;
-
- STRING *chart_string = string_strdupz(chart);
-
- netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
-
- ALARM_ENTRY *ae;
- for (ae = host->health_log.alarms; ae && count < max; ae = ae->next) {
- if ((ae->unique_id > after) && (!chart || chart_string == ae->chart)) {
- if (likely(count))
- buffer_strcat(wb, ",");
- health_alarm_entry2json_nolock(wb, ae, host);
- count++;
- }
- }
-
- netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
-
- string_freez(chart_string);
-
- buffer_strcat(wb, "\n]\n");
-}
-
static inline void health_rrdcalc_values2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC *rc) {
(void)host;
buffer_sprintf(wb,
@@ -397,43 +267,3 @@ void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all) {
buffer_strcat(wb, "\n\t}\n}\n");
}
-static int have_recent_alarm(RRDHOST *host, uint32_t alarm_id, uint32_t mark)
-{
- ALARM_ENTRY *ae = host->health_log.alarms;
-
- while(ae) {
- if (ae->alarm_id == alarm_id && ae->unique_id > mark &&
- (ae->new_status != RRDCALC_STATUS_WARNING && ae->new_status != RRDCALC_STATUS_CRITICAL))
- return 1;
- ae = ae->next;
- }
- return 0;
-}
-
-void health_active_log_alarms_2json(RRDHOST *host, BUFFER *wb) {
- netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
-
- buffer_sprintf(wb, "[\n");
-
- unsigned int max = host->health_log.max;
- unsigned int count = 0;
- ALARM_ENTRY *ae;
- for(ae = host->health_log.alarms; ae && count < max ; ae = ae->next) {
- if (!ae->updated_by_id &&
- ((ae->new_status == RRDCALC_STATUS_WARNING || ae->new_status == RRDCALC_STATUS_CRITICAL) ||
- ((ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL) &&
- ae->new_status == RRDCALC_STATUS_REMOVED))) {
-
- if (have_recent_alarm(host, ae->alarm_id, ae->unique_id))
- continue;
-
- if (likely(count))
- buffer_strcat(wb, ",");
- health_alarm_entry2json_nolock(wb, ae, host);
- count++;
- }
- }
- buffer_strcat(wb, "]");
-
- netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
-}