diff options
author | Emmanuel Vasilakis <mrzammler@mm.st> | 2023-05-23 15:56:56 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-05-23 15:56:56 +0300 |
commit | c0c1e0e85a627d0509a37ea4e7ef00c2cf4aa29f (patch) | |
tree | 6c1208d26ad8fb47fcb78c242cfb1b6ada5a5907 /health | |
parent | a53850f534f45bd86971160415befd6957ddb6d6 (diff) |
Better cleanup of health log table (#15045)
Diffstat (limited to 'health')
-rw-r--r-- | health/health.c | 81 | ||||
-rw-r--r-- | health/health.h | 4 | ||||
-rw-r--r-- | health/health_json.c | 170 |
3 files changed, 48 insertions, 207 deletions
diff --git a/health/health.c b/health/health.c index 5c2b85bc5a..df4798a204 100644 --- a/health/health.c +++ b/health/health.c @@ -412,17 +412,13 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { // find the previous notification for the same alarm // which we have run the exec script // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set + RRDCALC_STATUS last_executed_status = -3; if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) { - uint32_t id = ae->alarm_id; - ALARM_ENTRY *t; - for(t = ae->next; t ; t = t->next) { - if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN) - break; - } + int ret = sql_health_get_last_executed_event(host, ae, &last_executed_status); - if(likely(t)) { + if (likely(ret == 1)) { // we have executed this alarm notification in the past - if(t && t->new_status == ae->new_status) { + if(last_executed_status == ae->new_status) { // don't send the notification for the same status again debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_name(ae), ae_name(ae) , rrdcalc_status2string(ae->new_status)); @@ -561,6 +557,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS; ae->exec_spawn_serial = spawn_enq_cmd(command_to_run); enqueue_alarm_notify_in_progress(ae); + health_alarm_log_save(host, ae); } else { error("Failed to format command arguments"); } @@ -628,35 +625,32 @@ static inline void health_alarm_log_process(RRDHOST *host) { // remember this for the next iteration host->health_last_processed_id = first_waiting; - bool cleanup_excess_log_entries = host->health_log.count > host->health_log.max; - - if (!cleanup_excess_log_entries) - return; - - // cleanup excess entries in the log + //delete those that are updated, no in progress execution, and is not repeating netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock); - ALARM_ENTRY *last = NULL; - unsigned int count = host->health_log.max * 2 / 3; - for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ; - - if(ae && last && last->next == ae) - last->next = NULL; - else - ae = NULL; - - while(ae) { - debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id); - - ALARM_ENTRY *t = ae->next; - - if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) { - health_alarm_wait_for_execution(ae); + ALARM_ENTRY *prev = host->health_log.alarms; + for(ae = host->health_log.alarms; ae ; ae = ae->next) { + + if((likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING)) && + (ae->flags & HEALTH_ENTRY_FLAG_UPDATED) && + (ae->flags & HEALTH_ENTRY_FLAG_SAVED) && + !(ae->flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS)) + || + ((ae->new_status == RRDCALC_STATUS_REMOVED) && + (ae->flags & HEALTH_ENTRY_FLAG_SAVED) && + (ae->when + 3600 < now_realtime_sec()))) + { + + if (ae == host->health_log.alarms) { + host->health_log.alarms = ae->next; + prev = ae->next; + } else { + prev->next = ae->next; + } health_alarm_log_free_one_nochecks_nounlink(ae); - host->health_log.count--; - } - - ae = t; + ae = prev; + } else + prev = ae; } netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); @@ -904,8 +898,24 @@ static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) { return 0; } +static void sql_health_postpone_queue_removed(RRDHOST *host __maybe_unused) { +#ifdef ENABLE_ACLK + if (netdata_cloud_setting) { + struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config; + if (unlikely(!wc)) { + return; + } + + if (wc->alert_queue_removed >= 1) { + wc->alert_queue_removed+=6; + } + } +#endif +} + static void health_execute_delayed_initializations(RRDHOST *host) { RRDSET *st; + bool must_postpone = false; if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION)) return; rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION); @@ -941,8 +951,11 @@ static void health_execute_delayed_initializations(RRDHOST *host) { rrdvar_store_for_chart(host, st); } rrddim_foreach_done(rd); + must_postpone = true; } rrdset_foreach_done(st); + if (must_postpone) + sql_health_postpone_queue_removed(host); } /** diff --git a/health/health.h b/health/health.h index 902e36c622..c36aabac7e 100644 --- a/health/health.h +++ b/health/health.h @@ -41,7 +41,6 @@ void health_reload(void); void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* context, RRDCALC_STATUS status); void health_alarms2json(RRDHOST *host, BUFFER *wb, int all); void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all); -void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart); void health_api_v1_chart_variables2json(RRDSET *st, BUFFER *buf); void health_api_v1_chart_custom_variables2json(RRDSET *st, BUFFER *buf); @@ -87,11 +86,10 @@ void health_alarm_log_free_one_nochecks_nounlink(ALARM_ENTRY *ae); void *health_cmdapi_thread(void *ptr); -void health_label_log_save(RRDHOST *host); - char *health_edit_command_from_source(const char *source); void sql_refresh_hashes(void); void health_add_host_labels(void); +void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix); #endif //NETDATA_HEALTH_H diff --git a/health/health_json.c b/health/health_json.c index ba18bddba9..4f81998f07 100644 --- a/health/health_json.c +++ b/health/health_json.c @@ -13,136 +13,6 @@ void health_string2json(BUFFER *wb, const char *prefix, const char *label, const buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix); } -void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) { - char *edit_command = ae->source ? health_edit_command_from_source(ae_source(ae)) : strdupz("UNKNOWN=0=UNKNOWN"); - char config_hash_id[GUID_LEN + 1]; - uuid_unparse_lower(ae->config_hash_id, config_hash_id); - - buffer_sprintf(wb, - "\n\t{\n" - "\t\t\"hostname\": \"%s\",\n" - "\t\t\"utc_offset\": %d,\n" - "\t\t\"timezone\": \"%s\",\n" - "\t\t\"unique_id\": %u,\n" - "\t\t\"alarm_id\": %u,\n" - "\t\t\"alarm_event_id\": %u,\n" - "\t\t\"config_hash_id\": \"%s\",\n" - "\t\t\"name\": \"%s\",\n" - "\t\t\"chart\": \"%s\",\n" - "\t\t\"context\": \"%s\",\n" - "\t\t\"family\": \"%s\",\n" - "\t\t\"class\": \"%s\",\n" - "\t\t\"component\": \"%s\",\n" - "\t\t\"type\": \"%s\",\n" - "\t\t\"processed\": %s,\n" - "\t\t\"updated\": %s,\n" - "\t\t\"exec_run\": %lu,\n" - "\t\t\"exec_failed\": %s,\n" - "\t\t\"exec\": \"%s\",\n" - "\t\t\"recipient\": \"%s\",\n" - "\t\t\"exec_code\": %d,\n" - "\t\t\"source\": \"%s\",\n" - "\t\t\"command\": \"%s\",\n" - "\t\t\"units\": \"%s\",\n" - "\t\t\"when\": %lu,\n" - "\t\t\"duration\": %lu,\n" - "\t\t\"non_clear_duration\": %lu,\n" - "\t\t\"status\": \"%s\",\n" - "\t\t\"old_status\": \"%s\",\n" - "\t\t\"delay\": %d,\n" - "\t\t\"delay_up_to_timestamp\": %lu,\n" - "\t\t\"updated_by_id\": %u,\n" - "\t\t\"updates_id\": %u,\n" - "\t\t\"value_string\": \"%s\",\n" - "\t\t\"old_value_string\": \"%s\",\n" - "\t\t\"last_repeat\": \"%lu\",\n" - "\t\t\"silenced\": \"%s\",\n" - , rrdhost_hostname(host) - , host->utc_offset - , rrdhost_abbrev_timezone(host) - , ae->unique_id - , ae->alarm_id - , ae->alarm_event_id - , config_hash_id - , ae_name(ae) - , ae_chart_name(ae) - , ae_chart_context(ae) - , ae_family(ae) - , ae->classification?ae_classification(ae):"Unknown" - , ae->component?ae_component(ae):"Unknown" - , ae->type?ae_type(ae):"Unknown" - , (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false" - , (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false" - , (unsigned long)ae->exec_run_timestamp - , (ae->flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false" - , ae->exec?ae_exec(ae):string2str(host->health.health_default_exec) - , ae->recipient?ae_recipient(ae):string2str(host->health.health_default_recipient) - , ae->exec_code - , ae_source(ae) - , edit_command - , ae_units(ae) - , (unsigned long)ae->when - , (unsigned long)ae->duration - , (unsigned long)ae->non_clear_duration - , rrdcalc_status2string(ae->new_status) - , rrdcalc_status2string(ae->old_status) - , ae->delay - , (unsigned long)ae->delay_up_to_timestamp - , ae->updated_by_id - , ae->updates_id - , ae_new_value_string(ae) - , ae_old_value_string(ae) - , (unsigned long)ae->last_repeat - , (ae->flags & HEALTH_ENTRY_FLAG_SILENCED)?"true":"false" - ); - - health_string2json(wb, "\t\t", "info", ae->info ? ae_info(ae) : "", ",\n"); - - if(unlikely(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION)) { - buffer_strcat(wb, "\t\t\"no_clear_notification\": true,\n"); - } - - buffer_strcat(wb, "\t\t\"value\":"); - buffer_print_netdata_double(wb, ae->new_value); - buffer_strcat(wb, ",\n"); - - buffer_strcat(wb, "\t\t\"old_value\":"); - buffer_print_netdata_double(wb, ae->old_value); - buffer_strcat(wb, "\n"); - - buffer_strcat(wb, "\t}"); - - freez(edit_command); -} - -void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart) { - - buffer_strcat(wb, "["); - - unsigned int max = host->health_log.max; - unsigned int count = 0; - - STRING *chart_string = string_strdupz(chart); - - netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); - - ALARM_ENTRY *ae; - for (ae = host->health_log.alarms; ae && count < max; ae = ae->next) { - if ((ae->unique_id > after) && (!chart || chart_string == ae->chart)) { - if (likely(count)) - buffer_strcat(wb, ","); - health_alarm_entry2json_nolock(wb, ae, host); - count++; - } - } - - netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); - - string_freez(chart_string); - - buffer_strcat(wb, "\n]\n"); -} - static inline void health_rrdcalc_values2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC *rc) { (void)host; buffer_sprintf(wb, @@ -397,43 +267,3 @@ void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all) { buffer_strcat(wb, "\n\t}\n}\n"); } -static int have_recent_alarm(RRDHOST *host, uint32_t alarm_id, uint32_t mark) -{ - ALARM_ENTRY *ae = host->health_log.alarms; - - while(ae) { - if (ae->alarm_id == alarm_id && ae->unique_id > mark && - (ae->new_status != RRDCALC_STATUS_WARNING && ae->new_status != RRDCALC_STATUS_CRITICAL)) - return 1; - ae = ae->next; - } - return 0; -} - -void health_active_log_alarms_2json(RRDHOST *host, BUFFER *wb) { - netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); - - buffer_sprintf(wb, "[\n"); - - unsigned int max = host->health_log.max; - unsigned int count = 0; - ALARM_ENTRY *ae; - for(ae = host->health_log.alarms; ae && count < max ; ae = ae->next) { - if (!ae->updated_by_id && - ((ae->new_status == RRDCALC_STATUS_WARNING || ae->new_status == RRDCALC_STATUS_CRITICAL) || - ((ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL) && - ae->new_status == RRDCALC_STATUS_REMOVED))) { - - if (have_recent_alarm(host, ae->alarm_id, ae->unique_id)) - continue; - - if (likely(count)) - buffer_strcat(wb, ","); - health_alarm_entry2json_nolock(wb, ae, host); - count++; - } - } - buffer_strcat(wb, "]"); - - netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); -} |