diff options
author | Emmanuel Vasilakis <mrzammler@mm.st> | 2023-01-18 10:42:30 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-01-18 10:42:30 +0200 |
commit | 3d5f9e64a012b8d0f66ccf483e0e0e6eb3186787 (patch) | |
tree | a564b1d7a15427c601d7ead9a410a825e6007894 | |
parent | b9cafae62c973a651dbf1bb7448fc42e2eb61527 (diff) |
Revert health to run in a single thread (#14244)
* revert health to single thread
* remove getting now
* use a health struct
* remove commented code
* cleanup health log from metdata
* dont check for METADATA_UPDATE
-rw-r--r-- | aclk/aclk.c | 2 | ||||
-rw-r--r-- | aclk/aclk.h | 2 | ||||
-rw-r--r-- | daemon/service.c | 2 | ||||
-rw-r--r-- | daemon/static_threads.c | 9 | ||||
-rw-r--r-- | database/rrd.h | 24 | ||||
-rw-r--r-- | database/rrdcalc.c | 4 | ||||
-rw-r--r-- | database/rrddimvar.c | 4 | ||||
-rw-r--r-- | database/rrdhost.c | 27 | ||||
-rw-r--r-- | database/rrdset.c | 4 | ||||
-rw-r--r-- | database/rrdsetvar.c | 4 | ||||
-rw-r--r-- | database/sqlite/sqlite_aclk_alert.c | 10 | ||||
-rw-r--r-- | database/sqlite/sqlite_health.c | 14 | ||||
-rw-r--r-- | database/sqlite/sqlite_metadata.c | 13 | ||||
-rw-r--r-- | health/health.c | 893 | ||||
-rw-r--r-- | health/health.h | 7 | ||||
-rw-r--r-- | health/health_config.c | 10 | ||||
-rw-r--r-- | health/health_json.c | 10 | ||||
-rw-r--r-- | health/health_log.c | 74 | ||||
-rw-r--r-- | streaming/receiver.c | 4 |
19 files changed, 554 insertions, 563 deletions
diff --git a/aclk/aclk.c b/aclk/aclk.c index 9579912521..04b81b9eb0 100644 --- a/aclk/aclk.c +++ b/aclk/aclk.c @@ -49,6 +49,8 @@ float last_backoff_value = 0; time_t aclk_block_until = 0; +int aclk_alert_reloaded = 0; //1 on health log exchange, and again on health_reload + #ifdef ENABLE_ACLK mqtt_wss_client mqttwss_client; diff --git a/aclk/aclk.h b/aclk/aclk.h index 6aed548b74..56b24add92 100644 --- a/aclk/aclk.h +++ b/aclk/aclk.h @@ -26,6 +26,8 @@ extern time_t aclk_block_until; extern int disconnect_req; +extern int aclk_alert_reloaded; + #ifdef ENABLE_ACLK void *aclk_main(void *ptr); diff --git a/daemon/service.c b/daemon/service.c index fd19a3cc6f..3f5e8c55a3 100644 --- a/daemon/service.c +++ b/daemon/service.c @@ -201,7 +201,7 @@ static void svc_rrd_cleanup_obsolete_charts_from_all_hosts() { && ( ( host->child_last_chart_command - && host->child_last_chart_command + host->health_delay_up_to < now_realtime_sec() + && host->child_last_chart_command + host->health.health_delay_up_to < now_realtime_sec() ) || (host->child_connect_time + TIME_TO_RUN_OBSOLETIONS_ON_CHILD_CONNECT < now_realtime_sec()) ) diff --git a/daemon/static_threads.c b/daemon/static_threads.c index 51dbf64061..ff43fc0ccc 100644 --- a/daemon/static_threads.c +++ b/daemon/static_threads.c @@ -37,6 +37,15 @@ const struct netdata_static_thread static_threads_common[] = { .start_routine = cpuidlejitter_main }, { + .name = "HEALTH", + .config_section = NULL, + .config_name = NULL, + .enabled = 1, + .thread = NULL, + .init_routine = NULL, + .start_routine = health_main + }, + { .name = "ANALYTICS", .config_section = NULL, .config_name = NULL, diff --git a/database/rrd.h b/database/rrd.h index 77cfb5aee6..1d6a869d22 100644 --- a/database/rrd.h +++ b/database/rrd.h @@ -895,6 +895,17 @@ typedef struct alarm_log { netdata_rwlock_t alarm_log_rwlock; } ALARM_LOG; +typedef struct health { + unsigned int health_enabled; // 1 when this host has health enabled + time_t health_delay_up_to; // a timestamp to delay alarms processing up to + STRING *health_default_exec; // the full path of the alarms notifications program + STRING *health_default_recipient; // the default recipient for all alarms + char *health_log_filename; // the alarms event log filename + size_t health_log_entries_written; // the number of alarm events written to the alarms event log + FILE *health_log_fp; // the FILE pointer to the open alarms event log file + uint32_t health_default_warn_repeat_every; // the default value for the interval between repeating warning notifications + uint32_t health_default_crit_repeat_every; // the default value for the interval between repeating critical notifications +} HEALTH; // ---------------------------------------------------------------------------- // RRD HOST @@ -1012,17 +1023,8 @@ struct rrdhost { // ------------------------------------------------------------------------ // health monitoring options - unsigned int health_enabled; // 1 when this host has health enabled - bool health_spawn; // true when health thread is running - unsigned int aclk_alert_reloaded; // 1 on thread start and health reload, 0 after removed are sent - time_t health_delay_up_to; // a timestamp to delay alarms processing up to - STRING *health_default_exec; // the full path of the alarms notifications program - STRING *health_default_recipient; // the default recipient for all alarms - char *health_log_filename; // the alarms event log filename - size_t health_log_entries_written; // the number of alarm events written to the alarms event log - FILE *health_log_fp; // the FILE pointer to the open alarms event log file - uint32_t health_default_warn_repeat_every; // the default value for the interval between repeating warning notifications - uint32_t health_default_crit_repeat_every; // the default value for the interval between repeating critical notifications + // health variables + HEALTH health; // all RRDCALCs are primarily allocated and linked here DICTIONARY *rrdcalc_root_index; diff --git a/database/rrdcalc.c b/database/rrdcalc.c index 45338a8260..153d58513e 100644 --- a/database/rrdcalc.c +++ b/database/rrdcalc.c @@ -739,7 +739,7 @@ void rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(RRDHOST *host continue; if(!rrdlabels_match_simple_pattern_parsed(host->rrdlabels, rc->host_labels_pattern, '=')) { - info("Health configuration for alarm '%s' cannot be applied, because the host %s does not have the label(s) '%s'", + log_health("Health configuration for alarm '%s' cannot be applied, because the host %s does not have the label(s) '%s'", rrdcalc_name(rc), rrdhost_hostname(host), rrdcalc_host_labels(rc)); @@ -755,7 +755,7 @@ void rrdcalc_delete_alerts_not_matching_host_labels_from_all_hosts() { RRDHOST *host; rrdhost_foreach_read(host) { - if (unlikely(!host->health_enabled)) + if (unlikely(!host->health.health_enabled)) continue; if (host->rrdlabels) diff --git a/database/rrddimvar.c b/database/rrddimvar.c index 449ceeb937..be5235239f 100644 --- a/database/rrddimvar.c +++ b/database/rrddimvar.c @@ -65,7 +65,7 @@ static inline void rrddimvar_free_variables_unsafe(RRDDIMVAR *rs) { // HOST VARIABLES FOR THIS DIMENSION - if(host->rrdvars && host->health_enabled) { + if(host->rrdvars && host->health.health_enabled) { rrdvar_release_and_del(host->rrdvars, rs->rrdvar_host_chart_id_dim_id); rs->rrdvar_host_chart_id_dim_id = NULL; @@ -152,7 +152,7 @@ static inline void rrddimvar_update_variables_unsafe(RRDDIMVAR *rs) { // - $chart-name.id // - $chart-name.name - if(host->rrdvars && host->health_enabled) { + if(host->rrdvars && host->health.health_enabled) { rs->rrdvar_host_chart_id_dim_id = rrdvar_add_and_acquire("host", host->rrdvars, key_chart_id_dim_id, rs->type, RRDVAR_FLAG_NONE, rs->value); rs->rrdvar_host_chart_id_dim_name = rrdvar_add_and_acquire("host", host->rrdvars, key_chart_id_dim_name, rs->type, RRDVAR_FLAG_NONE, rs->value); rs->rrdvar_host_chart_name_dim_id = rrdvar_add_and_acquire("host", host->rrdvars, key_chart_name_dim_id, rs->type, RRDVAR_FLAG_NONE, rs->value); diff --git a/database/rrdhost.c b/database/rrdhost.c index 94e5a4770f..6dd7db91be 100644 --- a/database/rrdhost.c +++ b/database/rrdhost.c @@ -281,8 +281,8 @@ int is_legacy = 1; rrdhost_init_hostname(host, hostname, false); - host->rrd_history_entries = align_entries_to_pagesize(memory_mode, entries); - host->health_enabled = ((memory_mode == RRD_MEMORY_MODE_NONE)) ? 0 : health_enabled; + host->rrd_history_entries = align_entries_to_pagesize(memory_mode, entries); + host->health.health_enabled = ((memory_mode == RRD_MEMORY_MODE_NONE)) ? 0 : health_enabled; if (likely(!archived)) { rrdfunctions_init(host); @@ -366,9 +366,6 @@ int is_legacy = 1; rrdcalc_rrdhost_index_init(host); metaqueue_host_update_info(host); - if (health_enabled) - health_thread_spawn(host); - if (host->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) { #ifdef ENABLE_DBENGINE char dbenginepath[FILENAME_MAX + 1]; @@ -516,12 +513,12 @@ int is_legacy = 1; , rrdhost_has_rrdpush_sender_enabled(host)?"enabled":"disabled" , host->rrdpush_send_destination?host->rrdpush_send_destination:"" , host->rrdpush_send_api_key?host->rrdpush_send_api_key:"" - , host->health_enabled?"enabled":"disabled" + , host->health.health_enabled?"enabled":"disabled" , host->cache_dir , host->varlib_dir - , host->health_log_filename - , string2str(host->health_default_exec) - , string2str(host->health_default_recipient) + , host->health.health_log_filename + , string2str(host->health.health_default_exec) + , string2str(host->health.health_default_recipient) ); if(!archived) @@ -566,7 +563,7 @@ static void rrdhost_update(RRDHOST *host netdata_spinlock_lock(&host->rrdhost_update_lock); - host->health_enabled = (mode == RRD_MEMORY_MODE_NONE) ? 0 : health_enabled; + host->health.health_enabled = (mode == RRD_MEMORY_MODE_NONE) ? 0 : health_enabled; { struct rrdhost_system_info *old = host->system_info; @@ -651,9 +648,6 @@ static void rrdhost_update(RRDHOST *host info("Host %s is not in archived mode anymore", rrdhost_hostname(host)); } - if (health_enabled) - health_thread_spawn(host); - netdata_spinlock_unlock(&host->rrdhost_update_lock); } @@ -1175,9 +1169,9 @@ void rrdhost_free___while_having_rrd_wrlock(RRDHOST *host, bool force) { freez(host->rrdpush_send_api_key); freez(host->rrdpush_send_destination); rrdpush_destinations_free(host); - string_freez(host->health_default_exec); - string_freez(host->health_default_recipient); - freez(host->health_log_filename); + string_freez(host->health.health_default_exec); + string_freez(host->health.health_default_recipient); + freez(host->health.health_log_filename); string_freez(host->registry_hostname); simple_pattern_free(host->rrdpush_send_charts_matching); netdata_rwlock_destroy(&host->health_log.alarm_log_rwlock); @@ -1376,7 +1370,6 @@ void reload_host_labels(void) { health_label_log_save(localhost); rrdpush_send_host_labels(localhost); - health_reload(); } // ---------------------------------------------------------------------------- diff --git a/database/rrdset.c b/database/rrdset.c index 1aaff8a6b9..140a63953f 100644 --- a/database/rrdset.c +++ b/database/rrdset.c @@ -168,7 +168,7 @@ static void rrdset_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, v // chart variables - we need this for data collection to work (collector given chart variables) - not only health rrdsetvar_index_init(st); - if (host->health_enabled) { + if (host->health.health_enabled) { st->rrdfamily = rrdfamily_add_and_acquire(host, rrdset_family(st)); st->rrdvars = rrdvariables_create(); rrddimvar_index_init(st); @@ -366,7 +366,7 @@ static void rrdset_react_callback(const DICTIONARY_ITEM *item __maybe_unused, vo st->last_accessed_time_s = now_realtime_sec(); - if(host->health_enabled && (ctr->react_action & (RRDSET_REACT_NEW | RRDSET_REACT_CHART_ACTIVATED))) { + if(host->health.health_enabled && (ctr->react_action & (RRDSET_REACT_NEW | RRDSET_REACT_CHART_ACTIVATED))) { rrdset_flag_set(st, RRDSET_FLAG_PENDING_HEALTH_INITIALIZATION); rrdhost_flag_set(st->rrdhost, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION); } diff --git a/database/rrdsetvar.c b/database/rrdsetvar.c index 22cf8a1f01..9684310cdf 100644 --- a/database/rrdsetvar.c +++ b/database/rrdsetvar.c @@ -43,7 +43,7 @@ static inline void rrdsetvar_free_rrdvars_unsafe(RRDSET *st, RRDSETVAR *rs) { // ------------------------------------------------------------------------ // HOST - if(host->rrdvars && host->health_enabled) { + if(host->rrdvars && host->health.health_enabled) { rrdvar_release_and_del(host->rrdvars, rs->rrdvar_host_chart_id); rs->rrdvar_host_chart_id = NULL; @@ -93,7 +93,7 @@ static inline void rrdsetvar_update_rrdvars_unsafe(RRDSET *st, RRDSETVAR *rs) { // ------------------------------------------------------------------------ // HOST - if(host->rrdvars && host->health_enabled) { + if(host->rrdvars && host->health.health_enabled) { rs->rrdvar_host_chart_id = rrdvar_add_and_acquire("host", host->rrdvars, key_chart_id, rs->type, options, rs->value); rs->rrdvar_host_chart_name = rrdvar_add_and_acquire("host", host->rrdvars, key_chart_name, rs->type, options, rs->value); } diff --git a/database/sqlite/sqlite_aclk_alert.c b/database/sqlite/sqlite_aclk_alert.c index 60cce7165a..6151d9c885 100644 --- a/database/sqlite/sqlite_aclk_alert.c +++ b/database/sqlite/sqlite_aclk_alert.c @@ -314,7 +314,7 @@ void aclk_push_alert_event(struct aclk_database_worker_config *wc, struct aclk_d alarm_log.utc_offset = wc->host->utc_offset; alarm_log.timezone = strdupz(rrdhost_abbrev_timezone(wc->host)); alarm_log.exec_path = sqlite3_column_bytes(res, 14) > 0 ? strdupz((char *)sqlite3_column_text(res, 14)) : - strdupz((char *)string2str(wc->host->health_default_exec)); + strdupz((char *)string2str(wc->host->health.health_default_exec)); alarm_log.conf_source = strdupz((char *)sqlite3_column_text(res, 16)); char *edit_command = sqlite3_column_bytes(res, 16) > 0 ? @@ -531,7 +531,7 @@ void aclk_push_alarm_health_log(struct aclk_database_worker_config *wc, struct a alarm_log.node_id = wc->node_id; alarm_log.log_entries = log_entries; alarm_log.status = wc->alert_updates == 0 ? 2 : 1; - alarm_log.enabled = (int)host->health_enabled; + alarm_log.enabled = (int)host->health.health_enabled; wc->alert_sequence_id = last_sequence; @@ -544,6 +544,8 @@ void aclk_push_alarm_health_log(struct aclk_database_worker_config *wc, struct a freez(claim_id); buffer_free(sql); + + aclk_alert_reloaded = 1; #endif return; @@ -709,7 +711,7 @@ void aclk_start_alert_streaming(char *node_id, uint64_t batch_id, uint64_t start (struct aclk_database_worker_config *)host->dbsync_worker : (struct aclk_database_worker_config *)find_inactive_wc_by_node_id(node_id); - if (unlikely(!host->health_enabled)) { + if (unlikely(!host->health.health_enabled)) { log_access("ACLK STA [%s (N/A)]: Ignoring request to stream alert state changes, health is disabled.", node_id); return; } @@ -849,7 +851,7 @@ void health_alarm_entry2proto_nolock(struct alarm_log_entry *alarm_log, ALARM_EN alarm_log->utc_offset = host->utc_offset; alarm_log->timezone = strdupz(rrdhost_abbrev_timezone(host)); - alarm_log->exec_path = ae->exec ? strdupz(ae_exec(ae)) : strdupz((char *)string2str(host->health_default_exec)); + alarm_log->exec_path = ae->exec ? strdupz(ae_exec(ae)) : strdupz((char *)string2str(host->health.health_default_exec)); alarm_log->conf_source = ae->source ? strdupz(ae_source(ae)) : strdupz((char *)""); alarm_log->command = strdupz((char *)edit_command); diff --git a/database/sqlite/sqlite_health.c b/database/sqlite/sqlite_health.c index c189305b8d..dc4020deff 100644 --- a/database/sqlite/sqlite_health.c +++ b/database/sqlite/sqlite_health.c @@ -337,7 +337,7 @@ void sql_health_alarm_log_insert(RRDHOST *host, ALARM_ENTRY *ae) { } ae->flags |= HEALTH_ENTRY_FLAG_SAVED; - host->health_log_entries_written++; + host->health.health_log_entries_written++; failed: if (unlikely(sqlite3_finalize(res) != SQLITE_OK)) @@ -369,7 +369,7 @@ void sql_health_alarm_log_cleanup(RRDHOST *host) { if(rotate_every < 100) rotate_every = 100; } - if(likely(host->health_log_entries_written < rotate_every)) { + if(likely(host->health.health_log_entries_written < rotate_every)) { return; } @@ -382,7 +382,7 @@ void sql_health_alarm_log_cleanup(RRDHOST *host) { char uuid_str[GUID_LEN + 1]; uuid_unparse_lower_fix(&host->host_uuid, uuid_str); - snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_CLEANUP_HEALTH_LOG(uuid_str, uuid_str, (unsigned long int) (host->health_log_entries_written - rotate_every))); + snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_CLEANUP_HEALTH_LOG(uuid_str, uuid_str, (unsigned long int) (host->health.health_log_entries_written - rotate_every))); rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); if (unlikely(rc != SQLITE_OK)) { @@ -398,7 +398,7 @@ void sql_health_alarm_log_cleanup(RRDHOST *host) { if (unlikely(rc != SQLITE_OK)) error_report("Failed to finalize the prepared statement to cleanup health log table"); - host->health_log_entries_written = rotate_every; + host->health.health_log_entries_written = rotate_every; sql_aclk_alert_clean_dead_entries(host); } @@ -431,13 +431,13 @@ void sql_health_alarm_log_count(RRDHOST *host) { rc = sqlite3_step_monitored(res); if (likely(rc == SQLITE_ROW)) - host->health_log_entries_written = (size_t) sqlite3_column_int64(res, 0); + host->health.health_log_entries_written = (size_t) sqlite3_column_int64(res, 0); rc = sqlite3_finalize(res); if (unlikely(rc != SQLITE_OK)) error_report("Failed to finalize the prepared statement to count health log entries from db"); - info("HEALTH [%s]: Table health_log_%s, contains %lu entries.", rrdhost_hostname(host), uuid_str, (unsigned long int) host->health_log_entries_written); + info("HEALTH [%s]: Table health_log_%s, contains %lu entries.", rrdhost_hostname(host), uuid_str, (unsigned long int) host->health.health_log_entries_written); } #define SQL_INJECT_REMOVED(guid, guid2) "insert into health_log_%s (hostname, unique_id, alarm_id, alarm_event_id, config_hash_id, updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, " \ @@ -612,7 +612,7 @@ void sql_health_alarm_log_load(RRDHOST *host) { ssize_t errored = 0, loaded = 0; char command[MAX_HEALTH_SQL_SIZE + 1]; - host->health_log_entries_written = 0; + host->health.health_log_entries_written = 0; if (unlikely(!db_meta)) { if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) diff --git a/database/sqlite/sqlite_metadata.c b/database/sqlite/sqlite_metadata.c index 28cd18e40e..2733806ca3 100644 --- a/database/sqlite/sqlite_metadata.c +++ b/database/sqlite/sqlite_metadata.c @@ -328,7 +328,7 @@ static int sql_store_host_info(RRDHOST *host) if (unlikely(rc != SQLITE_OK)) goto bind_fail; - rc = sqlite3_bind_int(res, ++param, (int ) host->health_enabled); + rc = sqlite3_bind_int(res, ++param, (int ) host->health.health_enabled); if (unlikely(rc != SQLITE_OK)) goto bind_fail; @@ -686,6 +686,16 @@ skip_run: error_report("Failed to finalize the prepared statement when reading dimensions"); } +static void cleanup_health_log(void) +{ + RRDHOST *host; + dfe_start_reentrant(rrdhost_root_index, host) { + if (rrdhost_flag_check(host, RRDHOST_FLAG_ARCHIVED)) + continue; + sql_health_alarm_log_cleanup(host); + } + dfe_done(host); +} // // EVENT LOOP STARTS HERE @@ -845,6 +855,7 @@ static void start_metadata_cleanup(uv_work_t *req) worker_is_busy(UV_EVENT_METADATA_CLEANUP); struct metadata_wc *wc = req->data; check_dimension_metadata(wc); + cleanup_health_log(); worker_is_idle(); } diff --git a/health/health.c b/health/health.c index 0ad1f122a0..36fcd2f2b1 100644 --- a/health/health.c +++ b/health/health.c @@ -162,7 +162,7 @@ char *silencers_filename; SIMPLE_PATTERN *conf_enabled_alarms = NULL; // the queue of executed alarm notifications that haven't been waited for yet -static __thread struct { +static struct { ALARM_ENTRY *head; // oldest ALARM_ENTRY *tail; // latest } alarm_notifications_in_progress = {NULL, NULL}; @@ -302,7 +302,7 @@ void health_init(void) { * @param host the structure of the host that the function will reload the configuration. */ static void health_reload_host(RRDHOST *host) { - if(unlikely(!host->health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH)) + if(unlikely(!host->health.health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH)) return; log_health("[%s]: Reloading health.", rrdhost_hostname(host)); @@ -346,7 +346,6 @@ static void health_reload_host(RRDHOST *host) { rrdcalctemplate_link_matching_templates_to_rrdset(st); } rrdset_foreach_done(st); - host->aclk_alert_reloaded = 1; } /** @@ -364,6 +363,12 @@ void health_reload(void) { health_reload_host(host); rrd_unlock(); + +#ifdef ENABLE_ACLK + if (netdata_cloud_setting) { + aclk_alert_reloaded = 1; + } +#endif } // ---------------------------------------------------------------------------- @@ -445,8 +450,8 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { log_health("[%s]: Sending notification for alarm '%s.%s' status %s.", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); - const char *exec = (ae->exec) ? ae_exec(ae) : string2str(host->health_default_exec); - const char *recipient = (ae->recipient) ? ae_recipient(ae) : string2str(host->health_default_recipient); + const char *exec = (ae->exec) ? ae_exec(ae) : string2str(host->health.health_default_exec); + const char *recipient = (ae->recipient) ? ae_recipient(ae) : string2str(host->health.health_default_recipient); int n_warn=0, n_crit=0; RRDCALC *rc; @@ -720,7 +725,7 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) } static inline int check_if_resumed_from_suspension(void) { - static __thread usec_t last_realtime = 0, last_monotonic = 0; + static usec_t last_realtime = 0, last_monotonic = 0; usec_t realtime = now_realtime_usec(), monotonic = now_monotonic_usec(); int ret = 0; @@ -736,18 +741,19 @@ static inline int check_if_resumed_from_suspension(void) { return ret; } -static void health_thread_cleanup(void *ptr) { +static void health_main_cleanup(void *ptr) { worker_unregister(); - struct health_state *h = ptr; - h->host->health_spawn = 0; + struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr; + static_thread->enabled = NETDATA_MAIN_THREAD_EXITING; + info("cleaning up..."); + static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; - log_health("[%s]: Health thread ended.", rrdhost_hostname(h->host)); - debug(D_HEALTH, "HEALTH %s: Health thread ended.", rrdhost_hostname(h->host)); + log_health("Health thread ended."); } static void initialize_health(RRDHOST *host, int is_localhost) { - if(!host->health_enabled || + if(!host->health.health_enabled || rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH) || !service_running(SERVICE_HEALTH)) return; @@ -756,8 +762,8 @@ static void initialize_health(RRDHOST *host, int is_localhost) { log_health("[%s]: Initializing health.", rrdhost_hostname(host)); - host->health_default_warn_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat warning", "never"); - host->health_default_crit_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat critical", "never"); + host->health.health_default_warn_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat warning", "never"); + host->health.health_default_crit_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat critical", "never"); host->health_log.next_log_id = 1; host->health_log.next_alarm_id = 1; @@ -792,13 +798,13 @@ static void initialize_health(RRDHOST *host, int is_localhost) { error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), filename); } snprintfz(filename, FILENAME_MAX, "%s/health/health-log.db", host->varlib_dir); - host->health_log_filename = strdupz(filename); + host->health.health_log_filename = strdupz(filename); snprintfz(filename, FILENAME_MAX, "%s/alarm-notify.sh", netdata_configured_primary_plugins_dir); - host->health_default_exec = string_strdupz(config_get(CONFIG_SECTION_HEALTH, "script to execute on alarm", filename)); - host->health_default_recipient = string_strdupz("root"); + host->health.health_default_exec = string_strdupz(config_get(CONFIG_SECTION_HEALTH, "script to execute on alarm", filename)); + host->health.health_default_recipient = string_strdupz("root"); - if (!file_is_migrated(host->health_log_filename)) { + if (!file_is_migrated(host->health.health_log_filename)) { int rc = sql_create_health_log_table(host); if (unlikely(rc)) { log_health("[%s]: Failed to create health log table in the database", rrdhost_hostname(host)); @@ -807,7 +813,7 @@ static void initialize_health(RRDHOST *host, int is_localhost) { } else { health_alarm_log_load(host); - add_migrated_file(host->health_log_filename, 0); + add_migrated_file(host->health.health_log_filename, 0); } } else { // TODO: This needs to go to the metadata thread @@ -834,16 +840,14 @@ static void initialize_health(RRDHOST *host, int is_localhost) { //Discard alarms with labels that do not apply to host rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host); - - health_silencers_init(); } -static void health_sleep(time_t next_run, unsigned int loop __maybe_unused, RRDHOST *host) { +static void health_sleep(time_t next_run, unsigned int loop __maybe_unused) { time_t now = now_realtime_sec(); if(now < next_run) { worker_is_idle(); debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now)); - while (now < next_run && host->health_enabled && service_running(SERVICE_HEALTH)) { + while (now < next_run && service_running(SERVICE_HEALTH)) { sleep_usec(USEC_PER_SEC); now = now_realtime_sec(); } @@ -1001,534 +1005,522 @@ void *health_main(void *ptr) { worker_register_job_name(WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET, "rrdset init"); worker_register_job_name(WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM, "rrddim init"); - struct health_state *h = ptr; - netdata_thread_cleanup_push(health_thread_cleanup, ptr); - - RRDHOST *host = h->host; - initialize_health(host, host == localhost); + netdata_thread_cleanup_push(health_main_cleanup, ptr); int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10); if(min_run_every < 1) min_run_every = 1; - int cleanup_sql_every_loop = 7200 / min_run_every; - - time_t now = now_realtime_sec(); time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60); bool health_running_logged = false; - rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host); + rrdcalc_delete_alerts_not_matching_host_labels_from_all_hosts(); unsigned int loop = 0; #ifdef ENABLE_ACLK unsigned int marked_aclk_reload_loop = 0; #endif - while(service_running(SERVICE_HEALTH) && host->health_enabled) { + while(service_running(SERVICE_HEALTH)) { loop++; debug(D_HEALTH, "Health monitoring iteration no %u started", loop); - now = now_realtime_sec(); + time_t now = now_realtime_sec(); int runnable = 0, apply_hibernation_delay = 0; time_t next_run = now + min_run_every; RRDCALC *rc; + RRDHOST *host; if (unlikely(check_if_resumed_from_suspension())) { apply_hibernation_delay = 1; log_health( - "[%s]: Postponing alarm checks for %"PRId64" seconds, " + "Postponing alarm checks for %"PRId64" seconds, " "because it seems that the system was just resumed from suspension.", - rrdhost_hostname(host), (int64_t)hibernation_delay); } if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) { - static __thread int logged=0; + static int logged=0; if (!logged) { - log_health("[%s]: Skipping health checks, because all alarms are disabled via a %s command.", - rrdhost_hostname(host), + log_health("Skipping health checks, because all alarms are disabled via a %s command.", HEALTH_CMDAPI_CMD_DISABLEALL); logged = 1; } } #ifdef ENABLE_ACLK - if (host->aclk_alert_reloaded && !marked_aclk_reload_loop) + if (aclk_alert_reloaded && !marked_aclk_reload_loop) marked_aclk_reload_loop = loop; #endif - if (unlikely(apply_hibernation_delay)) { - log_health( - "[%s]: Postponing health checks for %"PRId64" seconds.", - rrdhost_hostname(host), - (int64_t)hibernation_delay); + worker_is_busy(WORKER_HEALTH_JOB_RRD_LOCK); + rrd_rdlock(); - host->health_delay_up_to = now + hibernation_delay; - next_run = now + hibernation_delay; - health_sleep(next_run, loop, host); - } + rrdhost_foreach_read(host) { - if (unlikely(host->health_delay_up_to)) { - if (unlikely(now < host->health_delay_up_to)) { - next_run = host->health_delay_up_to; - health_sleep(next_run, loop, host); + if (unlikely(!host->health.health_enabled)) continue; - } - - log_health("[%s]: Resuming health checks after delay.", rrdhost_hostname(host)); - host->health_delay_up_to = 0; - } - // wait until cleanup of obsolete charts on children is complete - if (host != localhost) { - if (unlikely(host->trigger_chart_obsoletion_check == 1)) { - log_health("[%s]: Waiting for chart obsoletion check.", rrdhost_hostname(host)); - health_sleep(next_run, loop, host); - continue; + if (unlikely(!rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH))) { + rrd_unlock(); + initialize_health(host, host == localhost); + rrd_rdlock(); |