diff options
author | Emmanuel Vasilakis <mrzammler@mm.st> | 2022-10-19 18:30:12 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-10-19 18:30:12 +0300 |
commit | 42e85b5a092b18881501b0fe76f91e4969dca088 (patch) | |
tree | a93d3dacbe095e56cf8648fe8d72081badeb0948 /database | |
parent | b5e1e3690c9a8e59abe3a3a247ce7906f3259053 (diff) |
Health thread per host (#13712)
* Rebased
* rebased
* health_execute_pending_updates -> health_execute_delayed_initializations
* fix labels for current host only
* missing bracket
* misc fixes, reload health for disconnected hosts
* remove volatile, add comment
Diffstat (limited to 'database')
-rw-r--r-- | database/rrd.h | 22 | ||||
-rw-r--r-- | database/rrdhost.c | 93 | ||||
-rw-r--r-- | database/sqlite/sqlite_aclk_alert.c | 2 | ||||
-rw-r--r-- | database/sqlite/sqlite_health.c | 2 |
4 files changed, 26 insertions, 93 deletions
diff --git a/database/rrd.h b/database/rrd.h index 0c8ef0ddf4..114953389c 100644 --- a/database/rrd.h +++ b/database/rrd.h @@ -948,16 +948,18 @@ struct rrdhost { // ------------------------------------------------------------------------ // health monitoring options - unsigned int health_enabled; // 1 when this host has health enabled - time_t health_delay_up_to; // a timestamp to delay alarms processing up to - STRING *health_default_exec; // the full path of the alarms notifications program - STRING *health_default_recipient; // the default recipient for all alarms - char *health_log_filename; // the alarms event log filename - size_t health_log_entries_written; // the number of alarm events written to the alarms event log - FILE *health_log_fp; // the FILE pointer to the open alarms event log file - uint32_t health_default_warn_repeat_every; // the default value for the interval between repeating warning notifications - uint32_t health_default_crit_repeat_every; // the default value for the interval between repeating critical notifications - + unsigned int health_enabled; // 1 when this host has health enabled + bool health_spawn; // true when health thread is running + netdata_thread_t health_thread; // the health thread + unsigned int aclk_alert_reloaded; // 1 on thread start and health reload, 0 after removed are sent + time_t health_delay_up_to; // a timestamp to delay alarms processing up to + STRING *health_default_exec; // the full path of the alarms notifications program + STRING *health_default_recipient; // the default recipient for all alarms + char *health_log_filename; // the alarms event log filename + size_t health_log_entries_written; // the number of alarm events written to the alarms event log + FILE *health_log_fp; // the FILE pointer to the open alarms event log file + uint32_t health_default_warn_repeat_every; // the default value for the interval between repeating warning notifications + uint32_t health_default_crit_repeat_every; // the default value for the interval between repeating critical notifications // all RRDCALCs are primarily allocated and linked here DICTIONARY *rrdcalc_root_index; diff --git a/database/rrdhost.c b/database/rrdhost.c index 46f2d6a72e..817728f807 100644 --- a/database/rrdhost.c +++ b/database/rrdhost.c @@ -231,82 +231,6 @@ static void rrdhost_initialize_rrdpush_sender(RRDHOST *host, rrdhost_option_clear(host, RRDHOST_OPTION_SENDER_ENABLED); } -static void rrdhost_initialize_health(RRDHOST *host, - int is_localhost - ) { - if(!host->health_enabled || rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH)) return; - rrdhost_flag_set(host, RRDHOST_FLAG_INITIALIZED_HEALTH); - - rrdfamily_index_init(host); - rrdcalctemplate_index_init(host); - rrdcalc_rrdhost_index_init(host); - - host->health_default_warn_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat warning", "never"); - host->health_default_crit_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat critical", "never"); - - host->health_log.next_log_id = 1; - host->health_log.next_alarm_id = 1; - host->health_log.max = 1000; - host->health_log.next_log_id = (uint32_t)now_realtime_sec(); - host->health_log.next_alarm_id = 0; - - long n = config_get_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", host->health_log.max); - if(n < 10) { - error("Host '%s': health configuration has invalid max log entries %ld. Using default %u", rrdhost_hostname(host), n, host->health_log.max); - config_set_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", (long)host->health_log.max); - } - else - host->health_log.max = (unsigned int)n; - - netdata_rwlock_init(&host->health_log.alarm_log_rwlock); - - char filename[FILENAME_MAX + 1]; - - if(!is_localhost) { - int r = mkdir(host->varlib_dir, 0775); - if (r != 0 && errno != EEXIST) - error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), host->varlib_dir); - } - - { - snprintfz(filename, FILENAME_MAX, "%s/health", host->varlib_dir); - int r = mkdir(filename, 0775); - if(r != 0 && errno != EEXIST) - error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), filename); - } - - snprintfz(filename, FILENAME_MAX, "%s/health/health-log.db", host->varlib_dir); - host->health_log_filename = strdupz(filename); - - snprintfz(filename, FILENAME_MAX, "%s/alarm-notify.sh", netdata_configured_primary_plugins_dir); - host->health_default_exec = string_strdupz(config_get(CONFIG_SECTION_HEALTH, "script to execute on alarm", filename)); - host->health_default_recipient = string_strdupz("root"); - - // ------------------------------------------------------------------------ - // load health configuration - - health_readdir(host, health_user_config_dir(), health_stock_config_dir(), NULL); - - if (!file_is_migrated(host->health_log_filename)) { - int rc = sql_create_health_log_table(host); - if (unlikely(rc)) { - error_report("Failed to create health log table in the database"); - health_alarm_log_load(host); - health_alarm_log_open(host); - } - else { - health_alarm_log_load(host); - add_migrated_file(host->health_log_filename, 0); - } - } else { - // TODO: This needs to go to the metadata thread - // Health should wait before accessing the table (needs to be created by the metadata thread - sql_create_health_log_table(host); - sql_health_alarm_log_load(host); - } -} - - RRDHOST *rrdhost_create(const char *hostname, const char *registry_hostname, const char *guid, @@ -423,7 +347,12 @@ int is_legacy = 1; else error_report("Host machine GUID %s is not valid", host->machine_guid); - rrdhost_initialize_health(host, is_localhost); + rrdfamily_index_init(host); + rrdcalctemplate_index_init(host); + rrdcalc_rrdhost_index_init(host); + + if (health_enabled) + health_thread_spawn(host); if (host->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) { #ifdef ENABLE_DBENGINE @@ -640,7 +569,9 @@ void rrdhost_update(RRDHOST *host rrdpush_api_key, rrdpush_send_charts_matching); - rrdhost_initialize_health(host, host == localhost); + rrdfamily_index_init(host); + rrdcalctemplate_index_init(host); + rrdcalc_rrdhost_index_init(host); rrd_hosts_available++; ml_new_host(host); @@ -648,6 +579,9 @@ void rrdhost_update(RRDHOST *host info("Host %s is not in archived mode anymore", rrdhost_hostname(host)); } + if (health_enabled) + health_thread_spawn(host); + return; } @@ -916,8 +850,6 @@ int rrd_init(char *hostname, struct rrdhost_system_info *system_info) { } } - health_init(); - unittest: metadata_sync_init(); debug(D_RRDHOST, "Initializing localhost with hostname '%s'", hostname); @@ -1119,6 +1051,7 @@ void rrdhost_free(RRDHOST *host, bool force) { freez(host->exporting_flags); + health_thread_stop(host); health_alarm_log_free(host); #ifdef ENABLE_DBENGINE diff --git a/database/sqlite/sqlite_aclk_alert.c b/database/sqlite/sqlite_aclk_alert.c index 8723642a12..f7d4febac1 100644 --- a/database/sqlite/sqlite_aclk_alert.c +++ b/database/sqlite/sqlite_aclk_alert.c @@ -532,8 +532,6 @@ void aclk_push_alarm_health_log(struct aclk_database_worker_config *wc, struct a freez(claim_id); buffer_free(sql); - - aclk_alert_reloaded = 1; #endif return; diff --git a/database/sqlite/sqlite_health.c b/database/sqlite/sqlite_health.c index b832a2df83..c189305b8d 100644 --- a/database/sqlite/sqlite_health.c +++ b/database/sqlite/sqlite_health.c @@ -806,7 +806,7 @@ void sql_health_alarm_log_load(RRDHOST *host) { if (unlikely(!host->health_log.next_alarm_id || host->health_log.next_alarm_id <= host->health_max_alarm_id)) host->health_log.next_alarm_id = host->health_max_alarm_id + 1; - info("HEALTH [%s]: Table health_log_%s, loaded %zd alarm entries, errors in %zd entries.", rrdhost_hostname(host), uuid_str, loaded, errored); + log_health("[%s]: Table health_log_%s, loaded %zd alarm entries, errors in %zd entries.", rrdhost_hostname(host), uuid_str, loaded, errored); ret = sqlite3_finalize(res); if (unlikely(ret != SQLITE_OK)) |