summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--aclk/aclk.c2
-rw-r--r--aclk/aclk.h2
-rw-r--r--daemon/service.c2
-rw-r--r--daemon/static_threads.c9
-rw-r--r--database/rrd.h24
-rw-r--r--database/rrdcalc.c4
-rw-r--r--database/rrddimvar.c4
-rw-r--r--database/rrdhost.c27
-rw-r--r--database/rrdset.c4
-rw-r--r--database/rrdsetvar.c4
-rw-r--r--database/sqlite/sqlite_aclk_alert.c10
-rw-r--r--database/sqlite/sqlite_health.c14
-rw-r--r--database/sqlite/sqlite_metadata.c13
-rw-r--r--health/health.c893
-rw-r--r--health/health.h7
-rw-r--r--health/health_config.c10
-rw-r--r--health/health_json.c10
-rw-r--r--health/health_log.c74
-rw-r--r--streaming/receiver.c4
19 files changed, 554 insertions, 563 deletions
diff --git a/aclk/aclk.c b/aclk/aclk.c
index 9579912521..04b81b9eb0 100644
--- a/aclk/aclk.c
+++ b/aclk/aclk.c
@@ -49,6 +49,8 @@ float last_backoff_value = 0;
time_t aclk_block_until = 0;
+int aclk_alert_reloaded = 0; //1 on health log exchange, and again on health_reload
+
#ifdef ENABLE_ACLK
mqtt_wss_client mqttwss_client;
diff --git a/aclk/aclk.h b/aclk/aclk.h
index 6aed548b74..56b24add92 100644
--- a/aclk/aclk.h
+++ b/aclk/aclk.h
@@ -26,6 +26,8 @@ extern time_t aclk_block_until;
extern int disconnect_req;
+extern int aclk_alert_reloaded;
+
#ifdef ENABLE_ACLK
void *aclk_main(void *ptr);
diff --git a/daemon/service.c b/daemon/service.c
index fd19a3cc6f..3f5e8c55a3 100644
--- a/daemon/service.c
+++ b/daemon/service.c
@@ -201,7 +201,7 @@ static void svc_rrd_cleanup_obsolete_charts_from_all_hosts() {
&& (
(
host->child_last_chart_command
- && host->child_last_chart_command + host->health_delay_up_to < now_realtime_sec()
+ && host->child_last_chart_command + host->health.health_delay_up_to < now_realtime_sec()
)
|| (host->child_connect_time + TIME_TO_RUN_OBSOLETIONS_ON_CHILD_CONNECT < now_realtime_sec())
)
diff --git a/daemon/static_threads.c b/daemon/static_threads.c
index 51dbf64061..ff43fc0ccc 100644
--- a/daemon/static_threads.c
+++ b/daemon/static_threads.c
@@ -37,6 +37,15 @@ const struct netdata_static_thread static_threads_common[] = {
.start_routine = cpuidlejitter_main
},
{
+ .name = "HEALTH",
+ .config_section = NULL,
+ .config_name = NULL,
+ .enabled = 1,
+ .thread = NULL,
+ .init_routine = NULL,
+ .start_routine = health_main
+ },
+ {
.name = "ANALYTICS",
.config_section = NULL,
.config_name = NULL,
diff --git a/database/rrd.h b/database/rrd.h
index 77cfb5aee6..1d6a869d22 100644
--- a/database/rrd.h
+++ b/database/rrd.h
@@ -895,6 +895,17 @@ typedef struct alarm_log {
netdata_rwlock_t alarm_log_rwlock;
} ALARM_LOG;
+typedef struct health {
+ unsigned int health_enabled; // 1 when this host has health enabled
+ time_t health_delay_up_to; // a timestamp to delay alarms processing up to
+ STRING *health_default_exec; // the full path of the alarms notifications program
+ STRING *health_default_recipient; // the default recipient for all alarms
+ char *health_log_filename; // the alarms event log filename
+ size_t health_log_entries_written; // the number of alarm events written to the alarms event log
+ FILE *health_log_fp; // the FILE pointer to the open alarms event log file
+ uint32_t health_default_warn_repeat_every; // the default value for the interval between repeating warning notifications
+ uint32_t health_default_crit_repeat_every; // the default value for the interval between repeating critical notifications
+} HEALTH;
// ----------------------------------------------------------------------------
// RRD HOST
@@ -1012,17 +1023,8 @@ struct rrdhost {
// ------------------------------------------------------------------------
// health monitoring options
- unsigned int health_enabled; // 1 when this host has health enabled
- bool health_spawn; // true when health thread is running
- unsigned int aclk_alert_reloaded; // 1 on thread start and health reload, 0 after removed are sent
- time_t health_delay_up_to; // a timestamp to delay alarms processing up to
- STRING *health_default_exec; // the full path of the alarms notifications program
- STRING *health_default_recipient; // the default recipient for all alarms
- char *health_log_filename; // the alarms event log filename
- size_t health_log_entries_written; // the number of alarm events written to the alarms event log
- FILE *health_log_fp; // the FILE pointer to the open alarms event log file
- uint32_t health_default_warn_repeat_every; // the default value for the interval between repeating warning notifications
- uint32_t health_default_crit_repeat_every; // the default value for the interval between repeating critical notifications
+ // health variables
+ HEALTH health;
// all RRDCALCs are primarily allocated and linked here
DICTIONARY *rrdcalc_root_index;
diff --git a/database/rrdcalc.c b/database/rrdcalc.c
index 45338a8260..153d58513e 100644
--- a/database/rrdcalc.c
+++ b/database/rrdcalc.c
@@ -739,7 +739,7 @@ void rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(RRDHOST *host
continue;
if(!rrdlabels_match_simple_pattern_parsed(host->rrdlabels, rc->host_labels_pattern, '=')) {
- info("Health configuration for alarm '%s' cannot be applied, because the host %s does not have the label(s) '%s'",
+ log_health("Health configuration for alarm '%s' cannot be applied, because the host %s does not have the label(s) '%s'",
rrdcalc_name(rc),
rrdhost_hostname(host),
rrdcalc_host_labels(rc));
@@ -755,7 +755,7 @@ void rrdcalc_delete_alerts_not_matching_host_labels_from_all_hosts() {
RRDHOST *host;
rrdhost_foreach_read(host) {
- if (unlikely(!host->health_enabled))
+ if (unlikely(!host->health.health_enabled))
continue;
if (host->rrdlabels)
diff --git a/database/rrddimvar.c b/database/rrddimvar.c
index 449ceeb937..be5235239f 100644
--- a/database/rrddimvar.c
+++ b/database/rrddimvar.c
@@ -65,7 +65,7 @@ static inline void rrddimvar_free_variables_unsafe(RRDDIMVAR *rs) {
// HOST VARIABLES FOR THIS DIMENSION
- if(host->rrdvars && host->health_enabled) {
+ if(host->rrdvars && host->health.health_enabled) {
rrdvar_release_and_del(host->rrdvars, rs->rrdvar_host_chart_id_dim_id);
rs->rrdvar_host_chart_id_dim_id = NULL;
@@ -152,7 +152,7 @@ static inline void rrddimvar_update_variables_unsafe(RRDDIMVAR *rs) {
// - $chart-name.id
// - $chart-name.name
- if(host->rrdvars && host->health_enabled) {
+ if(host->rrdvars && host->health.health_enabled) {
rs->rrdvar_host_chart_id_dim_id = rrdvar_add_and_acquire("host", host->rrdvars, key_chart_id_dim_id, rs->type, RRDVAR_FLAG_NONE, rs->value);
rs->rrdvar_host_chart_id_dim_name = rrdvar_add_and_acquire("host", host->rrdvars, key_chart_id_dim_name, rs->type, RRDVAR_FLAG_NONE, rs->value);
rs->rrdvar_host_chart_name_dim_id = rrdvar_add_and_acquire("host", host->rrdvars, key_chart_name_dim_id, rs->type, RRDVAR_FLAG_NONE, rs->value);
diff --git a/database/rrdhost.c b/database/rrdhost.c
index 94e5a4770f..6dd7db91be 100644
--- a/database/rrdhost.c
+++ b/database/rrdhost.c
@@ -281,8 +281,8 @@ int is_legacy = 1;
rrdhost_init_hostname(host, hostname, false);
- host->rrd_history_entries = align_entries_to_pagesize(memory_mode, entries);
- host->health_enabled = ((memory_mode == RRD_MEMORY_MODE_NONE)) ? 0 : health_enabled;
+ host->rrd_history_entries = align_entries_to_pagesize(memory_mode, entries);
+ host->health.health_enabled = ((memory_mode == RRD_MEMORY_MODE_NONE)) ? 0 : health_enabled;
if (likely(!archived)) {
rrdfunctions_init(host);
@@ -366,9 +366,6 @@ int is_legacy = 1;
rrdcalc_rrdhost_index_init(host);
metaqueue_host_update_info(host);
- if (health_enabled)
- health_thread_spawn(host);
-
if (host->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) {
#ifdef ENABLE_DBENGINE
char dbenginepath[FILENAME_MAX + 1];
@@ -516,12 +513,12 @@ int is_legacy = 1;
, rrdhost_has_rrdpush_sender_enabled(host)?"enabled":"disabled"
, host->rrdpush_send_destination?host->rrdpush_send_destination:""
, host->rrdpush_send_api_key?host->rrdpush_send_api_key:""
- , host->health_enabled?"enabled":"disabled"
+ , host->health.health_enabled?"enabled":"disabled"
, host->cache_dir
, host->varlib_dir
- , host->health_log_filename
- , string2str(host->health_default_exec)
- , string2str(host->health_default_recipient)
+ , host->health.health_log_filename
+ , string2str(host->health.health_default_exec)
+ , string2str(host->health.health_default_recipient)
);
if(!archived)
@@ -566,7 +563,7 @@ static void rrdhost_update(RRDHOST *host
netdata_spinlock_lock(&host->rrdhost_update_lock);
- host->health_enabled = (mode == RRD_MEMORY_MODE_NONE) ? 0 : health_enabled;
+ host->health.health_enabled = (mode == RRD_MEMORY_MODE_NONE) ? 0 : health_enabled;
{
struct rrdhost_system_info *old = host->system_info;
@@ -651,9 +648,6 @@ static void rrdhost_update(RRDHOST *host
info("Host %s is not in archived mode anymore", rrdhost_hostname(host));
}
- if (health_enabled)
- health_thread_spawn(host);
-
netdata_spinlock_unlock(&host->rrdhost_update_lock);
}
@@ -1175,9 +1169,9 @@ void rrdhost_free___while_having_rrd_wrlock(RRDHOST *host, bool force) {
freez(host->rrdpush_send_api_key);
freez(host->rrdpush_send_destination);
rrdpush_destinations_free(host);
- string_freez(host->health_default_exec);
- string_freez(host->health_default_recipient);
- freez(host->health_log_filename);
+ string_freez(host->health.health_default_exec);
+ string_freez(host->health.health_default_recipient);
+ freez(host->health.health_log_filename);
string_freez(host->registry_hostname);
simple_pattern_free(host->rrdpush_send_charts_matching);
netdata_rwlock_destroy(&host->health_log.alarm_log_rwlock);
@@ -1376,7 +1370,6 @@ void reload_host_labels(void) {
health_label_log_save(localhost);
rrdpush_send_host_labels(localhost);
- health_reload();
}
// ----------------------------------------------------------------------------
diff --git a/database/rrdset.c b/database/rrdset.c
index 1aaff8a6b9..140a63953f 100644
--- a/database/rrdset.c
+++ b/database/rrdset.c
@@ -168,7 +168,7 @@ static void rrdset_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, v
// chart variables - we need this for data collection to work (collector given chart variables) - not only health
rrdsetvar_index_init(st);
- if (host->health_enabled) {
+ if (host->health.health_enabled) {
st->rrdfamily = rrdfamily_add_and_acquire(host, rrdset_family(st));
st->rrdvars = rrdvariables_create();
rrddimvar_index_init(st);
@@ -366,7 +366,7 @@ static void rrdset_react_callback(const DICTIONARY_ITEM *item __maybe_unused, vo
st->last_accessed_time_s = now_realtime_sec();
- if(host->health_enabled && (ctr->react_action & (RRDSET_REACT_NEW | RRDSET_REACT_CHART_ACTIVATED))) {
+ if(host->health.health_enabled && (ctr->react_action & (RRDSET_REACT_NEW | RRDSET_REACT_CHART_ACTIVATED))) {
rrdset_flag_set(st, RRDSET_FLAG_PENDING_HEALTH_INITIALIZATION);
rrdhost_flag_set(st->rrdhost, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION);
}
diff --git a/database/rrdsetvar.c b/database/rrdsetvar.c
index 22cf8a1f01..9684310cdf 100644
--- a/database/rrdsetvar.c
+++ b/database/rrdsetvar.c
@@ -43,7 +43,7 @@ static inline void rrdsetvar_free_rrdvars_unsafe(RRDSET *st, RRDSETVAR *rs) {
// ------------------------------------------------------------------------
// HOST
- if(host->rrdvars && host->health_enabled) {
+ if(host->rrdvars && host->health.health_enabled) {
rrdvar_release_and_del(host->rrdvars, rs->rrdvar_host_chart_id);
rs->rrdvar_host_chart_id = NULL;
@@ -93,7 +93,7 @@ static inline void rrdsetvar_update_rrdvars_unsafe(RRDSET *st, RRDSETVAR *rs) {
// ------------------------------------------------------------------------
// HOST
- if(host->rrdvars && host->health_enabled) {
+ if(host->rrdvars && host->health.health_enabled) {
rs->rrdvar_host_chart_id = rrdvar_add_and_acquire("host", host->rrdvars, key_chart_id, rs->type, options, rs->value);
rs->rrdvar_host_chart_name = rrdvar_add_and_acquire("host", host->rrdvars, key_chart_name, rs->type, options, rs->value);
}
diff --git a/database/sqlite/sqlite_aclk_alert.c b/database/sqlite/sqlite_aclk_alert.c
index 60cce7165a..6151d9c885 100644
--- a/database/sqlite/sqlite_aclk_alert.c
+++ b/database/sqlite/sqlite_aclk_alert.c
@@ -314,7 +314,7 @@ void aclk_push_alert_event(struct aclk_database_worker_config *wc, struct aclk_d
alarm_log.utc_offset = wc->host->utc_offset;
alarm_log.timezone = strdupz(rrdhost_abbrev_timezone(wc->host));
alarm_log.exec_path = sqlite3_column_bytes(res, 14) > 0 ? strdupz((char *)sqlite3_column_text(res, 14)) :
- strdupz((char *)string2str(wc->host->health_default_exec));
+ strdupz((char *)string2str(wc->host->health.health_default_exec));
alarm_log.conf_source = strdupz((char *)sqlite3_column_text(res, 16));
char *edit_command = sqlite3_column_bytes(res, 16) > 0 ?
@@ -531,7 +531,7 @@ void aclk_push_alarm_health_log(struct aclk_database_worker_config *wc, struct a
alarm_log.node_id = wc->node_id;
alarm_log.log_entries = log_entries;
alarm_log.status = wc->alert_updates == 0 ? 2 : 1;
- alarm_log.enabled = (int)host->health_enabled;
+ alarm_log.enabled = (int)host->health.health_enabled;
wc->alert_sequence_id = last_sequence;
@@ -544,6 +544,8 @@ void aclk_push_alarm_health_log(struct aclk_database_worker_config *wc, struct a
freez(claim_id);
buffer_free(sql);
+
+ aclk_alert_reloaded = 1;
#endif
return;
@@ -709,7 +711,7 @@ void aclk_start_alert_streaming(char *node_id, uint64_t batch_id, uint64_t start
(struct aclk_database_worker_config *)host->dbsync_worker :
(struct aclk_database_worker_config *)find_inactive_wc_by_node_id(node_id);
- if (unlikely(!host->health_enabled)) {
+ if (unlikely(!host->health.health_enabled)) {
log_access("ACLK STA [%s (N/A)]: Ignoring request to stream alert state changes, health is disabled.", node_id);
return;
}
@@ -849,7 +851,7 @@ void health_alarm_entry2proto_nolock(struct alarm_log_entry *alarm_log, ALARM_EN
alarm_log->utc_offset = host->utc_offset;
alarm_log->timezone = strdupz(rrdhost_abbrev_timezone(host));
- alarm_log->exec_path = ae->exec ? strdupz(ae_exec(ae)) : strdupz((char *)string2str(host->health_default_exec));
+ alarm_log->exec_path = ae->exec ? strdupz(ae_exec(ae)) : strdupz((char *)string2str(host->health.health_default_exec));
alarm_log->conf_source = ae->source ? strdupz(ae_source(ae)) : strdupz((char *)"");
alarm_log->command = strdupz((char *)edit_command);
diff --git a/database/sqlite/sqlite_health.c b/database/sqlite/sqlite_health.c
index c189305b8d..dc4020deff 100644
--- a/database/sqlite/sqlite_health.c
+++ b/database/sqlite/sqlite_health.c
@@ -337,7 +337,7 @@ void sql_health_alarm_log_insert(RRDHOST *host, ALARM_ENTRY *ae) {
}
ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
- host->health_log_entries_written++;
+ host->health.health_log_entries_written++;
failed:
if (unlikely(sqlite3_finalize(res) != SQLITE_OK))
@@ -369,7 +369,7 @@ void sql_health_alarm_log_cleanup(RRDHOST *host) {
if(rotate_every < 100) rotate_every = 100;
}
- if(likely(host->health_log_entries_written < rotate_every)) {
+ if(likely(host->health.health_log_entries_written < rotate_every)) {
return;
}
@@ -382,7 +382,7 @@ void sql_health_alarm_log_cleanup(RRDHOST *host) {
char uuid_str[GUID_LEN + 1];
uuid_unparse_lower_fix(&host->host_uuid, uuid_str);
- snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_CLEANUP_HEALTH_LOG(uuid_str, uuid_str, (unsigned long int) (host->health_log_entries_written - rotate_every)));
+ snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_CLEANUP_HEALTH_LOG(uuid_str, uuid_str, (unsigned long int) (host->health.health_log_entries_written - rotate_every)));
rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0);
if (unlikely(rc != SQLITE_OK)) {
@@ -398,7 +398,7 @@ void sql_health_alarm_log_cleanup(RRDHOST *host) {
if (unlikely(rc != SQLITE_OK))
error_report("Failed to finalize the prepared statement to cleanup health log table");
- host->health_log_entries_written = rotate_every;
+ host->health.health_log_entries_written = rotate_every;
sql_aclk_alert_clean_dead_entries(host);
}
@@ -431,13 +431,13 @@ void sql_health_alarm_log_count(RRDHOST *host) {
rc = sqlite3_step_monitored(res);
if (likely(rc == SQLITE_ROW))
- host->health_log_entries_written = (size_t) sqlite3_column_int64(res, 0);
+ host->health.health_log_entries_written = (size_t) sqlite3_column_int64(res, 0);
rc = sqlite3_finalize(res);
if (unlikely(rc != SQLITE_OK))
error_report("Failed to finalize the prepared statement to count health log entries from db");
- info("HEALTH [%s]: Table health_log_%s, contains %lu entries.", rrdhost_hostname(host), uuid_str, (unsigned long int) host->health_log_entries_written);
+ info("HEALTH [%s]: Table health_log_%s, contains %lu entries.", rrdhost_hostname(host), uuid_str, (unsigned long int) host->health.health_log_entries_written);
}
#define SQL_INJECT_REMOVED(guid, guid2) "insert into health_log_%s (hostname, unique_id, alarm_id, alarm_event_id, config_hash_id, updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, " \
@@ -612,7 +612,7 @@ void sql_health_alarm_log_load(RRDHOST *host) {
ssize_t errored = 0, loaded = 0;
char command[MAX_HEALTH_SQL_SIZE + 1];
- host->health_log_entries_written = 0;
+ host->health.health_log_entries_written = 0;
if (unlikely(!db_meta)) {
if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE)
diff --git a/database/sqlite/sqlite_metadata.c b/database/sqlite/sqlite_metadata.c
index 28cd18e40e..2733806ca3 100644
--- a/database/sqlite/sqlite_metadata.c
+++ b/database/sqlite/sqlite_metadata.c
@@ -328,7 +328,7 @@ static int sql_store_host_info(RRDHOST *host)
if (unlikely(rc != SQLITE_OK))
goto bind_fail;
- rc = sqlite3_bind_int(res, ++param, (int ) host->health_enabled);
+ rc = sqlite3_bind_int(res, ++param, (int ) host->health.health_enabled);
if (unlikely(rc != SQLITE_OK))
goto bind_fail;
@@ -686,6 +686,16 @@ skip_run:
error_report("Failed to finalize the prepared statement when reading dimensions");
}
+static void cleanup_health_log(void)
+{
+ RRDHOST *host;
+ dfe_start_reentrant(rrdhost_root_index, host) {
+ if (rrdhost_flag_check(host, RRDHOST_FLAG_ARCHIVED))
+ continue;
+ sql_health_alarm_log_cleanup(host);
+ }
+ dfe_done(host);
+}
//
// EVENT LOOP STARTS HERE
@@ -845,6 +855,7 @@ static void start_metadata_cleanup(uv_work_t *req)
worker_is_busy(UV_EVENT_METADATA_CLEANUP);
struct metadata_wc *wc = req->data;
check_dimension_metadata(wc);
+ cleanup_health_log();
worker_is_idle();
}
diff --git a/health/health.c b/health/health.c
index 0ad1f122a0..36fcd2f2b1 100644
--- a/health/health.c
+++ b/health/health.c
@@ -162,7 +162,7 @@ char *silencers_filename;
SIMPLE_PATTERN *conf_enabled_alarms = NULL;
// the queue of executed alarm notifications that haven't been waited for yet
-static __thread struct {
+static struct {
ALARM_ENTRY *head; // oldest
ALARM_ENTRY *tail; // latest
} alarm_notifications_in_progress = {NULL, NULL};
@@ -302,7 +302,7 @@ void health_init(void) {
* @param host the structure of the host that the function will reload the configuration.
*/
static void health_reload_host(RRDHOST *host) {
- if(unlikely(!host->health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH))
+ if(unlikely(!host->health.health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH))
return;
log_health("[%s]: Reloading health.", rrdhost_hostname(host));
@@ -346,7 +346,6 @@ static void health_reload_host(RRDHOST *host) {
rrdcalctemplate_link_matching_templates_to_rrdset(st);
}
rrdset_foreach_done(st);
- host->aclk_alert_reloaded = 1;
}
/**
@@ -364,6 +363,12 @@ void health_reload(void) {
health_reload_host(host);
rrd_unlock();
+
+#ifdef ENABLE_ACLK
+ if (netdata_cloud_setting) {
+ aclk_alert_reloaded = 1;
+ }
+#endif
}
// ----------------------------------------------------------------------------
@@ -445,8 +450,8 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
log_health("[%s]: Sending notification for alarm '%s.%s' status %s.", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
- const char *exec = (ae->exec) ? ae_exec(ae) : string2str(host->health_default_exec);
- const char *recipient = (ae->recipient) ? ae_recipient(ae) : string2str(host->health_default_recipient);
+ const char *exec = (ae->exec) ? ae_exec(ae) : string2str(host->health.health_default_exec);
+ const char *recipient = (ae->recipient) ? ae_recipient(ae) : string2str(host->health.health_default_recipient);
int n_warn=0, n_crit=0;
RRDCALC *rc;
@@ -720,7 +725,7 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run)
}
static inline int check_if_resumed_from_suspension(void) {
- static __thread usec_t last_realtime = 0, last_monotonic = 0;
+ static usec_t last_realtime = 0, last_monotonic = 0;
usec_t realtime = now_realtime_usec(), monotonic = now_monotonic_usec();
int ret = 0;
@@ -736,18 +741,19 @@ static inline int check_if_resumed_from_suspension(void) {
return ret;
}
-static void health_thread_cleanup(void *ptr) {
+static void health_main_cleanup(void *ptr) {
worker_unregister();
- struct health_state *h = ptr;
- h->host->health_spawn = 0;
+ struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
+ static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
+ info("cleaning up...");
+ static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
- log_health("[%s]: Health thread ended.", rrdhost_hostname(h->host));
- debug(D_HEALTH, "HEALTH %s: Health thread ended.", rrdhost_hostname(h->host));
+ log_health("Health thread ended.");
}
static void initialize_health(RRDHOST *host, int is_localhost) {
- if(!host->health_enabled ||
+ if(!host->health.health_enabled ||
rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH) ||
!service_running(SERVICE_HEALTH))
return;
@@ -756,8 +762,8 @@ static void initialize_health(RRDHOST *host, int is_localhost) {
log_health("[%s]: Initializing health.", rrdhost_hostname(host));
- host->health_default_warn_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat warning", "never");
- host->health_default_crit_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat critical", "never");
+ host->health.health_default_warn_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat warning", "never");
+ host->health.health_default_crit_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat critical", "never");
host->health_log.next_log_id = 1;
host->health_log.next_alarm_id = 1;
@@ -792,13 +798,13 @@ static void initialize_health(RRDHOST *host, int is_localhost) {
error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), filename);
}
snprintfz(filename, FILENAME_MAX, "%s/health/health-log.db", host->varlib_dir);
- host->health_log_filename = strdupz(filename);
+ host->health.health_log_filename = strdupz(filename);
snprintfz(filename, FILENAME_MAX, "%s/alarm-notify.sh", netdata_configured_primary_plugins_dir);
- host->health_default_exec = string_strdupz(config_get(CONFIG_SECTION_HEALTH, "script to execute on alarm", filename));
- host->health_default_recipient = string_strdupz("root");
+ host->health.health_default_exec = string_strdupz(config_get(CONFIG_SECTION_HEALTH, "script to execute on alarm", filename));
+ host->health.health_default_recipient = string_strdupz("root");
- if (!file_is_migrated(host->health_log_filename)) {
+ if (!file_is_migrated(host->health.health_log_filename)) {
int rc = sql_create_health_log_table(host);
if (unlikely(rc)) {
log_health("[%s]: Failed to create health log table in the database", rrdhost_hostname(host));
@@ -807,7 +813,7 @@ static void initialize_health(RRDHOST *host, int is_localhost) {
}
else {
health_alarm_log_load(host);
- add_migrated_file(host->health_log_filename, 0);
+ add_migrated_file(host->health.health_log_filename, 0);
}
} else {
// TODO: This needs to go to the metadata thread
@@ -834,16 +840,14 @@ static void initialize_health(RRDHOST *host, int is_localhost) {
//Discard alarms with labels that do not apply to host
rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
-
- health_silencers_init();
}
-static void health_sleep(time_t next_run, unsigned int loop __maybe_unused, RRDHOST *host) {
+static void health_sleep(time_t next_run, unsigned int loop __maybe_unused) {
time_t now = now_realtime_sec();
if(now < next_run) {
worker_is_idle();
debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
- while (now < next_run && host->health_enabled && service_running(SERVICE_HEALTH)) {
+ while (now < next_run && service_running(SERVICE_HEALTH)) {
sleep_usec(USEC_PER_SEC);
now = now_realtime_sec();
}
@@ -1001,534 +1005,522 @@ void *health_main(void *ptr) {
worker_register_job_name(WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET, "rrdset init");
worker_register_job_name(WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM, "rrddim init");
- struct health_state *h = ptr;
- netdata_thread_cleanup_push(health_thread_cleanup, ptr);
-
- RRDHOST *host = h->host;
- initialize_health(host, host == localhost);
+ netdata_thread_cleanup_push(health_main_cleanup, ptr);
int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10);
if(min_run_every < 1) min_run_every = 1;
- int cleanup_sql_every_loop = 7200 / min_run_every;
-
- time_t now = now_realtime_sec();
time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60);
bool health_running_logged = false;
- rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
+ rrdcalc_delete_alerts_not_matching_host_labels_from_all_hosts();
unsigned int loop = 0;
#ifdef ENABLE_ACLK
unsigned int marked_aclk_reload_loop = 0;
#endif
- while(service_running(SERVICE_HEALTH) && host->health_enabled) {
+ while(service_running(SERVICE_HEALTH)) {
loop++;
debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
- now = now_realtime_sec();
+ time_t now = now_realtime_sec();
int runnable = 0, apply_hibernation_delay = 0;
time_t next_run = now + min_run_every;
RRDCALC *rc;
+ RRDHOST *host;
if (unlikely(check_if_resumed_from_suspension())) {
apply_hibernation_delay = 1;
log_health(
- "[%s]: Postponing alarm checks for %"PRId64" seconds, "
+ "Postponing alarm checks for %"PRId64" seconds, "
"because it seems that the system was just resumed from suspension.",
- rrdhost_hostname(host),
(int64_t)hibernation_delay);
}
if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) {
- static __thread int logged=0;
+ static int logged=0;
if (!logged) {
- log_health("[%s]: Skipping health checks, because all alarms are disabled via a %s command.",
- rrdhost_hostname(host),
+ log_health("Skipping health checks, because all alarms are disabled via a %s command.",
HEALTH_CMDAPI_CMD_DISABLEALL);
logged = 1;
}
}
#ifdef ENABLE_ACLK
- if (host->aclk_alert_reloaded && !marked_aclk_reload_loop)
+ if (aclk_alert_reloaded && !marked_aclk_reload_loop)
marked_aclk_reload_loop = loop;
#endif
- if (unlikely(apply_hibernation_delay)) {
- log_health(
- "[%s]: Postponing health checks for %"PRId64" seconds.",
- rrdhost_hostname(host),
- (int64_t)hibernation_delay);
+ worker_is_busy(WORKER_HEALTH_JOB_RRD_LOCK);
+ rrd_rdlock();
- host->health_delay_up_to = now + hibernation_delay;
- next_run = now + hibernation_delay;
- health_sleep(next_run, loop, host);
- }
+ rrdhost_foreach_read(host) {
- if (unlikely(host->health_delay_up_to)) {
- if (unlikely(now < host->health_delay_up_to)) {
- next_run = host->health_delay_up_to;
- health_sleep(next_run, loop, host);
+ if (unlikely(!host->health.health_enabled))
continue;
- }
-
- log_health("[%s]: Resuming health checks after delay.", rrdhost_hostname(host));
- host->health_delay_up_to = 0;
- }
- // wait until cleanup of obsolete charts on children is complete
- if (host != localhost) {
- if (unlikely(host->trigger_chart_obsoletion_check == 1)) {
- log_health("[%s]: Waiting for chart obsoletion check.", rrdhost_hostname(host));
- health_sleep(next_run, loop, host);
- continue;
+ if (unlikely(!rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH))) {
+ rrd_unlock();
+ initialize_health(host, host == localhost);
+ rrd_rdlock();
}
- }
-
- if (!health_running_logged) {
- log_health("[%s]: Health is running.", rrdhost_hostname(host));
- health_running_logged = true;
- }
- if(likely(!host->health_log_fp) && (loop == 1 || loop % cleanup_sql_every_loop == 0))
- sql_health_alarm_log_cleanup(host);
+ health_execute_delayed_initializations(host);
- health_execute_delayed_initializations(host);
+ rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
- worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK);
-
- // the first loop is to lookup values from the db
- foreach_rrdcalc_in_rrdhost_read(host, rc) {
+ if (unlikely(apply_hibernation_delay)) {
+ log_health(
+ "[%s]: Postponing health checks for %"PRId64" seconds.",