summaryrefslogtreecommitdiffstats
path: root/health/health.c
diff options
context:
space:
mode:
authorCosta Tsaousis <costa@netdata.cloud>2023-11-22 08:27:25 +0000
committerGitHub <noreply@github.com>2023-11-22 10:27:25 +0200
commit3e508c8f95ab0bdf8b6d74501437210d7b8d2919 (patch)
tree965caf50e73854f638bc9fbc4aebfbd4690619e5 /health/health.c
parent8f31356a0c0cb5956b9a31ffd5abb45d85de1656 (diff)
New logging layer (#16357)
* cleanup of logging - wip * first working iteration * add errno annotator * replace old logging functions with netdata_logger() * cleanup * update error_limit * fix remanining error_limit references * work on fatal() * started working on structured logs * full cleanup * default logging to files; fix all plugins initialization * fix formatting of numbers * cleanup and reorg * fix coverity issues * cleanup obsolete code * fix formatting of numbers * fix log rotation * fix for older systems * add detection of systemd journal via stderr * finished on access.log * remove left-over transport * do not add empty fields to the logs * journal get compact uuids; X-Transaction-ID header is added in web responses * allow compiling on systems without memfd sealing * added libnetdata/uuid directory * move datetime formatters to libnetdata * add missing files * link the makefiles in libnetdata * added uuid_parse_flexi() to parse UUIDs with and without hyphens; the web server now read X-Transaction-ID and uses it for functions and web responses * added stream receiver, sender, proc plugin and pluginsd log stack * iso8601 advanced usage; line_splitter module in libnetdata; code cleanup * add message ids to streaming inbound and outbound connections * cleanup line_splitter between lines to avoid logging garbage; when killing children, kill them with SIGABRT if internal checks is enabled * send SIGABRT to external plugins only if we are not shutting down * fix cross cleanup in pluginsd parser * fatal when there is a stack error in logs * compile netdata with -fexceptions * do not kill external plugins with SIGABRT * metasync info logs to debug level * added severity to logs * added json output; added options per log output; added documentation; fixed issues mentioned * allow memfd only on linux * moved journal low level functions to journal.c/h * move health logs to daemon.log with proper priorities * fixed a couple of bugs; health log in journal * updated docs * systemd-cat-native command to push structured logs to journal from the command line * fix makefiles * restored NETDATA_LOG_SEVERITY_LEVEL * fix makefiles * systemd-cat-native can also work as the logger of Netdata scripts * do not require a socket to systemd-journal to log-as-netdata * alarm notify logs in native format * properly compare log ids * fatals log alerts; alarm-notify.sh working * fix overflow warning * alarm-notify.sh now logs the request (command line) * anotate external plugins logs with the function cmd they run * added context, component and type to alarm-notify.sh; shell sanitization removes control character and characters that may be expanded by bash * reformatted alarm-notify logs * unify cgroup-network-helper.sh * added quotes around params * charts.d.plugin switched logging to journal native * quotes for logfmt * unify the status codes of streaming receivers and senders * alarm-notify: dont log anything, if there is nothing to do * all external plugins log to stderr when running outside netdata; alarm-notify now shows an error when notifications menthod are needed but are not available * migrate cgroup-name.sh to new logging * systemd-cat-native now supports messages with newlines * socket.c logs use priority * cleanup log field types * inherit the systemd set INVOCATION_ID if found * allow systemd-cat-native to send messages to a systemd-journal-remote URL * log2journal command that can convert structured logs to journal export format * various fixes and documentation of log2journal * updated log2journal docs * updated log2journal docs * updated documentation of fields * allow compiling without libcurl * do not use socket as format string * added version information to newly added tools * updated documentation and help messages * fix the namespace socket path * print errno with error * do not timeout * updated docs * updated docs * updated docs * log2journal updated docs and params * when talking to a remote journal, systemd-cat-native batches the messages * enable lz4 compression for systemd-cat-native when sending messages to a systemd-journal-remote * Revert "enable lz4 compression for systemd-cat-native when sending messages to a systemd-journal-remote" This reverts commit b079d53c11f6687cd64d804fdd7b24c0492bf245. * note about uncompressed traffic * log2journal: code reorg and cleanup to make modular * finished rewriting log2journal * more comments * rewriting rules support * increased limits * updated docs * updated docs * fix old log call * use journal only when stderr is connected to journal * update netdata.spec for libcurl, libpcre2 and log2journal * pcre2-devel * do not require pcre2 in centos < 8, amazonlinux < 2023, open suse * log2journal only on systems pcre2 is available * ignore log2journal in .gitignore * avoid log2journal on centos 7, amazonlinux 2 and opensuse * add pcre2-8 to static build * undo last commit * Bundle to static Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud> * Add build deps for deb packages Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud> * Add dependencies; build from source Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud> * Test build for amazon linux and centos expect to fail for suse Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud> * fix minor oversight Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud> * Reorg code * Add the install from source (deps) as a TODO * Not enable the build on suse ecosystem Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud> --------- Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud> Co-authored-by: Tasos Katsoulas <tasos@netdata.cloud>
Diffstat (limited to 'health/health.c')
-rw-r--r--health/health.c108
1 files changed, 83 insertions, 25 deletions
diff --git a/health/health.c b/health/health.c
index 390578799e..2ed15b3214 100644
--- a/health/health.c
+++ b/health/health.c
@@ -82,10 +82,13 @@ static bool prepare_command(BUFFER *wb,
const char *edit_command,
const char *machine_guid,
uuid_t *transition_id,
- const char *summary
+ const char *summary,
+ const char *context,
+ const char *component,
+ const char *type
) {
char buf[8192];
- size_t n = 8192 - 1;
+ size_t n = sizeof(buf) - 1;
buffer_strcat(wb, "exec");
@@ -195,6 +198,18 @@ static bool prepare_command(BUFFER *wb,
return false;
buffer_sprintf(wb, " '%s'", buf);
+ if (!sanitize_command_argument_string(buf, context, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, component, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, type, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
return true;
}
@@ -342,7 +357,9 @@ static void health_reload_host(RRDHOST *host) {
if(unlikely(!host->health.health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH))
return;
- netdata_log_health("[%s]: Reloading health.", rrdhost_hostname(host));
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Reloading health.",
+ rrdhost_hostname(host));
char *user_path = health_user_config_dir();
char *stock_path = health_stock_config_dir();
@@ -436,8 +453,10 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
// do not send notifications for disabled statuses
- netdata_log_debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
- netdata_log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)",
+ rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
// mark it as run, so that we will send the same alarm if it happens again
goto done;
@@ -454,10 +473,10 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
// we have executed this alarm notification in the past
if(last_executed_status == ae->new_status && !(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING)) {
// don't send the notification for the same status again
- netdata_log_debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_id(ae), ae_name(ae)
- , rrdcalc_status2string(ae->new_status));
- netdata_log_health("[%s]: Health not sending again notification for alarm '%s.%s' status %s", rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae)
- , rrdcalc_status2string(ae->new_status));
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Health not sending again notification for alarm '%s.%s' status %s",
+ rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae),
+ rrdcalc_status2string(ae->new_status));
goto done;
}
}
@@ -476,11 +495,16 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
// Check if alarm notifications are silenced
if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) {
- netdata_log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Health not sending notification for alarm '%s.%s' status %s "
+ "(command API has disabled notifications)",
+ rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
goto done;
}
- netdata_log_health("[%s]: Sending notification for alarm '%s.%s' status %s.", rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Sending notification for alarm '%s.%s' status %s.",
+ rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
const char *exec = (ae->exec) ? ae_exec(ae) : string2str(host->health.health_default_exec);
const char *recipient = (ae->recipient) ? ae_recipient(ae) : string2str(host->health.health_default_recipient);
@@ -581,7 +605,11 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
edit_command,
host->machine_guid,
&ae->transition_id,
- host->health.use_summary_for_notifications && ae->summary?ae_summary(ae):ae_name(ae));
+ host->health.use_summary_for_notifications && ae->summary?ae_summary(ae):ae_name(ae),
+ string2str(ae->chart_context),
+ string2str(ae->component),
+ string2str(ae->type)
+ );
const char *command_to_run = buffer_tostring(wb);
if (ok) {
@@ -778,7 +806,8 @@ static void health_main_cleanup(void *ptr) {
netdata_log_info("cleaning up...");
static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
- netdata_log_health("Health thread ended.");
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "Health thread ended.");
}
static void initialize_health(RRDHOST *host)
@@ -790,7 +819,9 @@ static void initialize_health(RRDHOST *host)
rrdhost_flag_set(host, RRDHOST_FLAG_INITIALIZED_HEALTH);
- netdata_log_health("[%s]: Initializing health.", rrdhost_hostname(host));
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Initializing health.",
+ rrdhost_hostname(host));
host->health.health_default_warn_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat warning", "never");
host->health.health_default_crit_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat critical", "never");
@@ -803,7 +834,11 @@ static void initialize_health(RRDHOST *host)
long n = config_get_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", host->health_log.max);
if(n < 10) {
- netdata_log_health("Host '%s': health configuration has invalid max log entries %ld. Using default %u", rrdhost_hostname(host), n, host->health_log.max);
+ nd_log(NDLS_DAEMON, NDLP_WARNING,
+ "Host '%s': health configuration has invalid max log entries %ld. "
+ "Using default %u",
+ rrdhost_hostname(host), n, host->health_log.max);
+
config_set_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", (long)host->health_log.max);
}
else
@@ -811,7 +846,11 @@ static void initialize_health(RRDHOST *host)
uint32_t m = config_get_number(CONFIG_SECTION_HEALTH, "health log history", HEALTH_LOG_DEFAULT_HISTORY);
if (m < HEALTH_LOG_MINIMUM_HISTORY) {
- netdata_log_health("Host '%s': health configuration has invalid health log history %u. Using minimum %d", rrdhost_hostname(host), m, HEALTH_LOG_MINIMUM_HISTORY);
+ nd_log(NDLS_DAEMON, NDLP_WARNING,
+ "Host '%s': health configuration has invalid health log history %u. "
+ "Using minimum %d",
+ rrdhost_hostname(host), m, HEALTH_LOG_MINIMUM_HISTORY);
+
config_set_number(CONFIG_SECTION_HEALTH, "health log history", HEALTH_LOG_MINIMUM_HISTORY);
m = HEALTH_LOG_MINIMUM_HISTORY;
}
@@ -823,7 +862,9 @@ static void initialize_health(RRDHOST *host)
} else
host->health_log.health_log_history = m;
- netdata_log_health("[%s]: Health log history is set to %u seconds (%u days)", rrdhost_hostname(host), host->health_log.health_log_history, host->health_log.health_log_history / 86400);
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Health log history is set to %u seconds (%u days)",
+ rrdhost_hostname(host), host->health_log.health_log_history, host->health_log.health_log_history / 86400);
conf_enabled_alarms = simple_pattern_create(config_get(CONFIG_SECTION_HEALTH, "enabled alarms", "*"), NULL,
SIMPLE_PATTERN_EXACT, true);
@@ -1049,7 +1090,7 @@ void *health_main(void *ptr) {
if (unlikely(check_if_resumed_from_suspension())) {
apply_hibernation_delay = 1;
- netdata_log_health(
+ nd_log(NDLS_DAEMON, NDLP_NOTICE,
"Postponing alarm checks for %"PRId64" seconds, "
"because it seems that the system was just resumed from suspension.",
(int64_t)hibernation_delay);
@@ -1058,8 +1099,9 @@ void *health_main(void *ptr) {
if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) {
static int logged=0;
if (!logged) {
- netdata_log_health("Skipping health checks, because all alarms are disabled via a %s command.",
- HEALTH_CMDAPI_CMD_DISABLEALL);
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "Skipping health checks, because all alarms are disabled via a %s command.",
+ HEALTH_CMDAPI_CMD_DISABLEALL);
logged = 1;
}
}
@@ -1081,7 +1123,7 @@ void *health_main(void *ptr) {
rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
if (unlikely(apply_hibernation_delay)) {
- netdata_log_health(
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
"[%s]: Postponing health checks for %"PRId64" seconds.",
rrdhost_hostname(host),
(int64_t)hibernation_delay);
@@ -1094,20 +1136,30 @@ void *health_main(void *ptr) {
continue;
}
- netdata_log_health("[%s]: Resuming health checks after delay.", rrdhost_hostname(host));
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Resuming health checks after delay.",
+ rrdhost_hostname(host));
+
host->health.health_delay_up_to = 0;
}
// wait until cleanup of obsolete charts on children is complete
if (host != localhost) {
if (unlikely(host->trigger_chart_obsoletion_check == 1)) {
- netdata_log_health("[%s]: Waiting for chart obsoletion check.", rrdhost_hostname(host));
+
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Waiting for chart obsoletion check.",
+ rrdhost_hostname(host));
+
continue;
}
}
if (!health_running_logged) {
- netdata_log_health("[%s]: Health is running.", rrdhost_hostname(host));
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Health is running.",
+ rrdhost_hostname(host));
+
health_running_logged = true;
}
@@ -1161,6 +1213,7 @@ void *health_main(void *ptr) {
rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0);
if (ae) {
+ health_log_alert(host, ae);
health_alarm_log_add_entry(host, ae);
rc->old_status = rc->status;
rc->status = RRDCALC_STATUS_REMOVED;
@@ -1432,9 +1485,13 @@ void *health_main(void *ptr) {
)
);
+ health_log_alert(host, ae);
health_alarm_log_add_entry(host, ae);
- netdata_log_health("[%s]: Alert event for [%s.%s], value [%s], status [%s].", rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), ae_new_value_string(ae), rrdcalc_status2string(ae->new_status));
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Alert event for [%s.%s], value [%s], status [%s].",
+ rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), ae_new_value_string(ae),
+ rrdcalc_status2string(ae->new_status));
rc->last_status_change_value = rc->value;
rc->last_status_change = now;
@@ -1519,6 +1576,7 @@ void *health_main(void *ptr) {
)
);
+ health_log_alert(host, ae);
ae->last_repeat = rc->last_repeat;
if (!(rc->run_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) {
ae->flags |= HEALTH_ENTRY_RUN_ONCE;