summaryrefslogtreecommitdiffstats
path: root/health/health_log.c
diff options
context:
space:
mode:
authorCosta Tsaousis <costa@netdata.cloud>2023-11-22 08:27:25 +0000
committerGitHub <noreply@github.com>2023-11-22 10:27:25 +0200
commit3e508c8f95ab0bdf8b6d74501437210d7b8d2919 (patch)
tree965caf50e73854f638bc9fbc4aebfbd4690619e5 /health/health_log.c
parent8f31356a0c0cb5956b9a31ffd5abb45d85de1656 (diff)
New logging layer (#16357)
* cleanup of logging - wip * first working iteration * add errno annotator * replace old logging functions with netdata_logger() * cleanup * update error_limit * fix remanining error_limit references * work on fatal() * started working on structured logs * full cleanup * default logging to files; fix all plugins initialization * fix formatting of numbers * cleanup and reorg * fix coverity issues * cleanup obsolete code * fix formatting of numbers * fix log rotation * fix for older systems * add detection of systemd journal via stderr * finished on access.log * remove left-over transport * do not add empty fields to the logs * journal get compact uuids; X-Transaction-ID header is added in web responses * allow compiling on systems without memfd sealing * added libnetdata/uuid directory * move datetime formatters to libnetdata * add missing files * link the makefiles in libnetdata * added uuid_parse_flexi() to parse UUIDs with and without hyphens; the web server now read X-Transaction-ID and uses it for functions and web responses * added stream receiver, sender, proc plugin and pluginsd log stack * iso8601 advanced usage; line_splitter module in libnetdata; code cleanup * add message ids to streaming inbound and outbound connections * cleanup line_splitter between lines to avoid logging garbage; when killing children, kill them with SIGABRT if internal checks is enabled * send SIGABRT to external plugins only if we are not shutting down * fix cross cleanup in pluginsd parser * fatal when there is a stack error in logs * compile netdata with -fexceptions * do not kill external plugins with SIGABRT * metasync info logs to debug level * added severity to logs * added json output; added options per log output; added documentation; fixed issues mentioned * allow memfd only on linux * moved journal low level functions to journal.c/h * move health logs to daemon.log with proper priorities * fixed a couple of bugs; health log in journal * updated docs * systemd-cat-native command to push structured logs to journal from the command line * fix makefiles * restored NETDATA_LOG_SEVERITY_LEVEL * fix makefiles * systemd-cat-native can also work as the logger of Netdata scripts * do not require a socket to systemd-journal to log-as-netdata * alarm notify logs in native format * properly compare log ids * fatals log alerts; alarm-notify.sh working * fix overflow warning * alarm-notify.sh now logs the request (command line) * anotate external plugins logs with the function cmd they run * added context, component and type to alarm-notify.sh; shell sanitization removes control character and characters that may be expanded by bash * reformatted alarm-notify logs * unify cgroup-network-helper.sh * added quotes around params * charts.d.plugin switched logging to journal native * quotes for logfmt * unify the status codes of streaming receivers and senders * alarm-notify: dont log anything, if there is nothing to do * all external plugins log to stderr when running outside netdata; alarm-notify now shows an error when notifications menthod are needed but are not available * migrate cgroup-name.sh to new logging * systemd-cat-native now supports messages with newlines * socket.c logs use priority * cleanup log field types * inherit the systemd set INVOCATION_ID if found * allow systemd-cat-native to send messages to a systemd-journal-remote URL * log2journal command that can convert structured logs to journal export format * various fixes and documentation of log2journal * updated log2journal docs * updated log2journal docs * updated documentation of fields * allow compiling without libcurl * do not use socket as format string * added version information to newly added tools * updated documentation and help messages * fix the namespace socket path * print errno with error * do not timeout * updated docs * updated docs * updated docs * log2journal updated docs and params * when talking to a remote journal, systemd-cat-native batches the messages * enable lz4 compression for systemd-cat-native when sending messages to a systemd-journal-remote * Revert "enable lz4 compression for systemd-cat-native when sending messages to a systemd-journal-remote" This reverts commit b079d53c11f6687cd64d804fdd7b24c0492bf245. * note about uncompressed traffic * log2journal: code reorg and cleanup to make modular * finished rewriting log2journal * more comments * rewriting rules support * increased limits * updated docs * updated docs * fix old log call * use journal only when stderr is connected to journal * update netdata.spec for libcurl, libpcre2 and log2journal * pcre2-devel * do not require pcre2 in centos < 8, amazonlinux < 2023, open suse * log2journal only on systems pcre2 is available * ignore log2journal in .gitignore * avoid log2journal on centos 7, amazonlinux 2 and opensuse * add pcre2-8 to static build * undo last commit * Bundle to static Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud> * Add build deps for deb packages Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud> * Add dependencies; build from source Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud> * Test build for amazon linux and centos expect to fail for suse Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud> * fix minor oversight Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud> * Reorg code * Add the install from source (deps) as a TODO * Not enable the build on suse ecosystem Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud> --------- Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud> Co-authored-by: Tasos Katsoulas <tasos@netdata.cloud>
Diffstat (limited to 'health/health_log.c')
-rw-r--r--health/health_log.c73
1 files changed, 73 insertions, 0 deletions
diff --git a/health/health_log.c b/health/health_log.c
index 35f297007d..250937c58b 100644
--- a/health/health_log.c
+++ b/health/health_log.c
@@ -8,6 +8,79 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
sql_health_alarm_log_save(host, ae);
}
+
+void health_log_alert_transition_with_trace(RRDHOST *host, ALARM_ENTRY *ae, int line, const char *file, const char *function) {
+ ND_LOG_STACK lgs[] = {
+ ND_LOG_FIELD_UUID(NDF_MESSAGE_ID, &health_alert_transition_msgid),
+ ND_LOG_FIELD_STR(NDF_NIDL_NODE, host->hostname),
+ ND_LOG_FIELD_STR(NDF_NIDL_INSTANCE, ae->chart_name),
+ ND_LOG_FIELD_STR(NDF_NIDL_CONTEXT, ae->chart_context),
+ ND_LOG_FIELD_U64(NDF_ALERT_ID, ae->alarm_id),
+ ND_LOG_FIELD_U64(NDF_ALERT_UNIQUE_ID, ae->unique_id),
+ ND_LOG_FIELD_U64(NDF_ALERT_EVENT_ID, ae->alarm_event_id),
+ ND_LOG_FIELD_UUID(NDF_ALERT_CONFIG_HASH, &ae->config_hash_id),
+ ND_LOG_FIELD_UUID(NDF_ALERT_TRANSITION_ID, &ae->transition_id),
+ ND_LOG_FIELD_STR(NDF_ALERT_NAME, ae->name),
+ ND_LOG_FIELD_STR(NDF_ALERT_CLASS, ae->classification),
+ ND_LOG_FIELD_STR(NDF_ALERT_COMPONENT, ae->component),
+ ND_LOG_FIELD_STR(NDF_ALERT_TYPE, ae->type),
+ ND_LOG_FIELD_STR(NDF_ALERT_EXEC, ae->exec),
+ ND_LOG_FIELD_STR(NDF_ALERT_RECIPIENT, ae->recipient),
+ ND_LOG_FIELD_STR(NDF_ALERT_SOURCE, ae->exec),
+ ND_LOG_FIELD_STR(NDF_ALERT_UNITS, ae->units),
+ ND_LOG_FIELD_STR(NDF_ALERT_SUMMARY, ae->summary),
+ ND_LOG_FIELD_STR(NDF_ALERT_INFO, ae->info),
+ ND_LOG_FIELD_DBL(NDF_ALERT_VALUE, ae->new_value),
+ ND_LOG_FIELD_DBL(NDF_ALERT_VALUE_OLD, ae->old_value),
+ ND_LOG_FIELD_TXT(NDF_ALERT_STATUS, rrdcalc_status2string(ae->new_status)),
+ ND_LOG_FIELD_TXT(NDF_ALERT_STATUS_OLD, rrdcalc_status2string(ae->old_status)),
+ ND_LOG_FIELD_I64(NDF_ALERT_DURATION, ae->duration),
+ ND_LOG_FIELD_I64(NDF_RESPONSE_CODE, ae->exec_code),
+ ND_LOG_FIELD_U64(NDF_ALERT_NOTIFICATION_REALTIME_USEC, ae->delay_up_to_timestamp * USEC_PER_SEC),
+ ND_LOG_FIELD_END(),
+ };
+ ND_LOG_STACK_PUSH(lgs);
+
+ errno = 0;
+
+ ND_LOG_FIELD_PRIORITY priority = NDLP_INFO;
+
+ switch(ae->new_status) {
+ case RRDCALC_STATUS_UNDEFINED:
+ if(ae->old_status >= RRDCALC_STATUS_CLEAR)
+ priority = NDLP_NOTICE;
+ else
+ priority = NDLP_DEBUG;
+ break;
+
+ default:
+ case RRDCALC_STATUS_UNINITIALIZED:
+ case RRDCALC_STATUS_REMOVED:
+ priority = NDLP_DEBUG;
+ break;
+
+ case RRDCALC_STATUS_CLEAR:
+ priority = NDLP_INFO;
+ break;
+
+ case RRDCALC_STATUS_WARNING:
+ if(ae->old_status < RRDCALC_STATUS_WARNING)
+ priority = NDLP_WARNING;
+ break;
+
+ case RRDCALC_STATUS_CRITICAL:
+ if(ae->old_status < RRDCALC_STATUS_CRITICAL)
+ priority = NDLP_CRIT;
+ break;
+ }
+
+ netdata_logger(NDLS_HEALTH, priority, file, function, line,
+ "ALERT '%s' of instance '%s' on node '%s', transitioned from %s to %s",
+ string2str(ae->name), string2str(ae->chart), string2str(host->hostname),
+ rrdcalc_status2string(ae->old_status), rrdcalc_status2string(ae->new_status)
+ );
+}
+
// ----------------------------------------------------------------------------
// health alarm log management