summaryrefslogtreecommitdiffstats
path: root/health
diff options
context:
space:
mode:
authorCosta Tsaousis <costa@netdata.cloud>2022-10-05 14:13:46 +0300
committerGitHub <noreply@github.com>2022-10-05 14:13:46 +0300
commit8fc3b351a2e7fc96eced8f924de2e9cec9842128 (patch)
treebde41c66573ccaf8876c280e00742cc6096b587c /health
parent6850878e697d66dc90b9af1e750b22238c63c292 (diff)
Allow netdata plugins to expose functions for querying more information about specific charts (#13720)
* function renames and code cleanup in popen.c; no actual code changes * netdata popen() now opens both child process stdin and stdout and returns FILE * for both * pass both input and output to parser structures * updated rrdset to call custom functions * RRDSET FUNCTION leading calls for both sync and async operation * put RRDSET functions to a separate file * added format and timeout at function definition * support for synchronous (internal plugins) and asynchronous (external plugins and children) functions * /api/v1/function endpoint * functions are now attached to the host and there is a dictionary view per chart * functions implemented at plugins.d * remove the defer until keyword hook from plugins.d when it is done * stream sender implementation of functions * sanitization of all functions so that certain characters are only allowed * strictier sanitization * common max size * 1st working plugins.d example * always init inflight dictionary * properly destroy dictionaries to avoid parallel insertion of items * add more debugging on disconnection reasons * add more debugging on disconnection reasons again * streaming receiver respects newlines * dont use the same fp for both streaming receive and send * dont free dbengine memory with internal checks * make sender proceed in the buffer * added timing info and garbage collection at plugins.d * added info about routing nodes * added info about routing nodes with delay * added more info about delays * added more info about delays again * signal sending thread to wake up * streaming version labeling and commented code to support capabilities * added functions to /api/v1/data, /api/v1/charts, /api/v1/chart, /api/v1/info * redirect top output to stdout * address coverity findings * fix resource leaks of popen * log attempts to connect to individual destinations * better messages * properly parse destinations * try to find a function from the most matching to the least matching * log added streaming destinations * rotate destinations bypassing a node in the middle that does not accept our connection * break the loops properly * use typedef to define callbacks * capabilities negotiation during streaming * functions exposed upstream based on capabilities; compression disabled per node persisting reconnects; always try to connect with all capabilities * restore functionality to lookup functions * better logging of capabilities * remove old versions from capabilities when a newer version is there * fix formatting * optimization for plugins.d rrdlabels to avoid creating and destructing dictionaries all the time * delayed health initialization for rrddim and rrdset * cleanup health initialization * fix for popen() not returning the right value * add health worker jobs for initializing rrdset and rrddim * added content type support for functions; apps.plugin permanent function to display all the processes * fixes for functions parameters parsing in apps.plugin * fix for process matching in apps.plugiin * first working function for apps.plugin * Dashboard ACL is disabled for functions; Function errors are all in JSON format * apps.plugin function processes returns json table * use json_escape_string() to escape message * fix formatting * apps.plugin exposes all its metrics to function processes * fix json formatting when filtering out some rows * reopen the internal pipe of rrdpush in case of errors * misplaced statement * do not use buffer->len * support for GLOBAL functions (functions that are not linked to a chart * added /api/v1/functions endpoint; removed format from the FUNCTIONS api; * swagger documentation about the new api end points * added plugins.d documentation about functions * never re-close a file * remove uncessesary ifdef * fixed issues identified by codacy * fix for null label value * make edit-config copy-and-paste friendly * Revert "make edit-config copy-and-paste friendly" This reverts commit 54500c0e0a97f65a0c66c4d34e966f6a9056698e. * reworked sender handshake to fix coverity findings * timeout is zero, for both send_timeout() and recv_timeout() * properly detect that parent closed the socket * support caching of function responses; limit function response to 10MB; added protection from malformed function responses * disabled excessive logging * added units to apps.plugin function processes and normalized all values to be human readable * shorter field names * fixed issues reported * fixed apps.plugin error response; tested that pluginsd can properly handle faulty responses * use double linked list macros for double linked list management * faster apps.plugin function printing by minimizing file operations * added memory percentage * fix compatibility issues with older compilers and FreeBSD * rrdpush sender code cleanup; rrhost structure cleanup from sender flags and variables; * fix letftover variable in ifdef * apps.plugin: do not call detach from the thread; exit immediately when input is broken * exclude AR charts from health * flush cleaner; prefer sender output * clarity * do not fill the cbuffer if not connected * fix * dont enabled host->sender if streaming is not enabled; send host label updates to parent; * functions are only available through ACLK * Prepared statement reports only in dev mode * fix AR chart detection * fix for streaming not being enabling itself * more cleanup of sender and receiver structures * moved read-only flags and configuration options to rrdhost->options * fixed merge with master * fix for incomplete rename * prevent service thread from working on charts that are being collected Co-authored-by: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com>
Diffstat (limited to 'health')
-rw-r--r--health/health.c79
1 files changed, 53 insertions, 26 deletions
diff --git a/health/health.c b/health/health.c
index be8b50ebb3..05dd2ca010 100644
--- a/health/health.c
+++ b/health/health.c
@@ -2,6 +2,22 @@
#include "health.h"
+#define WORKER_HEALTH_JOB_RRD_LOCK 0
+#define WORKER_HEALTH_JOB_HOST_LOCK 1
+#define WORKER_HEALTH_JOB_DB_QUERY 2
+#define WORKER_HEALTH_JOB_CALC_EVAL 3
+#define WORKER_HEALTH_JOB_WARNING_EVAL 4
+#define WORKER_HEALTH_JOB_CRITICAL_EVAL 5
+#define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6
+#define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7
+#define WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET 8
+#define WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM 9
+
+#if WORKER_UTILIZATION_MAX_JOB_TYPES < 10
+#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 10
+#endif
+
+
unsigned int default_health_enabled = 1;
char *silencers_filename;
@@ -659,22 +675,48 @@ static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
return 0;
}
-// Create alarms for dimensions that have been added to charts
-// since the previous iteration.
-static void health_execute_pending_updates(RRDHOST *host) {
+static void health_execute_delayed_initializations(RRDHOST *host) {
RRDSET *st;
- if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS))
- return;
+ if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION)) return;
+ rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION);
rrdset_foreach_reentrant(st, host) {
- if(!rrdset_flag_check(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS))
+ if(!rrdset_flag_check(st, RRDSET_FLAG_PENDING_HEALTH_INITIALIZATION)) continue;
+ rrdset_flag_clear(st, RRDSET_FLAG_PENDING_HEALTH_INITIALIZATION);
+
+ if(unlikely(rrdset_is_ar_chart(st)))
continue;
+ worker_is_busy(WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET);
+
+ if(!st->rrdfamily)
+ st->rrdfamily = rrdfamily_add_and_acquire(host, rrdset_family(st));
+
+ if(!st->rrdvars)
+ st->rrdvars = rrdvariables_create();
+
+ rrddimvar_index_init(st);
+
+ rrdsetvar_add_and_leave_released(st, "last_collected_t", RRDVAR_TYPE_TIME_T, &st->last_collected_time.tv_sec, RRDVAR_FLAG_NONE);
+ rrdsetvar_add_and_leave_released(st, "collected_total_raw", RRDVAR_TYPE_TOTAL, &st->last_collected_total, RRDVAR_FLAG_NONE);
+ rrdsetvar_add_and_leave_released(st, "green", RRDVAR_TYPE_CALCULATED, &st->green, RRDVAR_FLAG_NONE);
+ rrdsetvar_add_and_leave_released(st, "red", RRDVAR_TYPE_CALCULATED, &st->red, RRDVAR_FLAG_NONE);
+ rrdsetvar_add_and_leave_released(st, "update_every", RRDVAR_TYPE_INT, &st->update_every, RRDVAR_FLAG_NONE);
+
+ rrdcalc_link_matching_alerts_to_rrdset(st);
+ rrdcalctemplate_link_matching_templates_to_rrdset(st);
+
RRDDIM *rd;
rrddim_foreach_read(rd, st) {
- if(!rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARMS))
- continue;
+ if(!rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_HEALTH_INITIALIZATION)) continue;
+ rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_HEALTH_INITIALIZATION);
+
+ worker_is_busy(WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM);
+
+ rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_CALCULATED, NULL, NULL, &rd->last_stored_value, RRDVAR_FLAG_NONE);
+ rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_COLLECTED, NULL, "_raw", &rd->last_collected_value, RRDVAR_FLAG_NONE);
+ rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_TIME_T, NULL, "_last_collected_t", &rd->last_collected_time.tv_sec, RRDVAR_FLAG_NONE);
RRDCALCTEMPLATE *rt;
foreach_rrdcalctemplate_read(host, rt) {
@@ -685,14 +727,10 @@ static void health_execute_pending_updates(RRDHOST *host) {
rrdcalctemplate_check_rrddim_conditions_and_link(rt, st, rd, host);
}
foreach_rrdcalctemplate_done(rt);
-
- rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARMS);
}
rrddim_foreach_done(rd);
- rrdset_flag_clear(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS);
}
rrdset_foreach_done(st);
- rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS);
}
/**
@@ -705,19 +743,6 @@ static void health_execute_pending_updates(RRDHOST *host) {
* @return It always returns NULL
*/
-#define WORKER_HEALTH_JOB_RRD_LOCK 0
-#define WORKER_HEALTH_JOB_HOST_LOCK 1
-#define WORKER_HEALTH_JOB_DB_QUERY 2
-#define WORKER_HEALTH_JOB_CALC_EVAL 3
-#define WORKER_HEALTH_JOB_WARNING_EVAL 4
-#define WORKER_HEALTH_JOB_CRITICAL_EVAL 5
-#define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6
-#define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7
-
-#if WORKER_UTILIZATION_MAX_JOB_TYPES < 8
-#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 8
-#endif
-
void *health_main(void *ptr) {
worker_register("HEALTH");
worker_register_job_name(WORKER_HEALTH_JOB_RRD_LOCK, "rrd lock");
@@ -728,6 +753,8 @@ void *health_main(void *ptr) {
worker_register_job_name(WORKER_HEALTH_JOB_CRITICAL_EVAL, "critical eval");
worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY, "alarm log entry");
worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS, "alarm log process");
+ worker_register_job_name(WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET, "rrdset init");
+ worker_register_job_name(WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM, "rrddim init");
netdata_thread_cleanup_push(health_main_cleanup, ptr);
@@ -809,7 +836,7 @@ void *health_main(void *ptr) {
if(likely(!host->health_log_fp) && (loop == 1 || loop % cleanup_sql_every_loop == 0))
sql_health_alarm_log_cleanup(host);
- health_execute_pending_updates(host);
+ health_execute_delayed_initializations(host);
worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK);