summaryrefslogtreecommitdiffstats
path: root/health/health.c
diff options
context:
space:
mode:
authorCosta Tsaousis <costa@netdata.cloud>2022-05-09 16:34:31 +0300
committerGitHub <noreply@github.com>2022-05-09 16:34:31 +0300
commiteb216a1f4bbb26e1f18537b30d22e8ad8711f42c (patch)
tree353938a0f71da7b04d4f9b67769d2a38ba6db2cb /health/health.c
parent0b3ee50c76dcc3b8dcdd13cec0e432394d3c6964 (diff)
Workers utilization charts (#12807)
* initial version of worker utilization * working example * without mutexes * monitoring DBENGINE, ACLKSYNC, WEB workers * added charts to monitor worker usage * fixed charts units * updated contexts * updated priorities * added documentation * converted threads to stacked chart * One query per query thread * Revert "One query per query thread" This reverts commit 6aeb391f5987c3c6ba2864b559fd7f0cd64b14d3. * fixed priority for web charts * read worker cpu utilization from proc * read workers cpu utilization via /proc/self/task/PID/stat, so that we have cpu utilization even when the jobs are too long to finish within our update_every frequency * disabled web server cpu utilization monitoring - it is now monitored by worker utilization * tight integration of worker utilization to web server * monitoring statsd worker threads * code cleanup and renaming of variables * contrained worker and statistics conflict to just one variable * support for rendering jobs per type * better priorities and removed the total jobs chart * added busy time in ms per job type * added proc.plugin monitoring, switch clock to MONOTONIC_RAW if available, global statistics now cleans up old worker threads * isolated worker thread families * added cgroups.plugin workers * remove unneeded dimensions when then expected worker is just one * plugins.d and streaming monitoring * rebased; support worker_is_busy() to be called one after another * added diskspace plugin monitoring * added tc.plugin monitoring * added ML threads monitoring * dont create dimensions and charts that are not needed * fix crash when job types are added on the fly * added timex and idlejitter plugins; collected heartbeat statistics; reworked heartbeat according to the POSIX * the right name is heartbeat for this chart * monitor streaming senders * added streaming senders to global stats * prevent division by zero * added clock_init() to external C plugins * added freebsd and macos plugins * added freebsd and macos to global statistics * dont use new as a variable; address compiler warnings on FreeBSD and MacOS * refactored contexts to be unique; added health threads monitoring Co-authored-by: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com>
Diffstat (limited to 'health/health.c')
-rw-r--r--health/health.c41
1 files changed, 41 insertions, 0 deletions
diff --git a/health/health.c b/health/health.c
index 4359c90f51..df3802497e 100644
--- a/health/health.c
+++ b/health/health.c
@@ -573,6 +573,8 @@ static inline int check_if_resumed_from_suspension(void) {
}
static void health_main_cleanup(void *ptr) {
+ worker_unregister();
+
struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
@@ -695,7 +697,31 @@ static void init_pending_foreach_alarms(RRDHOST *host) {
*
* @return It always returns NULL
*/
+
+#define WORKER_HEALTH_JOB_RRD_LOCK 0
+#define WORKER_HEALTH_JOB_HOST_LOCK 1
+#define WORKER_HEALTH_JOB_DB_QUERY 2
+#define WORKER_HEALTH_JOB_CALC_EVAL 3
+#define WORKER_HEALTH_JOB_WARNING_EVAL 4
+#define WORKER_HEALTH_JOB_CRITICAL_EVAL 5
+#define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6
+#define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7
+
+#if WORKER_UTILIZATION_MAX_JOB_TYPES < 8
+#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 8
+#endif
+
void *health_main(void *ptr) {
+ worker_register("HEALTH");
+ worker_register_job_name(WORKER_HEALTH_JOB_RRD_LOCK, "rrd lock");
+ worker_register_job_name(WORKER_HEALTH_JOB_HOST_LOCK, "host lock");
+ worker_register_job_name(WORKER_HEALTH_JOB_DB_QUERY, "db lookup");
+ worker_register_job_name(WORKER_HEALTH_JOB_CALC_EVAL, "calc eval");
+ worker_register_job_name(WORKER_HEALTH_JOB_WARNING_EVAL, "warning eval");
+ worker_register_job_name(WORKER_HEALTH_JOB_CRITICAL_EVAL, "critical eval");
+ worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY, "alarm log entry");
+ worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS, "alarm log process");
+
netdata_thread_cleanup_push(health_main_cleanup, ptr);
int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10);
@@ -743,6 +769,7 @@ void *health_main(void *ptr) {
marked_aclk_reload_loop = loop;
#endif
+ worker_is_busy(WORKER_HEALTH_JOB_RRD_LOCK);
rrd_rdlock();
RRDHOST *host;
@@ -772,6 +799,7 @@ void *health_main(void *ptr) {
init_pending_foreach_alarms(host);
+ worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK);
rrdhost_rdlock(host);
// the first loop is to lookup values from the db
@@ -786,6 +814,7 @@ void *health_main(void *ptr) {
rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE) &&
now > (rc->rrdset->last_collected_time.tv_sec + 60))) {
if (!rrdcalc_isrepeating(rc)) {
+ worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
time_t now = now_realtime_sec();
ALARM_ENTRY *ae = health_create_alarm_entry(
host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
@@ -820,6 +849,8 @@ void *health_main(void *ptr) {
// if there is database lookup, do it
if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
+ worker_is_busy(WORKER_HEALTH_JOB_DB_QUERY);
+
/* time_t old_db_timestamp = rc->db_before; */
int value_is_null = 0;
@@ -876,6 +907,8 @@ void *health_main(void *ptr) {
// if there is calculation expression, run it
if (unlikely(rc->calculation)) {
+ worker_is_busy(WORKER_HEALTH_JOB_CALC_EVAL);
+
if (unlikely(!expression_evaluate(rc->calculation))) {
// calculation failed
rc->value = NAN;
@@ -924,6 +957,8 @@ void *health_main(void *ptr) {
// check the warning expression
if (likely(rc->warning)) {
+ worker_is_busy(WORKER_HEALTH_JOB_WARNING_EVAL);
+
if (unlikely(!expression_evaluate(rc->warning))) {
// calculation failed
rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
@@ -948,6 +983,8 @@ void *health_main(void *ptr) {
// check the critical expression
if (likely(rc->critical)) {
+ worker_is_busy(WORKER_HEALTH_JOB_CRITICAL_EVAL);
+
if (unlikely(!expression_evaluate(rc->critical))) {
// calculation failed
rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
@@ -1005,6 +1042,7 @@ void *health_main(void *ptr) {
// check if the new status and the old differ
if (status != rc->status) {
+ worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
int delay = 0;
// apply trigger hysteresis
@@ -1086,6 +1124,7 @@ void *health_main(void *ptr) {
}
if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
+ worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
rc->last_repeat = now;
if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++;
ALARM_ENTRY *ae = health_create_alarm_entry(
@@ -1118,6 +1157,7 @@ void *health_main(void *ptr) {
// execute notifications
// and cleanup
+ worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS);
health_alarm_log_process(host);
if (unlikely(netdata_exit)) {
@@ -1156,6 +1196,7 @@ void *health_main(void *ptr) {
now = now_realtime_sec();
if(now < next_run) {
+ worker_is_idle();
debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
now = now_realtime_sec();