Workers utilization charts (#12807)

* initial version of worker utilization * working example * without mutexes * monitoring DBENGINE, ACLKSYNC, WEB workers * added charts to monitor worker usage * fixed charts units * updated contexts * updated priorities * added documentation * converted threads to stacked chart * One query per query thread * Revert "One query per query thread" This reverts commit 6aeb391f5987c3c6ba2864b559fd7f0cd64b14d3. * fixed priority for web charts * read worker cpu utilization from proc * read workers cpu utilization via /proc/self/task/PID/stat, so that we have cpu utilization even when the jobs are too long to finish within our update_every frequency * disabled web server cpu utilization monitoring - it is now monitored by worker utilization * tight integration of worker utilization to web server * monitoring statsd worker threads * code cleanup and renaming of variables * contrained worker and statistics conflict to just one variable * support for rendering jobs per type * better priorities and removed the total jobs chart * added busy time in ms per job type * added proc.plugin monitoring, switch clock to MONOTONIC_RAW if available, global statistics now cleans up old worker threads * isolated worker thread families * added cgroups.plugin workers * remove unneeded dimensions when then expected worker is just one * plugins.d and streaming monitoring * rebased; support worker_is_busy() to be called one after another * added diskspace plugin monitoring * added tc.plugin monitoring * added ML threads monitoring * dont create dimensions and charts that are not needed * fix crash when job types are added on the fly * added timex and idlejitter plugins; collected heartbeat statistics; reworked heartbeat according to the POSIX * the right name is heartbeat for this chart * monitor streaming senders * added streaming senders to global stats * prevent division by zero * added clock_init() to external C plugins * added freebsd and macos plugins * added freebsd and macos to global statistics * dont use new as a variable; address compiler warnings on FreeBSD and MacOS * refactored contexts to be unique; added health threads monitoring Co-authored-by: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com>
author: Costa Tsaousis <costa@netdata.cloud> 2022-05-09 16:34:31 +0300
committer: GitHub <noreply@github.com> 2022-05-09 16:34:31 +0300
commit: eb216a1f4bbb26e1f18537b30d22e8ad8711f42c (patch)
tree: 353938a0f71da7b04d4f9b67769d2a38ba6db2cb /collectors/cgroups.plugin
parent: 0b3ee50c76dcc3b8dcdd13cec0e432394d3c6964 (diff)
1 files changed, 62 insertions, 40 deletions
diff --git a/collectors/cgroups.plugin/sys_fs_cgroup.c b/collectors/cgroups.plugin/sys_fs_cgroup.c
index 9453d1b712..bf78624f99 100644
--- a/collectors/cgroups.plugin/sys_fs_cgroup.c
+++ b/collectors/cgroups.plugin/sys_fs_cgroup.c
@@ -2646,11 +2646,26 @@ static inline void discovery_process_cgroup(struct cgroup *cg) {
     read_cgroup_network_interfaces(cg);
 }
 
+#define WORKER_DISCOVERY_INIT 0
+#define WORKER_DISCOVERY_FIND 1
+#define WORKER_DISCOVERY_PROCESS 2
+#define WORKER_DISCOVERY_UPDATE 3
+#define WORKER_DISCOVERY_CLEANUP 4
+#define WORKER_DISCOVERY_COPY 5
+#define WORKER_DISCOVERY_SHARE 6
+#define WORKER_DISCOVERY_LOCK 7
+
+#if WORKER_UTILIZATION_MAX_JOB_TYPES < 8
+#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 8
+#endif
+
 static inline void discovery_find_all_cgroups() {
     debug(D_CGROUP, "searching for cgroups");
 
+    worker_is_busy(WORKER_DISCOVERY_INIT);
     discovery_mark_all_cgroups_as_unavailable();
 
+    worker_is_busy(WORKER_DISCOVERY_FIND);
     if (!cgroup_use_unified_cgroups) {
         discovery_find_all_cgroups_v1();
     } else {
@@ -2659,16 +2674,25 @@ static inline void discovery_find_all_cgroups() {
 
     struct cgroup *cg;
     for (cg = discovered_cgroup_root; cg; cg = cg->discovered_next) {
+        worker_is_busy(WORKER_DISCOVERY_PROCESS);
         discovery_process_cgroup(cg);
     }
 
+    worker_is_busy(WORKER_DISCOVERY_UPDATE);
     discovery_update_filenames();
 
+    worker_is_busy(WORKER_DISCOVERY_LOCK);
     uv_mutex_lock(&cgroup_root_mutex);
+
+    worker_is_busy(WORKER_DISCOVERY_CLEANUP);
     discovery_cleanup_all_cgroups();
+
+    worker_is_busy(WORKER_DISCOVERY_COPY);
     discovery_copy_discovered_cgroups_to_reader();
+
     uv_mutex_unlock(&cgroup_root_mutex);
 
+    worker_is_busy(WORKER_DISCOVERY_SHARE);
     discovery_share_cgroups_with_ebpf();
 
     debug(D_CGROUP, "done searching for cgroups");
@@ -2678,7 +2702,19 @@ void cgroup_discovery_worker(void *ptr)
 {
     UNUSED(ptr);
 
+    worker_register("CGROUPSDISC");
+    worker_register_job_name(WORKER_DISCOVERY_INIT, "init");
+    worker_register_job_name(WORKER_DISCOVERY_FIND, "find");
+    worker_register_job_name(WORKER_DISCOVERY_PROCESS, "process");
+    worker_register_job_name(WORKER_DISCOVERY_UPDATE, "update");
+    worker_register_job_name(WORKER_DISCOVERY_CLEANUP, "cleanup");
+    worker_register_job_name(WORKER_DISCOVERY_COPY, "copy");
+    worker_register_job_name(WORKER_DISCOVERY_SHARE, "share");
+    worker_register_job_name(WORKER_DISCOVERY_LOCK, "lock");
+
     while (!netdata_exit) {
+        worker_is_idle();
+
         uv_mutex_lock(&discovery_thread.mutex);
         while (!discovery_thread.start_discovery)
             uv_cond_wait(&discovery_thread.cond_var, &discovery_thread.mutex);
@@ -2692,6 +2728,7 @@ void cgroup_discovery_worker(void *ptr)
     }
 
     discovery_thread.exited = 1;
+    worker_unregister();
 } 
 
 // ----------------------------------------------------------------------------
@@ -4650,6 +4687,8 @@ void update_cgroup_charts(int update_every) {
 // cgroups main
 
 static void cgroup_main_cleanup(void *ptr) {
+    worker_unregister();
+
     struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
     static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
 
@@ -4687,24 +4726,30 @@ static void cgroup_main_cleanup(void *ptr) {
     static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
 }
 
+#define WORKER_CGROUPS_LOCK 0
+#define WORKER_CGROUPS_READ 1
+#define WORKER_CGROUPS_CHART 2
+
+#if WORKER_UTILIZATION_MAX_JOB_TYPES < 3
+#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 3
+#endif
+
 void *cgroups_main(void *ptr) {
-    netdata_thread_cleanup_push(cgroup_main_cleanup, ptr);
+    worker_register("CGROUPS");
+    worker_register_job_name(WORKER_CGROUPS_LOCK, "lock");
+    worker_register_job_name(WORKER_CGROUPS_READ, "read");
+    worker_register_job_name(WORKER_CGROUPS_READ, "chart");
 
-    struct rusage thread;
+    netdata_thread_cleanup_push(cgroup_main_cleanup, ptr);
 
     if (getenv("KUBERNETES_SERVICE_HOST") != NULL && getenv("KUBERNETES_SERVICE_PORT") != NULL) {
         is_inside_k8s = 1;
         cgroup_enable_cpuacct_cpu_shares = CONFIG_BOOLEAN_YES;
     }
 
-    // when ZERO, attempt to do it
-    int vdo_cpu_netdata = config_get_boolean("plugin:cgroups", "cgroups plugin resource charts", 1);
-
     read_cgroup_plugin_configuration();
     netdata_cgroup_ebpf_initialize_shm();
 
-    RRDSET *stcpu_thread = NULL;
-
     if (uv_mutex_init(&cgroup_root_mutex)) {
         error("CGROUP: cannot initialize mutex for the main cgroup list");
         goto exit;
@@ -4736,6 +4781,8 @@ void *cgroups_main(void *ptr) {
     usec_t find_every = cgroup_check_for_new_every * USEC_PER_SEC, find_dt = 0;
 
     while(!netdata_exit) {
+        worker_is_idle();
+
         usec_t hb_dt = heartbeat_next(&hb, step);
         if(unlikely(netdata_exit)) break;
 
@@ -4747,46 +4794,21 @@ void *cgroups_main(void *ptr) {
             cgroups_check = 0;
         }
 
+        worker_is_busy(WORKER_CGROUPS_LOCK);
         uv_mutex_lock(&cgroup_root_mutex);
-        read_all_discovered_cgroups(cgroup_root);
-        update_cgroup_charts(cgroup_update_every);
-        uv_mutex_unlock(&cgroup_root_mutex);
-
-        // --------------------------------------------------------------------
-
-        if(vdo_cpu_netdata) {
-            getrusage(RUSAGE_THREAD, &thread);
 
-            if(unlikely(!stcpu_thread)) {
-
-                stcpu_thread = rrdset_create_localhost(
-                        "netdata"
-                        , "plugin_cgroups_cpu"
-                        , NULL
-                        , "cgroups"
-                        , NULL
-                        , "Netdata CGroups Plugin CPU usage"
-                        , "milliseconds/s"
-                        , PLUGIN_CGROUPS_NAME
-                        , "stats"
-                        , 132000
-                        , cgroup_update_every
-                        , RRDSET_TYPE_STACKED
-                );
+        worker_is_busy(WORKER_CGROUPS_READ);
+        read_all_discovered_cgroups(cgroup_root);
 
-                rrddim_add(stcpu_thread, "user",  NULL,  1, 1000, RRD_ALGORITHM_INCREMENTAL);
-                rrddim_add(stcpu_thread, "system", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL);
-            }
-            else
-                rrdset_next(stcpu_thread);
+        worker_is_busy(WORKER_CGROUPS_CHART);
+        update_cgroup_charts(cgroup_update_every);
 
-            rrddim_set(stcpu_thread, "user"  , thread.ru_utime.tv_sec * 1000000ULL + thread.ru_utime.tv_usec);
-            rrddim_set(stcpu_thread, "system", thread.ru_stime.tv_sec * 1000000ULL + thread.ru_stime.tv_usec);
-            rrdset_done(stcpu_thread);
-        }
+        worker_is_idle();
+        uv_mutex_unlock(&cgroup_root_mutex);
     }
 
 exit:
+    worker_unregister();
     netdata_thread_cleanup_pop(1);
     return NULL;
 }
author	Costa Tsaousis <costa@netdata.cloud>	2022-05-09 16:34:31 +0300
committer	GitHub <noreply@github.com>	2022-05-09 16:34:31 +0300
commit	eb216a1f4bbb26e1f18537b30d22e8ad8711f42c (patch)
tree	353938a0f71da7b04d4f9b67769d2a38ba6db2cb /collectors/cgroups.plugin
parent	0b3ee50c76dcc3b8dcdd13cec0e432394d3c6964 (diff)