summaryrefslogtreecommitdiffstats
path: root/libnetdata
diff options
context:
space:
mode:
authorCosta Tsaousis <costa@netdata.cloud>2022-05-10 14:53:07 +0300
committerGitHub <noreply@github.com>2022-05-10 14:53:07 +0300
commit33e20ac4909596dae0ae718ba527e19d4a30ed70 (patch)
tree2a1be524b09607c8483ac6dd43dd65b369418468 /libnetdata
parentbd504c6c57ad88f376cd392c711f02cd8b58a16e (diff)
workers fixes and improvements (#12863)
Diffstat (limited to 'libnetdata')
-rw-r--r--libnetdata/clocks/clocks.c3
-rw-r--r--libnetdata/worker_utilization/README.md62
-rw-r--r--libnetdata/worker_utilization/worker_utilization.c40
3 files changed, 72 insertions, 33 deletions
diff --git a/libnetdata/clocks/clocks.c b/libnetdata/clocks/clocks.c
index 85f4eff418..9e7264d33f 100644
--- a/libnetdata/clocks/clocks.c
+++ b/libnetdata/clocks/clocks.c
@@ -259,7 +259,8 @@ void heartbeat_statistics(usec_t *min_ptr, usec_t *max_ptr, usec_t *average_ptr,
count++;
}
}
- average = total / count;
+ if(count)
+ average = total / count;
if(min_ptr) *min_ptr = min;
if(max_ptr) *max_ptr = max;
diff --git a/libnetdata/worker_utilization/README.md b/libnetdata/worker_utilization/README.md
index 85e532ed10..35f30b40b4 100644
--- a/libnetdata/worker_utilization/README.md
+++ b/libnetdata/worker_utilization/README.md
@@ -5,8 +5,16 @@ custom_edit_url: https://github.com/netdata/netdata/edit/master/libnetdata/onewa
# Worker Utilization
-This library is to be used when there are 1 or more worker threads accepting requests of some kind and servicing them.
-The goal is to provide a very simple way to monitor worker threads utilization, as a percentage of the time they are busy and the amount of requests served.
+This library is to be used when there are 1 or more worker threads accepting requests
+of some kind and servicing them. The goal is to provide a very simple way to monitor
+worker threads utilization, as a percentage of the time they are busy and the amount
+of requests served.
+
+## Design goals
+
+1. Minimal, if any, impact on the performance of the workers
+2. Easy to be integrated into any kind of worker
+3. No state of any kind at the worker side
## How to use
@@ -19,40 +27,64 @@ void worker_register(const char *name);
This will create the necessary structures for the library to work.
No need to keep a pointer to them. They are allocated as `__thread` variables.
+Then job types need to be defined. Job types are anything a worker does that can be
+counted and their execution time needs to be reported. The library is fast enough to
+be integrated even on workers that perform hundreds of thousands of actions per second.
+
+Job types are defined like this:
+
+```c
+void worker_register_job_type(size_t id, const char *name);
+```
+
+`id` is a number starting from zero. The library is compiled with a fixed size of 50
+ids (0 to 49). More can be allocated by setting `WORKER_UTILIZATION_MAX_JOB_TYPES` in
+`worker_utilization.h`. `name` can be any string up to 22 characters. This can be
+changed by setting `WORKER_UTILIZATION_MAX_JOB_NAME_LENGTH` in `worker_utilization.h`.
+
+Each thread that calls `worker_register(name)` will allocate about 3kB for maintaining
+the information required.
+
When the thread stops, call:
```c
-void worker_unregister(void)
+void worker_unregister(void);
```
Again, no parameters, or return values.
+> IMPORTANT: cancellable threads need to add a call to `worker_unregister()` to the
+> `pop` function that cleans up the thread. Failure to do so, will result in about
+> 3kB of memory leak for every thread that is stopped.
+
When you are about to do some work in the working thread, call:
```c
-void worker_is_busy(void)
+void worker_is_busy(size_t id);
```
When you finish doing the job, call:
```c
-void worker_is_idle(void)
+void worker_is_idle(void);
```
-Calls to `worker_is_busy()` can be made one after another (without calling
+Calls to `worker_is_busy(id)` can be made one after another (without calling
`worker_is_idle()` between them) to switch jobs without losing any time between
them and eliminating one of the 2 clock calls involved.
## Implementation details
-Totally lockless, extremely fast, it should not introduce any kind of problems to the workers.
-Every time `worker_is_busy()` or `worker_is_idle()` are called, a call to `now_realtime_usec()`
-is done and a couple of variables are updated. That's it!
+Totally lockless, extremely fast, it should not introduce any kind of problems to the
+workers. Every time `worker_is_busy(id)` or `worker_is_idle()` are called, a call to
+`now_realtime_usec()` is done and a couple of variables are updated. That's it!
-The worker does not need to update the variables regularly. Based on the last status of the worker,
-the statistics collector of netdata will calculate if the thread is busy or idle all the time or
-part of the time. Works well for both thousands of jobs per second and unlimited working time
-(being totally busy with a single request for ages).
+The worker does not need to update the variables regularly. Based on the last status
+of the worker, the statistics collector of netdata will calculate if the thread is
+busy or idle all the time or part of the time. Works well for both thousands of jobs
+per second and unlimited working time (being totally busy with a single request for
+ages).
-The statistics collector is called by the global statistics thread of netdata. So, even if the workers
-are extremely busy with their jobs, netdata will be able to know how busy they are.
+The statistics collector is called by the global statistics thread of netdata. So,
+even if the workers are extremely busy with their jobs, netdata will be able to know
+how busy they are.
diff --git a/libnetdata/worker_utilization/worker_utilization.c b/libnetdata/worker_utilization/worker_utilization.c
index 459df2f265..6a86acc914 100644
--- a/libnetdata/worker_utilization/worker_utilization.c
+++ b/libnetdata/worker_utilization/worker_utilization.c
@@ -5,11 +5,14 @@
struct worker_job_type {
char name[WORKER_UTILIZATION_MAX_JOB_NAME_LENGTH + 1];
- size_t worker_jobs_started;
- usec_t worker_busy_time;
- size_t statistics_jobs_started;
- usec_t statistics_busy_time;
+ // statistics controlled variables
+ size_t statistics_last_jobs_started;
+ usec_t statistics_last_busy_time;
+
+ // worker controlled variables
+ volatile size_t worker_jobs_started;
+ volatile usec_t worker_busy_time;
};
struct worker {
@@ -18,13 +21,13 @@ struct worker {
const char *workname;
uint32_t workname_hash;
- // only one variable is set by our statistics callers
- usec_t statistics_last_checkpoint;
+ // statistics controlled variables
+ volatile usec_t statistics_last_checkpoint;
size_t statistics_last_jobs_started;
usec_t statistics_last_busy_time;
// the worker controlled variables
- size_t job_id;
+ volatile size_t job_id;
volatile size_t jobs_started;
volatile usec_t busy_time;
volatile usec_t last_action_timestamp;
@@ -154,22 +157,26 @@ void workers_foreach(const char *workname, void (*callback)(void *data, pid_t pi
per_job_type_name[i] = p->per_job_type[i].name;
size_t tmp_jobs_started = p->per_job_type[i].worker_jobs_started;
- per_job_type_jobs_started[i] = tmp_jobs_started - p->per_job_type[i].statistics_jobs_started;
- p->per_job_type[i].statistics_jobs_started = tmp_jobs_started;
+ per_job_type_jobs_started[i] = tmp_jobs_started - p->per_job_type[i].statistics_last_jobs_started;
+ p->per_job_type[i].statistics_last_jobs_started = tmp_jobs_started;
usec_t tmp_busy_time = p->per_job_type[i].worker_busy_time;
- per_job_type_busy_time[i] = tmp_busy_time - p->per_job_type[i].statistics_busy_time;
- p->per_job_type[i].statistics_busy_time = tmp_busy_time;
+ per_job_type_busy_time[i] = tmp_busy_time - p->per_job_type[i].statistics_last_busy_time;
+ p->per_job_type[i].statistics_last_busy_time = tmp_busy_time;
}
// get a copy of the worker variables
+ size_t worker_job_id = p->job_id;
usec_t worker_busy_time = p->busy_time;
size_t worker_jobs_started = p->jobs_started;
char worker_last_action = p->last_action;
usec_t worker_last_action_timestamp = p->last_action_timestamp;
+ delta = now - p->statistics_last_checkpoint;
+ p->statistics_last_checkpoint = now;
+
// this is the only variable both the worker thread and the statistics thread are writing
- // we set this only when the worker is busy, so that worker will not
+ // we set this only when the worker is busy, so that the worker will not
// accumulate all the busy time, but only the time after the point we collected statistics
if(worker_last_action == WORKER_BUSY && p->last_action_timestamp == worker_last_action_timestamp && p->last_action == WORKER_BUSY)
p->last_action_timestamp = now;
@@ -186,14 +193,13 @@ void workers_foreach(const char *workname, void (*callback)(void *data, pid_t pi
if(worker_last_action == WORKER_BUSY) {
// the worker is still busy with something
// let's add that busy time to the reported one
- busy_time += now - worker_last_action_timestamp;
+ usec_t dt = now - worker_last_action_timestamp;
+ busy_time += dt;
+ per_job_type_busy_time[worker_job_id] += dt;
+ p->per_job_type[worker_job_id].statistics_last_busy_time += dt;
jobs_running = 1;
}
- delta = now - p->statistics_last_checkpoint;
-
- p->statistics_last_checkpoint = now;
-
callback(data, p->pid, p->tag, busy_time, delta, jobs_started, jobs_running, per_job_type_name, per_job_type_jobs_started, per_job_type_busy_time);
}