summaryrefslogtreecommitdiffstats
path: root/daemon
diff options
context:
space:
mode:
authorCosta Tsaousis <costa@netdata.cloud>2022-05-09 16:34:31 +0300
committerGitHub <noreply@github.com>2022-05-09 16:34:31 +0300
commiteb216a1f4bbb26e1f18537b30d22e8ad8711f42c (patch)
tree353938a0f71da7b04d4f9b67769d2a38ba6db2cb /daemon
parent0b3ee50c76dcc3b8dcdd13cec0e432394d3c6964 (diff)
Workers utilization charts (#12807)
* initial version of worker utilization * working example * without mutexes * monitoring DBENGINE, ACLKSYNC, WEB workers * added charts to monitor worker usage * fixed charts units * updated contexts * updated priorities * added documentation * converted threads to stacked chart * One query per query thread * Revert "One query per query thread" This reverts commit 6aeb391f5987c3c6ba2864b559fd7f0cd64b14d3. * fixed priority for web charts * read worker cpu utilization from proc * read workers cpu utilization via /proc/self/task/PID/stat, so that we have cpu utilization even when the jobs are too long to finish within our update_every frequency * disabled web server cpu utilization monitoring - it is now monitored by worker utilization * tight integration of worker utilization to web server * monitoring statsd worker threads * code cleanup and renaming of variables * contrained worker and statistics conflict to just one variable * support for rendering jobs per type * better priorities and removed the total jobs chart * added busy time in ms per job type * added proc.plugin monitoring, switch clock to MONOTONIC_RAW if available, global statistics now cleans up old worker threads * isolated worker thread families * added cgroups.plugin workers * remove unneeded dimensions when then expected worker is just one * plugins.d and streaming monitoring * rebased; support worker_is_busy() to be called one after another * added diskspace plugin monitoring * added tc.plugin monitoring * added ML threads monitoring * dont create dimensions and charts that are not needed * fix crash when job types are added on the fly * added timex and idlejitter plugins; collected heartbeat statistics; reworked heartbeat according to the POSIX * the right name is heartbeat for this chart * monitor streaming senders * added streaming senders to global stats * prevent division by zero * added clock_init() to external C plugins * added freebsd and macos plugins * added freebsd and macos to global statistics * dont use new as a variable; address compiler warnings on FreeBSD and MacOS * refactored contexts to be unique; added health threads monitoring Co-authored-by: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com>
Diffstat (limited to 'daemon')
-rw-r--r--daemon/global_statistics.c1305
1 files changed, 943 insertions, 362 deletions
diff --git a/daemon/global_statistics.c b/daemon/global_statistics.c
index 5c48ae5253..c4849ed6c3 100644
--- a/daemon/global_statistics.c
+++ b/daemon/global_statistics.c
@@ -6,6 +6,16 @@
#define CONFIG_SECTION_GLOBAL_STATISTICS "global statistics"
+#define WORKER_JOB_GLOBAL 0
+#define WORKER_JOB_REGISTRY 1
+#define WORKER_JOB_WORKERS 2
+#define WORKER_JOB_DBENGINE 3
+#define WORKER_JOB_HEARTBEAT 4
+
+#if WORKER_UTILIZATION_MAX_JOB_TYPES < 5
+#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 5
+#endif
+
static struct global_statistics {
volatile uint16_t connected_clients;
@@ -436,435 +446,993 @@ static void global_statistics_charts(void) {
}
// ----------------------------------------------------------------
+}
+static void dbengine_statistics_charts(void) {
#ifdef ENABLE_DBENGINE
- RRDHOST *host;
- unsigned long long stats_array[RRDENG_NR_STATS] = {0};
- unsigned long long local_stats_array[RRDENG_NR_STATS];
- unsigned dbengine_contexts = 0, counted_multihost_db = 0, i;
-
- rrd_rdlock();
- rrdhost_foreach_read(host) {
- if (host->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE && !rrdhost_flag_check(host, RRDHOST_FLAG_ARCHIVED)) {
- if (&multidb_ctx == host->rrdeng_ctx) {
- if (counted_multihost_db)
- continue; /* Only count multi-host DB once */
- counted_multihost_db = 1;
- }
- ++dbengine_contexts;
- /* get localhost's DB engine's statistics */
- rrdeng_get_37_statistics(host->rrdeng_ctx, local_stats_array);
- for (i = 0 ; i < RRDENG_NR_STATS ; ++i) {
- /* aggregate statistics across hosts */
- stats_array[i] += local_stats_array[i];
+ if(netdata_rwlock_tryrdlock(&rrd_rwlock) == 0) {
+ RRDHOST *host;
+ unsigned long long stats_array[RRDENG_NR_STATS] = {0};
+ unsigned long long local_stats_array[RRDENG_NR_STATS];
+ unsigned dbengine_contexts = 0, counted_multihost_db = 0, i;
+
+ rrdhost_foreach_read(host) {
+ if (host->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE && !rrdhost_flag_check(host, RRDHOST_FLAG_ARCHIVED)) {
+ if (&multidb_ctx == host->rrdeng_ctx) {
+ if (counted_multihost_db)
+ continue; /* Only count multi-host DB once */
+ counted_multihost_db = 1;
+ }
+ ++dbengine_contexts;
+ /* get localhost's DB engine's statistics */
+ rrdeng_get_37_statistics(host->rrdeng_ctx, local_stats_array);
+ for (i = 0; i < RRDENG_NR_STATS; ++i) {
+ /* aggregate statistics across hosts */
+ stats_array[i] += local_stats_array[i];
+ }
}
}
- }
- rrd_unlock();
-
- if (dbengine_contexts) {
- /* deduplicate global statistics by getting the ones from the last context */
- stats_array[30] = local_stats_array[30];
- stats_array[31] = local_stats_array[31];
- stats_array[32] = local_stats_array[32];
- stats_array[34] = local_stats_array[34];
- stats_array[36] = local_stats_array[36];
-
- // ----------------------------------------------------------------
-
- {
- static RRDSET *st_compression = NULL;
- static RRDDIM *rd_savings = NULL;
-
- if (unlikely(!st_compression)) {
- st_compression = rrdset_create_localhost(
- "netdata"
- , "dbengine_compression_ratio"
- , NULL
- , "dbengine"
- , NULL
- , "Netdata DB engine data extents' compression savings ratio"
- , "percentage"
- , "netdata"
- , "stats"
- , 130502
- , localhost->rrd_update_every
- , RRDSET_TYPE_LINE
- );
-
- rd_savings = rrddim_add(st_compression, "savings", NULL, 1, 1000, RRD_ALGORITHM_ABSOLUTE);
+ rrd_unlock();
+
+ if (dbengine_contexts) {
+ /* deduplicate global statistics by getting the ones from the last context */
+ stats_array[30] = local_stats_array[30];
+ stats_array[31] = local_stats_array[31];
+ stats_array[32] = local_stats_array[32];
+ stats_array[34] = local_stats_array[34];
+ stats_array[36] = local_stats_array[36];
+
+ // ----------------------------------------------------------------
+
+ {
+ static RRDSET *st_compression = NULL;
+ static RRDDIM *rd_savings = NULL;
+
+ if (unlikely(!st_compression)) {
+ st_compression = rrdset_create_localhost(
+ "netdata",
+ "dbengine_compression_ratio",
+ NULL,
+ "dbengine",
+ NULL,
+ "Netdata DB engine data extents' compression savings ratio",
+ "percentage",
+ "netdata",
+ "stats",
+ 130502,
+ localhost->rrd_update_every,
+ RRDSET_TYPE_LINE);
+
+ rd_savings = rrddim_add(st_compression, "savings", NULL, 1, 1000, RRD_ALGORITHM_ABSOLUTE);
+ } else
+ rrdset_next(st_compression);
+
+ unsigned long long ratio;
+ unsigned long long compressed_content_size = stats_array[12];
+ unsigned long long content_size = stats_array[11];
+
+ if (content_size) {
+ // allow negative savings
+ ratio = ((content_size - compressed_content_size) * 100 * 1000) / content_size;
+ } else {
+ ratio = 0;
+ }
+ rrddim_set_by_pointer(st_compression, rd_savings, ratio);
+
+ rrdset_done(st_compression);
}
- else
- rrdset_next(st_compression);
-
- unsigned long long ratio;
- unsigned long long compressed_content_size = stats_array[12];
- unsigned long long content_size = stats_array[11];
-
- if (content_size) {
- // allow negative savings
- ratio = ((content_size - compressed_content_size) * 100 * 1000) / content_size;
- } else {
- ratio = 0;
- }
- rrddim_set_by_pointer(st_compression, rd_savings, ratio);
- rrdset_done(st_compression);
- }
+ // ----------------------------------------------------------------
+
+ {
+ static RRDSET *st_pg_cache_hit_ratio = NULL;
+ static RRDDIM *rd_hit_ratio = NULL;
+
+ if (unlikely(!st_pg_cache_hit_ratio)) {
+ st_pg_cache_hit_ratio = rrdset_create_localhost(
+ "netdata",
+ "page_cache_hit_ratio",
+ NULL,
+ "dbengine",
+ NULL,
+ "Netdata DB engine page cache hit ratio",
+ "percentage",
+ "netdata",
+ "stats",
+ 130503,
+ localhost->rrd_update_every,
+ RRDSET_TYPE_LINE);
+
+ rd_hit_ratio = rrddim_add(st_pg_cache_hit_ratio, "ratio", NULL, 1, 1000, RRD_ALGORITHM_ABSOLUTE);
+ } else
+ rrdset_next(st_pg_cache_hit_ratio);
+
+ static unsigned long long old_hits = 0;
+ static unsigned long long old_misses = 0;
+ unsigned long long hits = stats_array[7];
+ unsigned long long misses = stats_array[8];
+ unsigned long long hits_delta;
+ unsigned long long misses_delta;
+ unsigned long long ratio;
+
+ hits_delta = hits - old_hits;
+ misses_delta = misses - old_misses;
+ old_hits = hits;
+ old_misses = misses;
+
+ if (hits_delta + misses_delta) {
+ ratio = (hits_delta * 100 * 1000) / (hits_delta + misses_delta);
+ } else {
+ ratio = 0;
+ }
+ rrddim_set_by_pointer(st_pg_cache_hit_ratio, rd_hit_ratio, ratio);
+
+ rrdset_done(st_pg_cache_hit_ratio);
+ }
- // ----------------------------------------------------------------
-
- {
- static RRDSET *st_pg_cache_hit_ratio = NULL;
- static RRDDIM *rd_hit_ratio = NULL;
-
- if (unlikely(!st_pg_cache_hit_ratio)) {
- st_pg_cache_hit_ratio = rrdset_create_localhost(
- "netdata"
- , "page_cache_hit_ratio"
- , NULL
- , "dbengine"
- , NULL
- , "Netdata DB engine page cache hit ratio"
- , "percentage"
- , "netdata"
- , "stats"
- , 130503
- , localhost->rrd_update_every
- , RRDSET_TYPE_LINE
- );
-
- rd_hit_ratio = rrddim_add(st_pg_cache_hit_ratio, "ratio", NULL, 1, 1000, RRD_ALGORITHM_ABSOLUTE);
+ // ----------------------------------------------------------------
+
+ {
+ static RRDSET *st_pg_cache_pages = NULL;
+ static RRDDIM *rd_descriptors = NULL;
+ static RRDDIM *rd_populated = NULL;
+ static RRDDIM *rd_dirty = NULL;
+ static RRDDIM *rd_backfills = NULL;
+ static RRDDIM *rd_evictions = NULL;
+ static RRDDIM *rd_used_by_collectors = NULL;
+
+ if (unlikely(!st_pg_cache_pages)) {
+ st_pg_cache_pages = rrdset_create_localhost(
+ "netdata",
+ "page_cache_stats",
+ NULL,
+ "dbengine",
+ NULL,
+ "Netdata dbengine page cache statistics",
+ "pages",
+ "netdata",
+ "stats",
+ 130504,
+ localhost->rrd_update_every,
+ RRDSET_TYPE_LINE);
+
+ rd_descriptors = rrddim_add(st_pg_cache_pages, "descriptors", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ rd_populated = rrddim_add(st_pg_cache_pages, "populated", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ rd_dirty = rrddim_add(st_pg_cache_pages, "dirty", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ rd_backfills = rrddim_add(st_pg_cache_pages, "backfills", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
+ rd_evictions = rrddim_add(st_pg_cache_pages, "evictions", NULL, -1, 1, RRD_ALGORITHM_INCREMENTAL);
+ rd_used_by_collectors =
+ rrddim_add(st_pg_cache_pages, "used_by_collectors", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ } else
+ rrdset_next(st_pg_cache_pages);
+
+ rrddim_set_by_pointer(st_pg_cache_pages, rd_descriptors, (collected_number)stats_array[27]);
+ rrddim_set_by_pointer(st_pg_cache_pages, rd_populated, (collected_number)stats_array[3]);
+ rrddim_set_by_pointer(st_pg_cache_pages, rd_dirty, (collected_number)stats_array[0] + stats_array[4]);
+ rrddim_set_by_pointer(st_pg_cache_pages, rd_backfills, (collected_number)stats_array[9]);
+ rrddim_set_by_pointer(st_pg_cache_pages, rd_evictions, (collected_number)stats_array[10]);
+ rrddim_set_by_pointer(st_pg_cache_pages, rd_used_by_collectors, (collected_number)stats_array[0]);
+ rrdset_done(st_pg_cache_pages);
}
- else
- rrdset_next(st_pg_cache_hit_ratio);
-
- static unsigned long long old_hits = 0;
- static unsigned long long old_misses = 0;
- unsigned long long hits = stats_array[7];
- unsigned long long misses = stats_array[8];
- unsigned long long hits_delta;
- unsigned long long misses_delta;
- unsigned long long ratio;
-
- hits_delta = hits - old_hits;
- misses_delta = misses - old_misses;
- old_hits = hits;
- old_misses = misses;
-
- if (hits_delta + misses_delta) {
- ratio = (hits_delta * 100 * 1000) / (hits_delta + misses_delta);
- } else {
- ratio = 0;
+
+ // ----------------------------------------------------------------
+
+ {
+ static RRDSET *st_long_term_pages = NULL;
+ static RRDDIM *rd_total = NULL;
+ static RRDDIM *rd_insertions = NULL;
+ static RRDDIM *rd_deletions = NULL;
+ static RRDDIM *rd_flushing_pressure_deletions = NULL;
+
+ if (unlikely(!st_long_term_pages)) {
+ st_long_term_pages = rrdset_create_localhost(
+ "netdata",
+ "dbengine_long_term_page_stats",
+ NULL,
+ "dbengine",
+ NULL,
+ "Netdata dbengine long-term page statistics",
+ "pages",
+ "netdata",
+ "stats",
+ 130505,
+ localhost->rrd_update_every,
+ RRDSET_TYPE_LINE);
+
+ rd_total = rrddim_add(st_long_term_pages, "total", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ rd_insertions = rrddim_add(st_long_term_pages, "insertions", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
+ rd_deletions = rrddim_add(st_long_term_pages, "deletions", NULL, -1, 1, RRD_ALGORITHM_INCREMENTAL);
+ rd_flushing_pressure_deletions = rrddim_add(
+ st_long_term_pages, "flushing_pressure_deletions", NULL, -1, 1, RRD_ALGORITHM_INCREMENTAL);
+ } else
+ rrdset_next(st_long_term_pages);
+
+ rrddim_set_by_pointer(st_long_term_pages, rd_total, (collected_number)stats_array[2]);
+ rrddim_set_by_pointer(st_long_term_pages, rd_insertions, (collected_number)stats_array[5]);
+ rrddim_set_by_pointer(st_long_term_pages, rd_deletions, (collected_number)stats_array[6]);
+ rrddim_set_by_pointer(
+ st_long_term_pages, rd_flushing_pressure_deletions, (collected_number)stats_array[36]);
+ rrdset_done(st_long_term_pages);
}
- rrddim_set_by_pointer(st_pg_cache_hit_ratio, rd_hit_ratio, ratio);
- rrdset_done(st_pg_cache_hit_ratio);
- }
+ // ----------------------------------------------------------------
+
+ {
+ static RRDSET *st_io_stats = NULL;
+ static RRDDIM *rd_reads = NULL;
+ static RRDDIM *rd_writes = NULL;
+
+ if (unlikely(!st_io_stats)) {
+ st_io_stats = rrdset_create_localhost(
+ "netdata",
+ "dbengine_io_throughput",
+ NULL,
+ "dbengine",
+ NULL,
+ "Netdata DB engine I/O throughput",
+ "MiB/s",
+ "netdata",
+ "stats",
+ 130506,
+ localhost->rrd_update_every,
+ RRDSET_TYPE_LINE);
+
+ rd_reads = rrddim_add(st_io_stats, "reads", NULL, 1, 1024 * 1024, RRD_ALGORITHM_INCREMENTAL);
+ rd_writes = rrddim_add(st_io_stats, "writes", NULL, -1, 1024 * 1024, RRD_ALGORITHM_INCREMENTAL);
+ } else
+ rrdset_next(st_io_stats);
+
+ rrddim_set_by_pointer(st_io_stats, rd_reads, (collected_number)stats_array[17]);
+ rrddim_set_by_pointer(st_io_stats, rd_writes, (collected_number)stats_array[15]);
+ rrdset_done(st_io_stats);
+ }
- // ----------------------------------------------------------------
-
- {
- static RRDSET *st_pg_cache_pages = NULL;
- static RRDDIM *rd_descriptors = NULL;
- static RRDDIM *rd_populated = NULL;
- static RRDDIM *rd_dirty = NULL;
- static RRDDIM *rd_backfills = NULL;
- static RRDDIM *rd_evictions = NULL;
- static RRDDIM *rd_used_by_collectors = NULL;
-
- if (unlikely(!st_pg_cache_pages)) {
- st_pg_cache_pages = rrdset_create_localhost(
- "netdata"
- , "page_cache_stats"
- , NULL
- , "dbengine"
- , NULL
- , "Netdata dbengine page cache statistics"
- , "pages"
- , "netdata"
- , "stats"
- , 130504
- , localhost->rrd_update_every
- , RRDSET_TYPE_LINE
- );
-
- rd_descriptors = rrddim_add(st_pg_cache_pages, "descriptors", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
- rd_populated = rrddim_add(st_pg_cache_pages, "populated", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
- rd_dirty = rrddim_add(st_pg_cache_pages, "dirty", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
- rd_backfills = rrddim_add(st_pg_cache_pages, "backfills", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
- rd_evictions = rrddim_add(st_pg_cache_pages, "evictions", NULL, -1, 1, RRD_ALGORITHM_INCREMENTAL);
- rd_used_by_collectors = rrddim_add(st_pg_cache_pages, "used_by_collectors", NULL, 1, 1,
- RRD_ALGORITHM_ABSOLUTE);
+ // ----------------------------------------------------------------
+
+ {
+ static RRDSET *st_io_stats = NULL;
+ static RRDDIM *rd_reads = NULL;
+ static RRDDIM *rd_writes = NULL;
+
+ if (unlikely(!st_io_stats)) {
+ st_io_stats = rrdset_create_localhost(
+ "netdata",
+ "dbengine_io_operations",
+ NULL,
+ "dbengine",
+ NULL,
+ "Netdata DB engine I/O operations",
+ "operations/s",
+ "netdata",
+ "stats",
+ 130507,
+ localhost->rrd_update_every,
+ RRDSET_TYPE_LINE);
+
+ rd_reads = rrddim_add(st_io_stats, "reads", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
+ rd_writes = rrddim_add(st_io_stats, "writes", NULL, -1, 1, RRD_ALGORITHM_INCREMENTAL);
+ } else
+ rrdset_next(st_io_stats);
+
+ rrddim_set_by_pointer(st_io_stats, rd_reads, (collected_number)stats_array[18]);
+ rrddim_set_by_pointer(st_io_stats, rd_writes, (collected_number)stats_array[16]);
+ rrdset_done(st_io_stats);
}
- else
- rrdset_next(st_pg_cache_pages);
-
- rrddim_set_by_pointer(st_pg_cache_pages, rd_descriptors, (collected_number)stats_array[27]);
- rrddim_set_by_pointer(st_pg_cache_pages, rd_populated, (collected_number)stats_array[3]);
- rrddim_set_by_pointer(st_pg_cache_pages, rd_dirty, (collected_number)stats_array[0] + stats_array[4]);
- rrddim_set_by_pointer(st_pg_cache_pages, rd_backfills, (collected_number)stats_array[9]);
- rrddim_set_by_pointer(st_pg_cache_pages, rd_evictions, (collected_number)stats_array[10]);
- rrddim_set_by_pointer(st_pg_cache_pages, rd_used_by_collectors, (collected_number)stats_array[0]);
- rrdset_done(st_pg_cache_pages);
- }
- // ----------------------------------------------------------------
+ // ----------------------------------------------------------------
+
+ {
+ static RRDSET *st_errors = NULL;
+ static RRDDIM *rd_fs_errors = NULL;
+ static RRDDIM *rd_io_errors = NULL;
+ static RRDDIM *pg_cache_over_half_dirty_events = NULL;
+
+ if (unlikely(!st_errors)) {
+ st_errors = rrdset_create_localhost(
+ "netdata",
+ "dbengine_global_errors",
+ NULL,
+ "dbengine",
+ NULL,
+ "Netdata DB engine errors",
+ "errors/s",
+ "netdata",
+ "stats",
+ 130508,
+ localhost->rrd_update_every,
+ RRDSET_TYPE_LINE);
+
+ rd_io_errors = rrddim_add(st_errors, "io_errors", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
+ rd_fs_errors = rrddim_add(st_errors, "fs_errors", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
+ pg_cache_over_half_dirty_events =
+ rrddim_add(st_errors, "pg_cache_over_half_dirty_events", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
+ } else
+ rrdset_next(st_errors);
+
+ rrddim_set_by_pointer(st_errors, rd_io_errors, (collected_number)stats_array[30]);
+ rrddim_set_by_pointer(st_errors, rd_fs_errors, (collected_number)stats_array[31]);
+ rrddim_set_by_pointer(st_errors, pg_cache_over_half_dirty_events, (collected_number)stats_array[34]);
+ rrdset_done(st_errors);
+ }
- {
- static RRDSET *st_long_term_pages = NULL;
- static RRDDIM *rd_total = NULL;
- static RRDDIM *rd_insertions = NULL;
- static RRDDIM *rd_deletions = NULL;
- static RRDDIM *rd_flushing_pressure_deletions = NULL;
+ // ----------------------------------------------------------------
+
+ {
+ static RRDSET *st_fd = NULL;
+ static RRDDIM *rd_fd_current = NULL;
+ static RRDDIM *rd_fd_max = NULL;
+
+ if (unlikely(!st_fd)) {
+ st_fd = rrdset_create_localhost(
+ "netdata",
+ "dbengine_global_file_descriptors",
+ NULL,
+ "dbengine",
+ NULL,
+ "Netdata DB engine File Descriptors",
+ "descriptors",
+ "netdata",
+ "stats",
+ 130509,
+ localhost->rrd_update_every,
+ RRDSET_TYPE_LINE);
+
+ rd_fd_current = rrddim_add(st_fd, "current", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ rd_fd_max = rrddim_add(st_fd, "max", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ } else
+ rrdset_next(st_fd);
+
+ rrddim_set_by_pointer(st_fd, rd_fd_current, (collected_number)stats_array[32]);
+ /* Careful here, modify this accordingly if the File-Descriptor budget ever changes */
+ rrddim_set_by_pointer(st_fd, rd_fd_max, (collected_number)rlimit_nofile.rlim_cur / 4);
+ rrdset_done(st_fd);
+ }
- if (unlikely(!st_long_term_pages)) {
- st_long_term_pages = rrdset_create_localhost(
- "netdata"
- , "dbengine_long_term_page_stats"
- , NULL
- , "dbengine"
- , NULL
- , "Netdata dbengine long-term page statistics"
- , "pages"
- , "netdata"
- , "stats"
- , 130505
- , localhost->rrd_update_every
- , RRDSET_TYPE_LINE
- );
-
- rd_total = rrddim_add(st_long_term_pages, "total", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
- rd_insertions = rrddim_add(st_long_term_pages, "insertions", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
- rd_deletions = rrddim_add(st_long_term_pages, "deletions", NULL, -1, 1, RRD_ALGORITHM_INCREMENTAL);
- rd_flushing_pressure_deletions = rrddim_add(st_long_term_pages, "flushing_pressure_deletions", NULL, -1,
- 1, RRD_ALGORITHM_INCREMENTAL);
+ // ----------------------------------------------------------------
+
+ {
+ static RRDSET *st_ram_usage = NULL;
+ static RRDDIM *rd_cached = NULL;
+ static RRDDIM *rd_pinned = NULL;
+ static RRDDIM *rd_metadata = NULL;
+
+ collected_number cached_pages, pinned_pages, API_producers, populated_pages, metadata, pages_on_disk,
+ page_cache_descriptors;
+
+ if (unlikely(!st_ram_usage)) {
+ st_ram_usage = rrdset_create_localhost(
+ "netdata",
+ "dbengine_ram",
+ NULL,
+ "dbengine",
+ NULL,
+ "Netdata DB engine RAM usage",
+ "MiB",
+ "netdata",
+ "stats",
+ 130510,
+ localhost->rrd_update_every,
+ RRDSET_TYPE_STACKED);
+
+ rd_cached = rrddim_add(st_ram_usage, "cache", NULL, 1, 256, RRD_ALGORITHM_ABSOLUTE);
+ rd_pinned = rrddim_add(st_ram_usage, "collectors", NULL, 1, 256, RRD_ALGORITHM_ABSOLUTE);
+ rd_metadata = rrddim_add(st_ram_usage, "metadata", NULL, 1, 1048576, RRD_ALGORITHM_ABSOLUTE);
+ } else
+ rrdset_next(st_ram_usage);
+
+ API_producers = (collected_number)stats_array[0];
+ pages_on_disk = (collected_number)stats_array[2];
+ populated_pages = (collected_number)stats_array[3];
+ page_cache_descriptors = (collected_number)stats_array[27];
+
+ if (API_producers * 2 > populated_pages) {
+ pinned_pages = API_producers;
+ } else {
+ pinned_pages = API_producers * 2;
+ }
+ cached_pages = populated_pages - pinned_pages;
+
+ metadata = page_cache_descriptors * sizeof(struct page_cache_descr);
+ metadata += pages_on_disk * sizeof(struct rrdeng_page_descr);
+ /* This is an empirical estimation for Judy array indexing and extent structures */
+ metadata += pages_on_disk * 58;
+
+ rrddim_set_by_pointer(st_ram_usage, rd_cached, cached_pages);
+ rrddim_set_by_pointer(st_ram_usage, rd_pinned, pinned_pages);
+ rrddim_set_by_pointer(st_ram_usage, rd_metadata, metadata);
+ rrdset_done(st_ram_usage);
}
- else
- rrdset_next(st_long_term_pages);
-
- rrddim_set_by_pointer(st_long_term_pages, rd_total, (collected_number)stats_array[2]);
- rrddim_set_by_pointer(st_long_term_pages, rd_insertions, (collected_number)stats_array[5]);
- rrddim_set_by_pointer(st_long_term_pages, rd_deletions, (collected_number)stats_array[6]);
- rrddim_set_by_pointer(st_long_term_pages, rd_flushing_pressure_deletions,
- (collected_number)stats_array[36]);
- rrdset_done(st_long_term_pages);
}
+ }
+#endif
+}
+
+static void update_heartbeat_charts() {
+ RRDSET *st = rrdset_create_localhost(
+ "netdata"
+ , "heartbeat"
+ , NULL
+ , "heartbeat"
+ , NULL
+ , "System clock jitter"
+ , "microseconds"
+ , "netdata"
+ , "stats"
+ , 900000
+ , localhost->rrd_update_every
+ , RRDSET_TYPE_AREA
+ );
+
+ RRDDIM *rd_min = rrddim_add(st, "min", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ RRDDIM *rd_max = rrddim_add(st, "max", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ RRDDIM *rd_avg = rrddim_add(st, "average", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+
+ rrdset_next(st);
+
+ usec_t min, max, average;
+ size_t count;
+
+ heartbeat_statistics(&min, &max, &average, &count);
+
+ rrddim_set_by_pointer(st, rd_min, (collected_number)min);
+ rrddim_set_by_pointer(st, rd_max, (collected_number)max);
+ rrddim_set_by_pointer(st, rd_avg, (collected_number)average);
+
+ rrdset_done(st);
+}
+
+// ---------------------------------------------------------------------------------------------------------------------
+// worker utilization
+
+struct worker_job_type {
+ char name[WORKER_UTILIZATION_MAX_JOB_NAME_LENGTH + 1];
+ size_t jobs_started;
+ usec_t busy_time;
+
+ RRDDIM *rd_jobs_started;
+ RRDDIM *rd_busy_time;
+};
+
+struct worker_thread {
+ pid_t pid;
+ int enabled;
+
+ int cpu_enabled;
+
+ kernel_uint_t utime;
+ kernel_uint_t stime;
+
+ kernel_uint_t utime_old;
+ kernel_uint_t stime_old;
+
+ usec_t collected_time;
+ usec_t collected_time_old;
+
+ size_t jobs_started;
+ usec_t busy_time;
+
+ struct worker_thread *next;
+};
+
+struct worker_utilization {
+ const char *name;
+ const char *family;
+ size_t priority;
+ uint32_t flags;
+
+ char *name_lowercase;
+
+ struct worker_job_type per_job_type[WORKER_UTILIZATION_MAX_JOB_TYPES];
+
+ size_t workers_registered;
+ size_t workers_busy;
+ usec_t workers_total_busy_time;
+ usec_t workers_total_duration;
+ size_t workers_total_jobs_started;
+ double workers_min_busy_time;
+ double workers_max_busy_time;
- // ----------------------------------------------------------------
+ struct worker_thread *threads;
- {
- static RRDSET *st_io_stats = NULL;
- static RRDDIM *rd_reads = NULL;
- static RRDDIM *rd_writes = NULL;
+ RRDSET *st_workers_time;
+ RRDDIM *rd_workers_time_avg;
+ RRDDIM *rd_workers_time_min;
+ RRDDIM *rd_workers_time_max;
- if (unlikely(!st_io_stats)) {
- st_io_stats = rrdset_create_localhost(
+ size_t workers_cpu_enabled;
+ RRDSET *st_workers_cpu;
+ RRDDIM *rd_workers_cpu_avg;
+ RRDDIM *rd_workers_cpu_min;
+ RRDDIM *rd_workers_cpu_max;
+
+ RRDSET *st_workers_threads;
+ RRDDIM *rd_workers_threads_free;
+ RRDDIM *rd_workers_threads_busy;
+
+ RRDSET *st_workers_jobs_per_job_type;
+ RRDSET *st_workers_busy_per_job_type;
+};
+
+static void workers_utilization_update_chart(struct worker_utilization *wu) {
+ if(!wu->workers_registered) return;
+
+ //fprintf(stderr, "%-12s WORKER UTILIZATION: %-3.2f%%, %zu jobs done, %zu running, on %zu workers, min %-3.02f%%, max %-3.02f%%.\n",
+ // wu->name,
+ // (double)wu->workers_total_busy_time * 100.0 / (double)wu->workers_total_duration,
+ // wu->workers_total_jobs_started, wu->workers_busy, wu->workers_registered,
+ // wu->workers_min_busy_time, wu->workers_max_busy_time);
+
+ // ----------------------------------------------------------------------
+
+ if(unlikely(!wu->st_workers_time)) {
+ char name[RRD_ID_LENGTH_MAX + 1];
+ snprintfz(name, RRD_ID_LENGTH_MAX, "workers_time_%s", wu->name_lowercase);
+
+ char context[RRD_ID_LENGTH_MAX + 1];
+ snprintf(context, RRD_ID_LENGTH_MAX, "netdata.workers.%s.time", wu->name_lowercase);
+
+ wu->st_workers_time = rrdset_create_localhost(
+ "netdata"
+ , name
+ , NULL
+ , wu->family
+ , context
+ , "Netdata Workers Busy Time (100% = all workers busy)"
+ , "%"
+ , "netdata"
+ , "stats"
+ , wu->priority
+ , localhost->rrd_update_every
+ , RRDSET_TYPE_AREA
+ );
+ }
+
+ // we add the min and max dimensions only when we have multiple workers
+
+ if(unlikely(!wu->rd_workers_time_min && wu->workers_registered > 1))
+ wu->rd_workers_time_min = rrddim_add(wu->st_workers_time, "min", NULL, 1, 10000, RRD_ALGORITHM_ABSOLUTE);
+
+ if(unlikely(!wu->rd_workers_time_max && wu->workers_registered > 1))
+ wu->rd_workers_time_max = rrddim_add(wu->st_workers_time, "max", NULL, 1, 10000, RRD_ALGORITHM_ABSOLUTE);
+
+ if(unlikely(!wu->rd_workers_time_avg))
+ wu->rd_workers_time_avg = rrddim_add(wu->st_workers_time, "average", NULL, 1, 10000, RRD_ALGORITHM_ABSOLUTE);
+
+ rrdset_next(wu->st_workers_time);
+
+ if(wu->rd_workers_time_min)
+ rrddim_set_by_pointer(wu->st_workers_time, wu->rd_workers_time_min, (collected_number)((double)wu->workers_min_busy_time * 10000.0));
+
+ if(wu->rd_workers_time_max)
+ rrddim_set_by_pointer(wu->st_workers_time, wu->rd_workers_time_max, (collected_number)((double)wu->workers_max_busy_time * 10000.0));
+
+ rrddim_set_by_pointer(wu->st_workers_time, wu->rd_workers_time_avg, (collected_number)((double)wu->workers_total_busy_time * 100.0 * 10000.0 / (double)wu->workers_total_duration));
+ rrdset_done(wu->st_workers_time);
+
+ // ----------------------------------------------------------------------
+
+#ifdef __linux__
+ if(wu->workers_cpu_enabled || wu->st_workers_cpu) {
+ if(unlikely(!wu->st_workers_cpu)) {
+ char name[RRD_ID_LENGTH_MAX + 1];
+ snprintfz(name, RRD_ID_LENGTH_MAX, "workers_cpu_%s", wu->name_lowercase);
+
+ char context[RRD_ID_LENGTH_MAX + 1];
+ snprintf(context, RRD_ID_LENGTH_MAX, "netdata.workers.%s.cpu", wu->name_lowercase);
+
+ wu->st_workers_cpu = rrdset_create_localhost(
"netdata"
- , "dbengine_io_throughput"
-