summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVladimir Kobal <vlad@prokk.net>2019-08-29 20:35:05 +0300
committerPaul Emm. Katsoulakis <34388743+paulkatsoulakis@users.noreply.github.com>2019-08-29 20:35:05 +0300
commitc79112e85317d80f46acb26296e47a7da8d9d8b0 (patch)
tree7c65b3df8a78e0ee0ac38bea4746fe6aa2bc7dff
parent1907c1486fba9ecb534058d1d3b7573777b67b82 (diff)
Display uptime for processes (#6654)
* Get process uptime * Calculate target uptime * Update charts * Show collected data * Fix chart names * Update the documentation * Fix a flag value * Add an explanation note for the 'carried over uptime' chart * Move the functions for getting uptime to libnetdata * Rename the function for geting uptime * Remove redundant code * Fix starttime calculation * More accurate definition for the carried over uptime * fix group starttime calculation * Fix typo
-rw-r--r--collectors/apps.plugin/README.md5
-rw-r--r--collectors/apps.plugin/apps_plugin.c106
-rw-r--r--collectors/proc.plugin/proc_uptime.c71
-rw-r--r--libnetdata/clocks/clocks.c65
-rw-r--r--libnetdata/clocks/clocks.h2
-rw-r--r--web/gui/dashboard_info.js14
6 files changed, 195 insertions, 68 deletions
diff --git a/collectors/apps.plugin/README.md b/collectors/apps.plugin/README.md
index 308fe657fa..967ebb2d8e 100644
--- a/collectors/apps.plugin/README.md
+++ b/collectors/apps.plugin/README.md
@@ -47,6 +47,11 @@ Each of these sections provides the same number of charts:
- Threads Running
- Processes Running
- Pipes Open
+ - Carried Over Uptime (since the Netdata restart)
+ - Minimum Uptime
+ - Average Uptime
+ - Maximum Uptime
+
- Swap Memory
- Swap Memory Used
- Major Page Faults (i.e. swap activity)
diff --git a/collectors/apps.plugin/apps_plugin.c b/collectors/apps.plugin/apps_plugin.c
index a757a5bddf..47f5d6ede7 100644
--- a/collectors/apps.plugin/apps_plugin.c
+++ b/collectors/apps.plugin/apps_plugin.c
@@ -260,6 +260,12 @@ struct target {
kernel_uint_t openeventpolls;
kernel_uint_t openother;
+ kernel_uint_t starttime;
+ kernel_uint_t collected_starttime;
+ kernel_uint_t uptime_min;
+ kernel_uint_t uptime_sum;
+ kernel_uint_t uptime_max;
+
unsigned int processes; // how many processes have been merged to this
int exposed; // if set, we have sent this to netdata
int hidden; // if set, we set the hidden flag on the dimension
@@ -345,7 +351,7 @@ struct pid_stat {
// int64_t nice;
int32_t num_threads;
// int64_t itrealvalue;
- // kernel_uint_t starttime;
+ kernel_uint_t collected_starttime;
// kernel_uint_t vsize;
// kernel_uint_t rss;
// kernel_uint_t rsslim;
@@ -419,6 +425,8 @@ struct pid_stat {
usec_t io_collected_usec;
usec_t last_io_collected_usec;
+ kernel_uint_t uptime;
+
char *fds_dirname; // the full directory name in /proc/PID/fd
char *stat_filename;
@@ -433,6 +441,8 @@ struct pid_stat {
size_t pagesize;
+kernel_uint_t global_uptime;
+
// log each problem once per process
// log flood protection flags (log_thrown)
#define PID_LOG_IO 0x00000001
@@ -1421,7 +1431,8 @@ static inline int read_proc_pid_stat(struct pid_stat *p, void *ptr) {
// p->nice = str2kernel_uint_t(procfile_lineword(ff, 0, 18));
p->num_threads = (int32_t)str2uint32_t(procfile_lineword(ff, 0, 19));
// p->itrealvalue = str2kernel_uint_t(procfile_lineword(ff, 0, 20));
- // p->starttime = str2kernel_uint_t(procfile_lineword(ff, 0, 21));
+ p->collected_starttime = str2kernel_uint_t(procfile_lineword(ff, 0, 21)) / system_hz;
+ p->uptime = (global_uptime > p->collected_starttime)?(global_uptime - p->collected_starttime):0;
// p->vsize = str2kernel_uint_t(procfile_lineword(ff, 0, 22));
// p->rss = str2kernel_uint_t(procfile_lineword(ff, 0, 23));
// p->rsslim = str2kernel_uint_t(procfile_lineword(ff, 0, 24));
@@ -1490,6 +1501,8 @@ cleanup:
return 0;
}
+// ----------------------------------------------------------------------------
+
static inline int read_proc_pid_io(struct pid_stat *p, void *ptr) {
(void)ptr;
#ifdef __FreeBSD__
@@ -2634,6 +2647,12 @@ static int collect_data_for_all_processes(void) {
collect_data_for_pid(pid, &procbase[i]);
}
#else
+ static char uptime_filename[FILENAME_MAX + 1] = "";
+ if(*uptime_filename == '\0')
+ snprintfz(uptime_filename, FILENAME_MAX, "%s/proc/uptime", netdata_configured_host_prefix);
+
+ global_uptime = (kernel_uint_t)(uptime_msec(uptime_filename) / MSEC_PER_SEC);
+
char dirname[FILENAME_MAX + 1];
snprintfz(dirname, FILENAME_MAX, "%s/proc", netdata_configured_host_prefix);
@@ -2879,6 +2898,11 @@ static size_t zero_all_targets(struct target *root) {
w->openother = 0;
}
+ w->collected_starttime = 0;
+ w->uptime_min = 0;
+ w->uptime_sum = 0;
+ w->uptime_max = 0;
+
if(unlikely(w->root_pid)) {
struct pid_on_target *pid_on_target_to_free, *pid_on_target = w->root_pid;
@@ -3032,6 +3056,11 @@ static inline void aggregate_pid_on_target(struct target *w, struct pid_stat *p,
w->processes++;
w->num_threads += p->num_threads;
+ if(!w->collected_starttime || p->collected_starttime < w->collected_starttime) w->collected_starttime = p->collected_starttime;
+ if(!w->uptime_min || p->uptime < w->uptime_min) w->uptime_min = p->uptime;
+ w->uptime_sum += p->uptime;
+ if(!w->uptime_max || w->uptime_max < p->uptime) w->uptime_max = p->uptime;
+
if(unlikely(debug_enabled || w->debug_enabled)) {
debug_log_int("aggregating '%s' pid %d on target '%s' utime=" KERNEL_UINT_FORMAT ", stime=" KERNEL_UINT_FORMAT ", gtime=" KERNEL_UINT_FORMAT ", cutime=" KERNEL_UINT_FORMAT ", cstime=" KERNEL_UINT_FORMAT ", cgtime=" KERNEL_UINT_FORMAT ", minflt=" KERNEL_UINT_FORMAT ", majflt=" KERNEL_UINT_FORMAT ", cminflt=" KERNEL_UINT_FORMAT ", cmajflt=" KERNEL_UINT_FORMAT "", p->comm, p->pid, w->name, p->utime, p->stime, p->gtime, p->cutime, p->cstime, p->cgtime, p->minflt, p->majflt, p->cminflt, p->cmajflt);
@@ -3042,6 +3071,19 @@ static inline void aggregate_pid_on_target(struct target *w, struct pid_stat *p,
}
}
+static inline void post_aggregate_targets(struct target *root) {
+ struct target *w;
+ for (w = root; w ; w = w->next) {
+ if(w->collected_starttime) {
+ if (!w->starttime || w->collected_starttime < w->starttime) {
+ w->starttime = w->collected_starttime;
+ }
+ } else {
+ w->starttime = 0;
+ }
+ }
+}
+
static void calculate_netdata_statistics(void) {
apply_apps_groups_targets_inheritance();
@@ -3102,6 +3144,10 @@ static void calculate_netdata_statistics(void) {
aggregate_pid_fds_on_targets(p);
}
+ post_aggregate_targets(apps_groups_root_target);
+ post_aggregate_targets(users_root_target);
+ post_aggregate_targets(groups_root_target);
+
cleanup_exited_pids();
}
@@ -3457,6 +3503,36 @@ static void send_collected_data_to_netdata(struct target *root, const char *type
}
send_END();
+#ifndef __FreeBSD__
+ send_BEGIN(type, "uptime", dt);
+ for (w = root; w ; w = w->next) {
+ if(unlikely(w->exposed && w->processes))
+ send_SET(w->name, (global_uptime > w->starttime)?(global_uptime - w->starttime):0);
+ }
+ send_END();
+
+ send_BEGIN(type, "uptime_min", dt);
+ for (w = root; w ; w = w->next) {
+ if(unlikely(w->exposed && w->processes))
+ send_SET(w->name, w->uptime_min);
+ }
+ send_END();
+
+ send_BEGIN(type, "uptime_avg", dt);
+ for (w = root; w ; w = w->next) {
+ if(unlikely(w->exposed && w->processes))
+ send_SET(w->name, w->processes?(w->uptime_sum / w->processes):0);
+ }
+ send_END();
+
+ send_BEGIN(type, "uptime_max", dt);
+ for (w = root; w ; w = w->next) {
+ if(unlikely(w->exposed && w->processes))
+ send_SET(w->name, w->uptime_max);
+ }
+ send_END();
+#endif
+
send_BEGIN(type, "mem", dt);
for (w = root; w ; w = w->next) {
if(unlikely(w->exposed && w->processes))
@@ -3615,6 +3691,32 @@ static void send_charts_updates_to_netdata(struct target *root, const char *type
fprintf(stdout, "DIMENSION %s '' absolute 1 1\n", w->name);
}
+#ifndef __FreeBSD__
+ fprintf(stdout, "CHART %s.uptime '' '%s Carried Over Uptime' 'seconds' processes %s.uptime line 20008 %d\n", type, title, type, update_every);
+ for (w = root; w ; w = w->next) {
+ if(unlikely(w->exposed))
+ fprintf(stdout, "DIMENSION %s '' absolute 1 1\n", w->name);
+ }
+
+ fprintf(stdout, "CHART %s.uptime_min '' '%s Minimum Uptime' 'seconds' processes %s.uptime_min line 20009 %d\n", type, title, type, update_every);
+ for (w = root; w ; w = w->next) {
+ if(unlikely(w->exposed))
+ fprintf(stdout, "DIMENSION %s '' absolute 1 1\n", w->name);
+ }
+
+ fprintf(stdout, "CHART %s.uptime_avg '' '%s Average Uptime' 'seconds' processes %s.uptime_avg line 20010 %d\n", type, title, type, update_every);
+ for (w = root; w ; w = w->next) {
+ if(unlikely(w->exposed))
+ fprintf(stdout, "DIMENSION %s '' absolute 1 1\n", w->name);
+ }
+
+ fprintf(stdout, "CHART %s.uptime_max '' '%s Maximum Uptime' 'seconds' processes %s.uptime_max line 20011 %d\n", type, title, type, update_every);
+ for (w = root; w ; w = w->next) {
+ if(unlikely(w->exposed))
+ fprintf(stdout, "DIMENSION %s '' absolute 1 1\n", w->name);
+ }
+#endif
+
fprintf(stdout, "CHART %s.cpu_user '' '%s CPU User Time (%d%% = %d core%s)' 'percentage' cpu %s.cpu_user stacked 20020 %d\n", type, title, (processors * 100), processors, (processors>1)?"s":"", type, update_every);
for (w = root; w ; w = w->next) {
if(unlikely(w->exposed))
diff --git a/collectors/proc.plugin/proc_uptime.c b/collectors/proc.plugin/proc_uptime.c
index 142ae2d0c3..28b00e0da1 100644
--- a/collectors/proc.plugin/proc_uptime.c
+++ b/collectors/proc.plugin/proc_uptime.c
@@ -2,76 +2,17 @@
#include "plugin_proc.h"
-static inline collected_number uptime_from_boottime(void) {
-#ifdef CLOCK_BOOTTIME_IS_AVAILABLE
- return now_boottime_usec() / 1000;
-#else
- error("uptime cannot be read from CLOCK_BOOTTIME on this system.");
- return 0;
-#endif
-}
-
-static procfile *read_proc_uptime_ff = NULL;
-static inline collected_number read_proc_uptime(void) {
- if(unlikely(!read_proc_uptime_ff)) {
- char filename[FILENAME_MAX + 1];
- snprintfz(filename, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/proc/uptime");
-
- read_proc_uptime_ff = procfile_open(config_get("plugin:proc:/proc/uptime", "filename to monitor", filename), " \t", PROCFILE_FLAG_DEFAULT);
- if(unlikely(!read_proc_uptime_ff)) return 0;
- }
-
- read_proc_uptime_ff = procfile_readall(read_proc_uptime_ff);
- if(unlikely(!read_proc_uptime_ff)) return 0;
-
- if(unlikely(procfile_lines(read_proc_uptime_ff) < 1)) {
- error("/proc/uptime has no lines.");
- return 0;
- }
- if(unlikely(procfile_linewords(read_proc_uptime_ff, 0) < 1)) {
- error("/proc/uptime has less than 1 word in it.");
- return 0;
- }
-
- return (collected_number)(strtold(procfile_lineword(read_proc_uptime_ff, 0, 0), NULL) * 1000.0);
-}
-
int do_proc_uptime(int update_every, usec_t dt) {
(void)dt;
- static int use_boottime = -1;
-
- if(unlikely(use_boottime == -1)) {
- collected_number uptime_boottime = uptime_from_boottime();
- collected_number uptime_proc = read_proc_uptime();
-
- long long delta = (long long)uptime_boottime - (long long)uptime_proc;
- if(delta < 0) delta = -delta;
+ static char *uptime_filename = NULL;
+ if(!uptime_filename) {
+ char filename[FILENAME_MAX + 1];
+ snprintfz(filename, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/proc/uptime");
- if(delta <= 1000 && uptime_boottime != 0) {
- procfile_close(read_proc_uptime_ff);
- info("Using now_boottime_usec() for uptime (dt is %lld ms)", delta);
- use_boottime = 1;
- }
- else if(uptime_proc != 0) {
- info("Using /proc/uptime for uptime (dt is %lld ms)", delta);
- use_boottime = 0;
- }
- else {
- error("Cannot find any way to read uptime on this system.");
- return 1;
- }
+ uptime_filename = config_get("plugin:proc:/proc/uptime", "filename to monitor", filename);
}
- collected_number uptime;
- if(use_boottime)
- uptime = uptime_from_boottime();
- else
- uptime = read_proc_uptime();
-
-
- // --------------------------------------------------------------------
-
static RRDSET *st = NULL;
static RRDDIM *rd = NULL;
@@ -97,7 +38,7 @@ int do_proc_uptime(int update_every, usec_t dt) {
else
rrdset_next(st);
- rrddim_set_by_pointer(st, rd, uptime);
+ rrddim_set_by_pointer(st, rd, uptime_msec(uptime_filename));
rrdset_done(st);
diff --git a/libnetdata/clocks/clocks.c b/libnetdata/clocks/clocks.c
index f7d2171780..161225a9b6 100644
--- a/libnetdata/clocks/clocks.c
+++ b/libnetdata/clocks/clocks.c
@@ -210,3 +210,68 @@ int sleep_usec(usec_t usec) {
return ret;
#endif
}
+
+static inline collected_number uptime_from_boottime(void) {
+#ifdef CLOCK_BOOTTIME_IS_AVAILABLE
+ return now_boottime_usec() / 1000;
+#else
+ error("uptime cannot be read from CLOCK_BOOTTIME on this system.");
+ return 0;
+#endif
+}
+
+static procfile *read_proc_uptime_ff = NULL;
+static inline collected_number read_proc_uptime(char *filename) {
+ if(unlikely(!read_proc_uptime_ff)) {
+ read_proc_uptime_ff = procfile_open(filename, " \t", PROCFILE_FLAG_DEFAULT);
+ if(unlikely(!read_proc_uptime_ff)) return 0;
+ }
+
+ read_proc_uptime_ff = procfile_readall(read_proc_uptime_ff);
+ if(unlikely(!read_proc_uptime_ff)) return 0;
+
+ if(unlikely(procfile_lines(read_proc_uptime_ff) < 1)) {
+ error("/proc/uptime has no lines.");
+ return 0;
+ }
+ if(unlikely(procfile_linewords(read_proc_uptime_ff, 0) < 1)) {
+ error("/proc/uptime has less than 1 word in it.");
+ return 0;
+ }
+
+ return (collected_number)(strtold(procfile_lineword(read_proc_uptime_ff, 0, 0), NULL) * 1000.0);
+}
+
+inline collected_number uptime_msec(char *filename){
+ static int use_boottime = -1;
+
+ if(unlikely(use_boottime == -1)) {
+ collected_number uptime_boottime = uptime_from_boottime();
+ collected_number uptime_proc = read_proc_uptime(filename);
+
+ long long delta = (long long)uptime_boottime - (long long)uptime_proc;
+ if(delta < 0) delta = -delta;
+
+ if(delta <= 1000 && uptime_boottime != 0) {
+ procfile_close(read_proc_uptime_ff);
+ info("Using now_boottime_usec() for uptime (dt is %lld ms)", delta);
+ use_boottime = 1;
+ }
+ else if(uptime_proc != 0) {
+ info("Using /proc/uptime for uptime (dt is %lld ms)", delta);
+ use_boottime = 0;
+ }
+ else {
+ error("Cannot find any way to read uptime on this system.");
+ return 1;
+ }
+ }
+
+ collected_number uptime;
+ if(use_boottime)
+ uptime = uptime_from_boottime();
+ else
+ uptime = read_proc_uptime(filename);
+
+ return uptime;
+}
diff --git a/libnetdata/clocks/clocks.h b/libnetdata/clocks/clocks.h
index 47aa148c12..4af451d60a 100644
--- a/libnetdata/clocks/clocks.h
+++ b/libnetdata/clocks/clocks.h
@@ -136,4 +136,6 @@ extern int sleep_usec(usec_t usec);
*/
void test_clock_boottime(void);
+extern collected_number uptime_msec(char *filename);
+
#endif /* NETDATA_CLOCKS_H */
diff --git a/web/gui/dashboard_info.js b/web/gui/dashboard_info.js
index 07a8c3dbbc..130162be37 100644
--- a/web/gui/dashboard_info.js
+++ b/web/gui/dashboard_info.js
@@ -985,6 +985,10 @@ netdataDashboard.context = {
height: 2.0
},
+ 'apps.uptime': {
+ info: 'Carried over process group uptime since the Netdata restart. The period of time within which at least one process in the group was running.'
+ },
+
// ------------------------------------------------------------------------
// USERS
@@ -1008,6 +1012,10 @@ netdataDashboard.context = {
height: 2.0
},
+ 'users.uptime': {
+ info: 'Carried over process group uptime since the Netdata restart. The period of time within which at least one process in the group was running.'
+ },
+
// ------------------------------------------------------------------------
// GROUPS
@@ -1020,7 +1028,7 @@ netdataDashboard.context = {
},
'groups.vmem': {
- info: 'Virtual memory allocated per user group. Please check <a href="https://github.com/netdata/netdata/tree/master/daemon#virtual-memory" target="_blank">this article</a> for more information.'
+ info: 'Virtual memory allocated per user group since the Netdata restart. Please check <a href="https://github.com/netdata/netdata/tree/master/daemon#virtual-memory" target="_blank">this article</a> for more information.'
},
'groups.preads': {
@@ -1031,6 +1039,10 @@ netdataDashboard.context = {
height: 2.0
},
+ 'groups.uptime': {
+ info: 'Carried over process group uptime. The period of time within which at least one process in the group was running.'
+ },
+
// ------------------------------------------------------------------------
// NETWORK QoS