diff options
author | Vladimir Kobal <vlad@prokk.net> | 2019-08-29 20:35:05 +0300 |
---|---|---|
committer | Paul Emm. Katsoulakis <34388743+paulkatsoulakis@users.noreply.github.com> | 2019-08-29 20:35:05 +0300 |
commit | c79112e85317d80f46acb26296e47a7da8d9d8b0 (patch) | |
tree | 7c65b3df8a78e0ee0ac38bea4746fe6aa2bc7dff | |
parent | 1907c1486fba9ecb534058d1d3b7573777b67b82 (diff) |
Display uptime for processes (#6654)
* Get process uptime
* Calculate target uptime
* Update charts
* Show collected data
* Fix chart names
* Update the documentation
* Fix a flag value
* Add an explanation note for the 'carried over uptime' chart
* Move the functions for getting uptime to libnetdata
* Rename the function for geting uptime
* Remove redundant code
* Fix starttime calculation
* More accurate definition for the carried over uptime
* fix group starttime calculation
* Fix typo
-rw-r--r-- | collectors/apps.plugin/README.md | 5 | ||||
-rw-r--r-- | collectors/apps.plugin/apps_plugin.c | 106 | ||||
-rw-r--r-- | collectors/proc.plugin/proc_uptime.c | 71 | ||||
-rw-r--r-- | libnetdata/clocks/clocks.c | 65 | ||||
-rw-r--r-- | libnetdata/clocks/clocks.h | 2 | ||||
-rw-r--r-- | web/gui/dashboard_info.js | 14 |
6 files changed, 195 insertions, 68 deletions
diff --git a/collectors/apps.plugin/README.md b/collectors/apps.plugin/README.md index 308fe657fa..967ebb2d8e 100644 --- a/collectors/apps.plugin/README.md +++ b/collectors/apps.plugin/README.md @@ -47,6 +47,11 @@ Each of these sections provides the same number of charts: - Threads Running - Processes Running - Pipes Open + - Carried Over Uptime (since the Netdata restart) + - Minimum Uptime + - Average Uptime + - Maximum Uptime + - Swap Memory - Swap Memory Used - Major Page Faults (i.e. swap activity) diff --git a/collectors/apps.plugin/apps_plugin.c b/collectors/apps.plugin/apps_plugin.c index a757a5bddf..47f5d6ede7 100644 --- a/collectors/apps.plugin/apps_plugin.c +++ b/collectors/apps.plugin/apps_plugin.c @@ -260,6 +260,12 @@ struct target { kernel_uint_t openeventpolls; kernel_uint_t openother; + kernel_uint_t starttime; + kernel_uint_t collected_starttime; + kernel_uint_t uptime_min; + kernel_uint_t uptime_sum; + kernel_uint_t uptime_max; + unsigned int processes; // how many processes have been merged to this int exposed; // if set, we have sent this to netdata int hidden; // if set, we set the hidden flag on the dimension @@ -345,7 +351,7 @@ struct pid_stat { // int64_t nice; int32_t num_threads; // int64_t itrealvalue; - // kernel_uint_t starttime; + kernel_uint_t collected_starttime; // kernel_uint_t vsize; // kernel_uint_t rss; // kernel_uint_t rsslim; @@ -419,6 +425,8 @@ struct pid_stat { usec_t io_collected_usec; usec_t last_io_collected_usec; + kernel_uint_t uptime; + char *fds_dirname; // the full directory name in /proc/PID/fd char *stat_filename; @@ -433,6 +441,8 @@ struct pid_stat { size_t pagesize; +kernel_uint_t global_uptime; + // log each problem once per process // log flood protection flags (log_thrown) #define PID_LOG_IO 0x00000001 @@ -1421,7 +1431,8 @@ static inline int read_proc_pid_stat(struct pid_stat *p, void *ptr) { // p->nice = str2kernel_uint_t(procfile_lineword(ff, 0, 18)); p->num_threads = (int32_t)str2uint32_t(procfile_lineword(ff, 0, 19)); // p->itrealvalue = str2kernel_uint_t(procfile_lineword(ff, 0, 20)); - // p->starttime = str2kernel_uint_t(procfile_lineword(ff, 0, 21)); + p->collected_starttime = str2kernel_uint_t(procfile_lineword(ff, 0, 21)) / system_hz; + p->uptime = (global_uptime > p->collected_starttime)?(global_uptime - p->collected_starttime):0; // p->vsize = str2kernel_uint_t(procfile_lineword(ff, 0, 22)); // p->rss = str2kernel_uint_t(procfile_lineword(ff, 0, 23)); // p->rsslim = str2kernel_uint_t(procfile_lineword(ff, 0, 24)); @@ -1490,6 +1501,8 @@ cleanup: return 0; } +// ---------------------------------------------------------------------------- + static inline int read_proc_pid_io(struct pid_stat *p, void *ptr) { (void)ptr; #ifdef __FreeBSD__ @@ -2634,6 +2647,12 @@ static int collect_data_for_all_processes(void) { collect_data_for_pid(pid, &procbase[i]); } #else + static char uptime_filename[FILENAME_MAX + 1] = ""; + if(*uptime_filename == '\0') + snprintfz(uptime_filename, FILENAME_MAX, "%s/proc/uptime", netdata_configured_host_prefix); + + global_uptime = (kernel_uint_t)(uptime_msec(uptime_filename) / MSEC_PER_SEC); + char dirname[FILENAME_MAX + 1]; snprintfz(dirname, FILENAME_MAX, "%s/proc", netdata_configured_host_prefix); @@ -2879,6 +2898,11 @@ static size_t zero_all_targets(struct target *root) { w->openother = 0; } + w->collected_starttime = 0; + w->uptime_min = 0; + w->uptime_sum = 0; + w->uptime_max = 0; + if(unlikely(w->root_pid)) { struct pid_on_target *pid_on_target_to_free, *pid_on_target = w->root_pid; @@ -3032,6 +3056,11 @@ static inline void aggregate_pid_on_target(struct target *w, struct pid_stat *p, w->processes++; w->num_threads += p->num_threads; + if(!w->collected_starttime || p->collected_starttime < w->collected_starttime) w->collected_starttime = p->collected_starttime; + if(!w->uptime_min || p->uptime < w->uptime_min) w->uptime_min = p->uptime; + w->uptime_sum += p->uptime; + if(!w->uptime_max || w->uptime_max < p->uptime) w->uptime_max = p->uptime; + if(unlikely(debug_enabled || w->debug_enabled)) { debug_log_int("aggregating '%s' pid %d on target '%s' utime=" KERNEL_UINT_FORMAT ", stime=" KERNEL_UINT_FORMAT ", gtime=" KERNEL_UINT_FORMAT ", cutime=" KERNEL_UINT_FORMAT ", cstime=" KERNEL_UINT_FORMAT ", cgtime=" KERNEL_UINT_FORMAT ", minflt=" KERNEL_UINT_FORMAT ", majflt=" KERNEL_UINT_FORMAT ", cminflt=" KERNEL_UINT_FORMAT ", cmajflt=" KERNEL_UINT_FORMAT "", p->comm, p->pid, w->name, p->utime, p->stime, p->gtime, p->cutime, p->cstime, p->cgtime, p->minflt, p->majflt, p->cminflt, p->cmajflt); @@ -3042,6 +3071,19 @@ static inline void aggregate_pid_on_target(struct target *w, struct pid_stat *p, } } +static inline void post_aggregate_targets(struct target *root) { + struct target *w; + for (w = root; w ; w = w->next) { + if(w->collected_starttime) { + if (!w->starttime || w->collected_starttime < w->starttime) { + w->starttime = w->collected_starttime; + } + } else { + w->starttime = 0; + } + } +} + static void calculate_netdata_statistics(void) { apply_apps_groups_targets_inheritance(); @@ -3102,6 +3144,10 @@ static void calculate_netdata_statistics(void) { aggregate_pid_fds_on_targets(p); } + post_aggregate_targets(apps_groups_root_target); + post_aggregate_targets(users_root_target); + post_aggregate_targets(groups_root_target); + cleanup_exited_pids(); } @@ -3457,6 +3503,36 @@ static void send_collected_data_to_netdata(struct target *root, const char *type } send_END(); +#ifndef __FreeBSD__ + send_BEGIN(type, "uptime", dt); + for (w = root; w ; w = w->next) { + if(unlikely(w->exposed && w->processes)) + send_SET(w->name, (global_uptime > w->starttime)?(global_uptime - w->starttime):0); + } + send_END(); + + send_BEGIN(type, "uptime_min", dt); + for (w = root; w ; w = w->next) { + if(unlikely(w->exposed && w->processes)) + send_SET(w->name, w->uptime_min); + } + send_END(); + + send_BEGIN(type, "uptime_avg", dt); + for (w = root; w ; w = w->next) { + if(unlikely(w->exposed && w->processes)) + send_SET(w->name, w->processes?(w->uptime_sum / w->processes):0); + } + send_END(); + + send_BEGIN(type, "uptime_max", dt); + for (w = root; w ; w = w->next) { + if(unlikely(w->exposed && w->processes)) + send_SET(w->name, w->uptime_max); + } + send_END(); +#endif + send_BEGIN(type, "mem", dt); for (w = root; w ; w = w->next) { if(unlikely(w->exposed && w->processes)) @@ -3615,6 +3691,32 @@ static void send_charts_updates_to_netdata(struct target *root, const char *type fprintf(stdout, "DIMENSION %s '' absolute 1 1\n", w->name); } +#ifndef __FreeBSD__ + fprintf(stdout, "CHART %s.uptime '' '%s Carried Over Uptime' 'seconds' processes %s.uptime line 20008 %d\n", type, title, type, update_every); + for (w = root; w ; w = w->next) { + if(unlikely(w->exposed)) + fprintf(stdout, "DIMENSION %s '' absolute 1 1\n", w->name); + } + + fprintf(stdout, "CHART %s.uptime_min '' '%s Minimum Uptime' 'seconds' processes %s.uptime_min line 20009 %d\n", type, title, type, update_every); + for (w = root; w ; w = w->next) { + if(unlikely(w->exposed)) + fprintf(stdout, "DIMENSION %s '' absolute 1 1\n", w->name); + } + + fprintf(stdout, "CHART %s.uptime_avg '' '%s Average Uptime' 'seconds' processes %s.uptime_avg line 20010 %d\n", type, title, type, update_every); + for (w = root; w ; w = w->next) { + if(unlikely(w->exposed)) + fprintf(stdout, "DIMENSION %s '' absolute 1 1\n", w->name); + } + + fprintf(stdout, "CHART %s.uptime_max '' '%s Maximum Uptime' 'seconds' processes %s.uptime_max line 20011 %d\n", type, title, type, update_every); + for (w = root; w ; w = w->next) { + if(unlikely(w->exposed)) + fprintf(stdout, "DIMENSION %s '' absolute 1 1\n", w->name); + } +#endif + fprintf(stdout, "CHART %s.cpu_user '' '%s CPU User Time (%d%% = %d core%s)' 'percentage' cpu %s.cpu_user stacked 20020 %d\n", type, title, (processors * 100), processors, (processors>1)?"s":"", type, update_every); for (w = root; w ; w = w->next) { if(unlikely(w->exposed)) diff --git a/collectors/proc.plugin/proc_uptime.c b/collectors/proc.plugin/proc_uptime.c index 142ae2d0c3..28b00e0da1 100644 --- a/collectors/proc.plugin/proc_uptime.c +++ b/collectors/proc.plugin/proc_uptime.c @@ -2,76 +2,17 @@ #include "plugin_proc.h" -static inline collected_number uptime_from_boottime(void) { -#ifdef CLOCK_BOOTTIME_IS_AVAILABLE - return now_boottime_usec() / 1000; -#else - error("uptime cannot be read from CLOCK_BOOTTIME on this system."); - return 0; -#endif -} - -static procfile *read_proc_uptime_ff = NULL; -static inline collected_number read_proc_uptime(void) { - if(unlikely(!read_proc_uptime_ff)) { - char filename[FILENAME_MAX + 1]; - snprintfz(filename, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/proc/uptime"); - - read_proc_uptime_ff = procfile_open(config_get("plugin:proc:/proc/uptime", "filename to monitor", filename), " \t", PROCFILE_FLAG_DEFAULT); - if(unlikely(!read_proc_uptime_ff)) return 0; - } - - read_proc_uptime_ff = procfile_readall(read_proc_uptime_ff); - if(unlikely(!read_proc_uptime_ff)) return 0; - - if(unlikely(procfile_lines(read_proc_uptime_ff) < 1)) { - error("/proc/uptime has no lines."); - return 0; - } - if(unlikely(procfile_linewords(read_proc_uptime_ff, 0) < 1)) { - error("/proc/uptime has less than 1 word in it."); - return 0; - } - - return (collected_number)(strtold(procfile_lineword(read_proc_uptime_ff, 0, 0), NULL) * 1000.0); -} - int do_proc_uptime(int update_every, usec_t dt) { (void)dt; - static int use_boottime = -1; - - if(unlikely(use_boottime == -1)) { - collected_number uptime_boottime = uptime_from_boottime(); - collected_number uptime_proc = read_proc_uptime(); - - long long delta = (long long)uptime_boottime - (long long)uptime_proc; - if(delta < 0) delta = -delta; + static char *uptime_filename = NULL; + if(!uptime_filename) { + char filename[FILENAME_MAX + 1]; + snprintfz(filename, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/proc/uptime"); - if(delta <= 1000 && uptime_boottime != 0) { - procfile_close(read_proc_uptime_ff); - info("Using now_boottime_usec() for uptime (dt is %lld ms)", delta); - use_boottime = 1; - } - else if(uptime_proc != 0) { - info("Using /proc/uptime for uptime (dt is %lld ms)", delta); - use_boottime = 0; - } - else { - error("Cannot find any way to read uptime on this system."); - return 1; - } + uptime_filename = config_get("plugin:proc:/proc/uptime", "filename to monitor", filename); } - collected_number uptime; - if(use_boottime) - uptime = uptime_from_boottime(); - else - uptime = read_proc_uptime(); - - - // -------------------------------------------------------------------- - static RRDSET *st = NULL; static RRDDIM *rd = NULL; @@ -97,7 +38,7 @@ int do_proc_uptime(int update_every, usec_t dt) { else rrdset_next(st); - rrddim_set_by_pointer(st, rd, uptime); + rrddim_set_by_pointer(st, rd, uptime_msec(uptime_filename)); rrdset_done(st); diff --git a/libnetdata/clocks/clocks.c b/libnetdata/clocks/clocks.c index f7d2171780..161225a9b6 100644 --- a/libnetdata/clocks/clocks.c +++ b/libnetdata/clocks/clocks.c @@ -210,3 +210,68 @@ int sleep_usec(usec_t usec) { return ret; #endif } + +static inline collected_number uptime_from_boottime(void) { +#ifdef CLOCK_BOOTTIME_IS_AVAILABLE + return now_boottime_usec() / 1000; +#else + error("uptime cannot be read from CLOCK_BOOTTIME on this system."); + return 0; +#endif +} + +static procfile *read_proc_uptime_ff = NULL; +static inline collected_number read_proc_uptime(char *filename) { + if(unlikely(!read_proc_uptime_ff)) { + read_proc_uptime_ff = procfile_open(filename, " \t", PROCFILE_FLAG_DEFAULT); + if(unlikely(!read_proc_uptime_ff)) return 0; + } + + read_proc_uptime_ff = procfile_readall(read_proc_uptime_ff); + if(unlikely(!read_proc_uptime_ff)) return 0; + + if(unlikely(procfile_lines(read_proc_uptime_ff) < 1)) { + error("/proc/uptime has no lines."); + return 0; + } + if(unlikely(procfile_linewords(read_proc_uptime_ff, 0) < 1)) { + error("/proc/uptime has less than 1 word in it."); + return 0; + } + + return (collected_number)(strtold(procfile_lineword(read_proc_uptime_ff, 0, 0), NULL) * 1000.0); +} + +inline collected_number uptime_msec(char *filename){ + static int use_boottime = -1; + + if(unlikely(use_boottime == -1)) { + collected_number uptime_boottime = uptime_from_boottime(); + collected_number uptime_proc = read_proc_uptime(filename); + + long long delta = (long long)uptime_boottime - (long long)uptime_proc; + if(delta < 0) delta = -delta; + + if(delta <= 1000 && uptime_boottime != 0) { + procfile_close(read_proc_uptime_ff); + info("Using now_boottime_usec() for uptime (dt is %lld ms)", delta); + use_boottime = 1; + } + else if(uptime_proc != 0) { + info("Using /proc/uptime for uptime (dt is %lld ms)", delta); + use_boottime = 0; + } + else { + error("Cannot find any way to read uptime on this system."); + return 1; + } + } + + collected_number uptime; + if(use_boottime) + uptime = uptime_from_boottime(); + else + uptime = read_proc_uptime(filename); + + return uptime; +} diff --git a/libnetdata/clocks/clocks.h b/libnetdata/clocks/clocks.h index 47aa148c12..4af451d60a 100644 --- a/libnetdata/clocks/clocks.h +++ b/libnetdata/clocks/clocks.h @@ -136,4 +136,6 @@ extern int sleep_usec(usec_t usec); */ void test_clock_boottime(void); +extern collected_number uptime_msec(char *filename); + #endif /* NETDATA_CLOCKS_H */ diff --git a/web/gui/dashboard_info.js b/web/gui/dashboard_info.js index 07a8c3dbbc..130162be37 100644 --- a/web/gui/dashboard_info.js +++ b/web/gui/dashboard_info.js @@ -985,6 +985,10 @@ netdataDashboard.context = { height: 2.0 }, + 'apps.uptime': { + info: 'Carried over process group uptime since the Netdata restart. The period of time within which at least one process in the group was running.' + }, + // ------------------------------------------------------------------------ // USERS @@ -1008,6 +1012,10 @@ netdataDashboard.context = { height: 2.0 }, + 'users.uptime': { + info: 'Carried over process group uptime since the Netdata restart. The period of time within which at least one process in the group was running.' + }, + // ------------------------------------------------------------------------ // GROUPS @@ -1020,7 +1028,7 @@ netdataDashboard.context = { }, 'groups.vmem': { - info: 'Virtual memory allocated per user group. Please check <a href="https://github.com/netdata/netdata/tree/master/daemon#virtual-memory" target="_blank">this article</a> for more information.' + info: 'Virtual memory allocated per user group since the Netdata restart. Please check <a href="https://github.com/netdata/netdata/tree/master/daemon#virtual-memory" target="_blank">this article</a> for more information.' }, 'groups.preads': { @@ -1031,6 +1039,10 @@ netdataDashboard.context = { height: 2.0 }, + 'groups.uptime': { + info: 'Carried over process group uptime. The period of time within which at least one process in the group was running.' + }, + // ------------------------------------------------------------------------ // NETWORK QoS |