summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCosta Tsaousis <costa@tsaousis.gr>2016-07-08 03:56:10 +0300
committerGitHub <noreply@github.com>2016-07-08 03:56:10 +0300
commitc2f188f6db995d19c85823b829585d804a8ab026 (patch)
tree9115116aeb9f076323a6aa39e7f3b2982e9bd7fb
parent509c50df3bce82fc6bae3974347619fa97591bad (diff)
parentfdc28d5054fc44bb096357f5c114dc0f0edd31ba (diff)
Merge pull request #667 from ktsaou/master
yet another rewrite of apps.plugin to fix cpu spikes on slow systems
-rw-r--r--src/apps_plugin.c1184
1 files changed, 670 insertions, 514 deletions
diff --git a/src/apps_plugin.c b/src/apps_plugin.c
index ba497b7b9c..93e9501c2b 100644
--- a/src/apps_plugin.c
+++ b/src/apps_plugin.c
@@ -48,12 +48,14 @@ pid_t pid_max = 32768;
int debug = 0;
int update_every = 1;
+unsigned long long global_iterations_counter = 1;
unsigned long long file_counter = 0;
int proc_pid_cmdline_is_needed = 0;
int include_exited_childs = 1;
char *host_prefix = "";
char *config_dir = CONFIG_DIR;
+pid_t *all_pids_sortlist = NULL;
// ----------------------------------------------------------------------------
@@ -146,15 +148,6 @@ struct target {
unsigned long long num_threads;
unsigned long long rss;
- long long fix_minflt;
- long long fix_cminflt;
- long long fix_majflt;
- long long fix_cmajflt;
- long long fix_utime;
- long long fix_stime;
- long long fix_cutime;
- long long fix_cstime;
-
unsigned long long statm_size;
unsigned long long statm_resident;
unsigned long long statm_share;
@@ -171,14 +164,6 @@ struct target {
unsigned long long io_storage_bytes_written;
unsigned long long io_cancelled_write_bytes;
- unsigned long long fix_io_logical_bytes_read;
- unsigned long long fix_io_logical_bytes_written;
- unsigned long long fix_io_read_calls;
- unsigned long long fix_io_write_calls;
- unsigned long long fix_io_storage_bytes_read;
- unsigned long long fix_io_storage_bytes_written;
- unsigned long long fix_io_cancelled_write_bytes;
-
int *fds;
unsigned long long openfiles;
unsigned long long openpipes;
@@ -459,6 +444,18 @@ struct pid_stat {
// int32_t tty_nr;
// int32_t tpgid;
// uint64_t flags;
+
+ // these are raw values collected
+ unsigned long long minflt_raw;
+ unsigned long long cminflt_raw;
+ unsigned long long majflt_raw;
+ unsigned long long cmajflt_raw;
+ unsigned long long utime_raw;
+ unsigned long long stime_raw;
+ unsigned long long cutime_raw;
+ unsigned long long cstime_raw;
+
+ // these are rates
unsigned long long minflt;
unsigned long long cminflt;
unsigned long long majflt;
@@ -467,6 +464,7 @@ struct pid_stat {
unsigned long long stime;
unsigned long long cutime;
unsigned long long cstime;
+
// int64_t priority;
// int64_t nice;
int32_t num_threads;
@@ -506,6 +504,14 @@ struct pid_stat {
unsigned long long statm_data;
unsigned long long statm_dirty;
+ unsigned long long io_logical_bytes_read_raw;
+ unsigned long long io_logical_bytes_written_raw;
+ unsigned long long io_read_calls_raw;
+ unsigned long long io_write_calls_raw;
+ unsigned long long io_storage_bytes_read_raw;
+ unsigned long long io_storage_bytes_written_raw;
+ unsigned long long io_cancelled_write_bytes_raw;
+
unsigned long long io_logical_bytes_read;
unsigned long long io_logical_bytes_written;
unsigned long long io_read_calls;
@@ -514,61 +520,42 @@ struct pid_stat {
unsigned long long io_storage_bytes_written;
unsigned long long io_cancelled_write_bytes;
- // we need the last values
- // for all incremental counters
- // so that when a process switches users/groups
- // we will subtract these values from the old
- // target
- unsigned long long last_minflt;
- unsigned long long last_majflt;
- unsigned long long last_utime;
- unsigned long long last_stime;
-
- unsigned long long last_cminflt;
- unsigned long long last_cmajflt;
- unsigned long long last_cutime;
- unsigned long long last_cstime;
-
- unsigned long long last_fix_cminflt;
- unsigned long long last_fix_cmajflt;
- unsigned long long last_fix_cutime;
- unsigned long long last_fix_cstime;
-
- unsigned long long last_io_logical_bytes_read;
- unsigned long long last_io_logical_bytes_written;
- unsigned long long last_io_read_calls;
- unsigned long long last_io_write_calls;
- unsigned long long last_io_storage_bytes_read;
- unsigned long long last_io_storage_bytes_written;
- unsigned long long last_io_cancelled_write_bytes;
-
- unsigned long long fix_cminflt;
- unsigned long long fix_cmajflt;
- unsigned long long fix_cutime;
- unsigned long long fix_cstime;
-
int *fds; // array of fds it uses
int fds_size; // the size of the fds array
int children_count; // number of processes directly referencing this
- int updated; // 1 when update
+ int keep; // 1 when we need to keep this process in memory even after it exited
+ int keeploops; // increases by 1 every time keep is 1 and updated 0
+ int updated; // 1 when the process is currently running
int merged; // 1 when it has been merged to its parent
- int new_entry;
+ int new_entry; // 1 when this is a new process, just saw for the first time
+ int read; // 1 when we have already read this process for this iteration
+ int sortlist; // higher numbers = top on the process tree
+ // each process gets a unique number
struct target *target; // app_groups.conf targets
struct target *user_target; // uid based targets
struct target *group_target; // gid based targets
+ unsigned long long stat_collected_usec;
+ unsigned long long last_stat_collected_usec;
+
+ unsigned long long io_collected_usec;
+ unsigned long long last_io_collected_usec;
+
+ char *stat_filename;
+ char *statm_filename;
+ char *io_filename;
+ char *cmdline_filename;
+
struct pid_stat *parent;
struct pid_stat *prev;
struct pid_stat *next;
-
} *root_of_pids = NULL, **all_pids;
long all_pids_count = 0;
-struct pid_stat *get_pid_entry(pid_t pid)
-{
+struct pid_stat *get_pid_entry(pid_t pid) {
if(all_pids[pid]) {
all_pids[pid]->new_entry = 0;
return all_pids[pid];
@@ -592,12 +579,16 @@ struct pid_stat *get_pid_entry(pid_t pid)
all_pids[pid]->pid = pid;
all_pids[pid]->new_entry = 1;
+ all_pids_count++;
+
return all_pids[pid];
}
-void del_pid_entry(pid_t pid)
-{
- if(!all_pids[pid]) return;
+void del_pid_entry(pid_t pid) {
+ if(!all_pids[pid]) {
+ error("attempted to free pid %d that is not allocated.", pid);
+ return;
+ }
if(unlikely(debug))
fprintf(stderr, "apps.plugin: process %d %s exited, deleting it.\n", pid, all_pids[pid]->comm);
@@ -607,8 +598,14 @@ void del_pid_entry(pid_t pid)
if(all_pids[pid]->prev) all_pids[pid]->prev->next = all_pids[pid]->next;
if(all_pids[pid]->fds) free(all_pids[pid]->fds);
+ if(all_pids[pid]->stat_filename) free(all_pids[pid]->stat_filename);
+ if(all_pids[pid]->statm_filename) free(all_pids[pid]->statm_filename);
+ if(all_pids[pid]->io_filename) free(all_pids[pid]->io_filename);
+ if(all_pids[pid]->cmdline_filename) free(all_pids[pid]->cmdline_filename);
free(all_pids[pid]);
+
all_pids[pid] = NULL;
+ all_pids_count--;
}
@@ -616,42 +613,51 @@ void del_pid_entry(pid_t pid)
// update pids from proc
int read_proc_pid_cmdline(struct pid_stat *p) {
- char filename[FILENAME_MAX + 1];
- snprintfz(filename, FILENAME_MAX, "%s/proc/%d/cmdline", host_prefix, p->pid);
+
+ if(unlikely(!p->cmdline_filename)) {
+ char filename[FILENAME_MAX + 1];
+ snprintfz(filename, FILENAME_MAX, "%s/proc/%d/cmdline", host_prefix, p->pid);
+ if(!(p->cmdline_filename = strdup(filename)))
+ fatal("Cannot allocate memory for filename '%s'", filename);
+ }
- int fd = open(filename, O_RDONLY, 0666);
- if(unlikely(fd == -1)) return 1;
+ int fd = open(p->cmdline_filename, O_RDONLY, 0666);
+ if(unlikely(fd == -1)) goto cleanup;
int i, bytes = read(fd, p->cmdline, MAX_CMDLINE);
close(fd);
- if(bytes <= 0) {
- // copy the command to the command line
- strncpyz(p->cmdline, p->comm, MAX_CMDLINE);
- return 0;
- }
+ if(unlikely(bytes <= 0)) goto cleanup;
p->cmdline[bytes] = '\0';
for(i = 0; i < bytes ; i++)
- if(!p->cmdline[i]) p->cmdline[i] = ' ';
+ if(unlikely(!p->cmdline[i])) p->cmdline[i] = ' ';
if(unlikely(debug))
- fprintf(stderr, "Read file '%s' contents: %s\n", filename, p->cmdline);
+ fprintf(stderr, "Read file '%s' contents: %s\n", p->cmdline_filename, p->cmdline);
+
+ return 0;
+cleanup:
+ // copy the command to the command line
+ strncpyz(p->cmdline, p->comm, MAX_CMDLINE);
return 0;
}
int read_proc_pid_ownership(struct pid_stat *p) {
- char filename[FILENAME_MAX + 1];
-
- snprintfz(filename, FILENAME_MAX, "%s/proc/%d", host_prefix, p->pid);
+ if(unlikely(!p->stat_filename)) {
+ error("pid %d does not have a stat_filename", p->pid);
+ return 1;
+ }
// ----------------------------------------
// read uid and gid
struct stat st;
- if(stat(filename, &st) != 0)
+ if(stat(p->stat_filename, &st) != 0) {
+ error("Cannot stat file '%s'", p->stat_filename);
return 1;
+ }
p->uid = st.st_uid;
p->gid = st.st_gid;
@@ -662,26 +668,26 @@ int read_proc_pid_ownership(struct pid_stat *p) {
int read_proc_pid_stat(struct pid_stat *p) {
static procfile *ff = NULL;
- char filename[FILENAME_MAX + 1];
-
- snprintfz(filename, FILENAME_MAX, "%s/proc/%d/stat", host_prefix, p->pid);
-
- // ----------------------------------------
+ if(unlikely(!p->stat_filename)) {
+ char filename[FILENAME_MAX + 1];
+ snprintfz(filename, FILENAME_MAX, "%s/proc/%d/stat", host_prefix, p->pid);
+ if(!(p->stat_filename = strdup(filename)))
+ fatal("Cannot allocate memory for filename '%s'", filename);
+ }
int set_quotes = (!ff)?1:0;
- ff = procfile_reopen(ff, filename, NULL, PROCFILE_FLAG_NO_ERROR_ON_FILE_IO);
- if(!ff) return 1;
+ ff = procfile_reopen(ff, p->stat_filename, NULL, PROCFILE_FLAG_NO_ERROR_ON_FILE_IO);
+ if(unlikely(!ff)) goto cleanup;
// if(set_quotes) procfile_set_quotes(ff, "()");
if(set_quotes) procfile_set_open_close(ff, "(", ")");
ff = procfile_readall(ff);
- if(!ff) {
- // procfile_close(ff);
- return 1;
- }
+ if(unlikely(!ff)) goto cleanup;
+ p->last_stat_collected_usec = p->stat_collected_usec;
+ p->stat_collected_usec = timems();
file_counter++;
// parse the process name
@@ -697,14 +703,41 @@ int read_proc_pid_stat(struct pid_stat *p) {
// p->tty_nr = atol(procfile_lineword(ff, 0, 6+i));
// p->tpgid = atol(procfile_lineword(ff, 0, 7+i));
// p->flags = strtoull(procfile_lineword(ff, 0, 8+i), NULL, 10);
- p->minflt = strtoull(procfile_lineword(ff, 0, 9+i), NULL, 10);
- p->cminflt = strtoull(procfile_lineword(ff, 0, 10+i), NULL, 10);
- p->majflt = strtoull(procfile_lineword(ff, 0, 11+i), NULL, 10);
- p->cmajflt = strtoull(procfile_lineword(ff, 0, 12+i), NULL, 10);
- p->utime = strtoull(procfile_lineword(ff, 0, 13+i), NULL, 10);
- p->stime = strtoull(procfile_lineword(ff, 0, 14+i), NULL, 10);
- p->cutime = strtoull(procfile_lineword(ff, 0, 15+i), NULL, 10);
- p->cstime = strtoull(procfile_lineword(ff, 0, 16+i), NULL, 10);
+
+ unsigned long long last;
+
+ last = p->minflt_raw;
+ p->minflt_raw = strtoull(procfile_lineword(ff, 0, 9+i), NULL, 10);
+ p->minflt = (p->minflt_raw - last) * (1000000 * 100) / (p->stat_collected_usec - p->last_stat_collected_usec);
+
+ last = p->cminflt_raw;
+ p->cminflt_raw = strtoull(procfile_lineword(ff, 0, 10+i), NULL, 10);
+ p->cminflt = (p->cminflt_raw - last) * (1000000 * 100) / (p->stat_collected_usec - p->last_stat_collected_usec);
+
+ last = p->majflt_raw;
+ p->majflt_raw = strtoull(procfile_lineword(ff, 0, 11+i), NULL, 10);
+ p->majflt = (p->majflt_raw - last) * (1000000 * 100) / (p->stat_collected_usec - p->last_stat_collected_usec);
+
+ last = p->cmajflt_raw;
+ p->cmajflt_raw = strtoull(procfile_lineword(ff, 0, 12+i), NULL, 10);
+ p->cmajflt = (p->cmajflt_raw - last) * (1000000 * 100) / (p->stat_collected_usec - p->last_stat_collected_usec);
+
+ last = p->utime_raw;
+ p->utime_raw = strtoull(procfile_lineword(ff, 0, 13+i), NULL, 10);
+ p->utime = (p->utime_raw - last) * (1000000 * 100) / (p->stat_collected_usec - p->last_stat_collected_usec);
+
+ last = p->stime_raw;
+ p->stime_raw = strtoull(procfile_lineword(ff, 0, 14+i), NULL, 10);
+ p->stime = (p->stime_raw - last) * (1000000 * 100) / (p->stat_collected_usec - p->last_stat_collected_usec);
+
+ last = p->cutime_raw;
+ p->cutime_raw = strtoull(procfile_lineword(ff, 0, 15+i), NULL, 10);
+ p->cutime = (p->cutime_raw - last) * (1000000 * 100) / (p->stat_collected_usec - p->last_stat_collected_usec);
+
+ last = p->cstime_raw;
+ p->cstime_raw = strtoull(procfile_lineword(ff, 0, 16+i), NULL, 10);
+ p->cstime = (p->cstime_raw - last) * (1000000 * 100) / (p->stat_collected_usec - p->last_stat_collected_usec);
+
// p->priority = strtoull(procfile_lineword(ff, 0, 17+i), NULL, 10);
// p->nice = strtoull(procfile_lineword(ff, 0, 18+i), NULL, 10);
p->num_threads = (int32_t) atol(procfile_lineword(ff, 0, 19 + i));
@@ -734,27 +767,50 @@ int read_proc_pid_stat(struct pid_stat *p) {
// p->cguest_time = strtoull(procfile_lineword(ff, 0, 43), NULL, 10);
if(unlikely(debug || (p->target && p->target->debug)))
- fprintf(stderr, "apps.plugin: READ PROC/PID/STAT: %s/proc/%d/stat, process: '%s' VALUES: utime=%llu, stime=%llu, cutime=%llu, cstime=%llu, minflt=%llu, majflt=%llu, cminflt=%llu, cmajflt=%llu, threads=%d\n", host_prefix, p->pid, p->comm, p->utime, p->stime, p->cutime, p->cstime, p->minflt, p->majflt, p->cminflt, p->cmajflt, p->num_threads);
+ fprintf(stderr, "apps.plugin: READ PROC/PID/STAT: %s/proc/%d/stat, process: '%s' on target '%s' (dt=%llu) VALUES: utime=%llu, stime=%llu, cutime=%llu, cstime=%llu, minflt=%llu, majflt=%llu, cminflt=%llu, cmajflt=%llu, threads=%d\n", host_prefix, p->pid, p->comm, (p->target)?p->target->name:"UNSET", p->stat_collected_usec - p->last_stat_collected_usec, p->utime, p->stime, p->cutime, p->cstime, p->minflt, p->majflt, p->cminflt, p->cmajflt, p->num_threads);
+
+ if(unlikely(global_iterations_counter == 1)) {
+ p->minflt = 0;
+ p->cminflt = 0;
+ p->majflt = 0;
+ p->cmajflt = 0;
+ p->utime = 0;
+ p->stime = 0;
+ p->cutime = 0;
+ p->cstime = 0;
+ }
- // procfile_close(ff);
return 0;
+
+cleanup:
+ p->minflt = 0;
+ p->cminflt = 0;
+ p->majflt = 0;
+ p->cmajflt = 0;
+ p->utime = 0;
+ p->stime = 0;
+ p->cutime = 0;
+ p->cstime = 0;
+ p->num_threads = 0;
+ p->rss = 0;
+ return 1;
}
int read_proc_pid_statm(struct pid_stat *p) {
static procfile *ff = NULL;
- char filename[FILENAME_MAX + 1];
-
- snprintfz(filename, FILENAME_MAX, "%s/proc/%d/statm", host_prefix, p->pid);
+ if(unlikely(!p->statm_filename)) {
+ char filename[FILENAME_MAX + 1];
+ snprintfz(filename, FILENAME_MAX, "%s/proc/%d/statm", host_prefix, p->pid);
+ if(!(p->statm_filename = strdup(filename)))
+ fatal("Cannot allocate memory for filename '%s'", filename);
+ }
- ff = procfile_reopen(ff, filename, NULL, PROCFILE_FLAG_NO_ERROR_ON_FILE_IO);
- if(!ff) return 1;
+ ff = procfile_reopen(ff, p->statm_filename, NULL, PROCFILE_FLAG_NO_ERROR_ON_FILE_IO);
+ if(unlikely(!ff)) goto cleanup;
ff = procfile_readall(ff);
- if(!ff) {
- // procfile_close(ff);
- return 1;
- }
+ if(unlikely(!ff)) goto cleanup;
file_counter++;
@@ -766,38 +822,92 @@ int read_proc_pid_statm(struct pid_stat *p) {
p->statm_data = strtoull(procfile_lineword(ff, 0, 5), NULL, 10);
p->statm_dirty = strtoull(procfile_lineword(ff, 0, 6), NULL, 10);
- // procfile_close(ff);
return 0;
+
+cleanup:
+ p->statm_size = 0;
+ p->statm_resident = 0;
+ p->statm_share = 0;
+ p->statm_text = 0;
+ p->statm_lib = 0;
+ p->statm_data = 0;
+ p->statm_dirty = 0;
+ return 1;
}
int read_proc_pid_io(struct pid_stat *p) {
static procfile *ff = NULL;
- char filename[FILENAME_MAX + 1];
-
- snprintfz(filename, FILENAME_MAX, "%s/proc/%d/io", host_prefix, p->pid);
+ if(unlikely(!p->io_filename)) {
+ char filename[FILENAME_MAX + 1];
+ snprintfz(filename, FILENAME_MAX, "%s/proc/%d/io", host_prefix, p->pid);
+ if(!(p->io_filename = strdup(filename)))
+ fatal("Cannot allocate memory for filename '%s'", filename);
+ }
- ff = procfile_reopen(ff, filename, NULL, PROCFILE_FLAG_NO_ERROR_ON_FILE_IO);
- if(!ff) return 1;
+ // open the file
+ ff = procfile_reopen(ff, p->io_filename, NULL, PROCFILE_FLAG_NO_ERROR_ON_FILE_IO);
+ if(unlikely(!ff)) goto cleanup;
ff = procfile_readall(ff);
- if(!ff) {
- // procfile_close(ff);
- return 1;
- }
+ if(unlikely(!ff)) goto cleanup;
file_counter++;
- p->io_logical_bytes_read = strtoull(procfile_lineword(ff, 0, 1), NULL, 10);
- p->io_logical_bytes_written = strtoull(procfile_lineword(ff, 1, 1), NULL, 10);
- p->io_read_calls = strtoull(procfile_lineword(ff, 2, 1), NULL, 10);
- p->io_write_calls = strtoull(procfile_lineword(ff, 3, 1), NULL, 10);
- p->io_storage_bytes_read = strtoull(procfile_lineword(ff, 4, 1), NULL, 10);
- p->io_storage_bytes_written = strtoull(procfile_lineword(ff, 5, 1), NULL, 10);
- p->io_cancelled_write_bytes = strtoull(procfile_lineword(ff, 6, 1), NULL, 10);
+ p->last_io_collected_usec = p->io_collected_usec;
+ p->io_collected_usec = timems();
+
+ unsigned long long last;
+
+ last = p->io_logical_bytes_read_raw;
+ p->io_logical_bytes_read_raw = strtoull(procfile_lineword(ff, 0, 1), NULL, 10);
+ p->io_logical_bytes_read = (p->io_logical_bytes_read_raw - last) * (1000000 * 100) / (p->io_collected_usec - p->last_io_collected_usec);
+
+ last = p->io_logical_bytes_written_raw;
+ p->io_logical_bytes_written_raw = strtoull(procfile_lineword(ff, 1, 1), NULL, 10);
+ p->io_logical_bytes_written = (p->io_logical_bytes_written_raw - last) * (1000000 * 100) / (p->io_collected_usec - p->last_io_collected_usec);
+
+ last = p->io_read_calls_raw;
+ p->io_read_calls_raw = strtoull(procfile_lineword(ff, 2, 1), NULL, 10);
+ p->io_read_calls = (p->io_read_calls_raw - last) * (1000000 * 100) / (p->io_collected_usec - p->last_io_collected_usec);
+
+ last = p->io_write_calls_raw;
+ p->io_write_calls_raw = strtoull(procfile_lineword(ff, 3, 1), NULL, 10);
+ p->io_write_calls = (p->io_write_calls_raw - last) * (1000000 * 100) / (p->io_collected_usec - p->last_io_collected_usec);
+
+ last = p->io_storage_bytes_read_raw;
+ p->io_storage_bytes_read_raw = strtoull(procfile_lineword(ff, 4, 1), NULL, 10);
+ p->io_storage_bytes_read = (p->io_storage_bytes_read_raw - last) * (1000000 * 100) / (p->io_collected_usec - p->last_io_collected_usec);
+
+ last = p->io_storage_bytes_written_raw;
+ p->io_storage_bytes_written_raw = strtoull(procfile_lineword(ff, 5, 1), NULL, 10);
+ p->io_storage_bytes_written = (p->io_storage_bytes_written_raw - last) * (1000000 * 100) / (p->io_collected_usec - p->last_io_collected_usec);
+
+ last = p->io_cancelled_write_bytes_raw;
+ p->io_cancelled_write_bytes_raw = strtoull(procfile_lineword(ff, 6, 1), NULL, 10);
+ p->io_cancelled_write_bytes = (p->io_cancelled_write_bytes_raw - last) * (1000000 * 100) / (p->io_collected_usec - p->last_io_collected_usec);
+
+ if(unlikely(global_iterations_counter == 1)) {
+ p->io_logical_bytes_read = 0;
+ p->io_logical_bytes_written = 0;
+ p->io_read_calls = 0;
+ p->io_write_calls = 0;
+ p->io_storage_bytes_read = 0;
+ p->io_storage_bytes_written = 0;
+ p->io_cancelled_write_bytes = 0;
+ }
- // procfile_close(ff);
return 0;
+
+cleanup:
+ p->io_logical_bytes_read = 0;
+ p->io_logical_bytes_written = 0;
+ p->io_read_calls = 0;
+ p->io_write_calls = 0;
+ p->io_storage_bytes_read = 0;
+ p->io_storage_bytes_written = 0;
+ p->io_cancelled_write_bytes = 0;
+ return 1;
}
@@ -1120,6 +1230,175 @@ int read_pid_file_descriptors(struct pid_stat *p) {
// ----------------------------------------------------------------------------
+#ifdef NETDATA_INTERNAL_CHECKS
+void find_lost_child_debug(struct pid_stat *pe, struct pid_stat *ppe, unsigned long long lost, int type) {
+ int found = 0;
+ struct pid_stat *p = NULL, *pp = pe->parent;
+
+ log_date(stderr);
+ fprintf(stderr, "Searching for candidate of lost resources of process %d (%s, %s) which is aggregated on %d (%s, %s)\n", pe->pid, pe->comm, pe->updated?"running":"exited", ppe->pid, ppe->comm, ppe->updated?"running":"exited");
+ while(pp) {
+ fprintf(stderr, " >> parent %d (%s, %s)\n", pp->pid, pp->comm, pp->updated?"running":"exited");
+ pp = pp->parent;
+ }
+
+ for(p = root_of_pids; p ; p = p->next) {
+ if(p == pe) continue;
+
+ switch(type) {
+ case 1:
+ if(p->cminflt > lost) {
+ fprintf(stderr, " > process %d (%s) could use the lost exited child minflt %llu of process %d (%s)\n", p->pid, p->comm, lost, pe->pid, pe->comm);
+ found++;
+ }
+ break;
+
+ case 2:
+ if(p->cmajflt > lost) {
+ fprintf(stderr, " > process %d (%s) could use the lost exited child majflt %llu of process %d (%s)\n", p->pid, p->comm, lost, pe->pid, pe->comm);
+ found++;
+ }
+ break;
+
+ case 3:
+ if(p->cutime > lost) {
+ fprintf(stderr, " > process %d (%s) could use the lost exited child utime %llu of process %d (%s)\n", p->pid, p->comm, lost, pe->pid, pe->comm);
+ found++;
+ }
+ break;
+
+ case 4:
+ if(p->cstime > lost) {
+ fprintf(stderr, " > process %d (%s) could use the lost exited child stime %llu of process %d (%s)\n", p->pid, p->comm, lost, pe->pid, pe->comm);
+ found++;
+ }
+ break;
+ }
+ }
+
+ if(!found) {
+ switch(type) {
+ case 1:
+ fprintf(stderr, " > cannot find any process to use the lost exited child minflt %llu of process %d (%s)\n", lost, pe->pid, pe->comm);
+ break;
+
+ case 2:
+ fprintf(stderr, " > cannot find any process to use the lost exited child majflt %llu of process %d (%s)\n", lost, pe->pid, pe->comm);
+ break;
+
+ case 3:
+ fprintf(stderr, " > cannot find any process to use the lost exited child utime %llu of process %d (%s)\n", lost, pe->pid, pe->comm);
+ break;
+
+ case 4:
+ fprintf(stderr, " > cannot find any process to use the lost exited child stime %llu of process %d (%s)\n", lost, pe->pid, pe->comm);
+ break;
+ }
+ }
+}
+#endif /* NETDATA_INTERNAL_CHECKS */
+
+void remove_exited_child_from_parent(unsigned long long *field, unsigned long long *pfield, unsigned long long *ifield, struct pid_stat *pe, struct pid_stat *ppe, int type) {
+ if(pfield) {
+ if(*field > *pfield) {
+ *field -= *pfield;
+ *pfield = 0;
+ }
+ else {
+ *pfield -= *field;
+ *field = 0;
+ }
+ }
+
+ if(*field) {
+ if(ifield && ifield != pfield) {
+ if(*field > *ifield) {
+ *field -= *ifield;
+ *ifield = 0;
+ }
+ else {
+ *ifield -= *field;
+ *field = 0;
+ }
+ }
+ }
+
+ if(*field) {
+#ifdef NETDATA_INTERNAL_CHECKS
+ find_lost_child_debug(pe, ppe, *field, type);
+#endif
+ while(pe && !pe->updated) {
+ pe->keep = 1;
+ pe = pe->parent;
+ }
+ }
+}
+
+void process_exited_processes() {
+ struct pid_stat *init = all_pids[1];
+ struct pid_stat *p;
+
+ for(p = root_of_pids; p ; p = p->next) {
+ if(p->updated || !p->stat_collected_usec) continue;
+
+ struct pid_stat *pp = p->parent;
+
+ // find the first parent that is running
+ while(pp && !pp->updated)
+ pp = pp->parent;
+
+ unsigned long long rate;
+
+ rate = (p->utime_raw + p->cutime_raw) * (1000000 * 100) / (p->stat_collected_usec - p->last_stat_collected_usec);
+ remove_exited_child_from_parent(&rate, (pp)?&pp->cutime:NULL, (init)?&init->cutime:NULL, p, pp, 3);
+ p->cutime_raw = 0;
+ p->utime_raw = rate * (p->stat_collected_usec - p->last_stat_collected_usec) / (1000000 * 100);
+
+ rate = (p->stime_raw + p->cstime_raw) * (1000000 * 100) / (p->stat_collected_usec - p->last_stat_collected_usec);
+ remove_exited_child_from_parent(&rate, (pp)?&pp->cstime:NULL, (init)?&init->cstime:NULL, p, pp, 4);
+ p->cstime_raw = 0;
+ p->stime_raw = rate * (p->stat_collected_usec - p->last_stat_collected_usec) / (1000000 * 100);
+
+ rate = (p->minflt_raw + p->cminflt_raw) * (1000000 * 100) / (p->stat_collected_usec - p->last_stat_collected_usec);
+ remove_exited_child_from_parent(&rate, (pp)?&pp->cminflt:NULL, (init)?&init->cminflt:NULL, p, pp, 1);
+ p->cminflt_raw = 0;
+ p->minflt_raw = rate * (p->stat_collected_usec - p->last_stat_collected_usec) / (1000000 * 100);
+
+ rate = (p->majflt_raw + p->cmajflt_raw) * (1000000 * 100) / (p->stat_collected_usec - p->last_stat_collected_usec);
+ remove_exited_child_from_parent(&rate, (pp)?&pp->cmajflt:NULL, (init)?&init->cmajflt:NULL, p, pp, 2);
+ p->cmajflt_raw = 0;
+ p->majflt_raw = rate * (p->stat_collected_usec - p->last_stat_collected_usec) / (1000000 * 100);
+ }
+}
+
+void link_all_processes_to_their_parents(void) {
+ struct pid_stat *p = NULL;
+
+ // link all children to their parents
+ // and update children count on parents
+ for(p = root_of_pids; p ; p = p->next) {
+ // for each process found running
+
+ if(likely(p->ppid > 0 && all_pids[p->ppid])) {
+ // valid parent processes
+
+ struct pid_stat *pp;
+
+ p->parent = pp = all_pids[p->ppid];
+ p->parent->children_count++;
+
+ if(unlikely(debug || (p->target && p->target->debug)))
+ fprintf(stderr, "apps.plugin: \tchild %d (%s, %s) on target '%s' has parent %d (%s, %s). Parent: utime=%llu, stime=%llu, minflt=%llu, majflt=%llu, cutime=%llu, cstime=%llu, cminflt=%llu, cmajflt=%llu\n", p->pid, p->comm, p->updated?"running":"exited", (p->target)?p->target->name:"UNSET", pp->pid, pp->comm, pp->updated?"running":"exited", pp->utime, pp->stime, pp->minflt, pp->majflt, pp->cutime, pp->cstime, pp->cminflt, pp->cmajflt);
+ }
+ else if(unlikely(p->ppid != 0))
+ error("pid %d %s states parent %d, but the later does not exist.", p->pid, p->comm, p->ppid);
+
+ p->sortlist = 0;
+ }
+}
+
+// ----------------------------------------------------------------------------
+
// 1. read all files in /proc
// 2. for each numeric directory:
// i. read /proc/pid/stat
@@ -1136,173 +1415,186 @@ int read_pid_file_descriptors(struct pid_stat *p) {
// to avoid filling up all disk space
// if debug is enabled, all errors are printed
-int collect_data_for_all_processes_from_proc(void)
-{
- char dirname[FILENAME_MAX + 1];
+static int compar_pid(const void *pid1, const void *pid2) {
- snprintfz(dirname, FILENAME_MAX, "%s/proc", host_prefix);
- DIR *dir = opendir(dirname);
- if(!dir) return 0;
+ struct pid_stat *p1 = all_pids[*((pid_t *)pid1)];
+ struct pid_stat *p2 = all_pids[*((pid_t *)pid2)];
- struct dirent *file = NULL;
- struct pid_stat *p = NULL;
+ if(p1->sortlist > p2->sortlist)
+ return -1;
+ else
+ return 1;
+}
- // mark them all as un-updated
- all_pids_count = 0;
- for(p = root_of_pids; p ; p = p->next) {
- all_pids_count++;
+void collect_data_for_pid(pid_t pid) {
+ if(unlikely(pid <= 0 || pid > pid_max)) {
+ error("Invalid pid %d read (expected 1 to %d). Ignoring process.", pid, pid_max);
+ return;
+ }
- p->parent = NULL;
+ struct pid_stat *p = get_pid_entry(pid);
+ if(unlikely(!p || p->read)) return;
+ p->read = 1;
- p->updated = 0;
- p->children_count = 0;
- p->merged = 0;
- p->new_entry = 0;
+ // fprintf(stderr, "Reading process %d (%s), sortlist %d\n", p->pid, p->comm, p->sortlist);
- p->last_minflt = p->minflt;
- p->last_majflt = p->majflt;
- p->last_utime = p->utime;
- p->last_stime = p->stime;
+ // --------------------------------------------------------------------
+ // /proc/<pid>/stat
- p->last_cminflt = p->cminflt;
- p->last_cmajflt = p->cmajflt;
- p->last_cutime = p->cutime;
- p->last_cstime = p->cstime;
+ if(unlikely(read_proc_pid_stat(p))) {
+ error("Cannot process %s/proc/%d/stat", host_prefix, pid);
+ // there is no reason to proceed if we cannot get its status
+ return;
+ }
- p->last_fix_cminflt = p->fix_cminflt;
- p->last_fix_cmajflt = p->fix_cmajflt;
- p->last_fix_cutime = p->fix_cutime;
- p->last_fix_cstime = p->fix_cstime;
+ read_proc_pid_ownership(p);
- p->last_io_logical_bytes_read = p->io_logical_bytes_read;
- p->last_io_logical_bytes_written = p->io_logical_bytes_written;
- p->last_io_read_calls = p->io_read_calls;
- p->last_io_write_calls = p->io_write_calls;
- p->last_io_storage_bytes_read = p->io_storage_bytes_read;
- p->last_io_storage_bytes_written = p->io_storage_bytes_written;
- p->last_io_cancelled_write_bytes = p->io_cancelled_write_bytes;
+ // check its parent pid
+ if(unlikely(p->ppid < 0 || p->ppid > pid_max)) {
+ error("Pid %d states invalid parent pid %d. Using 0.", pid, p->ppid);
+ p->ppid = 0;
}
- while((file = readdir(dir))) {
- char *endptr = file->d_name;
- pid_t pid = (pid_t) strtoul(file->d_name, &endptr, 10);
-
- // make sure we read a valid number
- if(unlikely(endptr == file->d_name || *endptr != '\0'))
- continue;
+ // --------------------------------------------------------------------
+ // /proc/<pid>/io
- if(unlikely(pid <= 0 || pid > pid_max)) {
- error("Invalid pid %d read (expected 1 to %d). Ignoring process.", pid, pid_max);
- continue;
- }
+ if(unlikely(read_proc_pid_io(p)))
+ error("Cannot process %s/proc/%d/io", host_prefix, pid);
- p = get_pid_entry(pid);
- if(unlikely(!p)) continue;
+ // --------------------------------------------------------------------
+ // /proc/<pid>/statm
+ if(unlikely(read_proc_pid_statm(p))) {
+ error("Cannot process %s/proc/%d/statm", host_prefix, pid);
+ // there is no reason to proceed if we cannot get its memory status
+ return;
+ }
- // --------------------------------------------------------------------
- // /proc/<pid>/stat
+ // --------------------------------------------------------------------
+ // link it
- if(unlikely(read_proc_pid_stat(p))) {
- error("Cannot process %s/proc/%d/stat", host_prefix, pid);
- // there is no reason to proceed if we cannot get its status
- continue;
+ // check if it is target
+ // we do this only once, the first time this pid is loaded
+ if(unlikely(p->new_entry)) {
+ // /proc/<pid>/cmdline
+ if(likely(proc_pid_cmdline_is_needed)) {
+ if(unlikely(read_proc_pid_cmdline(p)))
+ error("Cannot process %s/proc/%d/cmdline", host_prefix, pid);
}
- // check its parent pid
- if(unlikely(p->ppid < 0 || p->ppid > pid_max)) {
- error("Pid %d states invalid parent pid %d. Using 0.", pid, p->ppid);
- p->ppid = 0;
- }
+ if(unlikely(debug))
+ fprintf(stderr, "apps.plugin: \tJust added %d (%s)\n", pid, p->comm);
+
+ uint32_t hash = simple_hash(p->comm);
+ size_t pclen = strlen(p->comm);
+
+ struct target *w;
+ for(w = apps_groups_root_target; w ; w = w->next) {
+ // if(debug || (p->target && p->target->debug)) fprintf(stderr, "apps.plugin: \t\tcomparing '%s' with '%s'\n", w->compare, p->comm);
+
+ // find it - 4 cases:
+ // 1. the target is not a pattern
+ // 2. the target has the prefix
+ // 3. the target has the suffix
+ // 4. the target is something inside cmdline
+ if( (!w->starts_with && !w->ends_with && w->comparehash == hash && !strcmp(w->compare, p->comm))
+ || (w->starts_with && !w->ends_with && !strncmp(w->compare, p->comm, w->comparelen))
+ || (!w->starts_with && w->ends_with && pclen >= w->comparelen && !strcmp(w->compare, &p->comm[pclen - w->comparelen]))
+ || (proc_pid_cmdline_is_needed && w->starts_with && w->ends_with && strstr(p->cmdline, w->compare))
+ ) {
+ if(w->target) p->target = w->target;
+ else p->target = w;
- // --------------------------------------------------------------------
- // /proc/<pid>/statm
+ if(debug || (p->target && p->target->debug))
+ fprintf(stderr, "apps.plugin: \t\t%s linked to target %s\n", p->comm, p->target->name);
- if(unlikely(read_proc_pid_statm(p))) {
- error("Cannot process %s/proc/%d/statm", host_prefix, pid);
- // there is no reason to proceed if we cannot get its memory status
- continue;
+ break;
+ }
}
+ }
+ // --------------------------------------------------------------------
+ // /proc/<pid>/fd
- // --------------------------------------------------------------------
- // /proc/<pid>/io
+ if(unlikely(read_pid_file_descriptors(p))) {
+ error("Cannot process entries in %s/proc/%d/fd", host_prefix, pid);
+ }
- if(unlikely(read_proc_pid_io(p))) {
- error("Cannot process %s/proc/%d/io", host_prefix, pid);
+ // --------------------------------------------------------------------
+ // done!
- // on systems without /proc/X/io
- // allow proceeding without I/O information
- // continue;
- }
+#ifdef NETDATA_INTERNAL_CHECKS
+ if(unlikely(all_pids_count && p->ppid && all_pids[p->ppid] && !all_pids[p->ppid]->read))
+ fprintf(stderr, "Read process %d (%s) sortlisted %d, but its parent %d (%s) sortlisted %d, is not read\n", p->pid, p->comm, p->sortlist, all_pids[p->ppid]->pid, all_pids[p->ppid]->comm, all_pids[p->ppid]->sortlist);
+#endif
- // --------------------------------------------------------------------
- // <pid> ownership
+ // mark it as updated
+ p->updated = 1;
+ p->keep = 0;
+ p->keeploops = 0;
+}
+
+int collect_data_for_all_processes_from_proc(void) {
+ struct pid_stat *p = NULL;
+
+ if(all_pids_count) {
+ // read parents before childs
+ // this is needed to prevent a situation where
+ // a child is found running, but until we read
+ // its parent, it has exited and its parent
+ // has accumulated its resources
+
+ long slc = 0;
+ for(p = root_of_pids; p ; p = p->next) {
+ p->read = 0;
+ p->updated = 0;
+ p->new_entry = 0;
+ p->merged = 0;
+ p->children_count = 0;
+ p->parent = NULL;
- if(unlikely(read_proc_pid_ownership(p))) {
- error("Cannot stat %s/proc/%d", host_prefix, pid);
+#ifdef NETDATA_INTERNAL_CHECKS
+ if(unlikely(slc >= all_pids_count))
+ error("Internal error: I was thinking I had %ld processes in my arrays, but it seems there a