summaryrefslogtreecommitdiffstats
path: root/collectors/plugins.d
diff options
context:
space:
mode:
authoremmrk <37795797+emmrk@users.noreply.github.com>2019-07-09 12:03:04 +0300
committerChris Akritidis <43294513+cakrit@users.noreply.github.com>2019-07-09 11:03:04 +0200
commit34642e336532a711cfa163d1ce055f006a6f83a8 (patch)
tree906dd07964d88900aa9c5ab8609317b2248c11a0 /collectors/plugins.d
parentc7ab028f787f1c3f1325f6195ea0cb2afc95ab95 (diff)
Fix duplicate kills of collection threads on shutdown (#6387)
Disables collection threads killed with SIGTERM to stop the cleanup function from attempting to kill them again.
Diffstat (limited to 'collectors/plugins.d')
-rw-r--r--collectors/plugins.d/plugins_d.c110
1 files changed, 69 insertions, 41 deletions
diff --git a/collectors/plugins.d/plugins_d.c b/collectors/plugins.d/plugins_d.c
index 66ec5d0ea6..d333ae2413 100644
--- a/collectors/plugins.d/plugins_d.c
+++ b/collectors/plugins.d/plugins_d.c
@@ -526,6 +526,70 @@ static void pluginsd_worker_thread_cleanup(void *arg) {
}
}
+#define SERIAL_FAILURES_THRESHOLD 10
+static void pluginsd_worker_thread_handle_success(struct plugind *cd) {
+ if (likely(cd->successful_collections)) {
+ sleep((unsigned int) cd->update_every);
+ return;
+ }
+
+ if(likely(cd->serial_failures <= SERIAL_FAILURES_THRESHOLD)) {
+ info("'%s' (pid %d) does not generate useful output but it reports success (exits with 0). %s.",
+ cd->fullfilename, cd->pid,
+ cd->enabled ?
+ "Waiting a bit before starting it again." :
+ "Will not start it again - it is now disabled.");
+ sleep((unsigned int) (cd->update_every * 10));
+ return;
+ }
+
+ if (cd->serial_failures > SERIAL_FAILURES_THRESHOLD) {
+ error("'%s' (pid %d) does not generate useful output, although it reports success (exits with 0)."
+ "We have tried to collect something %zu times - unsuccessfully. Disabling it.",
+ cd->fullfilename, cd->pid, cd->serial_failures);
+ cd->enabled = 0;
+ return;
+ }
+
+ return;
+}
+
+static void pluginsd_worker_thread_handle_error(struct plugind *cd, int worker_ret_code) {
+ if (worker_ret_code == -1) {
+ info("'%s' (pid %d) was killed with SIGTERM. Disabling it.", cd->fullfilename, cd->pid);
+ cd->enabled = 0;
+ return;
+ }
+
+ if (!cd->successful_collections) {
+ error("'%s' (pid %d) exited with error code %d and haven't collected any data. Disabling it.",
+ cd->fullfilename, cd->pid, worker_ret_code);
+ cd->enabled = 0;
+ return;
+ }
+
+ if (cd->serial_failures <= SERIAL_FAILURES_THRESHOLD) {
+ error("'%s' (pid %d) exited with error code %d, but has given useful output in the past (%zu times). %s",
+ cd->fullfilename, cd->pid, worker_ret_code, cd->successful_collections,
+ cd->enabled ?
+ "Waiting a bit before starting it again." :
+ "Will not start it again - it is disabled.");
+ sleep((unsigned int) (cd->update_every * 10));
+ return;
+ }
+
+ if (cd->serial_failures > SERIAL_FAILURES_THRESHOLD) {
+ error("'%s' (pid %d) exited with error code %d, but has given useful output in the past (%zu times)."
+ "We tried to restart it %zu times, but it failed to generate data. Disabling it.",
+ cd->fullfilename, cd->pid, worker_ret_code, cd->successful_collections, cd->serial_failures);
+ cd->enabled = 0;
+ return;
+ }
+
+ return;
+}
+#undef SERIAL_FAILURES_THRESHOLD
+
void *pluginsd_worker_thread(void *arg) {
netdata_thread_cleanup_push(pluginsd_worker_thread_cleanup, arg);
@@ -546,50 +610,14 @@ void *pluginsd_worker_thread(void *arg) {
error("'%s' (pid %d) disconnected after %zu successful data collections (ENDs).", cd->fullfilename, cd->pid, count);
killpid(cd->pid, SIGTERM);
- // get the return code
- int code = mypclose(fp, cd->pid);
+ int worker_ret_code = mypclose(fp, cd->pid);
- if(code != 0) {
- // the plugin reports failure
-
- if(likely(!cd->successful_collections)) {
- // nothing collected - disable it
- error("'%s' (pid %d) exited with error code %d. Disabling it.", cd->fullfilename, cd->pid, code);
- cd->enabled = 0;
- }
- else {
- // we have collected something
+ if (likely(worker_ret_code == 0))
+ pluginsd_worker_thread_handle_success(cd);
+ else
+ pluginsd_worker_thread_handle_error(cd, worker_ret_code);
- if(likely(cd->serial_failures <= 10)) {
- error("'%s' (pid %d) exited with error code %d, but has given useful output in the past (%zu times). %s", cd->fullfilename, cd->pid, code, cd->successful_collections, cd->enabled?"Waiting a bit before starting it again.":"Will not start it again - it is disabled.");
- sleep((unsigned int) (cd->update_every * 10));
- }
- else {
- error("'%s' (pid %d) exited with error code %d, but has given useful output in the past (%zu times). We tried %zu times to restart it, but it failed to generate data. Disabling it.", cd->fullfilename, cd->pid, code, cd->successful_collections, cd->serial_failures);
- cd->enabled = 0;
- }
- }
- }
- else {
- // the plugin reports success
-
- if(unlikely(!cd->successful_collections)) {
- // we have collected nothing so far
-
- if(likely(cd->serial_failures <= 10)) {
- error("'%s' (pid %d) does not generate useful output but it reports success (exits with 0). %s.", cd->fullfilename, cd->pid, cd->enabled?"Waiting a bit before starting it again.":"Will not start it again - it is now disabled.");
- sleep((unsigned int) (cd->update_every * 10));
- }
- else {
- error("'%s' (pid %d) does not generate useful output, although it reports success (exits with 0), but we have tried %zu times to collect something. Disabling it.", cd->fullfilename, cd->pid, cd->serial_failures);
- cd->enabled = 0;
- }
- }
- else
- sleep((unsigned int) cd->update_every);
- }
cd->pid = 0;
-
if(unlikely(!cd->enabled)) break;
}