From 34642e336532a711cfa163d1ce055f006a6f83a8 Mon Sep 17 00:00:00 2001 From: emmrk <37795797+emmrk@users.noreply.github.com> Date: Tue, 9 Jul 2019 12:03:04 +0300 Subject: Fix duplicate kills of collection threads on shutdown (#6387) Disables collection threads killed with SIGTERM to stop the cleanup function from attempting to kill them again. --- collectors/plugins.d/plugins_d.c | 110 ++++++++++++++++++++++++--------------- 1 file changed, 69 insertions(+), 41 deletions(-) (limited to 'collectors/plugins.d') diff --git a/collectors/plugins.d/plugins_d.c b/collectors/plugins.d/plugins_d.c index 66ec5d0ea6..d333ae2413 100644 --- a/collectors/plugins.d/plugins_d.c +++ b/collectors/plugins.d/plugins_d.c @@ -526,6 +526,70 @@ static void pluginsd_worker_thread_cleanup(void *arg) { } } +#define SERIAL_FAILURES_THRESHOLD 10 +static void pluginsd_worker_thread_handle_success(struct plugind *cd) { + if (likely(cd->successful_collections)) { + sleep((unsigned int) cd->update_every); + return; + } + + if(likely(cd->serial_failures <= SERIAL_FAILURES_THRESHOLD)) { + info("'%s' (pid %d) does not generate useful output but it reports success (exits with 0). %s.", + cd->fullfilename, cd->pid, + cd->enabled ? + "Waiting a bit before starting it again." : + "Will not start it again - it is now disabled."); + sleep((unsigned int) (cd->update_every * 10)); + return; + } + + if (cd->serial_failures > SERIAL_FAILURES_THRESHOLD) { + error("'%s' (pid %d) does not generate useful output, although it reports success (exits with 0)." + "We have tried to collect something %zu times - unsuccessfully. Disabling it.", + cd->fullfilename, cd->pid, cd->serial_failures); + cd->enabled = 0; + return; + } + + return; +} + +static void pluginsd_worker_thread_handle_error(struct plugind *cd, int worker_ret_code) { + if (worker_ret_code == -1) { + info("'%s' (pid %d) was killed with SIGTERM. Disabling it.", cd->fullfilename, cd->pid); + cd->enabled = 0; + return; + } + + if (!cd->successful_collections) { + error("'%s' (pid %d) exited with error code %d and haven't collected any data. Disabling it.", + cd->fullfilename, cd->pid, worker_ret_code); + cd->enabled = 0; + return; + } + + if (cd->serial_failures <= SERIAL_FAILURES_THRESHOLD) { + error("'%s' (pid %d) exited with error code %d, but has given useful output in the past (%zu times). %s", + cd->fullfilename, cd->pid, worker_ret_code, cd->successful_collections, + cd->enabled ? + "Waiting a bit before starting it again." : + "Will not start it again - it is disabled."); + sleep((unsigned int) (cd->update_every * 10)); + return; + } + + if (cd->serial_failures > SERIAL_FAILURES_THRESHOLD) { + error("'%s' (pid %d) exited with error code %d, but has given useful output in the past (%zu times)." + "We tried to restart it %zu times, but it failed to generate data. Disabling it.", + cd->fullfilename, cd->pid, worker_ret_code, cd->successful_collections, cd->serial_failures); + cd->enabled = 0; + return; + } + + return; +} +#undef SERIAL_FAILURES_THRESHOLD + void *pluginsd_worker_thread(void *arg) { netdata_thread_cleanup_push(pluginsd_worker_thread_cleanup, arg); @@ -546,50 +610,14 @@ void *pluginsd_worker_thread(void *arg) { error("'%s' (pid %d) disconnected after %zu successful data collections (ENDs).", cd->fullfilename, cd->pid, count); killpid(cd->pid, SIGTERM); - // get the return code - int code = mypclose(fp, cd->pid); + int worker_ret_code = mypclose(fp, cd->pid); - if(code != 0) { - // the plugin reports failure - - if(likely(!cd->successful_collections)) { - // nothing collected - disable it - error("'%s' (pid %d) exited with error code %d. Disabling it.", cd->fullfilename, cd->pid, code); - cd->enabled = 0; - } - else { - // we have collected something + if (likely(worker_ret_code == 0)) + pluginsd_worker_thread_handle_success(cd); + else + pluginsd_worker_thread_handle_error(cd, worker_ret_code); - if(likely(cd->serial_failures <= 10)) { - error("'%s' (pid %d) exited with error code %d, but has given useful output in the past (%zu times). %s", cd->fullfilename, cd->pid, code, cd->successful_collections, cd->enabled?"Waiting a bit before starting it again.":"Will not start it again - it is disabled."); - sleep((unsigned int) (cd->update_every * 10)); - } - else { - error("'%s' (pid %d) exited with error code %d, but has given useful output in the past (%zu times). We tried %zu times to restart it, but it failed to generate data. Disabling it.", cd->fullfilename, cd->pid, code, cd->successful_collections, cd->serial_failures); - cd->enabled = 0; - } - } - } - else { - // the plugin reports success - - if(unlikely(!cd->successful_collections)) { - // we have collected nothing so far - - if(likely(cd->serial_failures <= 10)) { - error("'%s' (pid %d) does not generate useful output but it reports success (exits with 0). %s.", cd->fullfilename, cd->pid, cd->enabled?"Waiting a bit before starting it again.":"Will not start it again - it is now disabled."); - sleep((unsigned int) (cd->update_every * 10)); - } - else { - error("'%s' (pid %d) does not generate useful output, although it reports success (exits with 0), but we have tried %zu times to collect something. Disabling it.", cd->fullfilename, cd->pid, cd->serial_failures); - cd->enabled = 0; - } - } - else - sleep((unsigned int) cd->update_every); - } cd->pid = 0; - if(unlikely(!cd->enabled)) break; } -- cgit v1.2.3