summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorvkalintiris <vasilis@netdata.cloud>2024-02-27 20:10:54 +0200
committerGitHub <noreply@github.com>2024-02-27 20:10:54 +0200
commit700d77b5b9939671e77b789d07ed35419f7db369 (patch)
treebfc2727f3350740f301a65f3d201ecd8a0c4ad0e
parenta9ca70c20e12046d9af6fa0bff916dd616a90e53 (diff)
Abort the agent if a single shutdown step takes more than 60 seconds. (#17060)
* Add timed-wait for completion. * Abort if any shutdown step takes more than 60 seconds to complete. * Timeout only on sentry builds.
-rw-r--r--src/daemon/watcher.c42
-rw-r--r--src/libnetdata/completion/completion.c35
-rw-r--r--src/libnetdata/completion/completion.h4
3 files changed, 67 insertions, 14 deletions
diff --git a/src/daemon/watcher.c b/src/daemon/watcher.c
index 2a3bcf544b..70f71f8e43 100644
--- a/src/daemon/watcher.c
+++ b/src/daemon/watcher.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-3.0-or-later
-#include "daemon/common.h"
#include "watcher.h"
watcher_step_t *watcher_steps;
@@ -30,24 +29,39 @@ void *watcher_main(void *arg)
completion_wait_for(&shutdown_begin_completion);
usec_t shutdown_start_time = now_monotonic_usec();
- // TODO:
- // - add a version of completion_wait_for with timeout
- // - check the step's duration and abort when the timeout has expired.
+ netdata_log_error("Shutdown process started");
+
+ unsigned timeout = 60;
+
for (int step_id = 0; step_id != WATCHER_STEP_ID_MAX; step_id++) {
usec_t step_start_time = now_monotonic_usec();
+
+#ifdef ENABLE_SENTRY
+ // Wait with a timeout
+ bool ok = completion_timedwait_for(&watcher_steps[step_id].p, timeout);
+#else
+ // Wait indefinitely
+ bool ok = true;
completion_wait_for(&watcher_steps[step_id].p);
- usec_t step_end_time = now_monotonic_usec();
-
- usec_t step_duration = step_end_time - step_start_time;
- netdata_log_info("shutdown step: [%d/%d] - '%s' finished in %llu milliseconds",
- step_id + 1,
- WATCHER_STEP_ID_MAX,
- watcher_steps[step_id].msg,
- step_duration / USEC_PER_MS);
+#endif
+
+ usec_t step_duration = now_monotonic_usec() - step_start_time;
+
+ if (ok) {
+ netdata_log_info("shutdown step: [%d/%d] - '%s' finished in %llu milliseconds",
+ step_id + 1, WATCHER_STEP_ID_MAX,
+ watcher_steps[step_id].msg, step_duration / USEC_PER_MS);
+ } else {
+ // Do not call fatal() because it will try to execute the exit
+ // sequence twice.
+ netdata_log_error("shutdown step: [%d/%d] - '%s' took more than %u seconds (ie. %llu milliseconds)",
+ step_id + 1, WATCHER_STEP_ID_MAX, watcher_steps[step_id].msg,
+ timeout, step_duration / USEC_PER_MS);
+
+ abort();
+ }
}
- netdata_log_error("Shutdown process started");
-
completion_wait_for(&shutdown_end_completion);
usec_t shutdown_end_time = now_monotonic_usec();
diff --git a/src/libnetdata/completion/completion.c b/src/libnetdata/completion/completion.c
index 6257e02998..113423835a 100644
--- a/src/libnetdata/completion/completion.c
+++ b/src/libnetdata/completion/completion.c
@@ -26,6 +26,41 @@ void completion_wait_for(struct completion *p)
uv_mutex_unlock(&p->mutex);
}
+bool completion_timedwait_for(struct completion *p, uint64_t timeout)
+{
+ timeout *= NSEC_PER_SEC;
+
+ uint64_t start_time = uv_hrtime();
+ bool result = true;
+
+ uv_mutex_lock(&p->mutex);
+ while (!p->completed) {
+ int rc = uv_cond_timedwait(&p->cond, &p->mutex, timeout);
+
+ if (rc == 0) {
+ result = true;
+ break;
+ } else if (rc == UV_ETIMEDOUT) {
+ result = false;
+ break;
+ }
+
+ /*
+ * handle spurious wakeups
+ */
+
+ uint64_t elapsed = uv_hrtime() - start_time;
+ if (elapsed >= timeout) {
+ result = false;
+ break;
+ }
+ timeout -= elapsed;
+ }
+ uv_mutex_unlock(&p->mutex);
+
+ return result;
+}
+
void completion_mark_complete(struct completion *p)
{
uv_mutex_lock(&p->mutex);
diff --git a/src/libnetdata/completion/completion.h b/src/libnetdata/completion/completion.h
index 723f736889..908ccfaf64 100644
--- a/src/libnetdata/completion/completion.h
+++ b/src/libnetdata/completion/completion.h
@@ -18,6 +18,10 @@ void completion_destroy(struct completion *p);
void completion_wait_for(struct completion *p);
+// Wait for at most `timeout` seconds. Return true on success, false on
+// error or timeout.
+bool completion_timedwait_for(struct completion *p, uint64_t timeout);
+
void completion_mark_complete(struct completion *p);
unsigned completion_wait_for_a_job(struct completion *p, unsigned completed_jobs);