summaryrefslogtreecommitdiffstats
path: root/health
diff options
context:
space:
mode:
authorCosta Tsaousis <costa@netdata.cloud>2023-01-23 22:18:44 +0200
committerGitHub <noreply@github.com>2023-01-23 22:18:44 +0200
commitdd0f7ae992a8de282c77dc7745c5090e5d65cc28 (patch)
treefecf5514eda33c0a96f4d359f30fd07229d12cf7 /health
parentc2c3876c519fbc22a60a5d8b753dc6d8e81e0fed (diff)
DBENGINE v2 - improvements part 7 (#14307)
* run cleanup in workers * when there is a discrepancy between update every, fix it * fix the other occurences of metric update every mismatch * allow resetting the same timestamp * validate flushed pages before committing them to disk * initialize collection with the latest time in mrg * these should be static functions * acquire metrics for writing to detect multiple data collections of the same metric * print the uuid of the metric that is collected twice * log the discrepancies of completed pages * 1 second tolerance * unify validation of pages and related logging across dbengine * make do_flush_pages() thread safe * flush pages runs on libuv workers * added uv events to tp workers * dont cross datafile spinlock and rwlock * should be unlock * prevent the creation of multiple datafiles * break an infinite replication loop * do not log the epxansion of the replication window due to start streaming * log all invalid pages with internal checks * do not shutdown event loop threads * add information about collected page events, to find the root cause of invalid collected pages * rewrite of the gap filling to fix the invalid collected pages problem * handle multiple collections of the same metric gracefully * added log about main cache page conflicts; fix gap filling once again... * keep track of the first metric writer * it should be an internal fatal - it does not harm users * do not check of future timestamps on collected pages, since we inherit the clock of the children; do not check collected pages validity without internal checks * prevent negative replication completion percentage * internal error for the discrepancy of mrg * better logging of dbengine new metrics collection * without internal checks it is unused * prevent pluginsd crash on exit due to calling pthread_cancel() on an exited thread * renames and atomics everywhere * if a datafile cannot be acquired for deletion during shutdown, continue - this can happen when there are hot pages in open cache referencing it * Debug for context load * rrdcontext uuid debug * rrddim uuid debug * rrdeng uuid debug * Revert "rrdeng uuid debug" This reverts commit 393da190826a582e7e6cc90771bf91b175826d8b. * Revert "rrddim uuid debug" This reverts commit 72150b30408294f141b19afcfb35abd7c34777d8. * Revert "rrdcontext uuid debug" This reverts commit 2c3b940dc23f460226e9b2a6861c214e840044d0. * Revert "Debug for context load" This reverts commit 0d880fc1589f128524e0b47abd9ff0714283ce3b. * do not use legacy uuids on multihost dbs * thread safety for journafile size * handle other cases of inconsistent collected pages * make health thread check if it should be running in key loops * do not log uuids Co-authored-by: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com>
Diffstat (limited to 'health')
-rw-r--r--health/health.c22
1 files changed, 22 insertions, 0 deletions
diff --git a/health/health.c b/health/health.c
index 947ef8644d..d7368028f5 100644
--- a/health/health.c
+++ b/health/health.c
@@ -1058,6 +1058,9 @@ void *health_main(void *ptr) {
rrdhost_foreach_read(host) {
+ if(unlikely(!service_running(SERVICE_HEALTH)))
+ break;
+
if (unlikely(!host->health.health_enabled))
continue;
@@ -1107,6 +1110,9 @@ void *health_main(void *ptr) {
// the first loop is to lookup values from the db
foreach_rrdcalc_in_rrdhost_read(host, rc) {
+ if(unlikely(!service_running(SERVICE_HEALTH)))
+ break;
+
rrdcalc_update_info_using_rrdset_labels(rc);
if (update_disabled_silenced(host, rc))
@@ -1251,6 +1257,9 @@ void *health_main(void *ptr) {
if (unlikely(runnable && service_running(SERVICE_HEALTH))) {
foreach_rrdcalc_in_rrdhost_read(host, rc) {
+ if(unlikely(!service_running(SERVICE_HEALTH)))
+ break;
+
if (unlikely(!(rc->run_flags & RRDCALC_FLAG_RUNNABLE)))
continue;
@@ -1431,6 +1440,9 @@ void *health_main(void *ptr) {
// process repeating alarms
foreach_rrdcalc_in_rrdhost_read(host, rc) {
+ if(unlikely(!service_running(SERVICE_HEALTH)))
+ break;
+
int repeat_every = 0;
if(unlikely(rrdcalc_isrepeating(rc) && rc->delay_up_to_timestamp <= now)) {
if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
@@ -1514,6 +1526,9 @@ void *health_main(void *ptr) {
// wait for all notifications to finish before allowing health to be cleaned up
ALARM_ENTRY *ae;
while (NULL != (ae = alarm_notifications_in_progress.head)) {
+ if(unlikely(!service_running(SERVICE_HEALTH)))
+ break;
+
health_alarm_wait_for_execution(ae);
}
break;
@@ -1525,14 +1540,21 @@ void *health_main(void *ptr) {
// wait for all notifications to finish before allowing health to be cleaned up
ALARM_ENTRY *ae;
while (NULL != (ae = alarm_notifications_in_progress.head)) {
+ if(unlikely(!service_running(SERVICE_HEALTH)))
+ break;
+
health_alarm_wait_for_execution(ae);
}
#ifdef ENABLE_ACLK
if (netdata_cloud_setting && unlikely(aclk_alert_reloaded) && loop > (marked_aclk_reload_loop + 2)) {
rrdhost_foreach_read(host) {
+ if(unlikely(!service_running(SERVICE_HEALTH)))
+ break;
+
if (unlikely(!host->health.health_enabled))
continue;
+
sql_queue_removed_alerts_to_aclk(host);
}
aclk_alert_reloaded = 0;