From dd0f7ae992a8de282c77dc7745c5090e5d65cc28 Mon Sep 17 00:00:00 2001 From: Costa Tsaousis Date: Mon, 23 Jan 2023 22:18:44 +0200 Subject: DBENGINE v2 - improvements part 7 (#14307) * run cleanup in workers * when there is a discrepancy between update every, fix it * fix the other occurences of metric update every mismatch * allow resetting the same timestamp * validate flushed pages before committing them to disk * initialize collection with the latest time in mrg * these should be static functions * acquire metrics for writing to detect multiple data collections of the same metric * print the uuid of the metric that is collected twice * log the discrepancies of completed pages * 1 second tolerance * unify validation of pages and related logging across dbengine * make do_flush_pages() thread safe * flush pages runs on libuv workers * added uv events to tp workers * dont cross datafile spinlock and rwlock * should be unlock * prevent the creation of multiple datafiles * break an infinite replication loop * do not log the epxansion of the replication window due to start streaming * log all invalid pages with internal checks * do not shutdown event loop threads * add information about collected page events, to find the root cause of invalid collected pages * rewrite of the gap filling to fix the invalid collected pages problem * handle multiple collections of the same metric gracefully * added log about main cache page conflicts; fix gap filling once again... * keep track of the first metric writer * it should be an internal fatal - it does not harm users * do not check of future timestamps on collected pages, since we inherit the clock of the children; do not check collected pages validity without internal checks * prevent negative replication completion percentage * internal error for the discrepancy of mrg * better logging of dbengine new metrics collection * without internal checks it is unused * prevent pluginsd crash on exit due to calling pthread_cancel() on an exited thread * renames and atomics everywhere * if a datafile cannot be acquired for deletion during shutdown, continue - this can happen when there are hot pages in open cache referencing it * Debug for context load * rrdcontext uuid debug * rrddim uuid debug * rrdeng uuid debug * Revert "rrdeng uuid debug" This reverts commit 393da190826a582e7e6cc90771bf91b175826d8b. * Revert "rrddim uuid debug" This reverts commit 72150b30408294f141b19afcfb35abd7c34777d8. * Revert "rrdcontext uuid debug" This reverts commit 2c3b940dc23f460226e9b2a6861c214e840044d0. * Revert "Debug for context load" This reverts commit 0d880fc1589f128524e0b47abd9ff0714283ce3b. * do not use legacy uuids on multihost dbs * thread safety for journafile size * handle other cases of inconsistent collected pages * make health thread check if it should be running in key loops * do not log uuids Co-authored-by: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com> --- health/health.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'health') diff --git a/health/health.c b/health/health.c index 947ef8644d..d7368028f5 100644 --- a/health/health.c +++ b/health/health.c @@ -1058,6 +1058,9 @@ void *health_main(void *ptr) { rrdhost_foreach_read(host) { + if(unlikely(!service_running(SERVICE_HEALTH))) + break; + if (unlikely(!host->health.health_enabled)) continue; @@ -1107,6 +1110,9 @@ void *health_main(void *ptr) { // the first loop is to lookup values from the db foreach_rrdcalc_in_rrdhost_read(host, rc) { + if(unlikely(!service_running(SERVICE_HEALTH))) + break; + rrdcalc_update_info_using_rrdset_labels(rc); if (update_disabled_silenced(host, rc)) @@ -1251,6 +1257,9 @@ void *health_main(void *ptr) { if (unlikely(runnable && service_running(SERVICE_HEALTH))) { foreach_rrdcalc_in_rrdhost_read(host, rc) { + if(unlikely(!service_running(SERVICE_HEALTH))) + break; + if (unlikely(!(rc->run_flags & RRDCALC_FLAG_RUNNABLE))) continue; @@ -1431,6 +1440,9 @@ void *health_main(void *ptr) { // process repeating alarms foreach_rrdcalc_in_rrdhost_read(host, rc) { + if(unlikely(!service_running(SERVICE_HEALTH))) + break; + int repeat_every = 0; if(unlikely(rrdcalc_isrepeating(rc) && rc->delay_up_to_timestamp <= now)) { if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) { @@ -1514,6 +1526,9 @@ void *health_main(void *ptr) { // wait for all notifications to finish before allowing health to be cleaned up ALARM_ENTRY *ae; while (NULL != (ae = alarm_notifications_in_progress.head)) { + if(unlikely(!service_running(SERVICE_HEALTH))) + break; + health_alarm_wait_for_execution(ae); } break; @@ -1525,14 +1540,21 @@ void *health_main(void *ptr) { // wait for all notifications to finish before allowing health to be cleaned up ALARM_ENTRY *ae; while (NULL != (ae = alarm_notifications_in_progress.head)) { + if(unlikely(!service_running(SERVICE_HEALTH))) + break; + health_alarm_wait_for_execution(ae); } #ifdef ENABLE_ACLK if (netdata_cloud_setting && unlikely(aclk_alert_reloaded) && loop > (marked_aclk_reload_loop + 2)) { rrdhost_foreach_read(host) { + if(unlikely(!service_running(SERVICE_HEALTH))) + break; + if (unlikely(!host->health.health_enabled)) continue; + sql_queue_removed_alerts_to_aclk(host); } aclk_alert_reloaded = 0; -- cgit v1.2.3