From dd0f7ae992a8de282c77dc7745c5090e5d65cc28 Mon Sep 17 00:00:00 2001
From: Costa Tsaousis <costa@netdata.cloud>
Date: Mon, 23 Jan 2023 22:18:44 +0200
Subject: DBENGINE v2 - improvements part 7 (#14307)

* run cleanup in workers

* when there is a discrepancy between update every, fix it

* fix the other occurences of metric update every mismatch

* allow resetting the same timestamp

* validate flushed pages before committing them to disk

* initialize collection with the latest time in mrg

* these should be static functions

* acquire metrics for writing to detect multiple data collections of the same metric

* print the uuid of the metric that is collected twice

* log the discrepancies of completed pages

* 1 second tolerance

* unify validation of pages and related logging across dbengine

* make do_flush_pages() thread safe

* flush pages runs on libuv workers

* added uv events to tp workers

* dont cross datafile spinlock and rwlock

* should be unlock

* prevent the creation of multiple datafiles

* break an infinite replication loop

* do not log the epxansion of the replication window due to start streaming

* log all invalid pages with internal checks

* do not shutdown event loop threads

* add information about collected page events, to find the root cause of invalid collected pages

* rewrite of the gap filling to fix the invalid collected pages problem

* handle multiple collections of the same metric gracefully

* added log about main cache page conflicts; fix gap filling once again...

* keep track of the first metric writer

* it should be an internal fatal - it does not harm users

* do not check of future timestamps on collected pages, since we inherit the clock of the children; do not check collected pages validity without internal checks

* prevent negative replication completion percentage

* internal error for the discrepancy of mrg

* better logging of dbengine new metrics collection

* without internal checks it is unused

* prevent pluginsd crash on exit due to calling pthread_cancel() on an exited thread

* renames and atomics everywhere

* if a datafile cannot be acquired for deletion during shutdown, continue - this can happen when there are hot pages in open cache referencing it

* Debug for context load

* rrdcontext uuid debug

* rrddim uuid debug

* rrdeng uuid debug

* Revert "rrdeng uuid debug"

This reverts commit 393da190826a582e7e6cc90771bf91b175826d8b.

* Revert "rrddim uuid debug"

This reverts commit 72150b30408294f141b19afcfb35abd7c34777d8.

* Revert "rrdcontext uuid debug"

This reverts commit 2c3b940dc23f460226e9b2a6861c214e840044d0.

* Revert "Debug for context load"

This reverts commit 0d880fc1589f128524e0b47abd9ff0714283ce3b.

* do not use legacy uuids on multihost dbs

* thread safety for journafile size

* handle other cases of inconsistent collected pages

* make health thread check if it should be running in key loops

* do not log uuids

Co-authored-by: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com>
---
 health/health.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'health')

diff --git a/health/health.c b/health/health.c
index 947ef8644d..d7368028f5 100644
--- a/health/health.c
+++ b/health/health.c
@@ -1058,6 +1058,9 @@ void *health_main(void *ptr) {
 
         rrdhost_foreach_read(host) {
 
+            if(unlikely(!service_running(SERVICE_HEALTH)))
+                break;
+
             if (unlikely(!host->health.health_enabled))
                 continue;
 
@@ -1107,6 +1110,9 @@ void *health_main(void *ptr) {
             // the first loop is to lookup values from the db
             foreach_rrdcalc_in_rrdhost_read(host, rc) {
 
+                if(unlikely(!service_running(SERVICE_HEALTH)))
+                    break;
+
                 rrdcalc_update_info_using_rrdset_labels(rc);
 
                 if (update_disabled_silenced(host, rc))
@@ -1251,6 +1257,9 @@ void *health_main(void *ptr) {
 
             if (unlikely(runnable && service_running(SERVICE_HEALTH))) {
                 foreach_rrdcalc_in_rrdhost_read(host, rc) {
+                    if(unlikely(!service_running(SERVICE_HEALTH)))
+                        break;
+
                     if (unlikely(!(rc->run_flags & RRDCALC_FLAG_RUNNABLE)))
                         continue;
 
@@ -1431,6 +1440,9 @@ void *health_main(void *ptr) {
 
                 // process repeating alarms
                 foreach_rrdcalc_in_rrdhost_read(host, rc) {
+                    if(unlikely(!service_running(SERVICE_HEALTH)))
+                        break;
+
                     int repeat_every = 0;
                     if(unlikely(rrdcalc_isrepeating(rc) && rc->delay_up_to_timestamp <= now)) {
                         if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
@@ -1514,6 +1526,9 @@ void *health_main(void *ptr) {
                 // wait for all notifications to finish before allowing health to be cleaned up
                 ALARM_ENTRY *ae;
                 while (NULL != (ae = alarm_notifications_in_progress.head)) {
+                    if(unlikely(!service_running(SERVICE_HEALTH)))
+                        break;
+
                     health_alarm_wait_for_execution(ae);
                 }
                 break;
@@ -1525,14 +1540,21 @@ void *health_main(void *ptr) {
         // wait for all notifications to finish before allowing health to be cleaned up
         ALARM_ENTRY *ae;
         while (NULL != (ae = alarm_notifications_in_progress.head)) {
+            if(unlikely(!service_running(SERVICE_HEALTH)))
+                break;
+
             health_alarm_wait_for_execution(ae);
         }
 
 #ifdef ENABLE_ACLK
         if (netdata_cloud_setting && unlikely(aclk_alert_reloaded) && loop > (marked_aclk_reload_loop + 2)) {
             rrdhost_foreach_read(host) {
+                if(unlikely(!service_running(SERVICE_HEALTH)))
+                    break;
+
                 if (unlikely(!host->health.health_enabled))
                     continue;
+
                 sql_queue_removed_alerts_to_aclk(host);
             }
             aclk_alert_reloaded = 0;
-- 
cgit v1.2.3