DBENGINE v2 - improvements part 10 (#14332)

* replication cancels pending queries on exit * log when waiting for inflight queries * when there are collected and not-collected metrics, use the context priority from the collected only * Write metadata with a faster pace * Remove journal file size limit and sync mode to 0 / Drop wal checkpoint for now * Wrap in a big transaction remaining metadata writes (test 1) * fix higher tiers when tiering iterations = 2 * dbengine always returns db-aligned points; query engine expands the queries by 2 points in every direction to have enough data for interpolation * Wrap in a big transaction metadata writes (test 2) * replication cancelling fix * do not first and last entry in replication when the db has no retention * fix internal check condition * Increase metadata write batch size * always apply error limit to dbengine logs * Remove code that processes the obsolete health.db files * cleanup in query.c * do not allow queries to go beyond db boundaries * prevent internal log for +1 delta in timestamp * detect gap pages in conflicts * double protection for gap injection in main cache * Add checkpoint to prevent large WAL while running Remove unused and duplicate functions * do not allocate chart cache dir if not needed * add more info to unittests * revert query expansion to satisfy unittests Co-authored-by: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com>
author: Costa Tsaousis <costa@netdata.cloud> 2023-01-27 01:32:20 +0200
committer: GitHub <noreply@github.com> 2023-01-27 01:32:20 +0200
commit: 57eab742c88093c89d5d46deb495558ad726e6f0 (patch)
tree: e8a01519a8f9df7beba4d0be7be53a9be3f1fdfd
parent: c4f5524ea8279be492eb527a67242b408543382e (diff)
21 files changed, 305 insertions, 703 deletions
diff --git a/daemon/unit_test.c b/daemon/unit_test.c
index 9845ed430f..52b55c4e58 100644
--- a/daemon/unit_test.c
+++ b/daemon/unit_test.c
@@ -1858,7 +1858,7 @@ static void test_dbengine_create_charts(RRDHOST *host, RRDSET *st[CHARTS], RRDDI
         now_realtime_timeval(&now);
         rrdset_timed_done(st[i], now, false);
     }
-    // Fluh pages for subsequent real values
+    // Flush pages for subsequent real values
     for (i = 0 ; i < CHARTS ; ++i) {
         for (j = 0; j < DIMS; ++j) {
             rrdeng_store_metric_flush_current_page((rd[i][j])->tiers[0].db_collection_handle);
@@ -1978,10 +1978,11 @@ static int test_dbengine_check_rrdr(RRDSET *st[CHARTS], RRDDIM *rd[CHARTS][DIMS]
                                     int current_region, time_t time_start, time_t time_end)
 {
     int update_every = REGION_UPDATE_EVERY[current_region];
-    fprintf(stderr, "%s() running on region %d, start time %lld, end time %lld, update every %d...\n", __FUNCTION__, current_region, (long long)time_start, (long long)time_end, update_every);
+    fprintf(stderr, "%s() running on region %d, start time %lld, end time %lld, update every %d, on %d dimensions...\n",
+            __FUNCTION__, current_region, (long long)time_start, (long long)time_end, update_every, CHARTS * DIMS);
     uint8_t same;
     time_t time_now, time_retrieved;
-    int i, j, errors, value_errors = 0, time_errors = 0;
+    int i, j, errors, value_errors = 0, time_errors = 0, value_right = 0, time_right = 0;
     long c;
     collected_number last;
     NETDATA_DOUBLE value, expected;
@@ -2020,17 +2021,22 @@ static int test_dbengine_check_rrdr(RRDSET *st[CHARTS], RRDDIM *rd[CHARTS][DIMS]
                     same = (roundndd(value) == roundndd(expected)) ? 1 : 0;
                     if(!same) {
                         if(value_errors < 20)
-                            fprintf(stderr, "    DB-engine unittest %s/%s: at %lu secs, expecting value " NETDATA_DOUBLE_FORMAT
+                            fprintf(stderr, "    DB-engine unittest %s/%s: point #%ld, at %lu secs, expecting value " NETDATA_DOUBLE_FORMAT
                                 ", RRDR found " NETDATA_DOUBLE_FORMAT ", ### E R R O R ###\n",
-                                    rrdset_name(st[i]), rrddim_name(rd[i][j]), (unsigned long)time_now, expected, value);
+                                    rrdset_name(st[i]), rrddim_name(rd[i][j]), (long) c+1, (unsigned long)time_now, expected, value);
                         value_errors++;
                     }
+                    else
+                        value_right++;
+
                     if(time_retrieved != time_now) {
                         if(time_errors < 20)
-                            fprintf(stderr, "    DB-engine unittest %s/%s: at %lu secs, found RRDR timestamp %lu ### E R R O R ###\n",
-                                    rrdset_name(st[i]), rrddim_name(rd[i][j]), (unsigned long)time_now, (unsigned long)time_retrieved);
+                            fprintf(stderr, "    DB-engine unittest %s/%s: point #%ld at %lu secs, found RRDR timestamp %lu ### E R R O R ###\n",
+                                    rrdset_name(st[i]), rrddim_name(rd[i][j]), (long)c+1, (unsigned long)time_now, (unsigned long)time_retrieved);
                         time_errors++;
                     }
+                    else
+                        time_right++;
                 }
                 rrddim_foreach_done(d);
             }
@@ -2040,10 +2046,10 @@ static int test_dbengine_check_rrdr(RRDSET *st[CHARTS], RRDDIM *rd[CHARTS][DIMS]
     }
 
     if(value_errors)
-        fprintf(stderr, "%d value errors encountered\n", value_errors);
+        fprintf(stderr, "%d value errors encountered (%d were ok)\n", value_errors, value_right);
 
     if(time_errors)
-        fprintf(stderr, "%d time errors encountered\n", time_errors);
+        fprintf(stderr, "%d time errors encountered (%d were ok)\n", time_errors, value_right);
 
     return errors + value_errors + time_errors;
 }
@@ -2051,7 +2057,7 @@ static int test_dbengine_check_rrdr(RRDSET *st[CHARTS], RRDDIM *rd[CHARTS][DIMS]
 int test_dbengine(void)
 {
     fprintf(stderr, "%s() running...\n", __FUNCTION__ );
-    int i, j, errors, value_errors = 0, time_errors = 0, update_every, current_region;
+    int i, j, errors = 0, value_errors = 0, time_errors = 0, update_every, current_region;
     RRDHOST *host = NULL;
     RRDSET *st[CHARTS];
     RRDDIM *rd[CHARTS][DIMS];
@@ -2074,9 +2080,7 @@ int test_dbengine(void)
     time_start[current_region] = 2 * API_RELATIVE_TIME_MAX;
     time_end[current_region] = test_dbengine_create_metrics(st,rd, current_region, time_start[current_region]);
 
-    errors = test_dbengine_check_metrics(st, rd, current_region, time_start[current_region]);
-    if (errors)
-        goto error_out;
+    errors += test_dbengine_check_metrics(st, rd, current_region, time_start[current_region]);
 
     current_region = 1; //this is the second region of data
     update_every = REGION_UPDATE_EVERY[current_region]; // set data collection frequency to 3 seconds
@@ -2093,9 +2097,7 @@ int test_dbengine(void)
         time_start[current_region] += update_every - time_start[current_region] % update_every;
     time_end[current_region] = test_dbengine_create_metrics(st,rd, current_region, time_start[current_region]);
 
-    errors = test_dbengine_check_metrics(st, rd, current_region, time_start[current_region]);
-    if (errors)
-        goto error_out;
+    errors += test_dbengine_check_metrics(st, rd, current_region, time_start[current_region]);
 
     current_region = 2; //this is the third region of data
     update_every = REGION_UPDATE_EVERY[current_region]; // set data collection frequency to 1 seconds
@@ -2112,19 +2114,14 @@ int test_dbengine(void)
         time_start[current_region] += update_every - time_start[current_region] % update_every;
     time_end[current_region] = test_dbengine_create_metrics(st,rd, current_region, time_start[current_region]);
 
-    errors = test_dbengine_check_metrics(st, rd, current_region, time_start[current_region]);
-    if (errors)
-        goto error_out;
+    errors += test_dbengine_check_metrics(st, rd, current_region, time_start[current_region]);
 
     for (current_region = 0 ; current_region < REGIONS ; ++current_region) {
-        errors = test_dbengine_check_rrdr(st, rd, current_region, time_start[current_region], time_end[current_region]);
-        if (errors)
-            goto error_out;
+        errors += test_dbengine_check_rrdr(st, rd, current_region, time_start[current_region], time_end[current_region]);
     }
 
     current_region = 1;
     update_every = REGION_UPDATE_EVERY[current_region]; // use the maximum update_every = 3
-    errors = 0;
     long points = (time_end[REGIONS - 1] - time_start[0]) / update_every; // cover all time regions with RRDR
     long point_offset = (time_start[current_region] - time_start[0]) / update_every;
     for (i = 0 ; i < CHARTS ; ++i) {
@@ -2181,7 +2178,7 @@ int test_dbengine(void)
         }
         onewayalloc_destroy(owa);
     }
-error_out:
+
     rrd_wrlock();
     rrdeng_prepare_exit((struct rrdengine_instance *)host->db[0].instance);
     rrdhost_delete_charts(host);
diff --git a/database/engine/metric.c b/database/engine/metric.c
index af769fda95..d16bc063d9 100644
--- a/database/engine/metric.c
+++ b/database/engine/metric.c
@@ -327,33 +327,42 @@ bool mrg_metric_set_first_time_s_if_bigger(MRG *mrg __maybe_unused, METRIC *metr
     return ret;
 }
 
-bool mrg_metric_set_first_time_s_if_zero(MRG *mrg __maybe_unused, METRIC *metric, time_t first_time_s) {
-    bool ret = false;
+time_t mrg_metric_get_first_time_s(MRG *mrg __maybe_unused, METRIC *metric) {
+    time_t first_time_s;
 
     netdata_spinlock_lock(&metric->spinlock);
-    if(!metric->first_time_s) {
-        metric->first_time_s = first_time_s;
-        ret = true;
+
+    if(unlikely(!metric->first_time_s)) {
+        if(metric->latest_time_s_clean)
+            metric->first_time_s = metric->latest_time_s_clean;
+
+        else if(metric->latest_time_s_hot)
+            metric->first_time_s = metric->latest_time_s_hot;
     }
+
+    first_time_s = metric->first_time_s;
+
     netdata_spinlock_unlock(&metric->spinlock);
 
-    return ret;
+    return first_time_s;
 }
 
-time_t mrg_metric_get_first_time_s(MRG *mrg __maybe_unused, METRIC *metric) {
-    time_t first_time_s;
+void mrg_metric_get_retention(MRG *mrg __maybe_unused, METRIC *metric, time_t *first_time_s, time_t *last_time_s, time_t *update_every_s) {
     netdata_spinlock_lock(&metric->spinlock);
-    first_time_s = metric->first_time_s;
-    if(!first_time_s) {
+
+    if(unlikely(!metric->first_time_s)) {
         if(metric->latest_time_s_clean)
-            first_time_s = metric->latest_time_s_clean;
+            metric->first_time_s = metric->latest_time_s_clean;
 
-        if(!first_time_s || metric->latest_time_s_hot < metric->latest_time_s_clean)
-            first_time_s = metric->latest_time_s_hot;
+        else if(metric->latest_time_s_hot)
+            metric->first_time_s = metric->latest_time_s_hot;
     }
-    netdata_spinlock_unlock(&metric->spinlock);
 
-    return first_time_s;
+    *first_time_s = metric->first_time_s;
+    *last_time_s = MAX(metric->latest_time_s_clean, metric->latest_time_s_hot);
+    *update_every_s = metric->latest_update_every_s;
+
+    netdata_spinlock_unlock(&metric->spinlock);
 }
 
 bool mrg_metric_set_clean_latest_time_s(MRG *mrg __maybe_unused, METRIC *metric, time_t latest_time_s) {
diff --git a/database/engine/metric.h b/database/engine/metric.h
index 3eb2c97469..fe0481a1b1 100644
--- a/database/engine/metric.h
+++ b/database/engine/metric.h
@@ -46,18 +46,18 @@ Word_t mrg_metric_section(MRG *mrg, METRIC *metric);
 
 bool mrg_metric_set_first_time_s(MRG *mrg, METRIC *metric, time_t first_time_s);
 bool mrg_metric_set_first_time_s_if_bigger(MRG *mrg, METRIC *metric, time_t first_time_s);
-bool mrg_metric_set_first_time_s_if_zero(MRG *mrg, METRIC *metric, time_t first_time_s);
 time_t mrg_metric_get_first_time_s(MRG *mrg, METRIC *metric);
-void mrg_metric_expand_retention(MRG *mrg __maybe_unused, METRIC *metric, time_t first_time_s, time_t last_time_s, time_t update_every_s);
 
 bool mrg_metric_set_clean_latest_time_s(MRG *mrg, METRIC *metric, time_t latest_time_s);
 bool mrg_metric_set_hot_latest_time_s(MRG *mrg, METRIC *metric, time_t latest_time_s);
 time_t mrg_metric_get_latest_time_s(MRG *mrg, METRIC *metric);
 
 bool mrg_metric_set_update_every(MRG *mrg, METRIC *metric, time_t update_every_s);
+bool mrg_metric_set_update_every_s_if_zero(MRG *mrg, METRIC *metric, time_t update_every_s);
 time_t mrg_metric_get_update_every_s(MRG *mrg, METRIC *metric);
 
-bool mrg_metric_set_update_every_s_if_zero(MRG *mrg, METRIC *metric, time_t update_every_s);
+void mrg_metric_expand_retention(MRG *mrg, METRIC *metric, time_t first_time_s, time_t last_time_s, time_t update_every_s);
+void mrg_metric_get_retention(MRG *mrg, METRIC *metric, time_t *first_time_s, time_t *last_time_s, time_t *update_every_s);
 
 bool mrg_metric_writer_acquire(MRG *mrg, METRIC *metric);
 bool mrg_metric_writer_release(MRG *mrg, METRIC *metric);
diff --git a/database/engine/pagecache.c b/database/engine/pagecache.c
index 961c2b2886..11f63751b8 100644
--- a/database/engine/pagecache.c
+++ b/database/engine/pagecache.c
@@ -356,17 +356,27 @@ static size_t get_page_list_from_pgc(PGC *cache, METRIC *metric, struct rrdengin
 }
 
 static void pgc_inject_gap(struct rrdengine_instance *ctx, METRIC *metric, time_t start_time_s, time_t end_time_s) {
+
+    time_t db_first_time_s, db_last_time_s, db_update_every_s;
+    mrg_metric_get_retention(main_mrg, metric, &db_first_time_s, &db_last_time_s, &db_update_every_s);
+
+    if(is_page_in_time_range(start_time_s, end_time_s, db_first_time_s, db_last_time_s) != PAGE_IS_IN_RANGE)
+        return;
+
     PGC_ENTRY page_entry = {
             .hot = false,
             .section = (Word_t)ctx,
             .metric_id = (Word_t)metric,
-            .start_time_s = start_time_s,
-            .end_time_s = end_time_s,
+            .start_time_s = MAX(start_time_s, db_first_time_s),
+            .end_time_s = MIN(end_time_s, db_last_time_s),
             .update_every_s = 0,
             .size = 0,
             .data = DBENGINE_EMPTY_PAGE,
     };
 
+    if(page_entry.start_time_s >= page_entry.end_time_s)
+        return;
+
     PGC_PAGE *page = pgc_page_add_and_acquire(main_cache, page_entry, NULL);
     pgc_page_release(main_cache, page);
 }
diff --git a/database/engine/pdc.c b/database/engine/pdc.c
index d0daaa5c12..0563133719 100644
--- a/database/engine/pdc.c
+++ b/database/engine/pdc.c
@@ -606,6 +606,9 @@ void pdc_acquire(PDC *pdc) {
 }
 
 bool pdc_release_and_destroy_if_unreferenced(PDC *pdc, bool worker, bool router __maybe_unused) {
+    if(unlikely(!pdc))
+        return true;
+
     netdata_spinlock_lock(&pdc->refcount_spinlock);
 
     if(pdc->refcount <= 0)
diff --git a/database/engine/rrdengine.c b/database/engine/rrdengine.c
index ed1d851b1f..dbc017aafb 100644
--- a/database/engine/rrdengine.c
+++ b/database/engine/rrdengine.c
@@ -1620,9 +1620,17 @@ static void *ctx_shutdown_tp_worker(struct rrdengine_instance *ctx __maybe_unuse
     completion_wait_for(&ctx->quiesce.completion);
     completion_destroy(&ctx->quiesce.completion);
 
+    bool logged = false;
     while(__atomic_load_n(&ctx->atomic.extents_currently_being_flushed, __ATOMIC_RELAXED) ||
-            __atomic_load_n(&ctx->atomic.inflight_queries, __ATOMIC_RELAXED))
+            __atomic_load_n(&ctx->atomic.inflight_queries, __ATOMIC_RELAXED)) {
+        if(!logged) {
+            logged = true;
+            info("DBENGINE: waiting for %zu inflight queries to finish to shutdown tier %d...",
+                 __atomic_load_n(&ctx->atomic.inflight_queries, __ATOMIC_RELAXED),
+                 (ctx->config.legacy) ? -1 : ctx->config.tier);
+        }
         sleep_usec(1 * USEC_PER_MS);
+    }
 
     completion_mark_complete(completion);
 
diff --git a/database/engine/rrdengineapi.c b/database/engine/rrdengineapi.c
index 22fe30e4fd..b7d2eae3b3 100755
--- a/database/engine/rrdengineapi.c
+++ b/database/engine/rrdengineapi.c
@@ -266,16 +266,19 @@ STORAGE_COLLECT_HANDLE *rrdeng_store_metric_init(STORAGE_METRIC_HANDLE *db_metri
     if(!is_1st_metric_writer)
         __atomic_add_fetch(&ctx->atomic.collectors_running_duplicate, 1, __ATOMIC_RELAXED);
 
+    mrg_metric_set_update_every(main_mrg, metric, update_every);
+
+    handle->alignment = (struct pg_alignment *)smg;
+    rrdeng_page_alignment_acquire(handle->alignment);
+
     // this is important!
     // if we don't set the page_end_time_ut during the first collection
     // data collection may be able to go back in time and during the addition of new pages
     // clean pages may be found matching ours!
-    handle->page_end_time_ut = (usec_t)mrg_metric_get_latest_time_s(main_mrg, metric) * USEC_PER_SEC;
 
-    mrg_metric_set_update_every(main_mrg, metric, update_every);
-
-    handle->alignment = (struct pg_alignment *)smg;
-    rrdeng_page_alignment_acquire(handle->alignment);
+    time_t db_first_time_s, db_last_time_s, db_update_every_s;
+    mrg_metric_get_retention(main_mrg, metric, &db_first_time_s, &db_last_time_s, &db_update_every_s);
+    handle->page_end_time_ut = (usec_t)db_last_time_s * USEC_PER_SEC;
 
     return (STORAGE_COLLECT_HANDLE *)handle;
 }
@@ -382,11 +385,12 @@ static void rrdeng_store_metric_create_new_page(struct rrdeng_collect_handle *ha
         error_limit(&erl,
 #endif
               "DBENGINE: metric '%s' new page from %ld to %ld, update every %ld, has a conflict in main cache "
-              "with existing %s page from %ld to %ld, update every %ld - "
+              "with existing %s%s page from %ld to %ld, update every %ld - "
               "is it collected more than once?",
               uuid,
               page_entry.start_time_s, page_entry.end_time_s, (time_t)page_entry.update_every_s,
               pgc_is_page_hot(page) ? "hot" : "not-hot",
+              pgc_page_data(page) == DBENGINE_EMPTY_PAGE ? " gap" : "",
               pgc_page_start_time_s(page), pgc_page_end_time_s(page), pgc_page_update_every_s(page)
               );
 
@@ -580,12 +584,8 @@ static void store_metric_next_error_log(struct rrdeng_collect_handle *handle, us
         collect_page_flags_to_buffer(wb, handle->page_flags);
     }
 
-#ifdef NETDATA_INTERNAL_CHECKS
-    internal_error(true,
-#else
     error_limit_static_global_var(erl, 1, 0);
     error_limit(&erl,
-#endif
                 "DBENGINE: metric '%s' collected point at %ld, %s last collection at %ld, "
                 "update every %ld, %s page from %ld to %ld, position %u (of %u), flags: %s",
                 uuid,
@@ -699,8 +699,8 @@ int rrdeng_store_metric_finalize(STORAGE_COLLECT_HANDLE *collection_handle) {
     if((handle->options & RRDENG_1ST_METRIC_WRITER) && !mrg_metric_writer_release(main_mrg, handle->metric))
         internal_fatal(true, "DBENGINE: metric is already released");
 
-    time_t first_time_s = mrg_metric_get_first_time_s(main_mrg, handle->metric);
-    time_t last_time_s = mrg_metric_get_latest_time_s(main_mrg, handle->metric);
+    time_t first_time_s, last_time_s, update_every_s;
+    mrg_metric_get_retention(main_mrg, handle->metric, &first_time_s, &last_time_s, &update_every_s);
 
     mrg_metric_release(main_mrg, handle->metric);
     freez(handle);
@@ -759,7 +759,11 @@ static void unregister_query_handle(struct rrdeng_query_handle *handle __maybe_u
  * Gets a handle for loading metrics from the database.
  * The handle must be released with rrdeng_load_metric_final().
  */
-void rrdeng_load_metric_init(STORAGE_METRIC_HANDLE *db_metric_handle, struct storage_engine_query_handle *rrddim_handle, time_t start_time_s, time_t end_time_s, STORAGE_PRIORITY priority)
+void rrdeng_load_metric_init(STORAGE_METRIC_HANDLE *db_metric_handle,
+                             struct storage_engine_query_handle *rrddim_handle,
+                             time_t start_time_s,
+                             time_t end_time_s,
+                             STORAGE_PRIORITY priority)
 {
     usec_t started_ut = now_monotonic_usec();
 
@@ -769,8 +773,6 @@ void rrdeng_load_metric_init(STORAGE_METRIC_HANDLE *db_metric_handle, struct sto
     struct rrdengine_instance *ctx = mrg_metric_ctx(metric);
     struct rrdeng_query_handle *handle;
 
-    mrg_metric_set_update_every_s_if_zero(main_mrg, metric, default_rrd_update_every);
-
     handle = rrdeng_query_handle_get();
     register_query_handle(handle);
 
@@ -781,23 +783,48 @@ void rrdeng_load_metric_init(STORAGE_METRIC_HANDLE *db_metric_handle, struct sto
 
     handle->ctx = ctx;
     handle->metric = metric;
-    handle->start_time_s = start_time_s;
-    handle->end_time_s = end_time_s;
     handle->priority = priority;
-    handle->now_s = start_time_s;
 
-    handle->dt_s = mrg_metric_get_update_every_s(main_mrg, metric);
-    if(!handle->dt_s)
-        handle->dt_s = default_rrd_update_every;
+    // IMPORTANT!
+    // It is crucial not to exceed the db boundaries, because dbengine
+    // now has gap caching, so when a gap is detected a negative page
+    // is inserted into the main cache, to avoid scanning the journals
+    // again for pages matching the gap.
 
-    rrddim_handle->handle = (STORAGE_QUERY_HANDLE *)handle;
-    rrddim_handle->start_time_s = start_time_s;
-    rrddim_handle->end_time_s = end_time_s;
-    rrddim_handle->priority = priority;
+    time_t db_first_time_s, db_last_time_s, db_update_every_s;
+    mrg_metric_get_retention(main_mrg, metric, &db_first_time_s, &db_last_time_s, &db_update_every_s);
 
-    pg_cache_preload(handle);
+    if(is_page_in_time_range(start_time_s, end_time_s, db_first_time_s, db_last_time_s) == PAGE_IS_IN_RANGE) {
+        handle->start_time_s = MAX(start_time_s, db_first_time_s);
+        handle->end_time_s = MIN(end_time_s, db_last_time_s);
+        handle->now_s = handle->start_time_s;
 
-    __atomic_add_fetch(&rrdeng_cache_efficiency_stats.query_time_init, now_monotonic_usec() - started_ut, __ATOMIC_RELAXED);
+        handle->dt_s = db_update_every_s;
+        if (!handle->dt_s) {
+            handle->dt_s = default_rrd_update_every;
+            mrg_metric_set_update_every_s_if_zero(main_mrg, metric, default_rrd_update_every);
+        }
+
+        rrddim_handle->handle = (STORAGE_QUERY_HANDLE *) handle;
+        rrddim_handle->start_time_s = handle->start_time_s;
+        rrddim_handle->end_time_s = handle->end_time_s;
+        rrddim_handle->priority = priority;
+
+        pg_cache_preload(handle);
+
+        __atomic_add_fetch(&rrdeng_cache_efficiency_stats.query_time_init, now_monotonic_usec() - started_ut, __ATOMIC_RELAXED);
+    }
+    else {
+        handle->start_time_s = start_time_s;
+        handle->end_time_s = end_time_s;
+        handle->now_s = start_time_s;
+        handle->dt_s = db_update_every_s;
+
+        rrddim_handle->handle = (STORAGE_QUERY_HANDLE *) handle;
+        rrddim_handle->start_time_s = handle->start_time_s;
+        rrddim_handle->end_time_s = 0;
+        rrddim_handle->priority = priority;
+    }
 }
 
 static bool rrdeng_load_page_next(struct storage_engine_query_handle *rrddim_handle, bool debug_this __maybe_unused) {
@@ -827,10 +854,19 @@ static bool rrdeng_load_page_next(struct storage_engine_query_handle *rrddim_han
     unsigned position;
     if(likely(handle->now_s >= page_start_time_s && handle->now_s <= page_end_time_s)) {
 
-        if(unlikely(entries == 1 || page_start_time_s == page_end_time_s))
+        if(unlikely(entries == 1 || page_start_time_s == page_end_time_s || !page_update_every_s)) {
             position = 0;
-        else
+            handle->now_s = page_start_time_s;
+        }
+        else {
             position = (handle->now_s - page_start_time_s) * (entries - 1) / (page_end_time_s - page_start_time_s);
+            time_t point_end_time_s = page_start_time_s + position * page_update_every_s;
+            if(point_end_time_s < handle->now_s && position + 1 < entries) {
+                position++;
+                point_end_time_s = page_start_time_s + position * page_update_every_s;
+            }
+            handle->now_s = point_end_time_s;
+        }
 
         internal_fatal(position >= entries, "DBENGINE: wrong page position calculation");
     }
@@ -986,8 +1022,8 @@ bool rrdeng_metric_retention_by_uuid(STORAGE_INSTANCE *db_instance, uuid_t *dim_
     if (unlikely(!metric))
         return false;
 
-    *first_entry_s = mrg_metric_get_first_time_s(main_mrg, metric);
-    *last_entry_s = mrg_metric_get_latest_time_s(main_mrg, metric);
+    time_t update_every_s;
+    mrg_metric_get_retention(main_mrg, metric, first_entry_s, last_entry_s, &update_every_s);
 
     mrg_metric_release(main_mrg, metric);
 
diff --git a/database/rrd.c b/database/rrd.c
index df364419ea..d489ddb8b1 100644
--- a/database/rrd.c
+++ b/database/rrd.c
@@ -135,7 +135,7 @@ const char *rrdset_type_name(RRDSET_TYPE chart_type) {
 // ----------------------------------------------------------------------------
 // RRD - cache directory
 
-char *rrdset_cache_dir(RRDHOST *host, const char *id) {
+char *rrdhost_cache_dir_for_rrdset_alloc(RRDHOST *host, const char *id) {
     char *ret = NULL;
 
     char b[FILENAME_MAX + 1];
diff --git a/database/rrd.h b/database/rrd.h
index 1ae53b47e5..1128f7c6ab 100644
--- a/database/rrd.h
+++ b/database/rrd.h
@@ -314,7 +314,7 @@ typedef struct storage_collect_handle STORAGE_COLLECT_HANDLE;
 struct rrddim_tier {
     STORAGE_POINT virtual_point;
     size_t tier_grouping;
-    time_t next_point_time_s;
+    time_t next_point_end_time_s;
     STORAGE_METRIC_HANDLE *db_metric_handle;        // the metric handle inside the database
     STORAGE_COLLECT_HANDLE *db_collection_handle;   // the data collection handle
     struct storage_engine_collect_ops *collect_ops;
@@ -905,9 +905,7 @@ typedef struct health {
     time_t health_delay_up_to;                     // a timestamp to delay alarms processing up to
     STRING *health_default_exec;                   // the full path of the alarms notifications program
     STRING *health_default_recipient;              // the default recipient for all alarms
-    char *health_log_filename;                     // the alarms event log filename
     size_t health_log_entries_written;             // the number of alarm events written to the alarms event log
-    FILE *health_log_fp;                           // the FILE pointer to the open alarms event log file
     uint32_t health_default_warn_repeat_every;     // the default value for the interval between repeating warning notifications
     uint32_t health_default_crit_repeat_every;     // the default value for the interval between repeating critical notifications
 } HEALTH;
@@ -1340,7 +1338,8 @@ void rrdset_free(RRDSET *st);
 
 #ifdef NETDATA_RRD_INTERNALS
 
-char *rrdset_cache_dir(RRDHOST *host, const char *id);
+char *rrdhost_cache_dir_for_rrdset_alloc(RRDHOST *host, const char *id);
+const char *rrdset_cache_dir(RRDSET *st);
 
 void rrddim_free(RRDSET *st, RRDDIM *rd);
 
diff --git a/database/rrdcontext.c b/database/rrdcontext.c
index c261c832f3..8d019dafba 100644
--- a/database/rrdcontext.c
+++ b/database/rrdcontext.c
@@ -3446,6 +3446,8 @@ static void rrdcontext_post_process_updates(RRDCONTEXT *rc, bool force, RRD_FLAG
     if(worker_jobs)
         worker_is_busy(WORKER_JOB_PP_CONTEXT);
 
+    size_t min_priority_collected = LONG_MAX;
+    size_t min_priority_not_collected = LONG_MAX;
     size_t min_priority = LONG_MAX;
     time_t min_first_time_t = LONG_MAX, max_last_time_t = 0;
     size_t instances_active = 0, instances_deleted = 0;
@@ -3482,8 +3484,16 @@ static void rrdcontext_post_process_updates(RRDCONTEXT *rc, bool force, RRD_FLAG
 
             instances_active++;
 
-            if (ri->priority >= RRDCONTEXT_MINIMUM_ALLOWED_PRIORITY && ri->priority < min_priority)
-                min_priority = ri->priority;
+            if (ri->priority >= RRDCONTEXT_MINIMUM_ALLOWED_PRIORITY) {
+                if(rrd_flag_check(ri, RRD_FLAG_COLLECTED)) {
+                    if(ri->priority < min_priority_collected)
+                        min_priority_collected = ri->priority;
+                }
+                else {
+                    if(ri->priority < min_priority_not_collected)
+                        min_priority_not_collected = ri->priority;
+                }
+            }
 
             if (ri->first_time_s && ri->first_time_s < min_first_time_t)
                 min_first_time_t = ri->first_time_s;
@@ -3492,6 +3502,13 @@ static void rrdcontext_post_process_updates(RRDCONTEXT *rc, bool force, RRD_FLAG
                 max_last_time_t = ri->last_time_s;
         }
         dfe_done(ri);
+
+        if(min_priority_collected != LONG_MAX)
+            // use the collected priority
+            min_priority = min_priority_collected;
+        else
+            // use the non-collected priority
+            min_priority = min_priority_not_collected;
     }
 
     {
diff --git a/database/rrddim.c b/database/rrddim.c
index b520f21d3d..6846b0d42c 100644
--- a/database/rrddim.c
+++ b/database/rrddim.c
@@ -686,7 +686,7 @@ bool rrddim_memory_load_or_create_map_save(RRDSET *st, RRDDIM *rd, RRD_MEMORY_MO
     char filename[FILENAME_MAX + 1];
     char fullfilename[FILENAME_MAX + 1];
     rrdset_strncpyz_name(filename, rrddim_id(rd), FILENAME_MAX);
-    snprintfz(fullfilename, FILENAME_MAX, "%s/%s.db", st->cache_dir, filename);
+    snprintfz(fullfilename, FILENAME_MAX, "%s/%s.db", rrdset_cache_dir(st), filename);
 
     rd_on_file = (struct rrddim_map_save_v019 *)netdata_mmap(
         fullfilename, size, ((memory_mode == RRD_MEMORY_MODE_MAP) ? MAP_SHARED : MAP_PRIVATE), 1, false, NULL);
diff --git a/database/rrdhost.c b/database/rrdhost.c
index b25fc72d21..454fd6b809 100644
--- a/database/rrdhost.c
+++ b/database/rrdhost.c
@@ -499,7 +499,6 @@ int is_legacy = 1;
                  ", health %s"
                  ", cache_dir '%s'"
                  ", varlib_dir '%s'"
-                 ", health_log '%s'"
                  ", alarms default handler '%s'"
                  ", alarms default recipient '%s'"
          , rrdhost_hostname(host)
@@ -519,7 +518,6 @@ int is_legacy = 1;
          , host->health.health_enabled?"enabled":"disabled"
          , host->cache_dir
          , host->varlib_dir
-         , host->health.health_log_filename
          , string2str(host->health.health_default_exec)
          , string2str(host->health.health_default_recipient)
     );
@@ -1085,7 +1083,7 @@ void rrdhost_free___while_having_rrd_wrlock(RRDHOST *host, bool force) {
     if(!host) return;
 
     if (netdata_exit || force) {
-        info("Freeing all memory for host '%s'...", rrdhost_hostname(host));
+        info("RRD: 'host:%s' freeing memory...", rrdhost_hostname(host));
 
         // ------------------------------------------------------------------------
         // first remove it from the indexes, so that it will not be discoverable
@@ -1146,7 +1144,7 @@ void rrdhost_free___while_having_rrd_wrlock(RRDHOST *host, bool force) {
 #endif
 
     if (!netdata_exit && !force) {
-        info("Setting archive mode for host '%s'...", rrdhost_hostname(host));
+        info("RRD: 'host:%s' is now in archive mode...", rrdhost_hostname(host));
         rrdhost_flag_set(host, RRDHOST_FLAG_ARCHIVED | RRDHOST_FLAG_ORPHAN);
         return;
     }
@@ -1187,7 +1185,6 @@ void rrdhost_free___while_having_rrd_wrlock(RRDHOST *host, bool force) {
     rrdpush_destinations_free(host);
     string_freez(host->health.health_default_exec);
     string_freez(host->health.health_default_recipient);
-    freez(host->health.health_log_filename);
     string_freez(host->registry_hostname);
     simple_pattern_free(host->rrdpush_send_charts_matching);
     netdata_rwlock_destroy(&host->health_log.alarm_log_rwlock);
@@ -1236,7 +1233,7 @@ void rrd_finalize_collection_for_all_hosts(void) {
 void rrdhost_save_charts(RRDHOST *host) {
     if(!host) return;
 
-    info("Saving/Closing database of host '%s'...", rrdhost_hostname(host));
+    info("RRD: 'host:%s' saving / closing database...", rrdhost_hostname(host));
author	Costa Tsaousis <costa@netdata.cloud>	2023-01-27 01:32:20 +0200
committer	GitHub <noreply@github.com>	2023-01-27 01:32:20 +0200
commit	57eab742c88093c89d5d46deb495558ad726e6f0 (patch)
tree	e8a01519a8f9df7beba4d0be7be53a9be3f1fdfd
parent	c4f5524ea8279be492eb527a67242b408543382e (diff)