summaryrefslogtreecommitdiffstats
path: root/database/engine
diff options
context:
space:
mode:
authorMarkos Fountoulakis <44345837+mfundul@users.noreply.github.com>2019-10-07 11:08:21 +0300
committerGitHub <noreply@github.com>2019-10-07 11:08:21 +0300
commit852c97d412080205a7880d2cda2561c8fa40b203 (patch)
tree158dc0b05b0e2c2ef083e3a6f804ea8962414635 /database/engine
parentddaef9edba00faa83654f1dc8b696a1647b32229 (diff)
Remove hard cap from page cache size to eliminate deadlocks. (#7006)
* Remove page cache error detection and deadlock resolution * Change page cache logic to disallow deadlocks due to too many API users * Updated documentation * Changed default and minimum page cache size values to 32 and 8 MiB respectively
Diffstat (limited to 'database/engine')
-rw-r--r--database/engine/README.md6
-rw-r--r--database/engine/pagecache.c36
-rw-r--r--database/engine/rrdengine.h13
-rw-r--r--database/engine/rrdengineapi.c27
-rw-r--r--database/engine/rrdengineapi.h6
5 files changed, 39 insertions, 49 deletions
diff --git a/database/engine/README.md b/database/engine/README.md
index e824aa3a27..78f3b15ec8 100644
--- a/database/engine/README.md
+++ b/database/engine/README.md
@@ -57,7 +57,8 @@ The above values are the default and minimum values for Page Cache size and DB e
in **MiB**. All DB engine instances will allocate the configured resources separately.
The `page cache size` option determines the amount of RAM in **MiB** that is dedicated to caching Netdata metric values
-themselves.
+themselves as far as queries are concerned. The total page cache size will be greater since data collection itself will
+consume additional memory as is described in the [Memory requirements](#memory-requirements) section.
The `dbengine disk space` option determines the amount of disk space in **MiB** that is dedicated to storing Netdata
metric values and all related metadata describing them.
@@ -88,7 +89,8 @@ available memory.
There are explicit memory requirements **per** DB engine **instance**, meaning **per** Netdata **node** (e.g. localhost
and streaming recipient nodes):
-- `page cache size` must be at least `#dimensions-being-collected x 4096 x 2` bytes.
+- The total page cache memory footprint will be an additional `#dimensions-being-collected x 4096 x 2` bytes over what
+ the user configured with `page cache size`.
- an additional `#pages-on-disk x 4096 x 0.03` bytes of RAM are allocated for metadata.
diff --git a/database/engine/pagecache.c b/database/engine/pagecache.c
index 457bcb2185..a419ba9818 100644
--- a/database/engine/pagecache.c
+++ b/database/engine/pagecache.c
@@ -209,9 +209,31 @@ static void pg_cache_release_pages(struct rrdengine_instance *ctx, unsigned numb
pg_cache_release_pages_unsafe(ctx, number);
uv_rwlock_wrunlock(&pg_cache->pg_cache_rwlock);
}
+
+/*
+ * This function returns the maximum number of pages allowed in the page cache.
+ * The caller must hold the page cache lock.
+ */
+static inline unsigned long pg_cache_hard_limit(struct rrdengine_instance *ctx)
+{
+ /* it's twice the number of producers since we pin 2 pages per producer */
+ return ctx->max_cache_pages + 2 * (unsigned long)ctx->stats.metric_API_producers;
+}
+
+/*
+ * This function returns the low watermark number of pages in the page cache. The page cache should strive to keep the
+ * number of pages below that number.
+ * The caller must hold the page cache lock.
+ */
+static inline unsigned long pg_cache_soft_limit(struct rrdengine_instance *ctx)
+{
+ /* it's twice the number of producers since we pin 2 pages per producer */
+ return ctx->cache_pages_low_watermark + 2 * (unsigned long)ctx->stats.metric_API_producers;
+}
+
/*
* This function will block until it reserves #number populated pages.
- * It will trigger evictions or dirty page flushing if the ctx->max_cache_pages limit is hit.
+ * It will trigger evictions or dirty page flushing if the pg_cache_hard_limit() limit is hit.
*/
static void pg_cache_reserve_pages(struct rrdengine_instance *ctx, unsigned number)
{
@@ -223,10 +245,10 @@ static void pg_cache_reserve_pages(struct rrdengine_instance *ctx, unsigned numb
assert(number < ctx->max_cache_pages);
uv_rwlock_wrlock(&pg_cache->pg_cache_rwlock);
- if (pg_cache->populated_pages + number >= ctx->max_cache_pages + 1)
+ if (pg_cache->populated_pages + number >= pg_cache_hard_limit(ctx) + 1)
debug(D_RRDENGINE, "==Page cache full. Reserving %u pages.==",
number);
- while (pg_cache->populated_pages + number >= ctx->max_cache_pages + 1) {
+ while (pg_cache->populated_pages + number >= pg_cache_hard_limit(ctx) + 1) {
if (!pg_cache_try_evict_one_page_unsafe(ctx)) {
/* failed to evict */
@@ -260,7 +282,7 @@ static void pg_cache_reserve_pages(struct rrdengine_instance *ctx, unsigned numb
/*
* This function will attempt to reserve #number populated pages.
- * It may trigger evictions if the ctx->cache_pages_low_watermark limit is hit.
+ * It may trigger evictions if the pg_cache_soft_limit() limit is hit.
* Returns 0 on failure and 1 on success.
*/
static int pg_cache_try_reserve_pages(struct rrdengine_instance *ctx, unsigned number)
@@ -272,7 +294,7 @@ static int pg_cache_try_reserve_pages(struct rrdengine_instance *ctx, unsigned n
assert(number < ctx->max_cache_pages);
uv_rwlock_wrlock(&pg_cache->pg_cache_rwlock);
- if (pg_cache->populated_pages + number >= ctx->cache_pages_low_watermark + 1) {
+ if (pg_cache->populated_pages + number >= pg_cache_soft_limit(ctx) + 1) {
debug(D_RRDENGINE,
"==Page cache full. Trying to reserve %u pages.==",
number);
@@ -280,11 +302,11 @@ static int pg_cache_try_reserve_pages(struct rrdengine_instance *ctx, unsigned n
if (!pg_cache_try_evict_one_page_unsafe(ctx))
break;
++count;
- } while (pg_cache->populated_pages + number >= ctx->cache_pages_low_watermark + 1);
+ } while (pg_cache->populated_pages + number >= pg_cache_soft_limit(ctx) + 1);
debug(D_RRDENGINE, "Evicted %u pages.", count);
}
- if (pg_cache->populated_pages + number < ctx->max_cache_pages + 1) {
+ if (pg_cache->populated_pages + number < pg_cache_hard_limit(ctx) + 1) {
pg_cache->populated_pages += number;
ret = 1; /* success */
}
diff --git a/database/engine/rrdengine.h b/database/engine/rrdengine.h
index 6447a685bd..6f6a6f8ffd 100644
--- a/database/engine/rrdengine.h
+++ b/database/engine/rrdengine.h
@@ -148,25 +148,12 @@ struct rrdengine_statistics {
rrdeng_stats_t page_cache_descriptors;
rrdeng_stats_t io_errors;
rrdeng_stats_t fs_errors;
- rrdeng_stats_t pg_cache_warnings;
- rrdeng_stats_t pg_cache_errors;
};
/* I/O errors global counter */
extern rrdeng_stats_t global_io_errors;
/* File-System errors global counter */
extern rrdeng_stats_t global_fs_errors;
-/*
- * Page cache warnings global counter.
- * Some page cache instance is near critical utilization where metrics will fail to be stored.
- */
-extern rrdeng_stats_t global_pg_cache_warnings;
-/*
- * Page cache errors global counter.
- * Some page cache instance has hit critical utilization where metrics failed to be stored as a deadlock resolution
- * measure.
- */
-extern rrdeng_stats_t global_pg_cache_errors;
/* number of File-Descriptors that have been reserved by dbengine */
extern rrdeng_stats_t rrdeng_reserved_file_descriptors;
diff --git a/database/engine/rrdengineapi.c b/database/engine/rrdengineapi.c
index 7f54439254..79fc02e93e 100644
--- a/database/engine/rrdengineapi.c
+++ b/database/engine/rrdengineapi.c
@@ -4,7 +4,7 @@
/* Default global database instance */
static struct rrdengine_instance default_global_ctx;
-int default_rrdeng_page_cache_mb = 128;
+int default_rrdeng_page_cache_mb = 32;
int default_rrdeng_disk_quota_mb = RRDENG_MIN_DISK_SPACE_MB;
/*
@@ -192,25 +192,6 @@ void rrdeng_store_metric_next(RRDDIM *rd, usec_t point_in_time, storage_number n
descr->start_time = point_in_time;
rrd_stat_atomic_add(&ctx->stats.metric_API_producers, 1);
-
- if (unlikely(((unsigned long)ctx->stats.metric_API_producers) >= ctx->max_cache_pages)) {
- if (0 == (unsigned long)ctx->stats.pg_cache_errors) {
- /* only print the first time */
- error("Deadlock detected in dbengine instance \"%s\", metric data will not be stored in the database"
- ", please increase page cache size.", ctx->dbfiles_path);
- }
- rrd_stat_atomic_add(&ctx->stats.pg_cache_errors, 1);
- rrd_stat_atomic_add(&global_pg_cache_errors, 1);
- /* Resolve deadlock */
- descr->page_length = 0; /* make sure the page descriptor is deconstructed */
- rrdeng_store_metric_flush_current_page(rd);
- rrd_stat_atomic_add(&ctx->stats.metric_API_producers, -1);
- return;
- } else if (unlikely(((unsigned long)ctx->stats.metric_API_producers) >= ctx->cache_pages_low_watermark)) {
- rrd_stat_atomic_add(&ctx->stats.pg_cache_warnings, 1);
- rrd_stat_atomic_add(&global_pg_cache_warnings, 1);
- }
-
pg_cache_insert(ctx, handle->page_index, descr);
} else {
pg_cache_add_new_metric_time(handle->page_index, descr);
@@ -692,7 +673,7 @@ void *rrdeng_get_page(struct rrdengine_instance *ctx, uuid_t *id, usec_t point_i
* You must not change the indices of the statistics or user code will break.
* You must not exceed RRDENG_NR_STATS or it will crash.
*/
-void rrdeng_get_35_statistics(struct rrdengine_instance *ctx, unsigned long long *array)
+void rrdeng_get_33_statistics(struct rrdengine_instance *ctx, unsigned long long *array)
{
struct page_cache *pg_cache = &ctx->pg_cache;
@@ -729,9 +710,7 @@ void rrdeng_get_35_statistics(struct rrdengine_instance *ctx, unsigned long long
array[30] = (uint64_t)global_io_errors;
array[31] = (uint64_t)global_fs_errors;
array[32] = (uint64_t)rrdeng_reserved_file_descriptors;
- array[33] = (uint64_t)global_pg_cache_warnings;
- array[34] = (uint64_t)global_pg_cache_errors;
- assert(RRDENG_NR_STATS == 35);
+ assert(RRDENG_NR_STATS == 33);
}
/* Releases reference to page */
diff --git a/database/engine/rrdengineapi.h b/database/engine/rrdengineapi.h
index 97e358199e..c876705e4f 100644
--- a/database/engine/rrdengineapi.h
+++ b/database/engine/rrdengineapi.h
@@ -5,10 +5,10 @@
#include "rrdengine.h"
-#define RRDENG_MIN_PAGE_CACHE_SIZE_MB (32)
+#define RRDENG_MIN_PAGE_CACHE_SIZE_MB (8)
#define RRDENG_MIN_DISK_SPACE_MB (256)
-#define RRDENG_NR_STATS (35)
+#define RRDENG_NR_STATS (33)
#define RRDENG_FD_BUDGET_PER_INSTANCE (50)
@@ -41,7 +41,7 @@ extern int rrdeng_load_metric_is_finished(struct rrddim_query_handle *rrdimm_han
extern void rrdeng_load_metric_finalize(struct rrddim_query_handle *rrdimm_handle);
extern time_t rrdeng_metric_latest_time(RRDDIM *rd);
extern time_t rrdeng_metric_oldest_time(RRDDIM *rd);
-extern void rrdeng_get_35_statistics(struct rrdengine_instance *ctx, unsigned long long *array);
+extern void rrdeng_get_33_statistics(struct rrdengine_instance *ctx, unsigned long long *array);
/* must call once before using anything */
extern int rrdeng_init(struct rrdengine_instance **ctxp, char *dbfiles_path, unsigned page_cache_mb,