From 2728be8b0614fd58d4b530a32fee164509e88fd8 Mon Sep 17 00:00:00 2001 From: Markos Fountoulakis <44345837+mfundul@users.noreply.github.com> Date: Tue, 24 Sep 2019 11:59:15 +0300 Subject: Detect deadlock in dbengine page cache (#6911) * Detect deadlock in dbengine page cache when there are too many metrics and print error message * Resolve dbengine deadlock by dropping metrics when page cache is too small and define relevant alarms * Changed printing deadlock errors to only happen once per dbengine instance --- configs.signatures | 2 +- daemon/global_statistics.c | 9 +++++- database/engine/rrdengine.c | 2 ++ database/engine/rrdengine.h | 13 ++++++++ database/engine/rrdengineapi.c | 30 ++++++++++++++---- database/engine/rrdengineapi.h | 4 +-- health/health.d/dbengine.conf | 69 ++++++++++++++++++++++++++++-------------- 7 files changed, 97 insertions(+), 32 deletions(-) diff --git a/configs.signatures b/configs.signatures index e29883ac56..1d81064e46 100644 --- a/configs.signatures +++ b/configs.signatures @@ -381,7 +381,7 @@ declare -A configs_signatures=( ['7deb236ec68a512b9bdd18e6a51d76f7']='python.d/mysql.conf' ['7e5fc1644aa7a54f9dbb1bd102521b09']='health.d/memcached.conf' ['7f13631183fbdf79c21c8e5a171e9b34']='health.d/zfs.conf' - ['93674f3206872ae9c43ecbc54988413b']='health.d/dbengine.conf' + ['0fca55fc770c243ebfd8387c89059dd2']='health.d/dbengine.conf' ['7fb8184d56a27040e73261ed9c6fc76f']='health_alarm_notify.conf' ['80266bddd3df374923c750a6de91d120']='health.d/apache.conf' ['803a7f9dcb942eeac0fd764b9e3e38ca']='fping.conf' diff --git a/daemon/global_statistics.c b/daemon/global_statistics.c index 53b7546f26..777ec2816a 100644 --- a/daemon/global_statistics.c +++ b/daemon/global_statistics.c @@ -538,7 +538,7 @@ void global_statistics_charts(void) { unsigned long long stats_array[RRDENG_NR_STATS]; /* get localhost's DB engine's statistics */ - rrdeng_get_33_statistics(localhost->rrdeng_ctx, stats_array); + rrdeng_get_35_statistics(localhost->rrdeng_ctx, stats_array); // ---------------------------------------------------------------- @@ -756,6 +756,8 @@ void global_statistics_charts(void) { static RRDSET *st_errors = NULL; static RRDDIM *rd_fs_errors = NULL; static RRDDIM *rd_io_errors = NULL; + static RRDDIM *rd_pg_cache_warnings = NULL; + static RRDDIM *rd_pg_cache_errors = NULL; if (unlikely(!st_errors)) { st_errors = rrdset_create_localhost( @@ -775,12 +777,17 @@ void global_statistics_charts(void) { rd_io_errors = rrddim_add(st_errors, "I/O errors", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); rd_fs_errors = rrddim_add(st_errors, "FS errors", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + rd_pg_cache_warnings = rrddim_add(st_errors, "Page-Cache warnings", NULL, 1, 1, + RRD_ALGORITHM_INCREMENTAL); + rd_pg_cache_errors = rrddim_add(st_errors, "Page-Cache errors", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); } else rrdset_next(st_errors); rrddim_set_by_pointer(st_errors, rd_io_errors, (collected_number)stats_array[30]); rrddim_set_by_pointer(st_errors, rd_fs_errors, (collected_number)stats_array[31]); + rrddim_set_by_pointer(st_errors, rd_pg_cache_warnings, (collected_number)stats_array[33]); + rrddim_set_by_pointer(st_errors, rd_pg_cache_errors, (collected_number)stats_array[34]); rrdset_done(st_errors); } diff --git a/database/engine/rrdengine.c b/database/engine/rrdengine.c index 36d917541d..42f59b59f3 100644 --- a/database/engine/rrdengine.c +++ b/database/engine/rrdengine.c @@ -5,6 +5,8 @@ rrdeng_stats_t global_io_errors = 0; rrdeng_stats_t global_fs_errors = 0; +rrdeng_stats_t global_pg_cache_warnings = 0; +rrdeng_stats_t global_pg_cache_errors = 0; rrdeng_stats_t rrdeng_reserved_file_descriptors = 0; void sanity_check(void) diff --git a/database/engine/rrdengine.h b/database/engine/rrdengine.h index 6f6a6f8ffd..6447a685bd 100644 --- a/database/engine/rrdengine.h +++ b/database/engine/rrdengine.h @@ -148,12 +148,25 @@ struct rrdengine_statistics { rrdeng_stats_t page_cache_descriptors; rrdeng_stats_t io_errors; rrdeng_stats_t fs_errors; + rrdeng_stats_t pg_cache_warnings; + rrdeng_stats_t pg_cache_errors; }; /* I/O errors global counter */ extern rrdeng_stats_t global_io_errors; /* File-System errors global counter */ extern rrdeng_stats_t global_fs_errors; +/* + * Page cache warnings global counter. + * Some page cache instance is near critical utilization where metrics will fail to be stored. + */ +extern rrdeng_stats_t global_pg_cache_warnings; +/* + * Page cache errors global counter. + * Some page cache instance has hit critical utilization where metrics failed to be stored as a deadlock resolution + * measure. + */ +extern rrdeng_stats_t global_pg_cache_errors; /* number of File-Descriptors that have been reserved by dbengine */ extern rrdeng_stats_t rrdeng_reserved_file_descriptors; diff --git a/database/engine/rrdengineapi.c b/database/engine/rrdengineapi.c index bf373f31c7..26fbe8f7ef 100644 --- a/database/engine/rrdengineapi.c +++ b/database/engine/rrdengineapi.c @@ -95,9 +95,8 @@ void rrdeng_store_metric_flush_current_page(RRDDIM *rd) if (likely(descr->page_length)) { int ret, page_is_empty; -#ifdef NETDATA_INTERNAL_CHECKS rrd_stat_atomic_add(&ctx->stats.metric_API_producers, -1); -#endif + if (handle->prev_descr) { /* unpin old second page */ pg_cache_put(ctx, handle->prev_descr); @@ -192,9 +191,26 @@ void rrdeng_store_metric_next(RRDDIM *rd, usec_t point_in_time, storage_number n if (unlikely(INVALID_TIME == descr->start_time)) { descr->start_time = point_in_time; -#ifdef NETDATA_INTERNAL_CHECKS rrd_stat_atomic_add(&ctx->stats.metric_API_producers, 1); -#endif + + if (unlikely(((unsigned long)ctx->stats.metric_API_producers) >= ctx->max_cache_pages)) { + if (0 == (unsigned long)ctx->stats.pg_cache_errors) { + /* only print the first time */ + error("Deadlock detected in dbengine instance \"%s\", metric data will not be stored in the database" + ", please increase page cache size.", ctx->dbfiles_path); + } + rrd_stat_atomic_add(&ctx->stats.pg_cache_errors, 1); + rrd_stat_atomic_add(&global_pg_cache_errors, 1); + /* Resolve deadlock */ + descr->page_length = 0; /* make sure the page descriptor is deconstructed */ + rrdeng_store_metric_flush_current_page(rd); + rrd_stat_atomic_add(&ctx->stats.metric_API_producers, -1); + return; + } else if (unlikely(((unsigned long)ctx->stats.metric_API_producers) >= ctx->cache_pages_low_watermark)) { + rrd_stat_atomic_add(&ctx->stats.pg_cache_warnings, 1); + rrd_stat_atomic_add(&global_pg_cache_warnings, 1); + } + pg_cache_insert(ctx, handle->page_index, descr); } else { pg_cache_add_new_metric_time(handle->page_index, descr); @@ -672,7 +688,7 @@ void *rrdeng_get_page(struct rrdengine_instance *ctx, uuid_t *id, usec_t point_i * You must not change the indices of the statistics or user code will break. * You must not exceed RRDENG_NR_STATS or it will crash. */ -void rrdeng_get_33_statistics(struct rrdengine_instance *ctx, unsigned long long *array) +void rrdeng_get_35_statistics(struct rrdengine_instance *ctx, unsigned long long *array) { struct page_cache *pg_cache = &ctx->pg_cache; @@ -709,7 +725,9 @@ void rrdeng_get_33_statistics(struct rrdengine_instance *ctx, unsigned long long array[30] = (uint64_t)global_io_errors; array[31] = (uint64_t)global_fs_errors; array[32] = (uint64_t)rrdeng_reserved_file_descriptors; - assert(RRDENG_NR_STATS == 33); + array[33] = (uint64_t)global_pg_cache_warnings; + array[34] = (uint64_t)global_pg_cache_errors; + assert(RRDENG_NR_STATS == 35); } /* Releases reference to page */ diff --git a/database/engine/rrdengineapi.h b/database/engine/rrdengineapi.h index 9b1ab18742..97e358199e 100644 --- a/database/engine/rrdengineapi.h +++ b/database/engine/rrdengineapi.h @@ -8,7 +8,7 @@ #define RRDENG_MIN_PAGE_CACHE_SIZE_MB (32) #define RRDENG_MIN_DISK_SPACE_MB (256) -#define RRDENG_NR_STATS (33) +#define RRDENG_NR_STATS (35) #define RRDENG_FD_BUDGET_PER_INSTANCE (50) @@ -41,7 +41,7 @@ extern int rrdeng_load_metric_is_finished(struct rrddim_query_handle *rrdimm_han extern void rrdeng_load_metric_finalize(struct rrddim_query_handle *rrdimm_handle); extern time_t rrdeng_metric_latest_time(RRDDIM *rd); extern time_t rrdeng_metric_oldest_time(RRDDIM *rd); -extern void rrdeng_get_33_statistics(struct rrdengine_instance *ctx, unsigned long long *array); +extern void rrdeng_get_35_statistics(struct rrdengine_instance *ctx, unsigned long long *array); /* must call once before using anything */ extern int rrdeng_init(struct rrdengine_instance **ctxp, char *dbfiles_path, unsigned page_cache_mb, diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf index 956abf294c..eb34562dc4 100644 --- a/health/health.d/dbengine.conf +++ b/health/health.d/dbengine.conf @@ -1,26 +1,51 @@ # you can disable an alarm notification by setting the 'to' line to: silent - alarm: 10min_dbengine_global_fs_errors - on: netdata.dbengine_global_errors - os: linux freebsd macos - hosts: * - lookup: sum -10m unaligned of FS errors - units: errors - every: 10s - crit: $this > 0 - delay: down 15m multiplier 1.5 max 1h - info: number of File-System errors dbengine came across the last 10 minutes (too many open files, wrong permissions etc) - to: sysadmin + alarm: 10min_dbengine_global_fs_errors + on: netdata.dbengine_global_errors + os: linux freebsd macos + hosts: * +lookup: sum -10m unaligned of FS errors + units: errors + every: 10s + crit: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: number of File-System errors dbengine came across the last 10 minutes (too many open files, wrong permissions etc) + to: sysadmin - alarm: 10min_dbengine_global_io_errors - on: netdata.dbengine_global_errors - os: linux freebsd macos - hosts: * - lookup: sum -10m unaligned of I/O errors - units: errors - every: 10s - crit: $this > 0 - delay: down 1h multiplier 1.5 max 3h - info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc) - to: sysadmin + alarm: 10min_dbengine_global_io_errors + on: netdata.dbengine_global_errors + os: linux freebsd macos + hosts: * +lookup: sum -10m unaligned of I/O errors + units: errors + every: 10s + crit: $this > 0 + delay: down 1h multiplier 1.5 max 3h + info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc) + to: sysadmin + + alarm: 10min_dbengine_global_page_cache_errors + on: netdata.dbengine_global_errors + os: linux freebsd macos + hosts: * + units: errors + every: 10s +lookup: sum -10m unaligned of Page-Cache errors + crit: $this > 0 +repeat: warning 120s critical 10s + delay: down 1h multiplier 1.5 max 3h + info: number of deadlocks dbengine resolved the last 10 minutes due to insufficient page cache size, metrics have been lost + to: sysadmin + + alarm: 10min_dbengine_global_page_cache_warnings + on: netdata.dbengine_global_errors + os: linux freebsd macos + hosts: * + units: errors + every: 10s +lookup: sum -10m unaligned of Page-Cache warnings + warn: $this > 0 + delay: down 1h multiplier 1.5 max 3h + info: number of times dbengine almost deadlocked the last 10 minutes due to insufficient page cache size + to: sysadmin -- cgit v1.2.3