From 2728be8b0614fd58d4b530a32fee164509e88fd8 Mon Sep 17 00:00:00 2001 From: Markos Fountoulakis <44345837+mfundul@users.noreply.github.com> Date: Tue, 24 Sep 2019 11:59:15 +0300 Subject: Detect deadlock in dbengine page cache (#6911) * Detect deadlock in dbengine page cache when there are too many metrics and print error message * Resolve dbengine deadlock by dropping metrics when page cache is too small and define relevant alarms * Changed printing deadlock errors to only happen once per dbengine instance --- health/health.d/dbengine.conf | 69 +++++++++++++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 22 deletions(-) (limited to 'health') diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf index 956abf294c..eb34562dc4 100644 --- a/health/health.d/dbengine.conf +++ b/health/health.d/dbengine.conf @@ -1,26 +1,51 @@ # you can disable an alarm notification by setting the 'to' line to: silent - alarm: 10min_dbengine_global_fs_errors - on: netdata.dbengine_global_errors - os: linux freebsd macos - hosts: * - lookup: sum -10m unaligned of FS errors - units: errors - every: 10s - crit: $this > 0 - delay: down 15m multiplier 1.5 max 1h - info: number of File-System errors dbengine came across the last 10 minutes (too many open files, wrong permissions etc) - to: sysadmin + alarm: 10min_dbengine_global_fs_errors + on: netdata.dbengine_global_errors + os: linux freebsd macos + hosts: * +lookup: sum -10m unaligned of FS errors + units: errors + every: 10s + crit: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: number of File-System errors dbengine came across the last 10 minutes (too many open files, wrong permissions etc) + to: sysadmin - alarm: 10min_dbengine_global_io_errors - on: netdata.dbengine_global_errors - os: linux freebsd macos - hosts: * - lookup: sum -10m unaligned of I/O errors - units: errors - every: 10s - crit: $this > 0 - delay: down 1h multiplier 1.5 max 3h - info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc) - to: sysadmin + alarm: 10min_dbengine_global_io_errors + on: netdata.dbengine_global_errors + os: linux freebsd macos + hosts: * +lookup: sum -10m unaligned of I/O errors + units: errors + every: 10s + crit: $this > 0 + delay: down 1h multiplier 1.5 max 3h + info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc) + to: sysadmin + + alarm: 10min_dbengine_global_page_cache_errors + on: netdata.dbengine_global_errors + os: linux freebsd macos + hosts: * + units: errors + every: 10s +lookup: sum -10m unaligned of Page-Cache errors + crit: $this > 0 +repeat: warning 120s critical 10s + delay: down 1h multiplier 1.5 max 3h + info: number of deadlocks dbengine resolved the last 10 minutes due to insufficient page cache size, metrics have been lost + to: sysadmin + + alarm: 10min_dbengine_global_page_cache_warnings + on: netdata.dbengine_global_errors + os: linux freebsd macos + hosts: * + units: errors + every: 10s +lookup: sum -10m unaligned of Page-Cache warnings + warn: $this > 0 + delay: down 1h multiplier 1.5 max 3h + info: number of times dbengine almost deadlocked the last 10 minutes due to insufficient page cache size + to: sysadmin -- cgit v1.2.3