summaryrefslogtreecommitdiffstats
path: root/health
diff options
context:
space:
mode:
authorMarkos Fountoulakis <44345837+mfundul@users.noreply.github.com>2019-09-24 11:59:15 +0300
committerGitHub <noreply@github.com>2019-09-24 11:59:15 +0300
commit2728be8b0614fd58d4b530a32fee164509e88fd8 (patch)
treeabd39375b249b832327e35880597f9974be81b22 /health
parent7977137cee3274d2535fbdb49ec2d68941285b82 (diff)
Detect deadlock in dbengine page cache (#6911)
* Detect deadlock in dbengine page cache when there are too many metrics and print error message * Resolve dbengine deadlock by dropping metrics when page cache is too small and define relevant alarms * Changed printing deadlock errors to only happen once per dbengine instance
Diffstat (limited to 'health')
-rw-r--r--health/health.d/dbengine.conf69
1 files changed, 47 insertions, 22 deletions
diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf
index 956abf294c..eb34562dc4 100644
--- a/health/health.d/dbengine.conf
+++ b/health/health.d/dbengine.conf
@@ -1,26 +1,51 @@
# you can disable an alarm notification by setting the 'to' line to: silent
- alarm: 10min_dbengine_global_fs_errors
- on: netdata.dbengine_global_errors
- os: linux freebsd macos
- hosts: *
- lookup: sum -10m unaligned of FS errors
- units: errors
- every: 10s
- crit: $this > 0
- delay: down 15m multiplier 1.5 max 1h
- info: number of File-System errors dbengine came across the last 10 minutes (too many open files, wrong permissions etc)
- to: sysadmin
+ alarm: 10min_dbengine_global_fs_errors
+ on: netdata.dbengine_global_errors
+ os: linux freebsd macos
+ hosts: *
+lookup: sum -10m unaligned of FS errors
+ units: errors
+ every: 10s
+ crit: $this > 0
+ delay: down 15m multiplier 1.5 max 1h
+ info: number of File-System errors dbengine came across the last 10 minutes (too many open files, wrong permissions etc)
+ to: sysadmin
- alarm: 10min_dbengine_global_io_errors
- on: netdata.dbengine_global_errors
- os: linux freebsd macos
- hosts: *
- lookup: sum -10m unaligned of I/O errors
- units: errors
- every: 10s
- crit: $this > 0
- delay: down 1h multiplier 1.5 max 3h
- info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc)
- to: sysadmin
+ alarm: 10min_dbengine_global_io_errors
+ on: netdata.dbengine_global_errors
+ os: linux freebsd macos
+ hosts: *
+lookup: sum -10m unaligned of I/O errors
+ units: errors
+ every: 10s
+ crit: $this > 0
+ delay: down 1h multiplier 1.5 max 3h
+ info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc)
+ to: sysadmin
+
+ alarm: 10min_dbengine_global_page_cache_errors
+ on: netdata.dbengine_global_errors
+ os: linux freebsd macos
+ hosts: *
+ units: errors
+ every: 10s
+lookup: sum -10m unaligned of Page-Cache errors
+ crit: $this > 0
+repeat: warning 120s critical 10s
+ delay: down 1h multiplier 1.5 max 3h
+ info: number of deadlocks dbengine resolved the last 10 minutes due to insufficient page cache size, metrics have been lost
+ to: sysadmin
+
+ alarm: 10min_dbengine_global_page_cache_warnings
+ on: netdata.dbengine_global_errors
+ os: linux freebsd macos
+ hosts: *
+ units: errors
+ every: 10s
+lookup: sum -10m unaligned of Page-Cache warnings
+ warn: $this > 0
+ delay: down 1h multiplier 1.5 max 3h
+ info: number of times dbengine almost deadlocked the last 10 minutes due to insufficient page cache size
+ to: sysadmin