Drop dirty dbengine pages if disk cannot keep up (#7777)

* Introduce dirty page pressure handling in the dbengine page cache that invalidates pages when the disk cannot keep up with the flushing speed.
author: Markos Fountoulakis <44345837+mfundul@users.noreply.github.com> 2020-02-06 21:58:13 +0200
committer: GitHub <noreply@github.com> 2020-02-06 21:58:13 +0200
commit: 6b119d9170fce726e9a5720edc83f6d9ac88e7ce (patch)
tree: 90dc7094ba92af299f5e7c0532e519f706b47d92 /health
parent: b2b3c182548fe81e6d1c9a599b2571dabfdabcaa (diff)
1 files changed, 19 insertions, 7 deletions
diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf
index ce9839ef1d..274673e3e3 100644
--- a/health/health.d/dbengine.conf
+++ b/health/health.d/dbengine.conf
@@ -5,7 +5,7 @@
     on: netdata.dbengine_global_errors
     os: linux freebsd macos
  hosts: *
-lookup: sum -10m unaligned of FS errors
+lookup: sum -10m unaligned of fs_errors
  units: errors
  every: 10s
   crit: $this > 0
@@ -17,7 +17,7 @@ lookup: sum -10m unaligned of FS errors
     on: netdata.dbengine_global_errors
     os: linux freebsd macos
  hosts: *
-lookup: sum -10m unaligned of I/O errors
+lookup: sum -10m unaligned of io_errors
  units: errors
  every: 10s
   crit: $this > 0
@@ -25,14 +25,26 @@ lookup: sum -10m unaligned of I/O errors
   info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc)
     to: sysadmin
 
- alarm: 10min_dbengine_global_flushing_errors
+ alarm: 10min_dbengine_global_flushing_warnings
     on: netdata.dbengine_global_errors
     os: linux freebsd macos
  hosts: *
-lookup: sum -10m unaligned of flushing errors
+lookup: sum -10m unaligned of pg_cache_over_half_dirty_events
  units: errors
- every: 3s
-  crit: $this > 0
+ every: 10s
+  warn: $this > 0
+ delay: down 1h multiplier 1.5 max 3h
+  info: number of times in the last 10 minutes that dbengine dirty pages were over 50% of the instance's page cache, metric data at risk of not being stored in the database, please reduce disk load or use faster disks
+    to: sysadmin
+
+ alarm: 10min_dbengine_global_flushing_errors
+    on: netdata.dbengine_long_term_page_stats
+    os: linux freebsd macos
+ hosts: *
+lookup: sum -10m unaligned of flushing_pressure_deletions
+ units: pages
+ every: 10s
+  crit: $this != 0
  delay: down 1h multiplier 1.5 max 3h
-  info: number of times in the last 10 minutes that the dbengine failed to completely flush data to disk, metric data will not be stored in the database, please reduce disk load or use a faster disk
+  info: number of pages deleted due to failure to flush data to disk in the last 10 minutes, metric data were lost to unblock data collection, please reduce disk load or use faster disks
     to: sysadmin
author	Markos Fountoulakis <44345837+mfundul@users.noreply.github.com>	2020-02-06 21:58:13 +0200
committer	GitHub <noreply@github.com>	2020-02-06 21:58:13 +0200
commit	6b119d9170fce726e9a5720edc83f6d9ac88e7ce (patch)
tree	90dc7094ba92af299f5e7c0532e519f706b47d92 /health
parent	b2b3c182548fe81e6d1c9a599b2571dabfdabcaa (diff)