summaryrefslogtreecommitdiffstats
path: root/health
diff options
context:
space:
mode:
authorVladimir Kobal <vlad@prokk.net>2019-02-11 13:35:42 +0200
committerGitHub <noreply@github.com>2019-02-11 13:35:42 +0200
commit30f7324a6cdeaefbfee2960ffcc2a3b60e58c168 (patch)
tree8399a01558c370a2ca095f8d2df8123f711045b1 /health
parent2f6f8155dba6951256f5f8e080aafca6e6836dfc (diff)
Add cgroup cpu and memory limits and alarms (#5172)
* Add memory limit variables * Add memory usage alarms * Add CPU limit variables * Add cpu usage alarm * Fix quota calculation, minor cleanup * Update the documentation * Add charts with limits * Fix Codacy issues * Change units for the mem_usage_limit chart * Change the behaviour of the cpu_limit chart
Diffstat (limited to 'health')
-rw-r--r--health/Makefile.am1
-rw-r--r--health/health.d/cgroups.conf41
2 files changed, 42 insertions, 0 deletions
diff --git a/health/Makefile.am b/health/Makefile.am
index 40592a964e..4c4cdd7a03 100644
--- a/health/Makefile.am
+++ b/health/Makefile.am
@@ -31,6 +31,7 @@ dist_healthconfig_DATA = \
health.d/boinc.conf \
health.d/btrfs.conf \
health.d/ceph.conf \
+ health.d/cgroups.conf \
health.d/cpu.conf \
health.d/couchdb.conf \
health.d/disks.conf \
diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf
new file mode 100644
index 0000000000..79ece53f97
--- /dev/null
+++ b/health/health.d/cgroups.conf
@@ -0,0 +1,41 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+template: cgroup_10min_cpu_usage
+ on: cgroup.cpu_limit
+ os: linux
+ hosts: *
+ lookup: average -10m unaligned
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (75) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (95))
+ delay: down 15m multiplier 1.5 max 1h
+ info: cpu utilization for the last 10 minutes
+ to: sysadmin
+
+template: cgroup_ram_in_use
+ on: cgroup.mem_usage
+ os: linux
+ hosts: *
+ calc: ($ram) * 100 / $memory_limit
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ info: RAM used by cgroup
+ to: sysadmin
+
+template: cgroup_ram_and_swap_in_use
+ on: cgroup.mem_usage
+ os: linux
+ hosts: *
+ calc: ($ram + $swap) * 100 / $memory_and_swap_limit
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ info: RAM and Swap used by cgroup
+ to: sysadmin