diff options
author | Vladimir Kobal <vlad@prokk.net> | 2021-04-14 12:17:38 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-04-14 12:17:38 +0300 |
commit | 9f63bef4db69ee67735078570af62dfed208e510 (patch) | |
tree | 333b5fb6f02e41737ca283b44f6d2ce622a551c2 | |
parent | 317bced458e8dcbb8cf96bd0b1a4bbe814534460 (diff) |
Add a chart for out of memory kills (#10880)
Co-authored-by: Joel Hans <joel.g.hans@gmail.com>
Co-authored-by: Ilya Mashchenko <ilya@netdata.cloud>
-rw-r--r-- | collectors/all.h | 5 | ||||
-rw-r--r-- | collectors/proc.plugin/README.md | 22 | ||||
-rw-r--r-- | collectors/proc.plugin/proc_vmstat.c | 83 | ||||
-rw-r--r-- | health/health.d/ram.conf | 12 |
4 files changed, 105 insertions, 17 deletions
diff --git a/collectors/all.h b/collectors/all.h index 295261b56a..9443ec6b21 100644 --- a/collectors/all.h +++ b/collectors/all.h @@ -80,8 +80,9 @@ // Memory Section - 1xxx #define NETDATA_CHART_PRIO_MEM_SYSTEM_AVAILABLE 1010 -#define NETDATA_CHART_PRIO_MEM_SYSTEM_COMMITTED 1020 -#define NETDATA_CHART_PRIO_MEM_SYSTEM_PGFAULTS 1030 +#define NETDATA_CHART_PRIO_MEM_SYSTEM_OOM_KILL 1020 +#define NETDATA_CHART_PRIO_MEM_SYSTEM_COMMITTED 1030 +#define NETDATA_CHART_PRIO_MEM_SYSTEM_PGFAULTS 1040 #define NETDATA_CHART_PRIO_MEM_KERNEL 1100 #define NETDATA_CHART_PRIO_MEM_SLAB 1200 #define NETDATA_CHART_PRIO_MEM_HUGEPAGES 1250 diff --git a/collectors/proc.plugin/README.md b/collectors/proc.plugin/README.md index 085afb4fb6..065f9a0380 100644 --- a/collectors/proc.plugin/README.md +++ b/collectors/proc.plugin/README.md @@ -291,6 +291,28 @@ each state. `schedstat filename to monitor`, `cpuidle name filename to monitor`, and `cpuidle time filename to monitor` in the `[plugin:proc:/proc/stat]` configuration section +## Monitoring memory + +### Monitored memory metrics + +- Amount of memory swapped in/out +- Amount of memory paged from/to disk +- Number of memory page faults +- Number of out of memory kills +- Number of NUMA events + +### Configuration + +```conf +[plugin:proc:/proc/vmstat] + filename to monitor = /proc/vmstat + swap i/o = auto + disk i/o = yes + memory page faults = yes + out of memory kills = yes + system-wide numa metric summary = auto +``` + ## Monitoring Network Interfaces ### Monitored network interface metrics diff --git a/collectors/proc.plugin/proc_vmstat.c b/collectors/proc.plugin/proc_vmstat.c index 7def02ddff..c1a137161e 100644 --- a/collectors/proc.plugin/proc_vmstat.c +++ b/collectors/proc.plugin/proc_vmstat.c @@ -4,11 +4,13 @@ #define PLUGIN_PROC_MODULE_VMSTAT_NAME "/proc/vmstat" +#define OOM_KILL_STRING "oom_kill" + int do_proc_vmstat(int update_every, usec_t dt) { (void)dt; static procfile *ff = NULL; - static int do_swapio = -1, do_io = -1, do_pgfaults = -1, do_numa = -1; + static int do_swapio = -1, do_io = -1, do_pgfaults = -1, do_oom_kill = -1, do_numa = -1; static int has_numa = -1; static ARL_BASE *arl_base = NULL; @@ -27,11 +29,25 @@ int do_proc_vmstat(int update_every, usec_t dt) { static unsigned long long pgpgout = 0ULL; static unsigned long long pswpin = 0ULL; static unsigned long long pswpout = 0ULL; + static unsigned long long oom_kill = 0ULL; + + if(unlikely(!ff)) { + char filename[FILENAME_MAX + 1]; + snprintfz(filename, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/proc/vmstat"); + ff = procfile_open(config_get("plugin:proc:/proc/vmstat", "filename to monitor", filename), " \t:", PROCFILE_FLAG_DEFAULT); + if(unlikely(!ff)) return 1; + } + + ff = procfile_readall(ff); + if(unlikely(!ff)) return 0; // we return 0, so that we will retry to open it next time + + size_t lines = procfile_lines(ff), l; if(unlikely(!arl_base)) { do_swapio = config_get_boolean_ondemand("plugin:proc:/proc/vmstat", "swap i/o", CONFIG_BOOLEAN_AUTO); - do_io = config_get_boolean("plugin:proc:/proc/vmstat", "disk i/o", 1); - do_pgfaults = config_get_boolean("plugin:proc:/proc/vmstat", "memory page faults", 1); + do_io = config_get_boolean("plugin:proc:/proc/vmstat", "disk i/o", CONFIG_BOOLEAN_YES); + do_pgfaults = config_get_boolean("plugin:proc:/proc/vmstat", "memory page faults", CONFIG_BOOLEAN_YES); + do_oom_kill = config_get_boolean("plugin:proc:/proc/vmstat", "out of memory kills", CONFIG_BOOLEAN_AUTO); do_numa = config_get_boolean_ondemand("plugin:proc:/proc/vmstat", "system-wide numa metric summary", CONFIG_BOOLEAN_AUTO); @@ -43,6 +59,20 @@ int do_proc_vmstat(int update_every, usec_t dt) { arl_expect(arl_base, "pswpin", &pswpin); arl_expect(arl_base, "pswpout", &pswpout); + int has_oom_kill = 0; + + for (l = 0; l < lines; l++) { + if (!strcmp(procfile_lineword(ff, l, 0), OOM_KILL_STRING)) { + has_oom_kill = 1; + break; + } + } + + if (has_oom_kill) + arl_expect(arl_base, OOM_KILL_STRING, &oom_kill); + else + do_oom_kill = CONFIG_BOOLEAN_NO; + if(do_numa == CONFIG_BOOLEAN_YES || (do_numa == CONFIG_BOOLEAN_AUTO && (get_numa_node_count() >= 2 || netdata_zero_metrics_enabled == CONFIG_BOOLEAN_YES))) { @@ -66,18 +96,6 @@ int do_proc_vmstat(int update_every, usec_t dt) { } } - if(unlikely(!ff)) { - char filename[FILENAME_MAX + 1]; - snprintfz(filename, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/proc/vmstat"); - ff = procfile_open(config_get("plugin:proc:/proc/vmstat", "filename to monitor", filename), " \t:", PROCFILE_FLAG_DEFAULT); - if(unlikely(!ff)) return 1; - } - - ff = procfile_readall(ff); - if(unlikely(!ff)) return 0; // we return 0, so that we will retry to open it next time - - size_t lines = procfile_lines(ff), l; - arl_begin(arl_base); for(l = 0; l < lines ;l++) { size_t words = procfile_linewords(ff, l); @@ -193,6 +211,41 @@ int do_proc_vmstat(int update_every, usec_t dt) { rrdset_done(st_pgfaults); } + // -------------------------------------------------------------------- + + if (do_oom_kill == CONFIG_BOOLEAN_YES || + (do_oom_kill == CONFIG_BOOLEAN_AUTO && (oom_kill || netdata_zero_metrics_enabled == CONFIG_BOOLEAN_YES))) { + static RRDSET *st_oom_kill = NULL; + static RRDDIM *rd_oom_kill = NULL; + + do_oom_kill = CONFIG_BOOLEAN_YES; + + if(unlikely(!st_oom_kill)) { + st_oom_kill = rrdset_create_localhost( + "mem" + , "oom_kill" + , NULL + , "system" + , NULL + , "Out of Memory Kills" + , "kills/s" + , PLUGIN_PROC_NAME + , PLUGIN_PROC_MODULE_VMSTAT_NAME + , NETDATA_CHART_PRIO_MEM_SYSTEM_OOM_KILL + , update_every + , RRDSET_TYPE_LINE + ); + + rrdset_flag_set(st_oom_kill, RRDSET_FLAG_DETAIL); + + rd_oom_kill = rrddim_add(st_oom_kill, "kills", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + } + else rrdset_next(st_oom_kill); + + rrddim_set_by_pointer(st_oom_kill, rd_oom_kill, oom_kill); + rrdset_done(st_oom_kill); + } + // -------------------------------------------------------------------- // Ondemand criteria for NUMA. Since this won't change at run time, we diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf index 2daecc4895..57cf0d95d1 100644 --- a/health/health.d/ram.conf +++ b/health/health.d/ram.conf @@ -37,6 +37,18 @@ info: percentage of estimated amount of RAM available for userspace processes, without causing swapping to: sysadmin + alarm: oom_kill + on: mem.oom_kill + os: linux + hosts: * + lookup: sum -1m unaligned + units: kills + every: 10s + warn: $this > 0 + delay: down 5m + info: number of out of memory kills in the last minute + to: sysadmin + ## FreeBSD alarm: ram_in_use on: system.ram |