From 9f63bef4db69ee67735078570af62dfed208e510 Mon Sep 17 00:00:00 2001 From: Vladimir Kobal Date: Wed, 14 Apr 2021 12:17:38 +0300 Subject: Add a chart for out of memory kills (#10880) Co-authored-by: Joel Hans Co-authored-by: Ilya Mashchenko --- collectors/all.h | 5 ++- collectors/proc.plugin/README.md | 22 ++++++++++ collectors/proc.plugin/proc_vmstat.c | 83 +++++++++++++++++++++++++++++------- 3 files changed, 93 insertions(+), 17 deletions(-) (limited to 'collectors') diff --git a/collectors/all.h b/collectors/all.h index 295261b56a..9443ec6b21 100644 --- a/collectors/all.h +++ b/collectors/all.h @@ -80,8 +80,9 @@ // Memory Section - 1xxx #define NETDATA_CHART_PRIO_MEM_SYSTEM_AVAILABLE 1010 -#define NETDATA_CHART_PRIO_MEM_SYSTEM_COMMITTED 1020 -#define NETDATA_CHART_PRIO_MEM_SYSTEM_PGFAULTS 1030 +#define NETDATA_CHART_PRIO_MEM_SYSTEM_OOM_KILL 1020 +#define NETDATA_CHART_PRIO_MEM_SYSTEM_COMMITTED 1030 +#define NETDATA_CHART_PRIO_MEM_SYSTEM_PGFAULTS 1040 #define NETDATA_CHART_PRIO_MEM_KERNEL 1100 #define NETDATA_CHART_PRIO_MEM_SLAB 1200 #define NETDATA_CHART_PRIO_MEM_HUGEPAGES 1250 diff --git a/collectors/proc.plugin/README.md b/collectors/proc.plugin/README.md index 085afb4fb6..065f9a0380 100644 --- a/collectors/proc.plugin/README.md +++ b/collectors/proc.plugin/README.md @@ -291,6 +291,28 @@ each state. `schedstat filename to monitor`, `cpuidle name filename to monitor`, and `cpuidle time filename to monitor` in the `[plugin:proc:/proc/stat]` configuration section +## Monitoring memory + +### Monitored memory metrics + +- Amount of memory swapped in/out +- Amount of memory paged from/to disk +- Number of memory page faults +- Number of out of memory kills +- Number of NUMA events + +### Configuration + +```conf +[plugin:proc:/proc/vmstat] + filename to monitor = /proc/vmstat + swap i/o = auto + disk i/o = yes + memory page faults = yes + out of memory kills = yes + system-wide numa metric summary = auto +``` + ## Monitoring Network Interfaces ### Monitored network interface metrics diff --git a/collectors/proc.plugin/proc_vmstat.c b/collectors/proc.plugin/proc_vmstat.c index 7def02ddff..c1a137161e 100644 --- a/collectors/proc.plugin/proc_vmstat.c +++ b/collectors/proc.plugin/proc_vmstat.c @@ -4,11 +4,13 @@ #define PLUGIN_PROC_MODULE_VMSTAT_NAME "/proc/vmstat" +#define OOM_KILL_STRING "oom_kill" + int do_proc_vmstat(int update_every, usec_t dt) { (void)dt; static procfile *ff = NULL; - static int do_swapio = -1, do_io = -1, do_pgfaults = -1, do_numa = -1; + static int do_swapio = -1, do_io = -1, do_pgfaults = -1, do_oom_kill = -1, do_numa = -1; static int has_numa = -1; static ARL_BASE *arl_base = NULL; @@ -27,11 +29,25 @@ int do_proc_vmstat(int update_every, usec_t dt) { static unsigned long long pgpgout = 0ULL; static unsigned long long pswpin = 0ULL; static unsigned long long pswpout = 0ULL; + static unsigned long long oom_kill = 0ULL; + + if(unlikely(!ff)) { + char filename[FILENAME_MAX + 1]; + snprintfz(filename, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/proc/vmstat"); + ff = procfile_open(config_get("plugin:proc:/proc/vmstat", "filename to monitor", filename), " \t:", PROCFILE_FLAG_DEFAULT); + if(unlikely(!ff)) return 1; + } + + ff = procfile_readall(ff); + if(unlikely(!ff)) return 0; // we return 0, so that we will retry to open it next time + + size_t lines = procfile_lines(ff), l; if(unlikely(!arl_base)) { do_swapio = config_get_boolean_ondemand("plugin:proc:/proc/vmstat", "swap i/o", CONFIG_BOOLEAN_AUTO); - do_io = config_get_boolean("plugin:proc:/proc/vmstat", "disk i/o", 1); - do_pgfaults = config_get_boolean("plugin:proc:/proc/vmstat", "memory page faults", 1); + do_io = config_get_boolean("plugin:proc:/proc/vmstat", "disk i/o", CONFIG_BOOLEAN_YES); + do_pgfaults = config_get_boolean("plugin:proc:/proc/vmstat", "memory page faults", CONFIG_BOOLEAN_YES); + do_oom_kill = config_get_boolean("plugin:proc:/proc/vmstat", "out of memory kills", CONFIG_BOOLEAN_AUTO); do_numa = config_get_boolean_ondemand("plugin:proc:/proc/vmstat", "system-wide numa metric summary", CONFIG_BOOLEAN_AUTO); @@ -43,6 +59,20 @@ int do_proc_vmstat(int update_every, usec_t dt) { arl_expect(arl_base, "pswpin", &pswpin); arl_expect(arl_base, "pswpout", &pswpout); + int has_oom_kill = 0; + + for (l = 0; l < lines; l++) { + if (!strcmp(procfile_lineword(ff, l, 0), OOM_KILL_STRING)) { + has_oom_kill = 1; + break; + } + } + + if (has_oom_kill) + arl_expect(arl_base, OOM_KILL_STRING, &oom_kill); + else + do_oom_kill = CONFIG_BOOLEAN_NO; + if(do_numa == CONFIG_BOOLEAN_YES || (do_numa == CONFIG_BOOLEAN_AUTO && (get_numa_node_count() >= 2 || netdata_zero_metrics_enabled == CONFIG_BOOLEAN_YES))) { @@ -66,18 +96,6 @@ int do_proc_vmstat(int update_every, usec_t dt) { } } - if(unlikely(!ff)) { - char filename[FILENAME_MAX + 1]; - snprintfz(filename, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/proc/vmstat"); - ff = procfile_open(config_get("plugin:proc:/proc/vmstat", "filename to monitor", filename), " \t:", PROCFILE_FLAG_DEFAULT); - if(unlikely(!ff)) return 1; - } - - ff = procfile_readall(ff); - if(unlikely(!ff)) return 0; // we return 0, so that we will retry to open it next time - - size_t lines = procfile_lines(ff), l; - arl_begin(arl_base); for(l = 0; l < lines ;l++) { size_t words = procfile_linewords(ff, l); @@ -193,6 +211,41 @@ int do_proc_vmstat(int update_every, usec_t dt) { rrdset_done(st_pgfaults); } + // -------------------------------------------------------------------- + + if (do_oom_kill == CONFIG_BOOLEAN_YES || + (do_oom_kill == CONFIG_BOOLEAN_AUTO && (oom_kill || netdata_zero_metrics_enabled == CONFIG_BOOLEAN_YES))) { + static RRDSET *st_oom_kill = NULL; + static RRDDIM *rd_oom_kill = NULL; + + do_oom_kill = CONFIG_BOOLEAN_YES; + + if(unlikely(!st_oom_kill)) { + st_oom_kill = rrdset_create_localhost( + "mem" + , "oom_kill" + , NULL + , "system" + , NULL + , "Out of Memory Kills" + , "kills/s" + , PLUGIN_PROC_NAME + , PLUGIN_PROC_MODULE_VMSTAT_NAME + , NETDATA_CHART_PRIO_MEM_SYSTEM_OOM_KILL + , update_every + , RRDSET_TYPE_LINE + ); + + rrdset_flag_set(st_oom_kill, RRDSET_FLAG_DETAIL); + + rd_oom_kill = rrddim_add(st_oom_kill, "kills", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + } + else rrdset_next(st_oom_kill); + + rrddim_set_by_pointer(st_oom_kill, rd_oom_kill, oom_kill); + rrdset_done(st_oom_kill); + } + // -------------------------------------------------------------------- // Ondemand criteria for NUMA. Since this won't change at run time, we -- cgit v1.2.3