summaryrefslogtreecommitdiffstats
path: root/collectors
diff options
context:
space:
mode:
authorVladimir Kobal <vlad@prokk.net>2021-04-14 12:17:38 +0300
committerGitHub <noreply@github.com>2021-04-14 12:17:38 +0300
commit9f63bef4db69ee67735078570af62dfed208e510 (patch)
tree333b5fb6f02e41737ca283b44f6d2ce622a551c2 /collectors
parent317bced458e8dcbb8cf96bd0b1a4bbe814534460 (diff)
Add a chart for out of memory kills (#10880)
Co-authored-by: Joel Hans <joel.g.hans@gmail.com> Co-authored-by: Ilya Mashchenko <ilya@netdata.cloud>
Diffstat (limited to 'collectors')
-rw-r--r--collectors/all.h5
-rw-r--r--collectors/proc.plugin/README.md22
-rw-r--r--collectors/proc.plugin/proc_vmstat.c83
3 files changed, 93 insertions, 17 deletions
diff --git a/collectors/all.h b/collectors/all.h
index 295261b56a..9443ec6b21 100644
--- a/collectors/all.h
+++ b/collectors/all.h
@@ -80,8 +80,9 @@
// Memory Section - 1xxx
#define NETDATA_CHART_PRIO_MEM_SYSTEM_AVAILABLE 1010
-#define NETDATA_CHART_PRIO_MEM_SYSTEM_COMMITTED 1020
-#define NETDATA_CHART_PRIO_MEM_SYSTEM_PGFAULTS 1030
+#define NETDATA_CHART_PRIO_MEM_SYSTEM_OOM_KILL 1020
+#define NETDATA_CHART_PRIO_MEM_SYSTEM_COMMITTED 1030
+#define NETDATA_CHART_PRIO_MEM_SYSTEM_PGFAULTS 1040
#define NETDATA_CHART_PRIO_MEM_KERNEL 1100
#define NETDATA_CHART_PRIO_MEM_SLAB 1200
#define NETDATA_CHART_PRIO_MEM_HUGEPAGES 1250
diff --git a/collectors/proc.plugin/README.md b/collectors/proc.plugin/README.md
index 085afb4fb6..065f9a0380 100644
--- a/collectors/proc.plugin/README.md
+++ b/collectors/proc.plugin/README.md
@@ -291,6 +291,28 @@ each state.
`schedstat filename to monitor`, `cpuidle name filename to monitor`, and `cpuidle time filename to monitor` in the `[plugin:proc:/proc/stat]` configuration section
+## Monitoring memory
+
+### Monitored memory metrics
+
+- Amount of memory swapped in/out
+- Amount of memory paged from/to disk
+- Number of memory page faults
+- Number of out of memory kills
+- Number of NUMA events
+
+### Configuration
+
+```conf
+[plugin:proc:/proc/vmstat]
+ filename to monitor = /proc/vmstat
+ swap i/o = auto
+ disk i/o = yes
+ memory page faults = yes
+ out of memory kills = yes
+ system-wide numa metric summary = auto
+```
+
## Monitoring Network Interfaces
### Monitored network interface metrics
diff --git a/collectors/proc.plugin/proc_vmstat.c b/collectors/proc.plugin/proc_vmstat.c
index 7def02ddff..c1a137161e 100644
--- a/collectors/proc.plugin/proc_vmstat.c
+++ b/collectors/proc.plugin/proc_vmstat.c
@@ -4,11 +4,13 @@
#define PLUGIN_PROC_MODULE_VMSTAT_NAME "/proc/vmstat"
+#define OOM_KILL_STRING "oom_kill"
+
int do_proc_vmstat(int update_every, usec_t dt) {
(void)dt;
static procfile *ff = NULL;
- static int do_swapio = -1, do_io = -1, do_pgfaults = -1, do_numa = -1;
+ static int do_swapio = -1, do_io = -1, do_pgfaults = -1, do_oom_kill = -1, do_numa = -1;
static int has_numa = -1;
static ARL_BASE *arl_base = NULL;
@@ -27,11 +29,25 @@ int do_proc_vmstat(int update_every, usec_t dt) {
static unsigned long long pgpgout = 0ULL;
static unsigned long long pswpin = 0ULL;
static unsigned long long pswpout = 0ULL;
+ static unsigned long long oom_kill = 0ULL;
+
+ if(unlikely(!ff)) {
+ char filename[FILENAME_MAX + 1];
+ snprintfz(filename, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/proc/vmstat");
+ ff = procfile_open(config_get("plugin:proc:/proc/vmstat", "filename to monitor", filename), " \t:", PROCFILE_FLAG_DEFAULT);
+ if(unlikely(!ff)) return 1;
+ }
+
+ ff = procfile_readall(ff);
+ if(unlikely(!ff)) return 0; // we return 0, so that we will retry to open it next time
+
+ size_t lines = procfile_lines(ff), l;
if(unlikely(!arl_base)) {
do_swapio = config_get_boolean_ondemand("plugin:proc:/proc/vmstat", "swap i/o", CONFIG_BOOLEAN_AUTO);
- do_io = config_get_boolean("plugin:proc:/proc/vmstat", "disk i/o", 1);
- do_pgfaults = config_get_boolean("plugin:proc:/proc/vmstat", "memory page faults", 1);
+ do_io = config_get_boolean("plugin:proc:/proc/vmstat", "disk i/o", CONFIG_BOOLEAN_YES);
+ do_pgfaults = config_get_boolean("plugin:proc:/proc/vmstat", "memory page faults", CONFIG_BOOLEAN_YES);
+ do_oom_kill = config_get_boolean("plugin:proc:/proc/vmstat", "out of memory kills", CONFIG_BOOLEAN_AUTO);
do_numa = config_get_boolean_ondemand("plugin:proc:/proc/vmstat", "system-wide numa metric summary", CONFIG_BOOLEAN_AUTO);
@@ -43,6 +59,20 @@ int do_proc_vmstat(int update_every, usec_t dt) {
arl_expect(arl_base, "pswpin", &pswpin);
arl_expect(arl_base, "pswpout", &pswpout);
+ int has_oom_kill = 0;
+
+ for (l = 0; l < lines; l++) {
+ if (!strcmp(procfile_lineword(ff, l, 0), OOM_KILL_STRING)) {
+ has_oom_kill = 1;
+ break;
+ }
+ }
+
+ if (has_oom_kill)
+ arl_expect(arl_base, OOM_KILL_STRING, &oom_kill);
+ else
+ do_oom_kill = CONFIG_BOOLEAN_NO;
+
if(do_numa == CONFIG_BOOLEAN_YES || (do_numa == CONFIG_BOOLEAN_AUTO &&
(get_numa_node_count() >= 2 ||
netdata_zero_metrics_enabled == CONFIG_BOOLEAN_YES))) {
@@ -66,18 +96,6 @@ int do_proc_vmstat(int update_every, usec_t dt) {
}
}
- if(unlikely(!ff)) {
- char filename[FILENAME_MAX + 1];
- snprintfz(filename, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/proc/vmstat");
- ff = procfile_open(config_get("plugin:proc:/proc/vmstat", "filename to monitor", filename), " \t:", PROCFILE_FLAG_DEFAULT);
- if(unlikely(!ff)) return 1;
- }
-
- ff = procfile_readall(ff);
- if(unlikely(!ff)) return 0; // we return 0, so that we will retry to open it next time
-
- size_t lines = procfile_lines(ff), l;
-
arl_begin(arl_base);
for(l = 0; l < lines ;l++) {
size_t words = procfile_linewords(ff, l);
@@ -193,6 +211,41 @@ int do_proc_vmstat(int update_every, usec_t dt) {
rrdset_done(st_pgfaults);
}
+ // --------------------------------------------------------------------
+
+ if (do_oom_kill == CONFIG_BOOLEAN_YES ||
+ (do_oom_kill == CONFIG_BOOLEAN_AUTO && (oom_kill || netdata_zero_metrics_enabled == CONFIG_BOOLEAN_YES))) {
+ static RRDSET *st_oom_kill = NULL;
+ static RRDDIM *rd_oom_kill = NULL;
+
+ do_oom_kill = CONFIG_BOOLEAN_YES;
+
+ if(unlikely(!st_oom_kill)) {
+ st_oom_kill = rrdset_create_localhost(
+ "mem"
+ , "oom_kill"
+ , NULL
+ , "system"
+ , NULL
+ , "Out of Memory Kills"
+ , "kills/s"
+ , PLUGIN_PROC_NAME
+ , PLUGIN_PROC_MODULE_VMSTAT_NAME
+ , NETDATA_CHART_PRIO_MEM_SYSTEM_OOM_KILL
+ , update_every
+ , RRDSET_TYPE_LINE
+ );
+
+ rrdset_flag_set(st_oom_kill, RRDSET_FLAG_DETAIL);
+
+ rd_oom_kill = rrddim_add(st_oom_kill, "kills", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
+ }
+ else rrdset_next(st_oom_kill);
+
+ rrddim_set_by_pointer(st_oom_kill, rd_oom_kill, oom_kill);
+ rrdset_done(st_oom_kill);
+ }
+
// --------------------------------------------------------------------
// Ondemand criteria for NUMA. Since this won't change at run time, we