diff options
author | Costa Tsaousis <costa@netdata.cloud> | 2023-07-21 16:48:01 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-07-21 16:48:01 +0300 |
commit | 173a3f9bedeb2b8d367e908fdf99a46af19b6f1a (patch) | |
tree | afead499b661da8f12ef26a51ea6ae086f097200 /collectors/proc.plugin | |
parent | 63536a73aa7d98dfe5281956fa159495da7a9750 (diff) |
Memory Controller (MC) and DIMM Error Detection And Correction (EDAC) (#15473)
Co-authored-by: ilyam8 <ilya@netdata.cloud>
Diffstat (limited to 'collectors/proc.plugin')
-rw-r--r-- | collectors/proc.plugin/multi_metadata.yaml | 81 | ||||
-rw-r--r-- | collectors/proc.plugin/sys_devices_system_edac_mc.c | 333 |
2 files changed, 273 insertions, 141 deletions
diff --git a/collectors/proc.plugin/multi_metadata.yaml b/collectors/proc.plugin/multi_metadata.yaml index cb7172db47..04e66ddfe6 100644 --- a/collectors/proc.plugin/multi_metadata.yaml +++ b/collectors/proc.plugin/multi_metadata.yaml @@ -1307,7 +1307,7 @@ modules: plugin_name: proc.plugin module_name: /sys/devices/system/edac/mc monitored_instance: - name: System Memory Errors + name: Memory modules (DIMMs) link: "" categories: - data-collection.linux-systems.memory-metrics @@ -1319,9 +1319,10 @@ modules: description: "" keywords: - edac - - error detection and correction memory controllers - ecc + - dimm - ram + - hardware most_popular: false overview: data_collection: @@ -1362,16 +1363,22 @@ modules: problems: list: [] alerts: - - name: 1hour_ecc_memory_correctable + - name: ecc_memory_mc_noinfo_correctable + metric: mem.edac_mc + info: memory controller ${label:controller} ECC correctable errors (unknown DIMM slot) in the last 10 minutes link: https://github.com/netdata/netdata/blob/master/health/health.d/memory.conf - metric: mem.ecc_ce - info: number of ECC correctable errors in the last 10 minutes - os: "linux" - - name: 1hour_ecc_memory_uncorrectable + - name: ecc_memory_mc_noinfo_uncorrectable + metric: mem.edac_mc + info: memory controller ${label:controller} ECC uncorrectable errors (unknown DIMM slot) in the last 10 minutes + link: https://github.com/netdata/netdata/blob/master/health/health.d/memory.conf + - name: ecc_memory_dimm_correctable + metric: mem.edac_mc_dimm + info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors in the last 10 minutes + link: https://github.com/netdata/netdata/blob/master/health/health.d/memory.conf + - name: ecc_memory_dimm_uncorrectable + metric: mem.edac_mc_dimm + info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors in the last 10 minutes link: https://github.com/netdata/netdata/blob/master/health/health.d/memory.conf - metric: mem.ecc_ue - info: number of ECC uncorrectable errors in the last 10 minutes - os: "linux" metrics: folding: title: Metrics @@ -1379,22 +1386,54 @@ modules: description: "" availability: [] scopes: - - name: global - description: "" - labels: [] + - name: memory controller + description: These metrics refer to the memory controller. + labels: + - name: controller + description: "[mcX](https://www.kernel.org/doc/html/v5.0/admin-guide/ras.html#mcx-directories) directory name of this memory controller." + - name: mc_name + description: Memory controller type. + - name: size_mb + description: The amount of memory in megabytes that this memory controller manages. + - name: max_location + description: Last available memory slot in this memory controller. metrics: - - name: mem.ecc_ce - description: ECC Memory Correctable Errors - unit: "errors" + - name: mem.edac_mc + description: Memory Controller (MC) Error Detection And Correction (EDAC) Errors + unit: errors/s chart_type: line dimensions: - - name: a dimension per mem controller - - name: mem.ecc_ue - description: ECC Memory Uncorrectable Errors - unit: "errors" + - name: correctable + - name: uncorrectable + - name: correctable_noinfo + - name: uncorrectable_noinfo + - name: memory module + description: These metrics refer to the memory module (or rank, [depends on the memory controller](https://www.kernel.org/doc/html/v5.0/admin-guide/ras.html#f5)). + labels: + - name: controller + description: "[mcX](https://www.kernel.org/doc/html/v5.0/admin-guide/ras.html#mcx-directories) directory name of this memory controller." + - name: dimm + description: "[dimmX or rankX](https://www.kernel.org/doc/html/v5.0/admin-guide/ras.html#dimmx-or-rankx-directories) directory name of this memory module." + - name: dimm_dev_type + description: Type of DRAM device used in this memory module. For example, x1, x2, x4, x8. + - name: dimm_edac_mode + description: Used type of error detection and correction. For example, S4ECD4ED would mean a Chipkill with x4 DRAM. + - name: dimm_label + description: Label assigned to this memory module. + - name: dimm_location + description: Location of the memory module. + - name: dimm_mem_type + description: Type of the memory module. Usually either buffered or unbuffered memory. + - name: size + description: The amount of memory in megabytes that this memory module manages. + metrics: + - name: mem.edac_mc + description: DIMM Error Detection And Correction (EDAC) Errors + unit: errors/s chart_type: line dimensions: - - name: a dimension per mem controller + - name: correctable + - name: uncorrectable - meta: plugin_name: proc.plugin module_name: /sys/devices/system/node diff --git a/collectors/proc.plugin/sys_devices_system_edac_mc.c b/collectors/proc.plugin/sys_devices_system_edac_mc.c index fdb6b51e97..ec492a526a 100644 --- a/collectors/proc.plugin/sys_devices_system_edac_mc.c +++ b/collectors/proc.plugin/sys_devices_system_edac_mc.c @@ -2,35 +2,51 @@ #include "plugin_proc.h" +struct edac_count { + bool updated; + char *filename; + procfile *ff; + kernel_uint_t count; + RRDDIM *rd; +}; + +struct edac_dimm { + char *name; + + struct edac_count ce; + struct edac_count ue; + + RRDSET *st; + + struct edac_dimm *prev, *next; +}; + struct mc { char *name; - char ce_updated; - char ue_updated; - char *ce_count_filename; - char *ue_count_filename; + struct edac_count ce; + struct edac_count ue; + struct edac_count ce_noinfo; + struct edac_count ue_noinfo; - procfile *ce_ff; - procfile *ue_ff; + RRDSET *st; - collected_number ce_count; - collected_number ue_count; + struct edac_dimm *dimms; - RRDDIM *ce_rd; - RRDDIM *ue_rd; - - struct mc *next; + struct mc *prev, *next; }; + static struct mc *mc_root = NULL; +static char *mc_dirname = NULL; static void find_all_mc() { char name[FILENAME_MAX + 1]; snprintfz(name, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/sys/devices/system/edac/mc"); - char *dirname = config_get("plugin:proc:/sys/devices/system/edac/mc", "directory to monitor", name); + mc_dirname = config_get("plugin:proc:/sys/devices/system/edac/mc", "directory to monitor", name); - DIR *dir = opendir(dirname); + DIR *dir = opendir(mc_dirname); if(unlikely(!dir)) { - collector_error("Cannot read ECC memory errors directory '%s'", dirname); + collector_error("Cannot read EDAC memory errors directory '%s'", mc_dirname); return; } @@ -42,162 +58,239 @@ static void find_all_mc() { struct stat st; - snprintfz(name, FILENAME_MAX, "%s/%s/ce_count", dirname, de->d_name); + snprintfz(name, FILENAME_MAX, "%s/%s/ce_count", mc_dirname, de->d_name); if(stat(name, &st) != -1) - m->ce_count_filename = strdupz(name); + m->ce.filename = strdupz(name); - snprintfz(name, FILENAME_MAX, "%s/%s/ue_count", dirname, de->d_name); + snprintfz(name, FILENAME_MAX, "%s/%s/ue_count", mc_dirname, de->d_name); if(stat(name, &st) != -1) - m->ue_count_filename = strdupz(name); + m->ue.filename = strdupz(name); - if(!m->ce_count_filename && !m->ue_count_filename) { + snprintfz(name, FILENAME_MAX, "%s/%s/ce_noinfo_count", mc_dirname, de->d_name); + if(stat(name, &st) != -1) + m->ce_noinfo.filename = strdupz(name); + + snprintfz(name, FILENAME_MAX, "%s/%s/ue_noinfo_count", mc_dirname, de->d_name); + if(stat(name, &st) != -1) + m->ue_noinfo.filename = strdupz(name); + + if(!m->ce.filename && !m->ue.filename && !m->ce_noinfo.filename && !m->ue_noinfo.filename) { freez(m->name); freez(m); } - else { - m->next = mc_root; - mc_root = m; - } + else + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(mc_root, m, prev, next); } } - closedir(dir); -} -int do_proc_sys_devices_system_edac_mc(int update_every, usec_t dt) { - (void)dt; + for(struct mc *m = mc_root; m ;m = m->next) { + snprintfz(name, FILENAME_MAX, "%s/%s", mc_dirname, m->name); + dir = opendir(name); + if(!dir) { + collector_error("Cannot read EDAC memory errors directory '%s'", name); + continue; + } - if(unlikely(mc_root == NULL)) { - find_all_mc(); - if(unlikely(mc_root == NULL)) - return 1; - } + while((de = readdir(dir))) { + // it can be dimmX or rankX directory + // https://www.kernel.org/doc/html/v5.0/admin-guide/ras.html#f5 - static int do_ce = -1, do_ue = -1; - NETDATA_DOUBLE ce_sum = 0, ue_sum = 0; - struct mc *m; + if (de->d_type == DT_DIR && + ((strncmp(de->d_name, "rank", 4) == 0 || strncmp(de->d_name, "dimm", 4) == 0)) && + isdigit(de->d_name[4])) { - if(unlikely(do_ce == -1)) { - do_ce = config_get_boolean_ondemand("plugin:proc:/sys/devices/system/edac/mc", "enable ECC memory correctable errors", CONFIG_BOOLEAN_YES); - do_ue = config_get_boolean_ondemand("plugin:proc:/sys/devices/system/edac/mc", "enable ECC memory uncorrectable errors", CONFIG_BOOLEAN_YES); - } + struct edac_dimm *d = callocz(1, sizeof(struct edac_dimm)); + d->name = strdupz(de->d_name); - if(do_ce != CONFIG_BOOLEAN_NO) { - for(m = mc_root; m; m = m->next) { - if(m->ce_count_filename) { - m->ce_updated = 0; + struct stat st; - if(unlikely(!m->ce_ff)) { - m->ce_ff = procfile_open(m->ce_count_filename, " \t", PROCFILE_FLAG_DEFAULT); - if(unlikely(!m->ce_ff)) - continue; - } + snprintfz(name, FILENAME_MAX, "%s/%s/%s/dimm_ce_count", mc_dirname, m->name, de->d_name); + if(stat(name, &st) != -1) + d->ce.filename = strdupz(name); - m->ce_ff = procfile_readall(m->ce_ff); - if(unlikely(!m->ce_ff || procfile_lines(m->ce_ff) < 1 || procfile_linewords(m->ce_ff, 0) < 1)) - continue; + snprintfz(name, FILENAME_MAX, "%s/%s/%s/dimm_ue_count", mc_dirname, m->name, de->d_name); + if(stat(name, &st) != -1) + d->ue.filename = strdupz(name); - m->ce_count = str2ull(procfile_lineword(m->ce_ff, 0, 0), NULL); - ce_sum += m->ce_count; - m->ce_updated = 1; + if(!d->ce.filename && !d->ue.filename) { + freez(d->name); + freez(d); + } + else + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(m->dimms, d, prev, next); } } } +} - if(do_ue != CONFIG_BOOLEAN_NO) { - for(m = mc_root; m; m = m->next) { - if(m->ue_count_filename) { - m->ue_updated = 0; +static kernel_uint_t read_edac_count(struct edac_count *t) { + t->updated = false; + t->count = 0; - if(unlikely(!m->ue_ff)) { - m->ue_ff = procfile_open(m->ue_count_filename, " \t", PROCFILE_FLAG_DEFAULT); - if(unlikely(!m->ue_ff)) - continue; - } + if(t->filename) { + if(unlikely(!t->ff)) { + t->ff = procfile_open(t->filename, " \t", PROCFILE_FLAG_DEFAULT); + if(unlikely(!t->ff)) + return 0; + } - m->ue_ff = procfile_readall(m->ue_ff); - if(unlikely(!m->ue_ff || procfile_lines(m->ue_ff) < 1 || procfile_linewords(m->ue_ff, 0) < 1)) - continue; + t->ff = procfile_readall(t->ff); + if(unlikely(!t->ff || procfile_lines(t->ff) < 1 || procfile_linewords(t->ff, 0) < 1)) + return 0; - m->ue_count = str2ull(procfile_lineword(m->ue_ff, 0, 0), NULL); - ue_sum += m->ue_count; - m->ue_updated = 1; - } + t->count = str2ull(procfile_lineword(t->ff, 0, 0), NULL); + t->updated = true; + } + + return t->count; +} + +static bool read_edac_mc_file(const char *mc, const char *filename, char *out, size_t out_size) { + char f[FILENAME_MAX + 1]; + snprintfz(f, FILENAME_MAX, "%s/%s/%s", mc_dirname, mc, filename); + if(read_file(f, out, out_size) != 0) { + collector_error("EDAC: cannot read file '%s'", f); + return false; + } + return true; +} + +static bool read_edac_mc_rank_file(const char *mc, const char *rank, const char *filename, char *out, size_t out_size) { + char f[FILENAME_MAX + 1]; + snprintfz(f, FILENAME_MAX, "%s/%s/%s/%s", mc_dirname, mc, rank, filename); + if(read_file(f, out, out_size) != 0) { + collector_error("EDAC: cannot read file '%s'", f); + return false; + } + return true; +} + +int do_proc_sys_devices_system_edac_mc(int update_every, usec_t dt __maybe_unused) { + if(unlikely(!mc_root)) { + find_all_mc(); + + if(!mc_root) + // don't call this again + return 1; + } + + for(struct mc *m = mc_root; m; m = m->next) { + read_edac_count(&m->ce); + read_edac_count(&m->ce_noinfo); + read_edac_count(&m->ue); + read_edac_count(&m->ue_noinfo); + + for(struct edac_dimm *d = m->dimms; d ;d = d->next) { + read_edac_count(&d->ce); + read_edac_count(&d->ue); } } // -------------------------------------------------------------------- - if(do_ce == CONFIG_BOOLEAN_YES || (do_ce == CONFIG_BOOLEAN_AUTO && - (ce_sum > 0 || netdata_zero_metrics_enabled == CONFIG_BOOLEAN_YES))) { - do_ce = CONFIG_BOOLEAN_YES; + for(struct mc *m = mc_root; m ; m = m->next) { + if(unlikely(!m->ce.updated && !m->ue.updated && !m->ce_noinfo.updated && !m->ue_noinfo.updated)) + continue; - static RRDSET *ce_st = NULL; - - if(unlikely(!ce_st)) { - ce_st = rrdset_create_localhost( + if(unlikely(!m->st)) { + char id[RRD_ID_LENGTH_MAX + 1]; + snprintfz(id, RRD_ID_LENGTH_MAX, "edac_%s", m->name); + m->st = rrdset_create_localhost( "mem" - , "ecc_ce" - , NULL - , "ecc" + , id , NULL - , "ECC Memory Correctable Errors" - , "errors" + , "edac" + , "mem.edac_mc" + , "Memory Controller (MC) Error Detection And Correction (EDAC) Errors" + , "errors/s" , PLUGIN_PROC_NAME , "/sys/devices/system/edac/mc" , NETDATA_CHART_PRIO_MEM_HW_ECC_CE , update_every , RRDSET_TYPE_LINE ); - } - for(m = mc_root; m; m = m->next) { - if (m->ce_count_filename && m->ce_updated) { - if(unlikely(!m->ce_rd)) - m->ce_rd = rrddim_add(ce_st, m->name, NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + rrdlabels_add(m->st->rrdlabels, "controller", m->name, RRDLABEL_SRC_AUTO); - rrddim_set_by_pointer(ce_st, m->ce_rd, m->ce_count); - } + char buffer[1024 + 1]; + + if(read_edac_mc_file(m->name, "mc_name", buffer, 1024)) + rrdlabels_add(m->st->rrdlabels, "mc_name", buffer, RRDLABEL_SRC_AUTO); + + if(read_edac_mc_file(m->name, "size_mb", buffer, 1024)) + rrdlabels_add(m->st->rrdlabels, "size_mb", buffer, RRDLABEL_SRC_AUTO); + + if(read_edac_mc_file(m->name, "max_location", buffer, 1024)) + rrdlabels_add(m->st->rrdlabels, "max_location", buffer, RRDLABEL_SRC_AUTO); + + m->ce.rd = rrddim_add(m->st, "correctable", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + m->ue.rd = rrddim_add(m->st, "uncorrectable", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + m->ce_noinfo.rd = rrddim_add(m->st, "correctable_noinfo", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + m->ue_noinfo.rd = rrddim_add(m->st, "uncorrectable_noinfo", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); } - rrdset_done(ce_st); - } + rrddim_set_by_pointer(m->st, m->ce.rd, (collected_number)m->ce.count); + rrddim_set_by_pointer(m->st, m->ue.rd, (collected_number)m->ue.count); + rrddim_set_by_pointer(m->st, m->ce_noinfo.rd, (collected_number)m->ce_noinfo.count); + rrddim_set_by_pointer(m->st, m->ue_noinfo.rd, (collected_number)m->ue_noinfo.count); - // -------------------------------------------------------------------- + rrdset_done(m->st); - if(do_ue == CONFIG_BOOLEAN_YES || (do_ue == CONFIG_BOOLEAN_AUTO && - (ue_sum > 0 || netdata_zero_metrics_enabled == CONFIG_BOOLEAN_YES))) { - do_ue = CONFIG_BOOLEAN_YES; + for(struct edac_dimm *d = m->dimms; d ;d = d->next) { + if(unlikely(!d->ce.updated && !d->ue.updated)) + continue; - static RRDSET *ue_st = NULL; + if(unlikely(!d->st)) { + char id[RRD_ID_LENGTH_MAX + 1]; + snprintfz(id, RRD_ID_LENGTH_MAX, "edac_%s_%s", m->name, d->name); + d->st = rrdset_create_localhost( + "mem" + , id + , NULL + , "edac" + , "mem.edac_mc_dimm" + , "DIMM Error Detection And Correction (EDAC) Errors" + , "errors/s" + , PLUGIN_PROC_NAME + , "/sys/devices/system/edac/mc" + , NETDATA_CHART_PRIO_MEM_HW_ECC_CE + 1 + , update_every + , RRDSET_TYPE_LINE + ); - if(unlikely(!ue_st)) { - ue_st = rrdset_create_localhost( - "mem" - , "ecc_ue" - , NULL - , "ecc" - , NULL - , "ECC Memory Uncorrectable Errors" - , "errors" - , PLUGIN_PROC_NAME - , "/sys/devices/system/edac/mc" - , NETDATA_CHART_PRIO_MEM_HW_ECC_UE - , update_every - , RRDSET_TYPE_LINE - ); - } + rrdlabels_add(d->st->rrdlabels, "controller", m->name, RRDLABEL_SRC_AUTO); + rrdlabels_add(d->st->rrdlabels, "dimm", d->name, RRDLABEL_SRC_AUTO); + + char buffer[1024 + 1]; - for(m = mc_root; m; m = m->next) { - if (m->ue_count_filename && m->ue_updated) { - if(unlikely(!m->ue_rd)) - m->ue_rd = rrddim_add(ue_st, m->name, NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + if(read_edac_mc_rank_file(m->name, d->name, "dimm_dev_type", buffer, 1024)) + rrdlabels_add(d->st->rrdlabels, "dimm_dev_type", buffer, RRDLABEL_SRC_AUTO); - rrddim_set_by_pointer(ue_st, m->ue_rd, m->ue_count); + if(read_edac_mc_rank_file(m->name, d->name, "dimm_edac_mode", buffer, 1024)) + rrdlabels_add(d->st->rrdlabels, "dimm_edac_mode", buffer, RRDLABEL_SRC_AUTO); + + if(read_edac_mc_rank_file(m->name, d->name, "dimm_label", buffer, 1024)) + rrdlabels_add(d->st->rrdlabels, "dimm_label", buffer, RRDLABEL_SRC_AUTO); + + if(read_edac_mc_rank_file(m->name, d->name, "dimm_location", buffer, 1024)) + rrdlabels_add(d->st->rrdlabels, "dimm_location", buffer, RRDLABEL_SRC_AUTO); + + if(read_edac_mc_rank_file(m->name, d->name, "dimm_mem_type", buffer, 1024)) + rrdlabels_add(d->st->rrdlabels, "dimm_mem_type", buffer, RRDLABEL_SRC_AUTO); + + if(read_edac_mc_rank_file(m->name, d->name, "size", buffer, 1024)) + rrdlabels_add(d->st->rrdlabels, "size", buffer, RRDLABEL_SRC_AUTO); + + d->ce.rd = rrddim_add(d->st, "correctable", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + d->ue.rd = rrddim_add(d->st, "uncorrectable", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); } - } - rrdset_done(ue_st); + rrddim_set_by_pointer(d->st, d->ce.rd, (collected_number)d->ce.count); + rrddim_set_by_pointer(d->st, d->ue.rd, (collected_number)d->ue.count); + + rrdset_done(d->st); + } } return 0; |