From 8cc28e8be86dd351964a68fbaca88ca7c3abe157 Mon Sep 17 00:00:00 2001 From: Ilya Mashchenko Date: Thu, 8 Feb 2024 19:06:31 +0200 Subject: change edac ecc errors from incremental to absolute (#16970) --- src/collectors/proc.plugin/metadata.yaml | 24 +++++++++---------- .../proc.plugin/sys_devices_system_edac_mc.c | 20 ++++++++-------- src/health/health.d/memory.conf | 28 ++++++++++------------ 3 files changed, 34 insertions(+), 38 deletions(-) diff --git a/src/collectors/proc.plugin/metadata.yaml b/src/collectors/proc.plugin/metadata.yaml index 74b656809f..1ecec9b9ef 100644 --- a/src/collectors/proc.plugin/metadata.yaml +++ b/src/collectors/proc.plugin/metadata.yaml @@ -1658,20 +1658,20 @@ modules: list: [] alerts: - name: ecc_memory_mc_noinfo_correctable - metric: mem.edac_mc - info: memory controller ${label:controller} ECC correctable errors (unknown DIMM slot) in the last 10 minutes + metric: mem.edac_mc_errors + info: memory controller ${label:controller} ECC correctable errors (unknown DIMM slot) link: https://github.com/netdata/netdata/blob/master/src/health/health.d/memory.conf - name: ecc_memory_mc_noinfo_uncorrectable - metric: mem.edac_mc - info: memory controller ${label:controller} ECC uncorrectable errors (unknown DIMM slot) in the last 10 minutes + metric: mem.edac_mc_errors + info: memory controller ${label:controller} ECC uncorrectable errors (unknown DIMM slot) link: https://github.com/netdata/netdata/blob/master/src/health/health.d/memory.conf - name: ecc_memory_dimm_correctable - metric: mem.edac_mc_dimm - info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors in the last 10 minutes + metric: mem.edac_mc_dimm_errors + info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors link: https://github.com/netdata/netdata/blob/master/src/health/health.d/memory.conf - name: ecc_memory_dimm_uncorrectable - metric: mem.edac_mc_dimm - info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors in the last 10 minutes + metric: mem.edac_mc_dimm_errors + info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors link: https://github.com/netdata/netdata/blob/master/src/health/health.d/memory.conf metrics: folding: @@ -1692,9 +1692,9 @@ modules: - name: max_location description: Last available memory slot in this memory controller. metrics: - - name: mem.edac_mc + - name: mem.edac_mc_errors description: Memory Controller (MC) Error Detection And Correction (EDAC) Errors - unit: errors/s + unit: errors chart_type: line dimensions: - name: correctable @@ -1721,9 +1721,9 @@ modules: - name: size description: The amount of memory in megabytes that this memory module manages. metrics: - - name: mem.edac_mc + - name: mem.edac_mc_errors description: DIMM Error Detection And Correction (EDAC) Errors - unit: errors/s + unit: errors chart_type: line dimensions: - name: correctable diff --git a/src/collectors/proc.plugin/sys_devices_system_edac_mc.c b/src/collectors/proc.plugin/sys_devices_system_edac_mc.c index 9904b9cbcf..d3db8c0442 100644 --- a/src/collectors/proc.plugin/sys_devices_system_edac_mc.c +++ b/src/collectors/proc.plugin/sys_devices_system_edac_mc.c @@ -202,9 +202,9 @@ int do_proc_sys_devices_system_edac_mc(int update_every, usec_t dt __maybe_unuse , id , NULL , "edac" - , "mem.edac_mc" + , "mem.edac_mc_errors" , "Memory Controller (MC) Error Detection And Correction (EDAC) Errors" - , "errors/s" + , "errors" , PLUGIN_PROC_NAME , "/sys/devices/system/edac/mc" , NETDATA_CHART_PRIO_MEM_HW_ECC_CE @@ -225,10 +225,10 @@ int do_proc_sys_devices_system_edac_mc(int update_every, usec_t dt __maybe_unuse if(read_edac_mc_file(m->name, "max_location", buffer, 1024)) rrdlabels_add(m->st->rrdlabels, "max_location", buffer, RRDLABEL_SRC_AUTO); - m->ce.rd = rrddim_add(m->st, "correctable", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); - m->ue.rd = rrddim_add(m->st, "uncorrectable", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); - m->ce_noinfo.rd = rrddim_add(m->st, "correctable_noinfo", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); - m->ue_noinfo.rd = rrddim_add(m->st, "uncorrectable_noinfo", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + m->ce.rd = rrddim_add(m->st, "correctable", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + m->ue.rd = rrddim_add(m->st, "uncorrectable", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + m->ce_noinfo.rd = rrddim_add(m->st, "correctable_noinfo", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + m->ue_noinfo.rd = rrddim_add(m->st, "uncorrectable_noinfo", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); } rrddim_set_by_pointer(m->st, m->ce.rd, (collected_number)m->ce.count); @@ -250,9 +250,9 @@ int do_proc_sys_devices_system_edac_mc(int update_every, usec_t dt __maybe_unuse , id , NULL , "edac" - , "mem.edac_mc_dimm" + , "mem.edac_mc_dimm_errors" , "DIMM Error Detection And Correction (EDAC) Errors" - , "errors/s" + , "errors" , PLUGIN_PROC_NAME , "/sys/devices/system/edac/mc" , NETDATA_CHART_PRIO_MEM_HW_ECC_CE + 1 @@ -283,8 +283,8 @@ int do_proc_sys_devices_system_edac_mc(int update_every, usec_t dt __maybe_unuse if (read_edac_mc_rank_file(m->name, d->name, "size", buffer, 1024)) rrdlabels_add(d->st->rrdlabels, "size", buffer, RRDLABEL_SRC_AUTO); - d->ce.rd = rrddim_add(d->st, "correctable", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); - d->ue.rd = rrddim_add(d->st, "uncorrectable", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + d->ce.rd = rrddim_add(d->st, "correctable", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + d->ue.rd = rrddim_add(d->st, "uncorrectable", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); } rrddim_set_by_pointer(d->st, d->ce.rd, (collected_number)d->ce.count); diff --git a/src/health/health.d/memory.conf b/src/health/health.d/memory.conf index 5ab3d2d922..4a5f476e00 100644 --- a/src/health/health.d/memory.conf +++ b/src/health/health.d/memory.conf @@ -19,67 +19,63 @@ component: Memory ## ECC Controller template: ecc_memory_mc_correctable - on: mem.edac_mc + on: mem.edac_mc_errors class: Errors type: System component: Memory os: linux hosts: * - lookup: sum -10m unaligned of correctable, correctable_noinfo + calc: $correctable + $correctable_noinfo units: errors every: 1m warn: $this > 0 - delay: down 1h multiplier 1.5 max 1h summary: System ECC memory ${label:controller} correctable errors - info: Memory controller ${label:controller} ECC correctable errors in the last 10 minutes + info: Memory controller ${label:controller} ECC correctable errors to: sysadmin template: ecc_memory_mc_uncorrectable - on: mem.edac_mc + on: mem.edac_mc_errors class: Errors type: System component: Memory os: linux hosts: * - lookup: sum -10m unaligned of uncorrectable,uncorrectable_noinfo + calc: $uncorrectable + $uncorrectable_noinfo units: errors every: 1m crit: $this > 0 - delay: down 1h multiplier 1.5 max 1h summary: System ECC memory ${label:controller} uncorrectable errors - info: Memory controller ${label:controller} ECC uncorrectable errors in the last 10 minutes + info: Memory controller ${label:controller} ECC uncorrectable errors to: sysadmin ## ECC DIMM template: ecc_memory_dimm_correctable - on: mem.edac_mc_dimm + on: mem.edac_mc_dimm_errors class: Errors type: System component: Memory os: linux hosts: * - lookup: sum -10m unaligned of correctable + calc: $correctable units: errors every: 1m warn: $this > 0 - delay: down 1h multiplier 1.5 max 1h summary: System ECC memory DIMM ${label:dimm} correctable errors - info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors in the last 10 minutes + info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors to: sysadmin template: ecc_memory_dimm_uncorrectable - on: mem.edac_mc_dimm + on: mem.edac_mc_dimm_errors class: Errors type: System component: Memory os: linux hosts: * - lookup: sum -10m unaligned of uncorrectable + calc: $uncorrectable units: errors every: 1m crit: $this > 0 - delay: down 1h multiplier 1.5 max 1h summary: System ECC memory DIMM ${label:dimm} uncorrectable errors - info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors in the last 10 minutes + info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors to: sysadmin -- cgit v1.2.3