summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIlya Mashchenko <ilya@netdata.cloud>2024-02-08 19:06:31 +0200
committerGitHub <noreply@github.com>2024-02-08 19:06:31 +0200
commit8cc28e8be86dd351964a68fbaca88ca7c3abe157 (patch)
treed1dd9f9d1830330a677e6ff54e1f270d15de4aae
parent2f15252b653f2b04502dbbc5e65c6bce33892c6a (diff)
change edac ecc errors from incremental to absolute (#16970)
-rw-r--r--src/collectors/proc.plugin/metadata.yaml24
-rw-r--r--src/collectors/proc.plugin/sys_devices_system_edac_mc.c20
-rw-r--r--src/health/health.d/memory.conf28
3 files changed, 34 insertions, 38 deletions
diff --git a/src/collectors/proc.plugin/metadata.yaml b/src/collectors/proc.plugin/metadata.yaml
index 74b656809f..1ecec9b9ef 100644
--- a/src/collectors/proc.plugin/metadata.yaml
+++ b/src/collectors/proc.plugin/metadata.yaml
@@ -1658,20 +1658,20 @@ modules:
list: []
alerts:
- name: ecc_memory_mc_noinfo_correctable
- metric: mem.edac_mc
- info: memory controller ${label:controller} ECC correctable errors (unknown DIMM slot) in the last 10 minutes
+ metric: mem.edac_mc_errors
+ info: memory controller ${label:controller} ECC correctable errors (unknown DIMM slot)
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/memory.conf
- name: ecc_memory_mc_noinfo_uncorrectable
- metric: mem.edac_mc
- info: memory controller ${label:controller} ECC uncorrectable errors (unknown DIMM slot) in the last 10 minutes
+ metric: mem.edac_mc_errors
+ info: memory controller ${label:controller} ECC uncorrectable errors (unknown DIMM slot)
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/memory.conf
- name: ecc_memory_dimm_correctable
- metric: mem.edac_mc_dimm
- info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors in the last 10 minutes
+ metric: mem.edac_mc_dimm_errors
+ info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/memory.conf
- name: ecc_memory_dimm_uncorrectable
- metric: mem.edac_mc_dimm
- info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors in the last 10 minutes
+ metric: mem.edac_mc_dimm_errors
+ info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/memory.conf
metrics:
folding:
@@ -1692,9 +1692,9 @@ modules:
- name: max_location
description: Last available memory slot in this memory controller.
metrics:
- - name: mem.edac_mc
+ - name: mem.edac_mc_errors
description: Memory Controller (MC) Error Detection And Correction (EDAC) Errors
- unit: errors/s
+ unit: errors
chart_type: line
dimensions:
- name: correctable
@@ -1721,9 +1721,9 @@ modules:
- name: size
description: The amount of memory in megabytes that this memory module manages.
metrics:
- - name: mem.edac_mc
+ - name: mem.edac_mc_errors
description: DIMM Error Detection And Correction (EDAC) Errors
- unit: errors/s
+ unit: errors
chart_type: line
dimensions:
- name: correctable
diff --git a/src/collectors/proc.plugin/sys_devices_system_edac_mc.c b/src/collectors/proc.plugin/sys_devices_system_edac_mc.c
index 9904b9cbcf..d3db8c0442 100644
--- a/src/collectors/proc.plugin/sys_devices_system_edac_mc.c
+++ b/src/collectors/proc.plugin/sys_devices_system_edac_mc.c
@@ -202,9 +202,9 @@ int do_proc_sys_devices_system_edac_mc(int update_every, usec_t dt __maybe_unuse
, id
, NULL
, "edac"
- , "mem.edac_mc"
+ , "mem.edac_mc_errors"
, "Memory Controller (MC) Error Detection And Correction (EDAC) Errors"
- , "errors/s"
+ , "errors"
, PLUGIN_PROC_NAME
, "/sys/devices/system/edac/mc"
, NETDATA_CHART_PRIO_MEM_HW_ECC_CE
@@ -225,10 +225,10 @@ int do_proc_sys_devices_system_edac_mc(int update_every, usec_t dt __maybe_unuse
if(read_edac_mc_file(m->name, "max_location", buffer, 1024))
rrdlabels_add(m->st->rrdlabels, "max_location", buffer, RRDLABEL_SRC_AUTO);
- m->ce.rd = rrddim_add(m->st, "correctable", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
- m->ue.rd = rrddim_add(m->st, "uncorrectable", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
- m->ce_noinfo.rd = rrddim_add(m->st, "correctable_noinfo", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
- m->ue_noinfo.rd = rrddim_add(m->st, "uncorrectable_noinfo", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
+ m->ce.rd = rrddim_add(m->st, "correctable", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ m->ue.rd = rrddim_add(m->st, "uncorrectable", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ m->ce_noinfo.rd = rrddim_add(m->st, "correctable_noinfo", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ m->ue_noinfo.rd = rrddim_add(m->st, "uncorrectable_noinfo", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
}
rrddim_set_by_pointer(m->st, m->ce.rd, (collected_number)m->ce.count);
@@ -250,9 +250,9 @@ int do_proc_sys_devices_system_edac_mc(int update_every, usec_t dt __maybe_unuse
, id
, NULL
, "edac"
- , "mem.edac_mc_dimm"
+ , "mem.edac_mc_dimm_errors"
, "DIMM Error Detection And Correction (EDAC) Errors"
- , "errors/s"
+ , "errors"
, PLUGIN_PROC_NAME
, "/sys/devices/system/edac/mc"
, NETDATA_CHART_PRIO_MEM_HW_ECC_CE + 1
@@ -283,8 +283,8 @@ int do_proc_sys_devices_system_edac_mc(int update_every, usec_t dt __maybe_unuse
if (read_edac_mc_rank_file(m->name, d->name, "size", buffer, 1024))
rrdlabels_add(d->st->rrdlabels, "size", buffer, RRDLABEL_SRC_AUTO);
- d->ce.rd = rrddim_add(d->st, "correctable", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
- d->ue.rd = rrddim_add(d->st, "uncorrectable", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
+ d->ce.rd = rrddim_add(d->st, "correctable", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ d->ue.rd = rrddim_add(d->st, "uncorrectable", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
}
rrddim_set_by_pointer(d->st, d->ce.rd, (collected_number)d->ce.count);
diff --git a/src/health/health.d/memory.conf b/src/health/health.d/memory.conf
index 5ab3d2d922..4a5f476e00 100644
--- a/src/health/health.d/memory.conf
+++ b/src/health/health.d/memory.conf
@@ -19,67 +19,63 @@ component: Memory
## ECC Controller
template: ecc_memory_mc_correctable
- on: mem.edac_mc
+ on: mem.edac_mc_errors
class: Errors
type: System
component: Memory
os: linux
hosts: *
- lookup: sum -10m unaligned of correctable, correctable_noinfo
+ calc: $correctable + $correctable_noinfo
units: errors
every: 1m
warn: $this > 0
- delay: down 1h multiplier 1.5 max 1h
summary: System ECC memory ${label:controller} correctable errors
- info: Memory controller ${label:controller} ECC correctable errors in the last 10 minutes
+ info: Memory controller ${label:controller} ECC correctable errors
to: sysadmin
template: ecc_memory_mc_uncorrectable
- on: mem.edac_mc
+ on: mem.edac_mc_errors
class: Errors
type: System
component: Memory
os: linux
hosts: *
- lookup: sum -10m unaligned of uncorrectable,uncorrectable_noinfo
+ calc: $uncorrectable + $uncorrectable_noinfo
units: errors
every: 1m
crit: $this > 0
- delay: down 1h multiplier 1.5 max 1h
summary: System ECC memory ${label:controller} uncorrectable errors
- info: Memory controller ${label:controller} ECC uncorrectable errors in the last 10 minutes
+ info: Memory controller ${label:controller} ECC uncorrectable errors
to: sysadmin
## ECC DIMM
template: ecc_memory_dimm_correctable
- on: mem.edac_mc_dimm
+ on: mem.edac_mc_dimm_errors
class: Errors
type: System
component: Memory
os: linux
hosts: *
- lookup: sum -10m unaligned of correctable
+ calc: $correctable
units: errors
every: 1m
warn: $this > 0
- delay: down 1h multiplier 1.5 max 1h
summary: System ECC memory DIMM ${label:dimm} correctable errors
- info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors in the last 10 minutes
+ info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors
to: sysadmin
template: ecc_memory_dimm_uncorrectable
- on: mem.edac_mc_dimm
+ on: mem.edac_mc_dimm_errors
class: Errors
type: System
component: Memory
os: linux
hosts: *
- lookup: sum -10m unaligned of uncorrectable
+ calc: $uncorrectable
units: errors
every: 1m
crit: $this > 0
- delay: down 1h multiplier 1.5 max 1h
summary: System ECC memory DIMM ${label:dimm} uncorrectable errors
- info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors in the last 10 minutes
+ info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors
to: sysadmin