summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIlya Mashchenko <ilya@netdata.cloud>2024-04-20 19:20:12 +0300
committerGitHub <noreply@github.com>2024-04-20 19:20:12 +0300
commit623244f8a9dc134c37599d1e07e7792c9f193583 (patch)
treefd4a7d5e1efc82bba8c374322b4bdab6d289f8ff
parent3d65360e6990bbf67b85fed0ab1ebc1620a6cdf3 (diff)
go.d storcli update (#17460)
-rw-r--r--src/go/collectors/go.d.plugin/modules/storcli/charts.go45
-rw-r--r--src/go/collectors/go.d.plugin/modules/storcli/collect_controllers.go24
-rw-r--r--src/go/collectors/go.d.plugin/modules/storcli/collect_drives.go12
-rw-r--r--src/go/collectors/go.d.plugin/modules/storcli/metadata.yaml36
-rw-r--r--src/go/collectors/go.d.plugin/modules/storcli/storcli_test.go15
-rw-r--r--src/health/health.d/megacli.conf2
-rw-r--r--src/health/health.d/storcli.conf61
7 files changed, 174 insertions, 21 deletions
diff --git a/src/go/collectors/go.d.plugin/modules/storcli/charts.go b/src/go/collectors/go.d.plugin/modules/storcli/charts.go
index 65cd75a331..f12b2d1a78 100644
--- a/src/go/collectors/go.d.plugin/modules/storcli/charts.go
+++ b/src/go/collectors/go.d.plugin/modules/storcli/charts.go
@@ -18,6 +18,8 @@ const (
prioPhysDrivePredictiveFailures
prioPhysDriveSmartAlertStatus
prioPhysDriveTemperature
+
+ prioBBUTemperature
)
var controllerChartsTmpl = module.Charts{
@@ -106,7 +108,7 @@ var (
physDriveTemperatureChartTmpl = module.Chart{
ID: "phys_drive_%s_cntrl_%s_temperature",
Title: "Physical Drive temperature",
- Units: "status",
+ Units: "Celsius",
Fam: "pd temperature",
Ctx: "storcli.phys_drive_temperature",
Type: module.Line,
@@ -117,6 +119,25 @@ var (
}
)
+var bbuChartsTmpl = module.Charts{
+ bbuTemperatureChartTmpl.Copy(),
+}
+
+var (
+ bbuTemperatureChartTmpl = module.Chart{
+ ID: "bbu_%s_cntrl_%s_temperature",
+ Title: "BBU temperature",
+ Units: "Celsius",
+ Fam: "bbu temperature",
+ Ctx: "storcli.bbu_temperature",
+ Type: module.Line,
+ Priority: prioBBUTemperature,
+ Dims: module.Dims{
+ {ID: "bbu_%s_cntrl_%s_temperature", Name: "temperature"},
+ },
+ }
+)
+
func (s *StorCli) addControllerCharts(cntrl controllerInfo) {
charts := controllerChartsTmpl.Copy()
@@ -141,7 +162,7 @@ func (s *StorCli) addControllerCharts(cntrl controllerInfo) {
func (s *StorCli) addPhysDriveCharts(cntrlNum int, di *driveInfo, ds *driveState, da *driveAttrs) {
charts := physDriveChartsTmpl.Copy()
- if _, ok := parseInt(getDriveTemperature(ds.DriveTemperature)); !ok {
+ if _, ok := parseInt(getTemperature(ds.DriveTemperature)); !ok {
_ = charts.Remove(physDriveTemperatureChartTmpl.ID)
}
@@ -169,3 +190,23 @@ func (s *StorCli) addPhysDriveCharts(cntrlNum int, di *driveInfo, ds *driveState
s.Warning(err)
}
}
+
+func (s *StorCli) addBBUCharts(cntrlNum, bbuNum, model string) {
+ charts := bbuChartsTmpl.Copy()
+
+ for _, chart := range *charts {
+ chart.ID = fmt.Sprintf(chart.ID, bbuNum, cntrlNum)
+ chart.Labels = []module.Label{
+ {Key: "controller_number", Value: cntrlNum},
+ {Key: "bbu_number", Value: bbuNum},
+ {Key: "model", Value: model},
+ }
+ for _, dim := range chart.Dims {
+ dim.ID = fmt.Sprintf(dim.ID, bbuNum, cntrlNum)
+ }
+ }
+
+ if err := s.Charts().Add(*charts...); err != nil {
+ s.Warning(err)
+ }
+}
diff --git a/src/go/collectors/go.d.plugin/modules/storcli/collect_controllers.go b/src/go/collectors/go.d.plugin/modules/storcli/collect_controllers.go
index 259013e6c1..d1302aea01 100644
--- a/src/go/collectors/go.d.plugin/modules/storcli/collect_controllers.go
+++ b/src/go/collectors/go.d.plugin/modules/storcli/collect_controllers.go
@@ -47,13 +47,14 @@ func (s *StorCli) collectControllersInfo(mx map[string]int64, resp *controllersI
for _, v := range resp.Controllers {
cntrl := v.ResponseData
- idx := strconv.Itoa(cntrl.Basics.Controller)
- if !s.controllers[idx] {
- s.controllers[idx] = true
+ cntrlNum := strconv.Itoa(cntrl.Basics.Controller)
+
+ if !s.controllers[cntrlNum] {
+ s.controllers[cntrlNum] = true
s.addControllerCharts(cntrl)
}
- px := fmt.Sprintf("cntrl_%s_", idx)
+ px := fmt.Sprintf("cntrl_%s_", cntrlNum)
for _, st := range []string{"optimal", "degraded", "partially_degraded", "failed"} {
mx[px+"status_"+st] = 0
@@ -72,7 +73,22 @@ func (s *StorCli) collectControllersInfo(mx map[string]int64, resp *controllersI
default:
mx[px+"bbu_status_unhealthy"] = 1
}
+
+ for i, bbu := range cntrl.BBUInfo {
+ bbuNum := strconv.Itoa(i)
+ if k := cntrlNum + bbuNum; !s.bbu[k] {
+ s.bbu[k] = true
+ s.addBBUCharts(cntrlNum, bbuNum, bbu.Model)
+ }
+
+ px := fmt.Sprintf("bbu_%s_cntrl_%s_", bbuNum, cntrlNum)
+
+ if v, ok := parseInt(getTemperature(bbu.Temp)); ok {
+ mx[px+"temperature"] = v
+ }
+ }
}
+
return nil
}
diff --git a/src/go/collectors/go.d.plugin/modules/storcli/collect_drives.go b/src/go/collectors/go.d.plugin/modules/storcli/collect_drives.go
index 353728d6dc..c84ca4b1e2 100644
--- a/src/go/collectors/go.d.plugin/modules/storcli/collect_drives.go
+++ b/src/go/collectors/go.d.plugin/modules/storcli/collect_drives.go
@@ -112,7 +112,7 @@ func (s *StorCli) collectMegaRaidDrives(mx map[string]int64, resp *drivesInfoRes
if v, ok := parseInt(string(state.PredictiveFailureCount)); ok {
mx[px+"predictive_failure_count"] = v
}
- if v, ok := parseInt(getDriveTemperature(state.DriveTemperature)); ok {
+ if v, ok := parseInt(getTemperature(state.DriveTemperature)); ok {
mx[px+"temperature"] = v
}
for _, st := range []string{"active", "inactive"} {
@@ -120,6 +120,8 @@ func (s *StorCli) collectMegaRaidDrives(mx map[string]int64, resp *drivesInfoRes
}
if state.SmartAlertFlagged == "Yes" {
mx[px+"smart_alert_status_active"] = 1
+ } else {
+ mx[px+"smart_alert_status_inactive"] = 1
}
}
}
@@ -216,13 +218,13 @@ func getDriveAttrs(driveDetailedInfo map[string]json.RawMessage, id string) (*dr
return &state, nil
}
-func getDriveTemperature(s string) string {
- // ' 28C (82.40 F)'
- i := strings.IndexByte(s, 'C')
+func getTemperature(temp string) string {
+ // ' 28C (82.40 F)' (drive) or '33C' (bbu)
+ i := strings.IndexByte(temp, 'C')
if i == -1 {
return ""
}
- return strings.TrimSpace(s[:i])
+ return strings.TrimSpace(temp[:i])
}
func parseInt(s string) (int64, bool) {
diff --git a/src/go/collectors/go.d.plugin/modules/storcli/metadata.yaml b/src/go/collectors/go.d.plugin/modules/storcli/metadata.yaml
index ecf97fb442..ab7866bcf9 100644
--- a/src/go/collectors/go.d.plugin/modules/storcli/metadata.yaml
+++ b/src/go/collectors/go.d.plugin/modules/storcli/metadata.yaml
@@ -80,7 +80,23 @@ modules:
troubleshooting:
problems:
list: []
- alerts: []
+ alerts:
+ - name: storcli_controller_status
+ metric: storcli.controller_status
+ info: RAID controller ${label:controller_number} health status is not optimal
+ link: https://github.com/netdata/netdata/blob/master/src/health/health.d/storcli.conf
+ - name: storcli_controller_bbu_status
+ metric: storcli.controller_bbu_status
+ info: RAID controller ${label:controller_number} BBU is unhealthy
+ link: https://github.com/netdata/netdata/blob/master/src/health/health.d/storcli.conf
+ - name: storcli_phys_drive_errors
+ metric: storcli.phys_drive_errors
+ info: RAID physical drive c${label:controller_number}/e${label:enclosure_number}/s${label:slot_number} errors
+ link: https://github.com/netdata/netdata/blob/master/src/health/health.d/storcli.conf
+ - name: storcli_phys_drive_predictive_failures
+ metric: storcli.phys_drive_predictive_failures
+ info: RAID physical drive c${label:controller_number}/e${label:enclosure_number}/s${label:slot_number} predictive failures
+ link: https://github.com/netdata/netdata/blob/master/src/health/health.d/storcli.conf
metrics:
folding:
title: Metrics
@@ -147,7 +163,23 @@ modules:
- name: inactive
- name: storcli.phys_drive_temperature
description: Physical Drive temperature
- unit: status
+ unit: Celsius
+ chart_type: line
+ dimensions:
+ - name: temperature
+ - name: bbu
+ description: These metrics refer to the Backup Battery Unit.
+ labels:
+ - name: controller_number
+ description: Controller number (index)
+ - name: bbu_number
+ description: BBU number (index)
+ - name: model
+ description: BBU model
+ metrics:
+ - name: storcli.bbu_temperature
+ description: BBU temperature
+ unit: Celsius
chart_type: line
dimensions:
- name: temperature
diff --git a/src/go/collectors/go.d.plugin/modules/storcli/storcli_test.go b/src/go/collectors/go.d.plugin/modules/storcli/storcli_test.go
index 84ea3bb4c3..74c92b8704 100644
--- a/src/go/collectors/go.d.plugin/modules/storcli/storcli_test.go
+++ b/src/go/collectors/go.d.plugin/modules/storcli/storcli_test.go
@@ -147,8 +147,9 @@ func TestStorCli_Collect(t *testing.T) {
}{
"success MegaRAID controller": {
prepareMock: prepareMockMegaRaidOK,
- wantCharts: len(controllerChartsTmpl)*1 + len(physDriveChartsTmpl)*6,
+ wantCharts: len(controllerChartsTmpl)*1 + len(physDriveChartsTmpl)*6 + len(bbuChartsTmpl)*1,
wantMetrics: map[string]int64{
+ "bbu_0_cntrl_0_temperature": 34,
"cntrl_0_bbu_status_healthy": 1,
"cntrl_0_bbu_status_na": 0,
"cntrl_0_bbu_status_unhealthy": 0,
@@ -160,37 +161,37 @@ func TestStorCli_Collect(t *testing.T) {
"phys_drive_5000C500C36C8BCD_cntrl_0_other_error_count": 0,
"phys_drive_5000C500C36C8BCD_cntrl_0_predictive_failure_count": 0,
"phys_drive_5000C500C36C8BCD_cntrl_0_smart_alert_status_active": 0,
- "phys_drive_5000C500C36C8BCD_cntrl_0_smart_alert_status_inactive": 0,
+ "phys_drive_5000C500C36C8BCD_cntrl_0_smart_alert_status_inactive": 1,
"phys_drive_5000C500C36C8BCD_cntrl_0_temperature": 28,
"phys_drive_5000C500D59840FE_cntrl_0_media_error_count": 0,
"phys_drive_5000C500D59840FE_cntrl_0_other_error_count": 0,
"phys_drive_5000C500D59840FE_cntrl_0_predictive_failure_count": 0,
"phys_drive_5000C500D59840FE_cntrl_0_smart_alert_status_active": 0,
- "phys_drive_5000C500D59840FE_cntrl_0_smart_alert_status_inactive": 0,
+ "phys_drive_5000C500D59840FE_cntrl_0_smart_alert_status_inactive": 1,
"phys_drive_5000C500D59840FE_cntrl_0_temperature": 28,
"phys_drive_5000C500D6061539_cntrl_0_media_error_count": 0,
"phys_drive_5000C500D6061539_cntrl_0_other_error_count": 0,
"phys_drive_5000C500D6061539_cntrl_0_predictive_failure_count": 0,
"phys_drive_5000C500D6061539_cntrl_0_smart_alert_status_active": 0,
- "phys_drive_5000C500D6061539_cntrl_0_smart_alert_status_inactive": 0,
+ "phys_drive_5000C500D6061539_cntrl_0_smart_alert_status_inactive": 1,
"phys_drive_5000C500D6061539_cntrl_0_temperature": 28,
"phys_drive_5000C500DC79B194_cntrl_0_media_error_count": 0,
"phys_drive_5000C500DC79B194_cntrl_0_other_error_count": 0,
"phys_drive_5000C500DC79B194_cntrl_0_predictive_failure_count": 0,
"phys_drive_5000C500DC79B194_cntrl_0_smart_alert_status_active": 0,
- "phys_drive_5000C500DC79B194_cntrl_0_smart_alert_status_inactive": 0,
+ "phys_drive_5000C500DC79B194_cntrl_0_smart_alert_status_inactive": 1,
"phys_drive_5000C500DC79B194_cntrl_0_temperature": 28,
"phys_drive_5000C500E54F4EBB_cntrl_0_media_error_count": 0,
"phys_drive_5000C500E54F4EBB_cntrl_0_other_error_count": 0,
"phys_drive_5000C500E54F4EBB_cntrl_0_predictive_failure_count": 0,
"phys_drive_5000C500E54F4EBB_cntrl_0_smart_alert_status_active": 0,
- "phys_drive_5000C500E54F4EBB_cntrl_0_smart_alert_status_inactive": 0,
+ "phys_drive_5000C500E54F4EBB_cntrl_0_smart_alert_status_inactive": 1,
"phys_drive_5000C500E54F4EBB_cntrl_0_temperature": 28,
"phys_drive_5000C500E5659BA7_cntrl_0_media_error_count": 0,
"phys_drive_5000C500E5659BA7_cntrl_0_other_error_count": 0,
"phys_drive_5000C500E5659BA7_cntrl_0_predictive_failure_count": 0,
"phys_drive_5000C500E5659BA7_cntrl_0_smart_alert_status_active": 0,
- "phys_drive_5000C500E5659BA7_cntrl_0_smart_alert_status_inactive": 0,
+ "phys_drive_5000C500E5659BA7_cntrl_0_smart_alert_status_inactive": 1,
"phys_drive_5000C500E5659BA7_cntrl_0_temperature": 27,
},
},
diff --git a/src/health/health.d/megacli.conf b/src/health/health.d/megacli.conf
index 8d71d585bf..27721fa9ad 100644
--- a/src/health/health.d/megacli.conf
+++ b/src/health/health.d/megacli.conf
@@ -38,7 +38,7 @@ component: RAID
type: System
component: RAID
lookup: sum -10s
- units: media errors
+ units: failures
every: 10s
warn: $this > 0
delay: up 1m down 5m multiplier 2 max 10m
diff --git a/src/health/health.d/storcli.conf b/src/health/health.d/storcli.conf
new file mode 100644
index 0000000000..0beda76862
--- /dev/null
+++ b/src/health/health.d/storcli.conf
@@ -0,0 +1,61 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# Controllers
+
+ template: storcli_controller_status
+ on: storcli.controller_status
+ class: Errors
+ type: System
+component: RAID
+ lookup: average -1m unaligned percentage of optimal
+ units: %
+ every: 10s
+ crit: $this < 100
+ delay: down 5m multiplier 2 max 10m
+ summary: RAID controller ${label:controller_number} health
+ info: RAID controller ${label:controller_number} health status is not optimal
+ to: sysadmin
+
+ template: storcli_controller_bbu_status
+ on: storcli.controller_bbu_status
+ class: Errors
+ type: System
+component: RAID
+ lookup: average -1m unaligned percentage of healthy,na
+ units: %
+ every: 10s
+ crit: $this < 100
+ delay: down 5m multiplier 2 max 10m
+ summary: RAID controller ${label:controller_number} BBU health
+ info: RAID controller ${label:controller_number} BBU is unhealthy
+ to: sysadmin
+
+# Physical Drives
+
+ template: storcli_phys_drive_errors
+ on: storcli.phys_drive_errors
+ class: Errors
+ type: System
+component: RAID
+ lookup: sum -10s
+ units: errors
+ every: 10s
+ warn: $this > 0
+ delay: up 1m down 5m multiplier 2 max 10m
+ summary: RAID PD c${label:controller_number}/e${label:enclosure_number}/s${label:slot_number} errors
+ info: RAID physical drive c${label:controller_number}/e${label:enclosure_number}/s${label:slot_number} errors
+ to: sysadmin
+
+ template: storcli_phys_drive_predictive_failures
+ on: storcli.phys_drive_predictive_failures
+ class: Errors
+ type: System
+component: RAID
+ lookup: sum -10s
+ units: failures
+ every: 10s
+ warn: $this > 0
+ delay: up 1m down 5m multiplier 2 max 10m
+ summary: RAID PD c${label:controller_number}/e${label:enclosure_number}/s${label:slot_number} predictive failures
+ info: RAID physical drive c${label:controller_number}/e${label:enclosure_number}/s${label:slot_number} predictive failures
+ to: sysadmin