diff options
author | Ilya Mashchenko <ilya@netdata.cloud> | 2024-04-20 19:20:12 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-04-20 19:20:12 +0300 |
commit | 623244f8a9dc134c37599d1e07e7792c9f193583 (patch) | |
tree | fd4a7d5e1efc82bba8c374322b4bdab6d289f8ff | |
parent | 3d65360e6990bbf67b85fed0ab1ebc1620a6cdf3 (diff) |
go.d storcli update (#17460)
7 files changed, 174 insertions, 21 deletions
diff --git a/src/go/collectors/go.d.plugin/modules/storcli/charts.go b/src/go/collectors/go.d.plugin/modules/storcli/charts.go index 65cd75a331..f12b2d1a78 100644 --- a/src/go/collectors/go.d.plugin/modules/storcli/charts.go +++ b/src/go/collectors/go.d.plugin/modules/storcli/charts.go @@ -18,6 +18,8 @@ const ( prioPhysDrivePredictiveFailures prioPhysDriveSmartAlertStatus prioPhysDriveTemperature + + prioBBUTemperature ) var controllerChartsTmpl = module.Charts{ @@ -106,7 +108,7 @@ var ( physDriveTemperatureChartTmpl = module.Chart{ ID: "phys_drive_%s_cntrl_%s_temperature", Title: "Physical Drive temperature", - Units: "status", + Units: "Celsius", Fam: "pd temperature", Ctx: "storcli.phys_drive_temperature", Type: module.Line, @@ -117,6 +119,25 @@ var ( } ) +var bbuChartsTmpl = module.Charts{ + bbuTemperatureChartTmpl.Copy(), +} + +var ( + bbuTemperatureChartTmpl = module.Chart{ + ID: "bbu_%s_cntrl_%s_temperature", + Title: "BBU temperature", + Units: "Celsius", + Fam: "bbu temperature", + Ctx: "storcli.bbu_temperature", + Type: module.Line, + Priority: prioBBUTemperature, + Dims: module.Dims{ + {ID: "bbu_%s_cntrl_%s_temperature", Name: "temperature"}, + }, + } +) + func (s *StorCli) addControllerCharts(cntrl controllerInfo) { charts := controllerChartsTmpl.Copy() @@ -141,7 +162,7 @@ func (s *StorCli) addControllerCharts(cntrl controllerInfo) { func (s *StorCli) addPhysDriveCharts(cntrlNum int, di *driveInfo, ds *driveState, da *driveAttrs) { charts := physDriveChartsTmpl.Copy() - if _, ok := parseInt(getDriveTemperature(ds.DriveTemperature)); !ok { + if _, ok := parseInt(getTemperature(ds.DriveTemperature)); !ok { _ = charts.Remove(physDriveTemperatureChartTmpl.ID) } @@ -169,3 +190,23 @@ func (s *StorCli) addPhysDriveCharts(cntrlNum int, di *driveInfo, ds *driveState s.Warning(err) } } + +func (s *StorCli) addBBUCharts(cntrlNum, bbuNum, model string) { + charts := bbuChartsTmpl.Copy() + + for _, chart := range *charts { + chart.ID = fmt.Sprintf(chart.ID, bbuNum, cntrlNum) + chart.Labels = []module.Label{ + {Key: "controller_number", Value: cntrlNum}, + {Key: "bbu_number", Value: bbuNum}, + {Key: "model", Value: model}, + } + for _, dim := range chart.Dims { + dim.ID = fmt.Sprintf(dim.ID, bbuNum, cntrlNum) + } + } + + if err := s.Charts().Add(*charts...); err != nil { + s.Warning(err) + } +} diff --git a/src/go/collectors/go.d.plugin/modules/storcli/collect_controllers.go b/src/go/collectors/go.d.plugin/modules/storcli/collect_controllers.go index 259013e6c1..d1302aea01 100644 --- a/src/go/collectors/go.d.plugin/modules/storcli/collect_controllers.go +++ b/src/go/collectors/go.d.plugin/modules/storcli/collect_controllers.go @@ -47,13 +47,14 @@ func (s *StorCli) collectControllersInfo(mx map[string]int64, resp *controllersI for _, v := range resp.Controllers { cntrl := v.ResponseData - idx := strconv.Itoa(cntrl.Basics.Controller) - if !s.controllers[idx] { - s.controllers[idx] = true + cntrlNum := strconv.Itoa(cntrl.Basics.Controller) + + if !s.controllers[cntrlNum] { + s.controllers[cntrlNum] = true s.addControllerCharts(cntrl) } - px := fmt.Sprintf("cntrl_%s_", idx) + px := fmt.Sprintf("cntrl_%s_", cntrlNum) for _, st := range []string{"optimal", "degraded", "partially_degraded", "failed"} { mx[px+"status_"+st] = 0 @@ -72,7 +73,22 @@ func (s *StorCli) collectControllersInfo(mx map[string]int64, resp *controllersI default: mx[px+"bbu_status_unhealthy"] = 1 } + + for i, bbu := range cntrl.BBUInfo { + bbuNum := strconv.Itoa(i) + if k := cntrlNum + bbuNum; !s.bbu[k] { + s.bbu[k] = true + s.addBBUCharts(cntrlNum, bbuNum, bbu.Model) + } + + px := fmt.Sprintf("bbu_%s_cntrl_%s_", bbuNum, cntrlNum) + + if v, ok := parseInt(getTemperature(bbu.Temp)); ok { + mx[px+"temperature"] = v + } + } } + return nil } diff --git a/src/go/collectors/go.d.plugin/modules/storcli/collect_drives.go b/src/go/collectors/go.d.plugin/modules/storcli/collect_drives.go index 353728d6dc..c84ca4b1e2 100644 --- a/src/go/collectors/go.d.plugin/modules/storcli/collect_drives.go +++ b/src/go/collectors/go.d.plugin/modules/storcli/collect_drives.go @@ -112,7 +112,7 @@ func (s *StorCli) collectMegaRaidDrives(mx map[string]int64, resp *drivesInfoRes if v, ok := parseInt(string(state.PredictiveFailureCount)); ok { mx[px+"predictive_failure_count"] = v } - if v, ok := parseInt(getDriveTemperature(state.DriveTemperature)); ok { + if v, ok := parseInt(getTemperature(state.DriveTemperature)); ok { mx[px+"temperature"] = v } for _, st := range []string{"active", "inactive"} { @@ -120,6 +120,8 @@ func (s *StorCli) collectMegaRaidDrives(mx map[string]int64, resp *drivesInfoRes } if state.SmartAlertFlagged == "Yes" { mx[px+"smart_alert_status_active"] = 1 + } else { + mx[px+"smart_alert_status_inactive"] = 1 } } } @@ -216,13 +218,13 @@ func getDriveAttrs(driveDetailedInfo map[string]json.RawMessage, id string) (*dr return &state, nil } -func getDriveTemperature(s string) string { - // ' 28C (82.40 F)' - i := strings.IndexByte(s, 'C') +func getTemperature(temp string) string { + // ' 28C (82.40 F)' (drive) or '33C' (bbu) + i := strings.IndexByte(temp, 'C') if i == -1 { return "" } - return strings.TrimSpace(s[:i]) + return strings.TrimSpace(temp[:i]) } func parseInt(s string) (int64, bool) { diff --git a/src/go/collectors/go.d.plugin/modules/storcli/metadata.yaml b/src/go/collectors/go.d.plugin/modules/storcli/metadata.yaml index ecf97fb442..ab7866bcf9 100644 --- a/src/go/collectors/go.d.plugin/modules/storcli/metadata.yaml +++ b/src/go/collectors/go.d.plugin/modules/storcli/metadata.yaml @@ -80,7 +80,23 @@ modules: troubleshooting: problems: list: [] - alerts: [] + alerts: + - name: storcli_controller_status + metric: storcli.controller_status + info: RAID controller ${label:controller_number} health status is not optimal + link: https://github.com/netdata/netdata/blob/master/src/health/health.d/storcli.conf + - name: storcli_controller_bbu_status + metric: storcli.controller_bbu_status + info: RAID controller ${label:controller_number} BBU is unhealthy + link: https://github.com/netdata/netdata/blob/master/src/health/health.d/storcli.conf + - name: storcli_phys_drive_errors + metric: storcli.phys_drive_errors + info: RAID physical drive c${label:controller_number}/e${label:enclosure_number}/s${label:slot_number} errors + link: https://github.com/netdata/netdata/blob/master/src/health/health.d/storcli.conf + - name: storcli_phys_drive_predictive_failures + metric: storcli.phys_drive_predictive_failures + info: RAID physical drive c${label:controller_number}/e${label:enclosure_number}/s${label:slot_number} predictive failures + link: https://github.com/netdata/netdata/blob/master/src/health/health.d/storcli.conf metrics: folding: title: Metrics @@ -147,7 +163,23 @@ modules: - name: inactive - name: storcli.phys_drive_temperature description: Physical Drive temperature - unit: status + unit: Celsius + chart_type: line + dimensions: + - name: temperature + - name: bbu + description: These metrics refer to the Backup Battery Unit. + labels: + - name: controller_number + description: Controller number (index) + - name: bbu_number + description: BBU number (index) + - name: model + description: BBU model + metrics: + - name: storcli.bbu_temperature + description: BBU temperature + unit: Celsius chart_type: line dimensions: - name: temperature diff --git a/src/go/collectors/go.d.plugin/modules/storcli/storcli_test.go b/src/go/collectors/go.d.plugin/modules/storcli/storcli_test.go index 84ea3bb4c3..74c92b8704 100644 --- a/src/go/collectors/go.d.plugin/modules/storcli/storcli_test.go +++ b/src/go/collectors/go.d.plugin/modules/storcli/storcli_test.go @@ -147,8 +147,9 @@ func TestStorCli_Collect(t *testing.T) { }{ "success MegaRAID controller": { prepareMock: prepareMockMegaRaidOK, - wantCharts: len(controllerChartsTmpl)*1 + len(physDriveChartsTmpl)*6, + wantCharts: len(controllerChartsTmpl)*1 + len(physDriveChartsTmpl)*6 + len(bbuChartsTmpl)*1, wantMetrics: map[string]int64{ + "bbu_0_cntrl_0_temperature": 34, "cntrl_0_bbu_status_healthy": 1, "cntrl_0_bbu_status_na": 0, "cntrl_0_bbu_status_unhealthy": 0, @@ -160,37 +161,37 @@ func TestStorCli_Collect(t *testing.T) { "phys_drive_5000C500C36C8BCD_cntrl_0_other_error_count": 0, "phys_drive_5000C500C36C8BCD_cntrl_0_predictive_failure_count": 0, "phys_drive_5000C500C36C8BCD_cntrl_0_smart_alert_status_active": 0, - "phys_drive_5000C500C36C8BCD_cntrl_0_smart_alert_status_inactive": 0, + "phys_drive_5000C500C36C8BCD_cntrl_0_smart_alert_status_inactive": 1, "phys_drive_5000C500C36C8BCD_cntrl_0_temperature": 28, "phys_drive_5000C500D59840FE_cntrl_0_media_error_count": 0, "phys_drive_5000C500D59840FE_cntrl_0_other_error_count": 0, "phys_drive_5000C500D59840FE_cntrl_0_predictive_failure_count": 0, "phys_drive_5000C500D59840FE_cntrl_0_smart_alert_status_active": 0, - "phys_drive_5000C500D59840FE_cntrl_0_smart_alert_status_inactive": 0, + "phys_drive_5000C500D59840FE_cntrl_0_smart_alert_status_inactive": 1, "phys_drive_5000C500D59840FE_cntrl_0_temperature": 28, "phys_drive_5000C500D6061539_cntrl_0_media_error_count": 0, "phys_drive_5000C500D6061539_cntrl_0_other_error_count": 0, "phys_drive_5000C500D6061539_cntrl_0_predictive_failure_count": 0, "phys_drive_5000C500D6061539_cntrl_0_smart_alert_status_active": 0, - "phys_drive_5000C500D6061539_cntrl_0_smart_alert_status_inactive": 0, + "phys_drive_5000C500D6061539_cntrl_0_smart_alert_status_inactive": 1, "phys_drive_5000C500D6061539_cntrl_0_temperature": 28, "phys_drive_5000C500DC79B194_cntrl_0_media_error_count": 0, "phys_drive_5000C500DC79B194_cntrl_0_other_error_count": 0, "phys_drive_5000C500DC79B194_cntrl_0_predictive_failure_count": 0, "phys_drive_5000C500DC79B194_cntrl_0_smart_alert_status_active": 0, - "phys_drive_5000C500DC79B194_cntrl_0_smart_alert_status_inactive": 0, + "phys_drive_5000C500DC79B194_cntrl_0_smart_alert_status_inactive": 1, "phys_drive_5000C500DC79B194_cntrl_0_temperature": 28, "phys_drive_5000C500E54F4EBB_cntrl_0_media_error_count": 0, "phys_drive_5000C500E54F4EBB_cntrl_0_other_error_count": 0, "phys_drive_5000C500E54F4EBB_cntrl_0_predictive_failure_count": 0, "phys_drive_5000C500E54F4EBB_cntrl_0_smart_alert_status_active": 0, - "phys_drive_5000C500E54F4EBB_cntrl_0_smart_alert_status_inactive": 0, + "phys_drive_5000C500E54F4EBB_cntrl_0_smart_alert_status_inactive": 1, "phys_drive_5000C500E54F4EBB_cntrl_0_temperature": 28, "phys_drive_5000C500E5659BA7_cntrl_0_media_error_count": 0, "phys_drive_5000C500E5659BA7_cntrl_0_other_error_count": 0, "phys_drive_5000C500E5659BA7_cntrl_0_predictive_failure_count": 0, "phys_drive_5000C500E5659BA7_cntrl_0_smart_alert_status_active": 0, - "phys_drive_5000C500E5659BA7_cntrl_0_smart_alert_status_inactive": 0, + "phys_drive_5000C500E5659BA7_cntrl_0_smart_alert_status_inactive": 1, "phys_drive_5000C500E5659BA7_cntrl_0_temperature": 27, }, }, diff --git a/src/health/health.d/megacli.conf b/src/health/health.d/megacli.conf index 8d71d585bf..27721fa9ad 100644 --- a/src/health/health.d/megacli.conf +++ b/src/health/health.d/megacli.conf @@ -38,7 +38,7 @@ component: RAID type: System component: RAID lookup: sum -10s - units: media errors + units: failures every: 10s warn: $this > 0 delay: up 1m down 5m multiplier 2 max 10m diff --git a/src/health/health.d/storcli.conf b/src/health/health.d/storcli.conf new file mode 100644 index 0000000000..0beda76862 --- /dev/null +++ b/src/health/health.d/storcli.conf @@ -0,0 +1,61 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# Controllers + + template: storcli_controller_status + on: storcli.controller_status + class: Errors + type: System +component: RAID + lookup: average -1m unaligned percentage of optimal + units: % + every: 10s + crit: $this < 100 + delay: down 5m multiplier 2 max 10m + summary: RAID controller ${label:controller_number} health + info: RAID controller ${label:controller_number} health status is not optimal + to: sysadmin + + template: storcli_controller_bbu_status + on: storcli.controller_bbu_status + class: Errors + type: System +component: RAID + lookup: average -1m unaligned percentage of healthy,na + units: % + every: 10s + crit: $this < 100 + delay: down 5m multiplier 2 max 10m + summary: RAID controller ${label:controller_number} BBU health + info: RAID controller ${label:controller_number} BBU is unhealthy + to: sysadmin + +# Physical Drives + + template: storcli_phys_drive_errors + on: storcli.phys_drive_errors + class: Errors + type: System +component: RAID + lookup: sum -10s + units: errors + every: 10s + warn: $this > 0 + delay: up 1m down 5m multiplier 2 max 10m + summary: RAID PD c${label:controller_number}/e${label:enclosure_number}/s${label:slot_number} errors + info: RAID physical drive c${label:controller_number}/e${label:enclosure_number}/s${label:slot_number} errors + to: sysadmin + + template: storcli_phys_drive_predictive_failures + on: storcli.phys_drive_predictive_failures + class: Errors + type: System +component: RAID + lookup: sum -10s + units: failures + every: 10s + warn: $this > 0 + delay: up 1m down 5m multiplier 2 max 10m + summary: RAID PD c${label:controller_number}/e${label:enclosure_number}/s${label:slot_number} predictive failures + info: RAID physical drive c${label:controller_number}/e${label:enclosure_number}/s${label:slot_number} predictive failures + to: sysadmin |