diff options
author | Ilya Mashchenko <ilya@netdata.cloud> | 2024-04-19 17:38:37 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-04-19 17:38:37 +0300 |
commit | af8404cebb4821e0fa682e27aeb9714278af6096 (patch) | |
tree | 93bc7e67c8525892ae198bd6244ba22a4a830dac | |
parent | 079f1e312fded06956f8aa05385ba7b20d1a610d (diff) |
go.d add storcli collector (#17454)
18 files changed, 2390 insertions, 2 deletions
diff --git a/src/go/collectors/go.d.plugin/README.md b/src/go/collectors/go.d.plugin/README.md index 6dc519bee1..fc688ada01 100644 --- a/src/go/collectors/go.d.plugin/README.md +++ b/src/go/collectors/go.d.plugin/README.md @@ -114,9 +114,8 @@ see the appropriate collector readme. | [redis](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/redis) | Redis | | [scaleio](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/scaleio) | Dell EMC ScaleIO | | [SNMP](https://github.com/netdata/netdata/blob/master/src/go/collectors/go.d.plugin/modules/snmp) | SNMP | -| [solr](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/solr) | Solr | | [squidlog](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/squidlog) | Squid | -| [springboot2](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/springboot2) | Spring Boot2 | +| [storcli](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/storcli) | Broadcom Hardware RAID | | [supervisord](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/supervisord) | Supervisor | | [systemdunits](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/systemdunits) | Systemd unit state | | [tengine](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/tengine) | Tengine | diff --git a/src/go/collectors/go.d.plugin/config/go.d.conf b/src/go/collectors/go.d.plugin/config/go.d.conf index 86fa940650..ab3a5aca74 100644 --- a/src/go/collectors/go.d.plugin/config/go.d.conf +++ b/src/go/collectors/go.d.plugin/config/go.d.conf @@ -77,6 +77,7 @@ modules: # scaleio: yes # snmp: yes # squidlog: yes +# storcli: yes # supervisord: yes # systemdunits: yes # tengine: yes diff --git a/src/go/collectors/go.d.plugin/config/go.d/storcli.conf b/src/go/collectors/go.d.plugin/config/go.d/storcli.conf new file mode 100644 index 0000000000..a4a9e3e0ac --- /dev/null +++ b/src/go/collectors/go.d.plugin/config/go.d/storcli.conf @@ -0,0 +1,5 @@ +## All available configuration options, their descriptions and default values: +## https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/storcli#readme + +jobs: + - name: storcli diff --git a/src/go/collectors/go.d.plugin/modules/init.go b/src/go/collectors/go.d.plugin/modules/init.go index 2790d30c24..69e11617f7 100644 --- a/src/go/collectors/go.d.plugin/modules/init.go +++ b/src/go/collectors/go.d.plugin/modules/init.go @@ -69,6 +69,7 @@ import ( _ "github.com/netdata/netdata/go/go.d.plugin/modules/scaleio" _ "github.com/netdata/netdata/go/go.d.plugin/modules/snmp" _ "github.com/netdata/netdata/go/go.d.plugin/modules/squidlog" + _ "github.com/netdata/netdata/go/go.d.plugin/modules/storcli" _ "github.com/netdata/netdata/go/go.d.plugin/modules/supervisord" _ "github.com/netdata/netdata/go/go.d.plugin/modules/systemdunits" _ "github.com/netdata/netdata/go/go.d.plugin/modules/tengine" diff --git a/src/go/collectors/go.d.plugin/modules/storcli/charts.go b/src/go/collectors/go.d.plugin/modules/storcli/charts.go new file mode 100644 index 0000000000..65cd75a331 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/storcli/charts.go @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package storcli + +import ( + "fmt" + "strconv" + "strings" + + "github.com/netdata/netdata/go/go.d.plugin/agent/module" +) + +const ( + prioControllerStatus = module.Priority + iota + prioControllerBBUStatus + + prioPhysDriveErrors + prioPhysDrivePredictiveFailures + prioPhysDriveSmartAlertStatus + prioPhysDriveTemperature +) + +var controllerChartsTmpl = module.Charts{ + controllerStatusChartTmpl.Copy(), + controllerBBUStatusChartTmpl.Copy(), +} + +var ( + controllerStatusChartTmpl = module.Chart{ + ID: "controller_%s_status", + Title: "Controller status", + Units: "status", + Fam: "cntrl status", + Ctx: "storcli.controller_status", + Type: module.Line, + Priority: prioControllerStatus, + Dims: module.Dims{ + {ID: "cntrl_%s_status_optimal", Name: "optimal"}, + {ID: "cntrl_%s_status_degraded", Name: "degraded"}, + {ID: "cntrl_%s_status_partially_degraded", Name: "partially_degraded"}, + {ID: "cntrl_%s_status_failed", Name: "failed"}, + }, + } + controllerBBUStatusChartTmpl = module.Chart{ + ID: "controller_%s_bbu_status", + Title: "Controller BBU status", + Units: "status", + Fam: "cntrl status", + Ctx: "storcli.controller_bbu_status", + Type: module.Line, + Priority: prioControllerBBUStatus, + Dims: module.Dims{ + {ID: "cntrl_%s_bbu_status_healthy", Name: "healthy"}, + {ID: "cntrl_%s_bbu_status_unhealthy", Name: "unhealthy"}, + {ID: "cntrl_%s_bbu_status_na", Name: "na"}, + }, + } +) + +var physDriveChartsTmpl = module.Charts{ + physDriveMediaErrorsRateChartTmpl.Copy(), + physDrivePredictiveFailuresRateChartTmpl.Copy(), + physDriveSmartAlertStatusChartTmpl.Copy(), + physDriveTemperatureChartTmpl.Copy(), +} + +var ( + physDriveMediaErrorsRateChartTmpl = module.Chart{ + ID: "phys_drive_%s_cntrl_%s_media_errors_rate", + Title: "Physical Drive media errors rate", + Units: "errors/s", + Fam: "pd errors", + Ctx: "storcli.phys_drive_errors", + Type: module.Line, + Priority: prioPhysDriveErrors, + Dims: module.Dims{ + {ID: "phys_drive_%s_cntrl_%s_media_error_count", Name: "media"}, + {ID: "phys_drive_%s_cntrl_%s_other_error_count", Name: "other"}, + }, + } + physDrivePredictiveFailuresRateChartTmpl = module.Chart{ + ID: "phys_drive_%s_cntrl_%s_predictive_failures_rate", + Title: "Physical Drive predictive failures rate", + Units: "failures/s", + Fam: "pd errors", + Ctx: "storcli.phys_drive_predictive_failures", + Type: module.Line, + Priority: prioPhysDrivePredictiveFailures, + Dims: module.Dims{ + {ID: "phys_drive_%s_cntrl_%s_predictive_failure_count", Name: "predictive_failures"}, + }, + } + physDriveSmartAlertStatusChartTmpl = module.Chart{ + ID: "phys_drive_%s_cntrl_%s_smart_alert_status", + Title: "Physical Drive SMART alert status", + Units: "status", + Fam: "pd smart", + Ctx: "storcli.phys_drive_smart_alert_status", + Type: module.Line, + Priority: prioPhysDriveSmartAlertStatus, + Dims: module.Dims{ + {ID: "phys_drive_%s_cntrl_%s_smart_alert_status_active", Name: "active"}, + {ID: "phys_drive_%s_cntrl_%s_smart_alert_status_inactive", Name: "inactive"}, + }, + } + physDriveTemperatureChartTmpl = module.Chart{ + ID: "phys_drive_%s_cntrl_%s_temperature", + Title: "Physical Drive temperature", + Units: "status", + Fam: "pd temperature", + Ctx: "storcli.phys_drive_temperature", + Type: module.Line, + Priority: prioPhysDriveTemperature, + Dims: module.Dims{ + {ID: "phys_drive_%s_cntrl_%s_temperature", Name: "temperature"}, + }, + } +) + +func (s *StorCli) addControllerCharts(cntrl controllerInfo) { + charts := controllerChartsTmpl.Copy() + + num := strconv.Itoa(cntrl.Basics.Controller) + + for _, chart := range *charts { + chart.ID = fmt.Sprintf(chart.ID, num) + chart.Labels = []module.Label{ + {Key: "controller_number", Value: num}, + {Key: "model", Value: cntrl.Basics.Model}, + } + for _, dim := range chart.Dims { + dim.ID = fmt.Sprintf(dim.ID, num) + } + } + + if err := s.Charts().Add(*charts...); err != nil { + s.Warning(err) + } +} + +func (s *StorCli) addPhysDriveCharts(cntrlNum int, di *driveInfo, ds *driveState, da *driveAttrs) { + charts := physDriveChartsTmpl.Copy() + + if _, ok := parseInt(getDriveTemperature(ds.DriveTemperature)); !ok { + _ = charts.Remove(physDriveTemperatureChartTmpl.ID) + } + + num := strconv.Itoa(cntrlNum) + + var enc, slot string + if parts := strings.Split(di.EIDSlt, ":"); len(parts) == 2 { // EID:Slt + enc, slot = parts[0], parts[1] + } + + for _, chart := range *charts { + chart.ID = fmt.Sprintf(chart.ID, da.WWN, num) + chart.Labels = []module.Label{ + {Key: "controller_number", Value: num}, + {Key: "enclosure_number", Value: enc}, + {Key: "slot_number", Value: slot}, + {Key: "media_type", Value: di.Med}, + } + for _, dim := range chart.Dims { + dim.ID = fmt.Sprintf(dim.ID, da.WWN, num) + } + } + + if err := s.Charts().Add(*charts...); err != nil { + s.Warning(err) + } +} diff --git a/src/go/collectors/go.d.plugin/modules/storcli/collect.go b/src/go/collectors/go.d.plugin/modules/storcli/collect.go new file mode 100644 index 0000000000..d9b1c9af2f --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/storcli/collect.go @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package storcli + +import "fmt" + +func (s *StorCli) collect() (map[string]int64, error) { + cntrlResp, err := s.queryControllersInfo() + if err != nil { + return nil, err + } + + mx := make(map[string]int64) + + if err := s.collectControllersInfo(mx, cntrlResp); err != nil { + return nil, fmt.Errorf("error collecting controller info: %s", err) + } + + drives := cntrlResp.Controllers[0].ResponseData.PDList + driver := cntrlResp.Controllers[0].ResponseData.Version.DriverName + if driver == "megaraid_sas" && len(drives) > 0 { + drivesResp, err := s.queryDrivesInfo() + if err != nil { + return nil, fmt.Errorf("error collecting drives info: %s", err) + } + if err := s.collectMegaRaidDrives(mx, drivesResp); err != nil { + return nil, err + } + } + + return mx, nil +} diff --git a/src/go/collectors/go.d.plugin/modules/storcli/collect_controllers.go b/src/go/collectors/go.d.plugin/modules/storcli/collect_controllers.go new file mode 100644 index 0000000000..259013e6c1 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/storcli/collect_controllers.go @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package storcli + +import ( + "encoding/json" + "errors" + "fmt" + "strconv" + "strings" +) + +type ( + controllersInfoResponse struct { + Controllers []struct { + CommandStatus struct { + Controller int `json:"Controller"` + Status string `json:"Status"` + } `json:"Command Status"` + ResponseData controllerInfo `json:"Response Data"` + } `json:"Controllers"` + } + controllerInfo struct { + Basics struct { + Controller int `json:"Controller"` + Model string `json:"Model"` + SerialNumber string `json:"Serial Number"` + } `json:"Basics"` + Version struct { + DriverName string `json:"Driver Name"` + } `json:"Version"` + Status struct { + ControllerStatus string `json:"Controller Status"` + BBUStatus storNumber `json:"BBU Status"` + } `json:"Status"` + BBUInfo []struct { + Model string `json:"Model"` + State string `json:"State"` + Temp string `json:"Temp"` + } `json:"BBU_Info"` + PDList []struct { + } `json:"PD LIST"` + } +) + +func (s *StorCli) collectControllersInfo(mx map[string]int64, resp *controllersInfoResponse) error { + for _, v := range resp.Controllers { + cntrl := v.ResponseData + + idx := strconv.Itoa(cntrl.Basics.Controller) + if !s.controllers[idx] { + s.controllers[idx] = true + s.addControllerCharts(cntrl) + } + + px := fmt.Sprintf("cntrl_%s_", idx) + + for _, st := range []string{"optimal", "degraded", "partially_degraded", "failed"} { + mx[px+"status_"+st] = 0 + } + mx[px+"status_"+strings.ToLower(cntrl.Status.ControllerStatus)] = 1 + + for _, st := range []string{"healthy", "unhealthy", "na"} { + mx[px+"bbu_status_"+st] = 0 + } + // https://github.com/prometheus-community/node-exporter-textfile-collector-scripts/issues/27 + switch cntrl.Status.BBUStatus { + case "0", "8", "4096": // 0 good, 8 charging + mx[px+"bbu_status_healthy"] = 1 + case "NA", "N/A": + mx[px+"bbu_status_na"] = 1 + default: + mx[px+"bbu_status_unhealthy"] = 1 + } + } + return nil +} + +func (s *StorCli) queryControllersInfo() (*controllersInfoResponse, error) { + bs, err := s.exec.controllersInfo() + if err != nil { + return nil, err + } + + if len(bs) == 0 { + return nil, errors.New("empty response") + } + + var resp controllersInfoResponse + if err := json.Unmarshal(bs, &resp); err != nil { + return nil, err + } + if len(resp.Controllers) == 0 { + return nil, errors.New("no controllers found") + } + if st := resp.Controllers[0].CommandStatus.Status; st != "Success" { + return nil, fmt.Errorf("command status error: %s", st) + } + + return &resp, nil +} diff --git a/src/go/collectors/go.d.plugin/modules/storcli/collect_drives.go b/src/go/collectors/go.d.plugin/modules/storcli/collect_drives.go new file mode 100644 index 0000000000..353728d6dc --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/storcli/collect_drives.go @@ -0,0 +1,231 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package storcli + +import ( + "encoding/json" + "errors" + "fmt" + "strconv" + "strings" +) + +type drivesInfoResponse struct { + Controllers []struct { + CommandStatus struct { + Controller int `json:"Controller"` + Status string `json:"Status"` + } `json:"Command Status"` + ResponseData map[string]json.RawMessage `json:"Response Data"` + } `json:"Controllers"` +} + +type ( + driveInfo struct { + EIDSlt string `json:"EID:Slt"` + DID int `json:"DID"` + State string `json:"State"` + DG int `json:"DG"` + Size string `json:"Size"` + Intf string `json:"Intf"` + Med string `json:"Med"` + SED string `json:"SED"` + PI string `json:"PI"` + SeSz string `json:"SeSz"` + Model string `json:"Model"` + Sp string `json:"Sp"` + Type string `json:"Type"` + } + driveState struct { + MediaErrorCount storNumber `json:"Media Error Count"` + OtherErrorCount storNumber `json:"Other Error Count"` + DriveTemperature string `json:"Drive Temperature"` + PredictiveFailureCount storNumber `json:"Predictive Failure Count"` + SmartAlertFlagged string `json:"S.M.A.R.T alert flagged by drive"` + } + driveAttrs struct { + WWN string `json:"WWN"` + DeviceSpeed string `json:"Device Speed"` + LinkSpeed string `json:"Link Speed"` + } +) + +type storNumber string // some int values can be 'N/A' + +func (n *storNumber) UnmarshalJSON(b []byte) error { *n = storNumber(b); return nil } + +func (s *StorCli) collectMegaRaidDrives(mx map[string]int64, resp *drivesInfoResponse) error { + for _, cntrl := range resp.Controllers { + var ids []string + for k := range cntrl.ResponseData { + if !strings.HasSuffix(k, "Detailed Information") { + continue + } + parts := strings.Fields(k) // Drive /c0/e252/s0 - Detailed Information + if len(parts) < 2 { + continue + } + id := parts[1] + if strings.IndexByte(id, '/') == -1 { + continue + } + ids = append(ids, id) + } + + cntrlIdx := cntrl.CommandStatus.Controller + + for _, id := range ids { + info, err := getDriveInfo(cntrl.ResponseData, id) + if err != nil { + return err + } + data, err := getDriveDetailedInfo(cntrl.ResponseData, id) + if err != nil { + return err + } + state, err := getDriveState(data, id) + if err != nil { + return err + } + attrs, err := getDriveAttrs(data, id) + if err != nil { + return err + } + + if attrs.WWN == "" { + continue + } + + if !s.drives[attrs.WWN] { + s.drives[attrs.WWN] = true + s.addPhysDriveCharts(cntrlIdx, info, state, attrs) + } + + px := fmt.Sprintf("phys_drive_%s_cntrl_%d_", attrs.WWN, cntrlIdx) + + if v, ok := parseInt(string(state.MediaErrorCount)); ok { + mx[px+"media_error_count"] = v + } + if v, ok := parseInt(string(state.OtherErrorCount)); ok { + mx[px+"other_error_count"] = v + } + if v, ok := parseInt(string(state.PredictiveFailureCount)); ok { + mx[px+"predictive_failure_count"] = v + } + if v, ok := parseInt(getDriveTemperature(state.DriveTemperature)); ok { + mx[px+"temperature"] = v + } + for _, st := range []string{"active", "inactive"} { + mx[px+"smart_alert_status_"+st] = 0 + } + if state.SmartAlertFlagged == "Yes" { + mx[px+"smart_alert_status_active"] = 1 + } + } + } + + return nil +} + +func (s *StorCli) queryDrivesInfo() (*drivesInfoResponse, error) { + bs, err := s.exec.drivesInfo() + if err != nil { + return nil, err + } + + if len(bs) == 0 { + return nil, errors.New("empty response") + } + + var resp drivesInfoResponse + if err := json.Unmarshal(bs, &resp); err != nil { + return nil, err + } + + if len(resp.Controllers) == 0 { + return nil, errors.New("no controllers found") + } + if st := resp.Controllers[0].CommandStatus.Status; st != "Success" { + return nil, fmt.Errorf("command status error: %s", st) + } + + return &resp, nil +} + +func getDriveInfo(respData map[string]json.RawMessage, id string) (*driveInfo, error) { + k := fmt.Sprintf("Drive %s", id) + raw, ok := respData[k] + if !ok { + return nil, fmt.Errorf("drive info not found for '%s'", id) + } + + var drive []driveInfo + if err := json.Unmarshal(raw, &drive); err != nil { + return nil, err + } + + if len(drive) == 0 { + return nil, fmt.Errorf("drive info not found for '%s'", id) + } + + return &drive[0], nil +} + +func getDriveDetailedInfo(respData map[string]json.RawMessage, id string) (map[string]json.RawMessage, error) { + k := fmt.Sprintf("Drive %s - Detailed Information", id) + raw, ok := respData[k] + if !ok { + return nil, fmt.Errorf("drive detailed info not found for '%s'", id) + } + + var info map[string]json.RawMessage + if err := json.Unmarshal(raw, &info); err != nil { + return nil, err + } + + return info, nil +} + +func getDriveState(driveDetailedInfo map[string]json.RawMessage, id string) (*driveState, error) { + k := fmt.Sprintf("Drive %s State", id) + raw, ok := driveDetailedInfo[k] + if !ok { + return nil, fmt.Errorf("drive detailed info state not found for '%s'", id) + } + + var state driveState + if err := json.Unmarshal(raw, &state); err != nil { + return nil, err + } + + return &state, nil +} + +func getDriveAttrs(driveDetailedInfo map[string]json.RawMessage, id string) (*driveAttrs, error) { + k := fmt.Sprintf("Drive %s Device attributes", id) + raw, ok := driveDetailedInfo[k] + if !ok { + return nil, fmt.Errorf("drive detailed info state not found for '%s'", id) + } + + var state driveAttrs + if err := json.Unmarshal(raw, &state); err != nil { + return nil, err + } + + return &state, nil +} + +func getDriveTemperature(s string) string { + // ' 28C (82.40 F)' + i := strings.IndexByte(s, 'C') + if i == -1 { + return "" + } + return strings.TrimSpace(s[:i]) +} + +func parseInt(s string) (int64, bool) { + i, err := strconv.ParseInt(s, 10, 64) + return i, err == nil +} diff --git a/src/go/collectors/go.d.plugin/modules/storcli/config_schema.json b/src/go/collectors/go.d.plugin/modules/storcli/config_schema.json new file mode 100644 index 0000000000..226a370f43 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/storcli/config_schema.json @@ -0,0 +1,35 @@ +{ + "jsonSchema": { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "StorCLI collector configuration.", + "type": "object", + "properties": { + "update_every": { + "title": "Update every", + "description": "Data collection interval, measured in seconds.", + "type": "integer", + "minimum": 1, + "default": 10 + }, + "timeout": { + "title": "Timeout", + "description": "Timeout for executing the binary, specified in seconds.", + "type": "number", + "minimum": 0.5, + "default": 2 + } + }, + "additionalProperties": false, + "patternProperties": { + "^name$": {} + } + }, + "uiSchema": { + "uiOptions": { + "fullPage": true + }, + "timeout": { + "ui:help": "Accepts decimals for precise control (e.g., type 1.5 for 1.5 seconds)." + } + } +} diff --git a/src/go/collectors/go.d.plugin/modules/storcli/exec.go b/src/go/collectors/go.d.plugin/modules/storcli/exec.go new file mode 100644 index 0000000000..3375ddbe4f --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/storcli/exec.go @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package storcli + +import ( + "context" + "fmt" + "os/exec" + "time" + + "github.com/netdata/netdata/go/go.d.plugin/logger" +) + +func newStorCliExec(ndsudoPath string, timeout time.Duration, log *logger.Logger) *storCliExec { + return &storCliExec{ + Logger: log, + ndsudoPath: ndsudoPath, + timeout: timeout, + } +} + +type storCliExec struct { + *logger.Logger + + ndsudoPath string + timeout time.Duration +} + +func (e *storCliExec) controllersInfo() ([]byte, error) { + return e.execute("storcli-controllers-info") +} + +func (e *storCliExec) drivesInfo() ([]byte, error) { + return e.execute("storcli-drives-info") +} + +func (e *storCliExec) execute(args ...string) ([]byte, error) { + ctx, cancel := context.WithTimeout(context.Background(), e.timeout) + defer cancel() + + cmd := exec.CommandContext(ctx, e.ndsudoPath, args...) + e.Debugf("executing '%s'", cmd) + + bs, err := cmd.Output() + if err != nil { + return nil, fmt.Errorf("error on '%s': %v", cmd, err) + } + + return bs, nil +} diff --git a/src/go/collectors/go.d.plugin/modules/storcli/init.go b/src/go/collectors/go.d.plugin/modules/storcli/init.go new file mode 100644 index 0000000000..297f7c8c3e --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/storcli/init.go @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package storcli + +import ( + "fmt" + "os" + "path/filepath" + + "github.com/netdata/netdata/go/go.d.plugin/agent/executable" +) + +func (s *StorCli) initStorCliExec() (storCli, error) { + ndsudoPath := filepath.Join(executable.Directory, "ndsudo") + + if _, err := os.Stat(ndsudoPath); err != nil { + return nil, fmt.Errorf("ndsudo executable not found: %v", err) + } + + storExec := newStorCliExec(ndsudoPath, s.Timeout.Duration(), s.Logger) + + return storExec, nil +} diff --git a/src/go/collectors/go.d.plugin/modules/storcli/metadata.yaml b/src/go/collectors/go.d.plugin/modules/storcli/metadata.yaml new file mode 100644 index 0000000000..ecf97fb442 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/storcli/metadata.yaml @@ -0,0 +1,153 @@ +plugin_name: go.d.plugin +modules: + - meta: + id: collector-go.d.plugin-storcli + plugin_name: go.d.plugin + module_name: storcli + monitored_instance: + name: StoreCLI RAID + link: "https://docs.broadcom.com/doc/12352476" + icon_filename: "hard-drive.svg" + categories: + - data-collection.storage-mount-points-and-filesystems + keywords: + - storage + - raid-controller + - manage-disks + related_resources: + integrations: + list: [] + info_provided_to_referring_integrations: + description: "" + most_popular: false + overview: + data_collection: + metrics_description: | + Monitors the health of StoreCLI Hardware RAID by tracking the status of RAID adapters, physical drives, and backup batteries in your storage system. + It relies on the [`storcli`](https://docs.broadcom.com/doc/12352476) CLI tool but avoids directly executing the binary. + Instead, it utilizes `ndsudo`, a Netdata helper specifically designed to run privileged commands securely within the Netdata environment. + This approach eliminates the need to use `sudo`, improving security and potentially simplifying permission management. + + Executed commands: + - `storcli /cALL show all J nolog` + - `storcli /cALL/eALL/sALL show all J nolog` + method_description: "" + supported_platforms: + include: [] + exclude: [] + multi_instance: false + additional_permissions: + description: "" + default_behavior: + auto_detection: + description: "" + limits: + description: "" + performance_impact: + description: "" + setup: + prerequisites: + list: [] + configuration: + file: + name: go.d/storcli.conf + options: + description: | + The following options can be defined globally: update_every. + folding: + title: Config options + enabled: true + list: + - name: update_every + description: Data collection frequency. + default_value: 10 + required: false + - name: timeout + description: storcli binary execution timeout. + default_value: 2 + required: false + examples: + folding: + title: Config + enabled: true + list: + - name: Custom update_every + description: Allows you to override the default data collection interval. + config: | + jobs: + - name: storcli + update_every: 5 # Collect StorCLI RAID statistics every 5 seconds + troubleshooting: + problems: + list: [] + alerts: [] + metrics: + folding: + title: Metrics + enabled: false + description: "" + availability: [] + scopes: + - name: controller + description: These metrics refer to the Controller. + labels: + - name: controller_number + description: Controller number (index) + - name: model + description: Controller model + metrics: + - name: storcli.controller_status + description: Controller status + unit: status + chart_type: line + dimensions: + - name: optimal + - name: degraded + - name: partially_degraded + - name: failed + - name: storcli.controller_bbu_status + description: Controller BBU status + unit: status + chart_type: line + dimensions: + - name: healthy + - name: unhealthy + - name: na + - name: physical drive + description: These metrics refer to the Physical Drive. + labels: + - name: controller_number + description: Controller number (index) + - name: enclosure_number + description: Enclosure number (index) + - name: slot_number + description: Slot number (index) + - name: media type + description: Media type (e.g. HDD) + metrics: + - name: storcli.phys_drive_errors + description: Physical Drive media errors rate + unit: errors/s + chart_type: line + dimensions: + - name: media + - name: other + - name: storcli.phys_drive_predictive_failures + description: Physical Drive predictive failures rate + unit: failures/s + chart_type: line + dimensions: + - name: predictive_failures + - name: storcli.phys_drive_smart_alert_status + description: Physical Drive SMART alert status + unit: status + chart_type: line + dimensions: + - name: active + - name: inactive + - name: storcli.phys_drive_temperature + description: Physical Drive temperature + unit: status + chart_type: line + dimensions: + - name: temperature |