diff options
author | Ilya Mashchenko <ilya@netdata.cloud> | 2024-04-30 18:24:46 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-04-30 18:24:46 +0300 |
commit | e1c4f7fec367a610c9c8beea1789c51c6434d51a (patch) | |
tree | 1c2700683e9fa5079c39e2108274b0635b6abdb2 | |
parent | b1bb1e9ded69b68dcc31cb7e2303f5fd80d8c1bf (diff) |
go.d smartctl (#17536)
23 files changed, 2973 insertions, 2 deletions
diff --git a/src/go/collectors/go.d.plugin/README.md b/src/go/collectors/go.d.plugin/README.md index de64966164..4d0718fc0c 100644 --- a/src/go/collectors/go.d.plugin/README.md +++ b/src/go/collectors/go.d.plugin/README.md @@ -118,6 +118,7 @@ see the appropriate collector readme. | [sensors](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules) | Hardware Sensors | | [SNMP](https://github.com/netdata/netdata/blob/master/src/go/collectors/go.d.plugin/modules/snmp) | SNMP | | [squidlog](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/squidlog) | Squid | +| [smartctl](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/smartctl) | S.M.A.R.T Storage Devices | | [storcli](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/storcli) | Broadcom Hardware RAID | | [supervisord](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/supervisord) | Supervisor | | [systemdunits](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/systemdunits) | Systemd unit state | @@ -213,8 +214,7 @@ sudo su -s /bin/bash netdata ./go.d.plugin -d -m <module name> ``` -Change `<module name>` to the module name you want to debug. See the [whole list](#available-modules) of available -modules. +Change `<module name>` to the [module name](#available-modules) you want to debug. ## Netdata Community diff --git a/src/go/collectors/go.d.plugin/config/go.d.conf b/src/go/collectors/go.d.plugin/config/go.d.conf index 4d143b1cf9..34abed37f4 100644 --- a/src/go/collectors/go.d.plugin/config/go.d.conf +++ b/src/go/collectors/go.d.plugin/config/go.d.conf @@ -80,6 +80,7 @@ modules: # sensors: yes # snmp: yes # squidlog: yes +# smartctl: yes # storcli: yes # supervisord: yes # systemdunits: yes diff --git a/src/go/collectors/go.d.plugin/config/go.d/smartctl.conf b/src/go/collectors/go.d.plugin/config/go.d/smartctl.conf new file mode 100644 index 0000000000..dea5116be9 --- /dev/null +++ b/src/go/collectors/go.d.plugin/config/go.d/smartctl.conf @@ -0,0 +1,5 @@ +## All available configuration options, their descriptions and default values: +## https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/smartctl#readme + +jobs: + - name: smartctl diff --git a/src/go/collectors/go.d.plugin/go.mod b/src/go/collectors/go.d.plugin/go.mod index e5885c23e3..558223f426 100644 --- a/src/go/collectors/go.d.plugin/go.mod +++ b/src/go/collectors/go.d.plugin/go.mod @@ -119,6 +119,9 @@ require ( github.com/sirupsen/logrus v1.9.3 // indirect github.com/spf13/cast v1.3.1 // indirect github.com/spf13/pflag v1.0.5 // indirect + github.com/tidwall/gjson v1.17.1 // indirect + github.com/tidwall/match v1.1.1 // indirect + github.com/tidwall/pretty v1.2.0 // indirect github.com/xdg-go/pbkdf2 v1.0.0 // indirect github.com/xdg-go/scram v1.1.2 // indirect github.com/xdg-go/stringprep v1.0.4 // indirect diff --git a/src/go/collectors/go.d.plugin/go.sum b/src/go/collectors/go.d.plugin/go.sum index 9bd073665e..b8146164a4 100644 --- a/src/go/collectors/go.d.plugin/go.sum +++ b/src/go/collectors/go.d.plugin/go.sum @@ -332,6 +332,12 @@ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/tidwall/gjson v1.17.1 h1:wlYEnwqAHgzmhNUFfw7Xalt2JzQvsMx2Se4PcoFCT/U= +github.com/tidwall/gjson v1.17.1/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= +github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.0 h1:RWIZEg2iJ8/g6fDDYzMpobmaoGh5OLl4AXtGUGPcqCs= +github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= github.com/tomasen/fcgi_client v0.0.0-20180423082037-2bb3d819fd19 h1:ZCmSnT6CLGhfoQ2lPEhL4nsJstKDCw1F1RfN8/smTCU= github.com/tomasen/fcgi_client v0.0.0-20180423082037-2bb3d819fd19/go.mod h1:SXTY+QvI+KTTKXQdg0zZ7nx0u94QWh8ZAwBQYsW9cqk= github.com/valyala/fastjson v1.6.4 h1:uAUNq9Z6ymTgGhcm0UynUAB6tlbakBrz6CQFax3BXVQ= diff --git a/src/go/collectors/go.d.plugin/modules/init.go b/src/go/collectors/go.d.plugin/modules/init.go index b0f1f9e838..c8262096a6 100644 --- a/src/go/collectors/go.d.plugin/modules/init.go +++ b/src/go/collectors/go.d.plugin/modules/init.go @@ -70,6 +70,7 @@ import ( _ "github.com/netdata/netdata/go/go.d.plugin/modules/redis" _ "github.com/netdata/netdata/go/go.d.plugin/modules/scaleio" _ "github.com/netdata/netdata/go/go.d.plugin/modules/sensors" + _ "github.com/netdata/netdata/go/go.d.plugin/modules/smartctl" _ "github.com/netdata/netdata/go/go.d.plugin/modules/snmp" _ "github.com/netdata/netdata/go/go.d.plugin/modules/squidlog" _ "github.com/netdata/netdata/go/go.d.plugin/modules/storcli" diff --git a/src/go/collectors/go.d.plugin/modules/smartctl/charts.go b/src/go/collectors/go.d.plugin/modules/smartctl/charts.go new file mode 100644 index 0000000000..7ad4ea4c40 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/smartctl/charts.go @@ -0,0 +1,237 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package smartctl + +import ( + "fmt" + "strings" + + "github.com/netdata/netdata/go/go.d.plugin/agent/module" +) + +const ( + prioDeviceSmartStatus = module.Priority + iota + prioDeviceAtaSmartErrorLogCount + prioDevicePowerOnTime + prioDeviceTemperature + prioDevicePowerCycleCount + + prioDeviceSmartAttributeDecoded + prioDeviceSmartAttributeNormalized +) + +var deviceChartsTmpl = module.Charts{ + devicePowerOnTimeChartTmpl.Copy(), + deviceTemperatureChartTmpl.Copy(), + devicePowerCycleCountChartTmpl.Copy(), + deviceSmartStatusChartTmpl.Copy(), + deviceAtaSmartErrorLogCountChartTmpl.Copy(), +} + +var ( + deviceSmartStatusChartTmpl = module.Chart{ + ID: "device_%s_type_%s_smart_status", + Title: "Device smart status", + Units: "status", + Fam: "smart status", + Ctx: "smartctl.device_smart_status", + Type: module.Line, + Priority: prioDeviceSmartStatus, + Dims: module.Dims{ + {ID: "device_%s_type_%s_smart_status_passed", Name: "passed"}, + {ID: "device_%s_type_%s_smart_status_failed", Name: "failed"}, + }, + } + deviceAtaSmartErrorLogCountChartTmpl = module.Chart{ + ID: "device_%s_type_%s_ata_smart_error_log_count", + Title: "Device ATA smart error log count", + Units: "logs", + Fam: "smart error log", + Ctx: "smartctl.device_ata_smart_error_log_count", + Type: module.Line, + Priority: prioDeviceAtaSmartErrorLogCount, + Dims: module.Dims{ + {ID: "device_%s_type_%s_ata_smart_error_log_summary_count", Name: "error_log"}, + }, + } + devicePowerOnTimeChartTmpl = module.Chart{ + ID: "device_%s_type_%s_power_on_time", + Title: "Device power on time", + Units: "seconds", + Fam: "power on time", + Ctx: "smartctl.device_power_on_time", + Type: module.Line, + Priority: prioDevicePowerOnTime, + Dims: module.Dims{ + {ID: "device_%s_type_%s_power_on_time", Name: "power_on_time"}, + }, + } + deviceTemperatureChartTmpl = module.Chart{ + ID: "device_%s_type_%s_temperature", + Title: "Device temperature", + Units: "Celsius", + Fam: "temperature", + Ctx: "smartctl.device_temperature", + Type: module.Line, + Priority: prioDeviceTemperature, + Dims: module.Dims{ + {ID: "device_%s_type_%s_temperature", Name: "temperature"}, + }, + } + devicePowerCycleCountChartTmpl = module.Chart{ + ID: "device_%s_type_%s_power_cycle_count", + Title: "Device power cycles", + Units: "cycles", + Fam: "power cycles", + Ctx: "smartctl.device_power_cycles_count", + Type: module.Line, + Priority: prioDevicePowerCycleCount, + Dims: module.Dims{ + {ID: "device_%s_type_%s_power_cycle_count", Name: "power"}, + }, + } +) + +var ( + deviceSmartAttributeDecodedChartTmpl = module.Chart{ + ID: "device_%s_type_%s_smart_attr_%s", + Title: "Device smart attribute %s", + Units: "value", + Fam: "attr %s", + Ctx: "smartctl.device_smart_attr_%s", + Type: module.Line, + Priority: prioDeviceSmartAttributeDecoded, + Dims: module.Dims{ + {ID: "device_%s_type_%s_attr_%s_decoded", Name: "%s"}, + }, + } + deviceSmartAttributeNormalizedChartTmpl = module.Chart{ + ID: "device_%s_type_%s_smart_attr_%s_normalized", + Title: "Device smart attribute normalized %s", + Units: "value", + Fam: "attr %s", + Ctx: "smartctl.device_smart_attr_%s_normalized", + Type: module.Line, + Priority: prioDeviceSmartAttributeNormalized, + Dims: module.Dims{ + {ID: "device_%s_type_%s_attr_%s_normalized", Name: "%s"}, + }, + } +) + +func (s *Smartctl) addDeviceCharts(dev *smartDevice) { + charts := module.Charts{} + + if cs := s.newDeviceCharts(dev); cs != nil && len(*cs) > 0 { + if err := charts.Add(*cs...); err != nil { + s.Warning(err) + } + } + if cs := s.newDeviceSmartAttrCharts(dev); cs != nil && len(*cs) > 0 { + if err := charts.Add(*cs...); err != nil { + s.Warning(err) + } + } + + if err := s.Charts().Add(charts...); err != nil { + s.Warning(err) + } +} + +func (s *Smartctl) removeDeviceCharts(scanDev *scanDevice) { + px := fmt.Sprintf("device_%s_%s_", scanDev.shortName(), scanDev.typ) + + for _, chart := range *s.Charts() { + if strings.HasPrefix(chart.ID, px) { + chart.MarkRemove() + chart.MarkNotCreated() + } + } +} + +func (s *Smartctl) newDeviceCharts(dev *smartDevice) *module.Charts { + + charts := deviceChartsTmpl.Copy() + + if _, ok := dev.powerOnTime(); !ok { + _ = charts.Remove(devicePowerOnTimeChartTmpl.ID) + } + if _, ok := dev.temperature(); !ok { + _ = charts.Remove(deviceTemperatureChartTmpl.ID) + } + if _, ok := dev.powerCycleCount(); !ok { + _ = charts.Remove(devicePowerOnTimeChartTmpl.ID) + } + if _, ok := dev.smartStatusPassed(); !ok { + _ = charts.Remove(deviceSmartStatusChartTmpl.ID) + } + if _, ok := dev.ataSmartErrorLogCount(); !ok { + _ = charts.Remove(deviceAtaSmartErrorLogCountChartTmpl.ID) + } + + for _, chart := range *charts { + chart.ID = fmt.Sprintf(chart.ID, dev.deviceName(), dev.deviceType()) + chart.Labels = []module.Label{ + {Key: "device_name", Value: dev.deviceName()}, + {Key: "device_type", Value: dev.deviceType()}, + {Key: "model_name", Value: dev.modelName()}, + {Key: "serial_number", Value: dev.serialNumber()}, + } + for _, dim := range chart.Dims { + dim.ID = fmt.Sprintf(dim.ID, dev.deviceName(), dev.deviceType()) + } + } + + return charts +} + +func (s *Smartctl) newDeviceSmartAttrCharts(dev *smartDevice) *module.Charts { + attrs, ok := dev.ataSmartAttributeTable() + if !ok { + return nil + } + charts := module.Charts{} + + for _, attr := range attrs { + if !isSmartAttrValid(attr) || strings.HasPrefix(attr.name(), "Unknown") { + continue + } + + cs := module.Charts{ + deviceSmartAttributeDecodedChartTmpl.Copy(), + deviceSmartAttributeNormalizedChartTmpl.Copy(), + } + + name := cleanAttributeName(attr) + + // FIXME: attribute charts unit + for _, chart := range cs { + chart.ID = fmt.Sprintf(chart.ID, dev.deviceName(), dev.deviceType(), name) + chart.Title = fmt.Sprintf(chart.Title, attr.name()) + chart.Fam = fmt.Sprintf(chart.Fam, name) + chart.Ctx = fmt.Sprintf(chart.Ctx, name) + chart.Labels = []module.Label{ + {Key: "device_name", Value: dev.deviceName()}, + {Key: "device_type", Value: dev.deviceType()}, + {Key: "model_name", Value: dev.modelName()}, + {Key: "serial_number", Value: dev.serialNumber()}, + } + for _, dim := range chart.Dims { + dim.ID = fmt.Sprintf(dim.ID, dev.deviceName(), dev.deviceType(), name) + dim.Name = fmt.Sprintf(dim.Name, name) + } + } + + if err := charts.Add(cs...); err != nil { + s.Warning(err) + } + } + + return &charts +} + +var attrNameReplacer = strings.NewReplacer(" ", "_", "/", "_") + +func cleanAttributeName(attr *smartAttribute) string { + return strings.ToLower(attrNameReplacer.Replace(attr.name())) +} diff --git a/src/go/collectors/go.d.plugin/modules/smartctl/collect.go b/src/go/collectors/go.d.plugin/modules/smartctl/collect.go new file mode 100644 index 0000000000..79cbb13d02 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/smartctl/collect.go @@ -0,0 +1,189 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package smartctl + +import ( + "fmt" + "maps" + "slices" + "strconv" + "strings" + "time" + + "github.com/tidwall/gjson" +) + +func (s *Smartctl) collect() (map[string]int64, error) { + now := time.Now() + + if s.forceScan || s.isTimeToScan(now) { + devices, err := s.scanDevices() + if err != nil { + return nil, err + } + + for k, dev := range s.scannedDevices { + if _, ok := devices[k]; !ok { + delete(s.scannedDevices, k) + delete(s.seenDevices, k) + s.removeDeviceCharts(dev) + } + } + + s.forceDevicePoll = !maps.Equal(s.scannedDevices, devices) + s.scannedDevices = devices + s.lastScanTime = now + s.forceScan = false + } + + if s.forceDevicePoll || s.isTimeToPollDevices(now) { + mx := make(map[string]int64) + + // TODO: make it concurrent + for _, d := range s.scannedDevices { + if err := s.collectScannedDevice(mx, d); err != nil { + return nil, err + } + } + + s.forceDevicePoll = false + s.lastDevicePollTime = now + s.mx = mx + } + + return s.mx, nil +} + +func (s *Smartctl) collectScannedDevice(mx map[string]int64, scanDev *scanDevice) error { + resp, err := s.exec.deviceInfo(scanDev.name, scanDev.typ, s.NoCheckPowerMode) + if err != nil { + if resp != nil && isDeviceOpenFailedNoSuchDevice(resp) { + s.Infof("smartctl reported that device '%s' type '%s' no longer exists", scanDev.name, scanDev.typ) + s.forceScan = true + return nil + } + return fmt.Errorf("failed to get device info for '%s' type '%s': %v", scanDev.name, scanDev.typ, err) + } + + if isDeviceInLowerPowerMode(resp) { + s.Debugf("device '%s' type '%s' is in a low-power mode, skipping", scanDev.name, scanDev.typ) + return nil + } + + dev := newSmartDevice(resp) + if !isSmartDeviceValid(dev) { + return nil + } + + if !s.seenDevices[scanDev.key()] { + s.seenDevices[scanDev.key()] = true + s.addDeviceCharts(dev) + } + + s.collectSmartDevice(mx, dev) + + return nil +} + +func (s *Smartctl) collectSmartDevice(mx map[string]int64, dev *smartDevice) { + px := fmt.Sprintf("device_%s_type_%s_", dev.deviceName(), dev.deviceType()) + + if v, ok := dev.powerOnTime(); ok { + mx[px+"power_on_time"] = v + } + if v, ok := dev.temperature(); ok { + mx[px+"temperature"] = v + } + if v, ok := dev.powerCycleCount(); ok { + mx[px+"power_cycle_count"] = v + } + if v, ok := dev.smartStatusPassed(); ok { + mx[px+"smart_status_passed"] = 0 + mx[px+"smart_status_failed"] = 0 + if v { + mx[px+"smart_status_passed"] = 1 + } else { + mx[px+"smart_status_failed"] = 1 + } + } + if v, ok := dev.ataSmartErrorLogCount(); ok { + mx[px+"ata_smart_error_log_summary_count"] = v + } + + if attrs, ok := dev.ataSmartAttributeTable(); ok { + for _, attr := range attrs { + if !isSmartAttrValid(attr) { + continue + } + n := strings.ToLower(attr.name()) + n = strings.ReplaceAll(n, " ", "_") + px := fmt.Sprintf("%sattr_%s_", px, n) + + if v, err := strconv.ParseInt(attr.value(), 10, 64); err == nil { + mx[px+"normalized"] = v + } + + if v, err := strconv.ParseInt(attr.rawValue(), 10, 64); err == nil { + mx[px+"raw"] = v + } + + rs := strings.TrimSpace(attr.rawString()) + if i := strings.IndexByte(rs, ' '); i != -1 { + rs = rs[:i] + } + if v, err := strconv.ParseInt(rs, 10, 64); err == nil { + mx[px+"decoded"] = v + } + } + } +} + +func (s *Smartctl) isTimeToScan(now time.Time) bool { + return now.After(s.lastScanTime.Add(s.ScanEvery.Duration())) +} + +func (s *Smartctl) isTimeToPollDevices(now time.Time) bool { + return now.After(s.lastDevicePollTime.Add(s.PollDevicesEvery.Duration())) + +} + +func isSmartDeviceValid(d *smartDevice) bool { + return d.deviceName() != "" && d.deviceType() != "" +} + +func isSmartAttrValid(a *smartAttribute) bool { + return a.id() != "" && a.name() != "" +} + +func isDeviceInLowerPowerMode(r *gjson.Result) bool { + if !isExitStatusHasBit(r, 1) { + return false + } + + messages := r.Get("smartctl.messages").Array() + + return slices.ContainsFunc(messages, func(msg gjson.Result) bool { + text := msg.Get("string").String() + return strings.HasPrefix(text, "Device is in") && strings.Contains(text, "mode") + }) +} + +func isDeviceOpenFailedNoSuchDevice(r *gjson.Result) bool { + if !isExitStatusHasBit(r, 1) { + return false + } + + messages := r.Get("smartctl.messages").Array() + + return slices.ContainsFunc(messages, func(msg gjson.Result) bool { + text := msg.Get("string").String() + return strings.HasSuffix(text, "No such device") + }) +} + +func isExitStatusHasBit(r *gjson.Result, bit int) bool { + // https://manpages.debian.org/bullseye/smartmontools/smartctl.8.en.html#EXIT_STATUS + status := int(r.Get("smartctl.exit_status").Int()) + mask := 1 << bit + return (status & mask) != 0 +} diff --git a/src/go/collectors/go.d.plugin/modules/smartctl/config_schema.json b/src/go/collectors/go.d.plugin/modules/smartctl/config_schema.json new file mode 100644 index 0000000000..273899cd98 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/smartctl/config_schema.json @@ -0,0 +1,67 @@ +{ + "jsonSchema": { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Smartctl collector configuration.", + "type": "object", + "properties": { + "update_every": { + "title": "Update every", + "description": "Interval for updating Netdata charts, measured in seconds. Collector might use cached data if less than **Devices poll interval**.", + "type": "integer", + "minimum": 1, + "default": 10 + }, + "timeout": { + "title": "Timeout", + "description": "Timeout for executing the `smartctl` binary, specified in seconds.", + "type": "number", + "minimum": 0.5, + "default": 5 + }, + "scan_every": { + "title": "Scan interval", + "description": "Interval for discovering new devices using `smartctl --scan`, measured in seconds.", + "type": "number", + "minimum": 1, + "default": 900 + }, + "poll_devices_every": { + "title": "Devices poll interval", + "description": "Interval for gathering data for every device, measured in seconds. Data is cached for this interval.", + "type": "number", + "minimum": 1, + "default": 300 + }, + "no_check_power_mode": { + "title": "No check power mode", + "description": "ATA only. Skip data collection when the device is in a low-power mode. Prevents unnecessary disk spin-up.", + "type": "string", + "enum": [ + "standby", + "never", + "sleep", + "idle" + ], + "default": "standby" + } + }, + "additionalProperties": false, + "patternProperties": { + "^name$": {} + } + }, + "uiSchema": { + "uiOptions": { + "fullPage": true + }, + "timeout": { + "ui:help": "Accepts decimals for precise control (e.g., type 1.5 for 1.5 seconds)." + }, + "no_check_power_mode": { + "ui:widget": "radio", + "ui:options": { + "inline": true + } + } + } +} diff --git a/src/go/collectors/go.d.plugin/modules/smartctl/exec.go b/src/go/collectors/go.d.plugin/modules/smartctl/exec.go new file mode 100644 index 0000000000..a90e1b529b --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/smartctl/exec.go @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package smartctl + +import ( + "context" + "errors" + "fmt" + "os/exec" + "time" + + "github.com/netdata/netdata/go/go.d.plugin/logger" + + "github.com/tidwall/gjson" +) + +func newSmartctlCliExec(ndsudoPath string, timeout time.Duration, log *logger.Logger) *smartctlCliExec { + return &smartctlCliExec{ + Logger: log, + ndsudoPath: ndsudoPath, + timeout: timeout, + } +} + +type smartctlCliExec struct { + *logger.Logger + + ndsudoPath string + timeout time.Duration +} + +func (e *smartctlCliExec) scan() (*gjson.Result, error) { + return e.execute("smartctl-json-scan") +} + +func (e *smartctlCliExec) deviceInfo(deviceName, deviceType, powerMode string) (*gjson.Result, error) { + return e.execute("smartctl-json-device-info", + "--deviceName", deviceName, + "--deviceType", deviceType, + "--powerMode", powerMode, + ) +} + +func (e *smartctlCliExec) execute(args ...string) (*gjson.Result, error) { + ctx, cancel := context.WithTimeout(context.Background(), e.timeout) + defer cancel() + + cmd := exec.CommandContext(ctx, e.ndsudoPath, args...) + e.Debugf("executing '%s'", cmd) + + bs, err := cmd.Output() + if err != nil { + if errors.Is(err, context.DeadlineExceeded) || isExecExitCode(err, 1) || len(bs) == 0 { + return nil, fmt.Errorf("'%s' execution failed: %v", cmd, err) + } + } + if len(bs) == 0 { + return nil, fmt.Errorf("'%s' returned no output", cmd) + } + + if !gjson.ValidBytes(bs) { + return nil, fmt.Errorf("'%s' returned invalid JSON output", cmd) + } + + res := gjson.ParseBytes(bs) + if !res.Get("smartctl.exit_status").Exists() { + return nil, fmt.Errorf("'%s' returned unexpected data", cmd) + } + + for _, msg := range res.Get("smartctl.messages").Array() { + if msg.Get("severity").String() == "error" { + return &res, fmt.Errorf("'%s' reported an error: %s", cmd, msg.Get("string")) + } + } + + return &res, nil +} + +func isExecExitCode(err error, exitCode int) bool { + var v *exec.ExitError + return errors.As(err, &v) && v.ExitCode() == exitCode +} diff --git a/src/go/collectors/go.d.plugin/modules/smartctl/init.go b/src/go/collectors/go.d.plugin/modules/smartctl/init.go new file mode 100644 index 0000000000..5c6ede5316 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/smartctl/init.go @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package smartctl + +import ( + "fmt" + "os" + "path/filepath" + + "github.com/netdata/netdata/go/go.d.plugin/agent/executable" +) + +func (s *Smartctl) validateConfig() error { + switch s.NoCheckPowerMode { + case "never", "sleep", "standby", "idle": + default: + return fmt.Errorf("invalid power mode '%s'", s.NoCheckPowerMode) + } + return nil +} + +func (s *Smartctl) initSmartctlCli() (smartctlCli, error) { + ndsudoPath := filepath.Join(executable.Directory, "ndsudo") + if _, err := os.Stat(ndsudoPath); err != nil { + return nil, fmt.Errorf("ndsudo executable not found: %v", err) + + } + + smartctlExec := newSmartctlCliExec(ndsudoPath, s.Timeout.Duration(), s.Logger) + + return smartctlExec, nil +} diff --git a/src/go/collectors/go.d.plugin/modules/smartctl/metadata.yaml b/src/go/collectors/go.d.plugin/modules/smartctl/metadata.yaml new file mode 100644 index 0000000000..0ef0843f99 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/smartctl/metadata.yaml @@ -0,0 +1,154 @@ +plugin_name: go.d.plugin +modules: + - meta: + id: collector-go.d.plugin-smartctl + plugin_name: go.d.plugin + module_name: smartctl + monitored_instance: + name: S.M.A.R.T. + link: "https://linux.die.net/man/8/smartd" + icon_filename: "smart.png" + categories: + - data-collection.hardware-devices-and-sensors + keywords: + - smart + - S.M.A.R.T. + - SCSI devices + - ATA devices + related_resources: + integrations: + list: [] + info_provided_to_referring_integrations: + description: "" + most_popular: false + overview: + data_collection: + metrics_description: | + This collector monitors the health status of storage devices by analyzing S.M.A.R.T. (Self-Monitoring, Analysis, and Reporting Technology) counters. + It relies on the [`smartctl`](https://linux.die.net/man/8/smartctl) CLI tool but avoids directly executing the binary. + Instead, it utilizes `ndsudo`, a Netdata helper specifically designed to run privileged commands securely within the Netdata environment. + This approach eliminates the need to use `sudo`, improving security and potentially simplifying permission management. + + Executed commands: + - `smartctl --json --scan` + - `smartctl --json --all {deviceName} --device {deviceType} --nocheck {powerMode}` + method_description: "" + supported_platforms: + include: [] + exclude: [] + multi_instance: false + additional_permissions: + description: "" + default_behavior: + auto_detection: + description: "" + limits: + description: "" + performance_impact: + description: "" + setup: + prerequisites: + list: [] + configuration: + file: + name: go.d/smartctl.conf + options: + description: | + The following options can be defined globally: update_every. + folding: + title: Config options + enabled: true + list: + - name: update_every + description: interval for updating Netdata charts, measured in seconds. Collector might use cached data if less than **Devices poll interval**. + default_value: 10 + required: false + - name: timeout + description: smartctl binary execution timeout. + default_value: 5 + required: false + - name: scan_every + description: interval for discovering new devices using `smartctl --scan`, measured in seconds. + default_value: 900 + required: false + - name: poll_devices_every + description: interval for gathering data for every device, measured in seconds. Data is cached for this interval. + default_value: 300 + required: false + examples: + folding: + title: Config + enabled: true + list: + - name: Custom devices poll interval + description: Allows you to override the default devices poll interval (data collection). + config: | |