From e1c4f7fec367a610c9c8beea1789c51c6434d51a Mon Sep 17 00:00:00 2001 From: Ilya Mashchenko Date: Tue, 30 Apr 2024 18:24:46 +0300 Subject: go.d smartctl (#17536) --- src/go/collectors/go.d.plugin/README.md | 4 +- src/go/collectors/go.d.plugin/config/go.d.conf | 1 + .../go.d.plugin/config/go.d/smartctl.conf | 5 + src/go/collectors/go.d.plugin/go.mod | 3 + src/go/collectors/go.d.plugin/go.sum | 6 + src/go/collectors/go.d.plugin/modules/init.go | 1 + .../go.d.plugin/modules/smartctl/charts.go | 237 ++++++++ .../go.d.plugin/modules/smartctl/collect.go | 189 ++++++ .../modules/smartctl/config_schema.json | 67 +++ .../go.d.plugin/modules/smartctl/exec.go | 82 +++ .../go.d.plugin/modules/smartctl/init.go | 32 + .../go.d.plugin/modules/smartctl/metadata.yaml | 154 +++++ .../go.d.plugin/modules/smartctl/scan.go | 68 +++ .../go.d.plugin/modules/smartctl/smart_device.go | 123 ++++ .../go.d.plugin/modules/smartctl/smartctl.go | 126 ++++ .../go.d.plugin/modules/smartctl/smartctl_test.go | 436 ++++++++++++++ .../modules/smartctl/testdata/config.json | 7 + .../modules/smartctl/testdata/config.yaml | 5 + .../smartctl/testdata/type-nvme/device-nvme0.json | 112 ++++ .../modules/smartctl/testdata/type-nvme/scan.json | 29 + .../smartctl/testdata/type-sat/device-hdd-sda.json | 601 +++++++++++++++++++ .../smartctl/testdata/type-sat/device-ssd-sdc.json | 652 +++++++++++++++++++++ .../modules/smartctl/testdata/type-sat/scan.json | 35 ++ 23 files changed, 2973 insertions(+), 2 deletions(-) create mode 100644 src/go/collectors/go.d.plugin/config/go.d/smartctl.conf create mode 100644 src/go/collectors/go.d.plugin/modules/smartctl/charts.go create mode 100644 src/go/collectors/go.d.plugin/modules/smartctl/collect.go create mode 100644 src/go/collectors/go.d.plugin/modules/smartctl/config_schema.json create mode 100644 src/go/collectors/go.d.plugin/modules/smartctl/exec.go create mode 100644 src/go/collectors/go.d.plugin/modules/smartctl/init.go create mode 100644 src/go/collectors/go.d.plugin/modules/smartctl/metadata.yaml create mode 100644 src/go/collectors/go.d.plugin/modules/smartctl/scan.go create mode 100644 src/go/collectors/go.d.plugin/modules/smartctl/smart_device.go create mode 100644 src/go/collectors/go.d.plugin/modules/smartctl/smartctl.go create mode 100644 src/go/collectors/go.d.plugin/modules/smartctl/smartctl_test.go create mode 100644 src/go/collectors/go.d.plugin/modules/smartctl/testdata/config.json create mode 100644 src/go/collectors/go.d.plugin/modules/smartctl/testdata/config.yaml create mode 100644 src/go/collectors/go.d.plugin/modules/smartctl/testdata/type-nvme/device-nvme0.json create mode 100644 src/go/collectors/go.d.plugin/modules/smartctl/testdata/type-nvme/scan.json create mode 100644 src/go/collectors/go.d.plugin/modules/smartctl/testdata/type-sat/device-hdd-sda.json create mode 100644 src/go/collectors/go.d.plugin/modules/smartctl/testdata/type-sat/device-ssd-sdc.json create mode 100644 src/go/collectors/go.d.plugin/modules/smartctl/testdata/type-sat/scan.json diff --git a/src/go/collectors/go.d.plugin/README.md b/src/go/collectors/go.d.plugin/README.md index de64966164..4d0718fc0c 100644 --- a/src/go/collectors/go.d.plugin/README.md +++ b/src/go/collectors/go.d.plugin/README.md @@ -118,6 +118,7 @@ see the appropriate collector readme. | [sensors](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules) | Hardware Sensors | | [SNMP](https://github.com/netdata/netdata/blob/master/src/go/collectors/go.d.plugin/modules/snmp) | SNMP | | [squidlog](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/squidlog) | Squid | +| [smartctl](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/smartctl) | S.M.A.R.T Storage Devices | | [storcli](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/storcli) | Broadcom Hardware RAID | | [supervisord](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/supervisord) | Supervisor | | [systemdunits](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/systemdunits) | Systemd unit state | @@ -213,8 +214,7 @@ sudo su -s /bin/bash netdata ./go.d.plugin -d -m ``` -Change `` to the module name you want to debug. See the [whole list](#available-modules) of available -modules. +Change `` to the [module name](#available-modules) you want to debug. ## Netdata Community diff --git a/src/go/collectors/go.d.plugin/config/go.d.conf b/src/go/collectors/go.d.plugin/config/go.d.conf index 4d143b1cf9..34abed37f4 100644 --- a/src/go/collectors/go.d.plugin/config/go.d.conf +++ b/src/go/collectors/go.d.plugin/config/go.d.conf @@ -80,6 +80,7 @@ modules: # sensors: yes # snmp: yes # squidlog: yes +# smartctl: yes # storcli: yes # supervisord: yes # systemdunits: yes diff --git a/src/go/collectors/go.d.plugin/config/go.d/smartctl.conf b/src/go/collectors/go.d.plugin/config/go.d/smartctl.conf new file mode 100644 index 0000000000..dea5116be9 --- /dev/null +++ b/src/go/collectors/go.d.plugin/config/go.d/smartctl.conf @@ -0,0 +1,5 @@ +## All available configuration options, their descriptions and default values: +## https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/smartctl#readme + +jobs: + - name: smartctl diff --git a/src/go/collectors/go.d.plugin/go.mod b/src/go/collectors/go.d.plugin/go.mod index e5885c23e3..558223f426 100644 --- a/src/go/collectors/go.d.plugin/go.mod +++ b/src/go/collectors/go.d.plugin/go.mod @@ -119,6 +119,9 @@ require ( github.com/sirupsen/logrus v1.9.3 // indirect github.com/spf13/cast v1.3.1 // indirect github.com/spf13/pflag v1.0.5 // indirect + github.com/tidwall/gjson v1.17.1 // indirect + github.com/tidwall/match v1.1.1 // indirect + github.com/tidwall/pretty v1.2.0 // indirect github.com/xdg-go/pbkdf2 v1.0.0 // indirect github.com/xdg-go/scram v1.1.2 // indirect github.com/xdg-go/stringprep v1.0.4 // indirect diff --git a/src/go/collectors/go.d.plugin/go.sum b/src/go/collectors/go.d.plugin/go.sum index 9bd073665e..b8146164a4 100644 --- a/src/go/collectors/go.d.plugin/go.sum +++ b/src/go/collectors/go.d.plugin/go.sum @@ -332,6 +332,12 @@ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/tidwall/gjson v1.17.1 h1:wlYEnwqAHgzmhNUFfw7Xalt2JzQvsMx2Se4PcoFCT/U= +github.com/tidwall/gjson v1.17.1/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= +github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.0 h1:RWIZEg2iJ8/g6fDDYzMpobmaoGh5OLl4AXtGUGPcqCs= +github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= github.com/tomasen/fcgi_client v0.0.0-20180423082037-2bb3d819fd19 h1:ZCmSnT6CLGhfoQ2lPEhL4nsJstKDCw1F1RfN8/smTCU= github.com/tomasen/fcgi_client v0.0.0-20180423082037-2bb3d819fd19/go.mod h1:SXTY+QvI+KTTKXQdg0zZ7nx0u94QWh8ZAwBQYsW9cqk= github.com/valyala/fastjson v1.6.4 h1:uAUNq9Z6ymTgGhcm0UynUAB6tlbakBrz6CQFax3BXVQ= diff --git a/src/go/collectors/go.d.plugin/modules/init.go b/src/go/collectors/go.d.plugin/modules/init.go index b0f1f9e838..c8262096a6 100644 --- a/src/go/collectors/go.d.plugin/modules/init.go +++ b/src/go/collectors/go.d.plugin/modules/init.go @@ -70,6 +70,7 @@ import ( _ "github.com/netdata/netdata/go/go.d.plugin/modules/redis" _ "github.com/netdata/netdata/go/go.d.plugin/modules/scaleio" _ "github.com/netdata/netdata/go/go.d.plugin/modules/sensors" + _ "github.com/netdata/netdata/go/go.d.plugin/modules/smartctl" _ "github.com/netdata/netdata/go/go.d.plugin/modules/snmp" _ "github.com/netdata/netdata/go/go.d.plugin/modules/squidlog" _ "github.com/netdata/netdata/go/go.d.plugin/modules/storcli" diff --git a/src/go/collectors/go.d.plugin/modules/smartctl/charts.go b/src/go/collectors/go.d.plugin/modules/smartctl/charts.go new file mode 100644 index 0000000000..7ad4ea4c40 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/smartctl/charts.go @@ -0,0 +1,237 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package smartctl + +import ( + "fmt" + "strings" + + "github.com/netdata/netdata/go/go.d.plugin/agent/module" +) + +const ( + prioDeviceSmartStatus = module.Priority + iota + prioDeviceAtaSmartErrorLogCount + prioDevicePowerOnTime + prioDeviceTemperature + prioDevicePowerCycleCount + + prioDeviceSmartAttributeDecoded + prioDeviceSmartAttributeNormalized +) + +var deviceChartsTmpl = module.Charts{ + devicePowerOnTimeChartTmpl.Copy(), + deviceTemperatureChartTmpl.Copy(), + devicePowerCycleCountChartTmpl.Copy(), + deviceSmartStatusChartTmpl.Copy(), + deviceAtaSmartErrorLogCountChartTmpl.Copy(), +} + +var ( + deviceSmartStatusChartTmpl = module.Chart{ + ID: "device_%s_type_%s_smart_status", + Title: "Device smart status", + Units: "status", + Fam: "smart status", + Ctx: "smartctl.device_smart_status", + Type: module.Line, + Priority: prioDeviceSmartStatus, + Dims: module.Dims{ + {ID: "device_%s_type_%s_smart_status_passed", Name: "passed"}, + {ID: "device_%s_type_%s_smart_status_failed", Name: "failed"}, + }, + } + deviceAtaSmartErrorLogCountChartTmpl = module.Chart{ + ID: "device_%s_type_%s_ata_smart_error_log_count", + Title: "Device ATA smart error log count", + Units: "logs", + Fam: "smart error log", + Ctx: "smartctl.device_ata_smart_error_log_count", + Type: module.Line, + Priority: prioDeviceAtaSmartErrorLogCount, + Dims: module.Dims{ + {ID: "device_%s_type_%s_ata_smart_error_log_summary_count", Name: "error_log"}, + }, + } + devicePowerOnTimeChartTmpl = module.Chart{ + ID: "device_%s_type_%s_power_on_time", + Title: "Device power on time", + Units: "seconds", + Fam: "power on time", + Ctx: "smartctl.device_power_on_time", + Type: module.Line, + Priority: prioDevicePowerOnTime, + Dims: module.Dims{ + {ID: "device_%s_type_%s_power_on_time", Name: "power_on_time"}, + }, + } + deviceTemperatureChartTmpl = module.Chart{ + ID: "device_%s_type_%s_temperature", + Title: "Device temperature", + Units: "Celsius", + Fam: "temperature", + Ctx: "smartctl.device_temperature", + Type: module.Line, + Priority: prioDeviceTemperature, + Dims: module.Dims{ + {ID: "device_%s_type_%s_temperature", Name: "temperature"}, + }, + } + devicePowerCycleCountChartTmpl = module.Chart{ + ID: "device_%s_type_%s_power_cycle_count", + Title: "Device power cycles", + Units: "cycles", + Fam: "power cycles", + Ctx: "smartctl.device_power_cycles_count", + Type: module.Line, + Priority: prioDevicePowerCycleCount, + Dims: module.Dims{ + {ID: "device_%s_type_%s_power_cycle_count", Name: "power"}, + }, + } +) + +var ( + deviceSmartAttributeDecodedChartTmpl = module.Chart{ + ID: "device_%s_type_%s_smart_attr_%s", + Title: "Device smart attribute %s", + Units: "value", + Fam: "attr %s", + Ctx: "smartctl.device_smart_attr_%s", + Type: module.Line, + Priority: prioDeviceSmartAttributeDecoded, + Dims: module.Dims{ + {ID: "device_%s_type_%s_attr_%s_decoded", Name: "%s"}, + }, + } + deviceSmartAttributeNormalizedChartTmpl = module.Chart{ + ID: "device_%s_type_%s_smart_attr_%s_normalized", + Title: "Device smart attribute normalized %s", + Units: "value", + Fam: "attr %s", + Ctx: "smartctl.device_smart_attr_%s_normalized", + Type: module.Line, + Priority: prioDeviceSmartAttributeNormalized, + Dims: module.Dims{ + {ID: "device_%s_type_%s_attr_%s_normalized", Name: "%s"}, + }, + } +) + +func (s *Smartctl) addDeviceCharts(dev *smartDevice) { + charts := module.Charts{} + + if cs := s.newDeviceCharts(dev); cs != nil && len(*cs) > 0 { + if err := charts.Add(*cs...); err != nil { + s.Warning(err) + } + } + if cs := s.newDeviceSmartAttrCharts(dev); cs != nil && len(*cs) > 0 { + if err := charts.Add(*cs...); err != nil { + s.Warning(err) + } + } + + if err := s.Charts().Add(charts...); err != nil { + s.Warning(err) + } +} + +func (s *Smartctl) removeDeviceCharts(scanDev *scanDevice) { + px := fmt.Sprintf("device_%s_%s_", scanDev.shortName(), scanDev.typ) + + for _, chart := range *s.Charts() { + if strings.HasPrefix(chart.ID, px) { + chart.MarkRemove() + chart.MarkNotCreated() + } + } +} + +func (s *Smartctl) newDeviceCharts(dev *smartDevice) *module.Charts { + + charts := deviceChartsTmpl.Copy() + + if _, ok := dev.powerOnTime(); !ok { + _ = charts.Remove(devicePowerOnTimeChartTmpl.ID) + } + if _, ok := dev.temperature(); !ok { + _ = charts.Remove(deviceTemperatureChartTmpl.ID) + } + if _, ok := dev.powerCycleCount(); !ok { + _ = charts.Remove(devicePowerOnTimeChartTmpl.ID) + } + if _, ok := dev.smartStatusPassed(); !ok { + _ = charts.Remove(deviceSmartStatusChartTmpl.ID) + } + if _, ok := dev.ataSmartErrorLogCount(); !ok { + _ = charts.Remove(deviceAtaSmartErrorLogCountChartTmpl.ID) + } + + for _, chart := range *charts { + chart.ID = fmt.Sprintf(chart.ID, dev.deviceName(), dev.deviceType()) + chart.Labels = []module.Label{ + {Key: "device_name", Value: dev.deviceName()}, + {Key: "device_type", Value: dev.deviceType()}, + {Key: "model_name", Value: dev.modelName()}, + {Key: "serial_number", Value: dev.serialNumber()}, + } + for _, dim := range chart.Dims { + dim.ID = fmt.Sprintf(dim.ID, dev.deviceName(), dev.deviceType()) + } + } + + return charts +} + +func (s *Smartctl) newDeviceSmartAttrCharts(dev *smartDevice) *module.Charts { + attrs, ok := dev.ataSmartAttributeTable() + if !ok { + return nil + } + charts := module.Charts{} + + for _, attr := range attrs { + if !isSmartAttrValid(attr) || strings.HasPrefix(attr.name(), "Unknown") { + continue + } + + cs := module.Charts{ + deviceSmartAttributeDecodedChartTmpl.Copy(), + deviceSmartAttributeNormalizedChartTmpl.Copy(), + } + + name := cleanAttributeName(attr) + + // FIXME: attribute charts unit + for _, chart := range cs { + chart.ID = fmt.Sprintf(chart.ID, dev.deviceName(), dev.deviceType(), name) + chart.Title = fmt.Sprintf(chart.Title, attr.name()) + chart.Fam = fmt.Sprintf(chart.Fam, name) + chart.Ctx = fmt.Sprintf(chart.Ctx, name) + chart.Labels = []module.Label{ + {Key: "device_name", Value: dev.deviceName()}, + {Key: "device_type", Value: dev.deviceType()}, + {Key: "model_name", Value: dev.modelName()}, + {Key: "serial_number", Value: dev.serialNumber()}, + } + for _, dim := range chart.Dims { + dim.ID = fmt.Sprintf(dim.ID, dev.deviceName(), dev.deviceType(), name) + dim.Name = fmt.Sprintf(dim.Name, name) + } + } + + if err := charts.Add(cs...); err != nil { + s.Warning(err) + } + } + + return &charts +} + +var attrNameReplacer = strings.NewReplacer(" ", "_", "/", "_") + +func cleanAttributeName(attr *smartAttribute) string { + return strings.ToLower(attrNameReplacer.Replace(attr.name())) +} diff --git a/src/go/collectors/go.d.plugin/modules/smartctl/collect.go b/src/go/collectors/go.d.plugin/modules/smartctl/collect.go new file mode 100644 index 0000000000..79cbb13d02 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/smartctl/collect.go @@ -0,0 +1,189 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package smartctl + +import ( + "fmt" + "maps" + "slices" + "strconv" + "strings" + "time" + + "github.com/tidwall/gjson" +) + +func (s *Smartctl) collect() (map[string]int64, error) { + now := time.Now() + + if s.forceScan || s.isTimeToScan(now) { + devices, err := s.scanDevices() + if err != nil { + return nil, err + } + + for k, dev := range s.scannedDevices { + if _, ok := devices[k]; !ok { + delete(s.scannedDevices, k) + delete(s.seenDevices, k) + s.removeDeviceCharts(dev) + } + } + + s.forceDevicePoll = !maps.Equal(s.scannedDevices, devices) + s.scannedDevices = devices + s.lastScanTime = now + s.forceScan = false + } + + if s.forceDevicePoll || s.isTimeToPollDevices(now) { + mx := make(map[string]int64) + + // TODO: make it concurrent + for _, d := range s.scannedDevices { + if err := s.collectScannedDevice(mx, d); err != nil { + return nil, err + } + } + + s.forceDevicePoll = false + s.lastDevicePollTime = now + s.mx = mx + } + + return s.mx, nil +} + +func (s *Smartctl) collectScannedDevice(mx map[string]int64, scanDev *scanDevice) error { + resp, err := s.exec.deviceInfo(scanDev.name, scanDev.typ, s.NoCheckPowerMode) + if err != nil { + if resp != nil && isDeviceOpenFailedNoSuchDevice(resp) { + s.Infof("smartctl reported that device '%s' type '%s' no longer exists", scanDev.name, scanDev.typ) + s.forceScan = true + return nil + } + return fmt.Errorf("failed to get device info for '%s' type '%s': %v", scanDev.name, scanDev.typ, err) + } + + if isDeviceInLowerPowerMode(resp) { + s.Debugf("device '%s' type '%s' is in a low-power mode, skipping", scanDev.name, scanDev.typ) + return nil + } + + dev := newSmartDevice(resp) + if !isSmartDeviceValid(dev) { + return nil + } + + if !s.seenDevices[scanDev.key()] { + s.seenDevices[scanDev.key()] = true + s.addDeviceCharts(dev) + } + + s.collectSmartDevice(mx, dev) + + return nil +} + +func (s *Smartctl) collectSmartDevice(mx map[string]int64, dev *smartDevice) { + px := fmt.Sprintf("device_%s_type_%s_", dev.deviceName(), dev.deviceType()) + + if v, ok := dev.powerOnTime(); ok { + mx[px+"power_on_time"] = v + } + if v, ok := dev.temperature(); ok { + mx[px+"temperature"] = v + } + if v, ok := dev.powerCycleCount(); ok { + mx[px+"power_cycle_count"] = v + } + if v, ok := dev.smartStatusPassed(); ok { + mx[px+"smart_status_passed"] = 0 + mx[px+"smart_status_failed"] = 0 + if v { + mx[px+"smart_status_passed"] = 1 + } else { + mx[px+"smart_status_failed"] = 1 + } + } + if v, ok := dev.ataSmartErrorLogCount(); ok { + mx[px+"ata_smart_error_log_summary_count"] = v + } + + if attrs, ok := dev.ataSmartAttributeTable(); ok { + for _, attr := range attrs { + if !isSmartAttrValid(attr) { + continue + } + n := strings.ToLower(attr.name()) + n = strings.ReplaceAll(n, " ", "_") + px := fmt.Sprintf("%sattr_%s_", px, n) + + if v, err := strconv.ParseInt(attr.value(), 10, 64); err == nil { + mx[px+"normalized"] = v + } + + if v, err := strconv.ParseInt(attr.rawValue(), 10, 64); err == nil { + mx[px+"raw"] = v + } + + rs := strings.TrimSpace(attr.rawString()) + if i := strings.IndexByte(rs, ' '); i != -1 { + rs = rs[:i] + } + if v, err := strconv.ParseInt(rs, 10, 64); err == nil { + mx[px+"decoded"] = v + } + } + } +} + +func (s *Smartctl) isTimeToScan(now time.Time) bool { + return now.After(s.lastScanTime.Add(s.ScanEvery.Duration())) +} + +func (s *Smartctl) isTimeToPollDevices(now time.Time) bool { + return now.After(s.lastDevicePollTime.Add(s.PollDevicesEvery.Duration())) + +} + +func isSmartDeviceValid(d *smartDevice) bool { + return d.deviceName() != "" && d.deviceType() != "" +} + +func isSmartAttrValid(a *smartAttribute) bool { + return a.id() != "" && a.name() != "" +} + +func isDeviceInLowerPowerMode(r *gjson.Result) bool { + if !isExitStatusHasBit(r, 1) { + return false + } + + messages := r.Get("smartctl.messages").Array() + + return slices.ContainsFunc(messages, func(msg gjson.Result) bool { + text := msg.Get("string").String() + return strings.HasPrefix(text, "Device is in") && strings.Contains(text, "mode") + }) +} + +func isDeviceOpenFailedNoSuchDevice(r *gjson.Result) bool { + if !isExitStatusHasBit(r, 1) { + return false + } + + messages := r.Get("smartctl.messages").Array() + + return slices.ContainsFunc(messages, func(msg gjson.Result) bool { + text := msg.Get("string").String() + return strings.HasSuffix(text, "No such device") + }) +} + +func isExitStatusHasBit(r *gjson.Result, bit int) bool { + // https://manpages.debian.org/bullseye/smartmontools/smartctl.8.en.html#EXIT_STATUS + status := int(r.Get("smartctl.exit_status").Int()) + mask := 1 << bit + return (status & mask) != 0 +} diff --git a/src/go/collectors/go.d.plugin/modules/smartctl/config_schema.json b/src/go/collectors/go.d.plugin/modules/smartctl/config_schema.json new file mode 100644 index 0000000000..273899cd98 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/smartctl/config_schema.json @@ -0,0 +1,67 @@ +{ + "jsonSchema": { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Smartctl collector configuration.", + "type": "object", + "properties": { + "update_every": { + "title": "Update every", + "description": "Interval for updating Netdata charts, measured in seconds. Collector might use cached data if less than **Devices poll interval**.", + "type": "integer", + "minimum": 1, + "default": 10 + }, + "timeout": { + "title": "Timeout", + "description": "Timeout for executing the `smartctl` binary, specified in seconds.", + "type": "number", + "minimum": 0.5, + "default": 5 + }, + "scan_every": { + "title": "Scan interval", + "description": "Interval for discovering new devices using `smartctl --scan`, measured in seconds.", + "type": "number", + "minimum": 1, + "default": 900 + }, + "poll_devices_every": { + "title": "Devices poll interval", + "description": "Interval for gathering data for every device, measured in seconds. Data is cached for this interval.", + "type": "number", + "minimum": 1, + "default": 300 + }, + "no_check_power_mode": { + "title": "No check power mode", + "description": "ATA only. Skip data collection when the device is in a low-power mode. Prevents unnecessary disk spin-up.", + "type": "string", + "enum": [ + "standby", + "never", + "sleep", + "idle" + ], + "default": "standby" + } + }, + "additionalProperties": false, + "patternProperties": { + "^name$": {} + } + }, + "uiSchema": { + "uiOptions": { + "fullPage": true + }, + "timeout": { + "ui:help": "Accepts decimals for precise control (e.g., type 1.5 for 1.5 seconds)." + }, + "no_check_power_mode": { + "ui:widget": "radio", + "ui:options": { + "inline": true + } + } + } +} diff --git a/src/go/collectors/go.d.plugin/modules/smartctl/exec.go b/src/go/collectors/go.d.plugin/modules/smartctl/exec.go new file mode 100644 index 0000000000..a90e1b529b --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/smartctl/exec.go @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package smartctl + +import ( + "context" + "errors" + "fmt" + "os/exec" + "time" + + "github.com/netdata/netdata/go/go.d.plugin/logger" + + "github.com/tidwall/gjson" +) + +func newSmartctlCliExec(ndsudoPath string, timeout time.Duration, log *logger.Logger) *smartctlCliExec { + return &smartctlCliExec{ + Logger: log, + ndsudoPath: ndsudoPath, + timeout: timeout, + } +} + +type smartctlCliExec struct { + *logger.Logger + + ndsudoPath string + timeout time.Duration +} + +func (e *smartctlCliExec) scan() (*gjson.Result, error) { + return e.execute("smartctl-json-scan") +} + +func (e *smartctlCliExec) deviceInfo(deviceName, deviceType, powerMode string) (*gjson.Result, error) { + return e.execute("smartctl-json-device-info", + "--deviceName", deviceName, + "--deviceType", deviceType, + "--powerMode", powerMode, + ) +} + +func (e *smartctlCliExec) execute(args ...string) (*gjson.Result, error) { + ctx, cancel := context.WithTimeout(context.Background(), e.timeout) + defer cancel() + + cmd := exec.CommandContext(ctx, e.ndsudoPath, args...) + e.Debugf("executing '%s'", cmd) + + bs, err := cmd.Output() + if err != nil { + if errors.Is(err, context.DeadlineExceeded) || isExecExitCode(err, 1) || len(bs) == 0 { + return nil, fmt.Errorf("'%s' execution failed: %v", cmd, err) + } + } + if len(bs) == 0 { + return nil, fmt.Errorf("'%s' returned no output", cmd) + } + + if !gjson.ValidBytes(bs) { + return nil, fmt.Errorf("'%s' returned invalid JSON output", cmd) + } + + res := gjson.ParseBytes(bs) + if !res.Get("smartctl.exit_status").Exists() { + return nil, fmt.Errorf("'%s' returned unexpected data", cmd) + } + + for _, msg := range res.Get("smartctl.messages").Array() { + if msg.Get("severity").String() == "error" { + return &res, fmt.Errorf("'%s' reported an error: %s", cmd, msg.Get("string")) + } + } + + return &res, nil +} + +func isExecExitCode(err error, exitCode int) bool { + var v *exec.ExitError + return errors.As(err, &v) && v.ExitCode() == exitCode +} diff --git a/src/go/collectors/go.d.plugin/modules/smartctl/init.go b/src/go/collectors/go.d.plugin/modules/smartctl/init.go new file mode 100644 index 0000000000..5c6ede5316 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/smartctl/init.go @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package smartctl + +import ( + "fmt" + "os" + "path/filepath" + + "github.com/netdata/netdata/go/go.d.plugin/agent/executable" +) + +func (s *Smartctl) validateConfig() error { + switch s.NoCheckPowerMode { + case "never", "sleep", "standby", "idle": + default: + return fmt.Errorf("invalid power mode '%s'", s.NoCheckPowerMode) + } + return nil +} + +func (s *Smartctl) initSmartctlCli() (smartctlCli, error) { + ndsudoPath := filepath.Join(executable.Directory, "ndsudo") + if _, err := os.Stat(ndsudoPath); err != nil { + return nil, fmt.Errorf("ndsudo executable not found: %v", err) + + } + + smartctlExec := newSmartctlCliExec(ndsudoPath, s.Timeout.Duration(), s.Logger) + + return smartctlExec, nil +} diff --git a/src/go/collectors/go.d.plugin/modules/smartctl/metadata.yaml b/src/go/collectors/go.d.plugin/modules/smartctl/metadata.yaml new file mode 100644 index 0000000000..0ef0843f99 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/smartctl/metadata.yaml @@ -0,0 +1,154 @@ +plugin_name: go.d.plugin +modules: + - meta: + id: collector-go.d.plugin-smartctl + plugin_name: go.d.plugin + module_name: smartctl + monitored_instance: + name: S.M.A.R.T. + link: "https://linux.die.net/man/8/smartd" + icon_filename: "smart.png" + categories: + - data-collection.hardware-devices-and-sensors + keywords: + - smart + - S.M.A.R.T. + - SCSI devices + - ATA devices + related_resources: + integrations: + list: [] + info_provided_to_referring_integrations: + description: "" + most_popular: false + overview: + data_collection: + metrics_description: | + This collector monitors the health status of storage devices by analyzing S.M.A.R.T. (Self-Monitoring, Analysis, and Reporting Technology) counters. + It relies on the [`smartctl`](https://linux.die.net/man/8/smartctl) CLI tool but avoids directly executing the binary. + Instead, it utilizes `ndsudo`, a Netdata helper specifically designed to run privileged commands securely within the Netdata environment. + This approach eliminates the need to use `sudo`, improving security and potentially simplifying permission management. + + Executed commands: + - `smartctl --json --scan` + - `smartctl --json --all {deviceName} --device {deviceType} --nocheck {powerMode}` + method_description: "" + supported_platforms: + include: [] + exclude: [] + multi_instance: false + additional_permissions: + description: "" + default_behavior: + auto_detection: + description: "" + limits: + description: "" + performance_impact: + description: "" + setup: + prerequisites: + list: [] + configuration: + file: + name: go.d/smartctl.conf + options: + description: | + The following options can be defined globally: update_every. + folding: + title: Config options + enabled: true + list: + - name: update_every + description: interval for updating Netdata charts, measured in seconds. Collector might use cached data if less than **Devices poll interval**. + default_value: 10 + required: false + - name: timeout + description: smartctl binary execution timeout. + default_value: 5 + required: false + - name: scan_every + description: interval for discovering new devices using `smartctl --scan`, measured in seconds. + default_value: 900 + required: false + - name: poll_devices_every + description: interval for gathering data for every device, measured in seconds. Data is cached for this interval. + default_value: 300 + required: false + examples: + folding: + title: Config + enabled: true + list: + - name: Custom devices poll interval + description: Allows you to override the default devices poll interval (data collection). + config: | + jobs: + - name: smartctl + devices_poll_interval: 60 # Collect S.M.A.R.T statistics every 60 seconds + troubleshooting: + problems: + list: [] + alerts: [] + metrics: + folding: + title: Metrics + enabled: false + description: "" + availability: [] + scopes: + - name: controller + description: These metrics refer to the Storage Device. + labels: + - name: device_name + description: Device name + - name: device_type + description: Device type + - name: model_name + description: Model name + - name: serial_number + description: Serial number + metrics: + - name: smartctl.device_smart_status + description: Device smart status + unit: status + chart_type: line + dimensions: + - name: passed + - name: failed + - name: smartctl.device_ata_smart_error_log_count + description: Device ATA smart error log count + unit: logs + chart_type: line + dimensions: + - name: error_log + - name: smartctl.device_power_on_time + description: Device power on time + unit: seconds + chart_type: line + dimensions: + - name: power_on_time + - name: smartctl.device_temperature + description: Device temperature + unit: Celsius + chart_type: line + dimensions: + - name: temperature + - name: smartctl.device_power_cycles_count + description: Device power cycles + unit: cycles + chart_type: line + dimensions: + - name: power + - name: smartctl.device_smart_attr_{attribute_name} + description: Device smart attribute {attribute_name} + unit: '{attribute_unit}' + chart_type: line + dimensions: + - name: '{attribute_name}' + - name: smartctl.device_smart_attr_{attribute_name}_normalized + description: Device smart attribute {attribute_name} normalized + unit: value + chart_type: line + dimensions: + - name: '{attribute_name}' diff --git a/src/go/collectors/go.d.plugin/modules/smartctl/scan.go b/src/go/collectors/go.d.plugin/modules/smartctl/scan.go new file mode 100644 index 0000000000..cafdc8b60d --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/smartctl/scan.go @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package smartctl + +import ( + "errors" + "fmt" + "strings" +) + +type scanDevice struct { + name string + infoName string + typ string +} + +func (s *scanDevice) key() string { + return fmt.Sprintf("%s|%s", s.name, s.typ) +} + +func (s *scanDevice) shortName() string { + return strings.TrimPrefix(s.name, "/dev/") +} + +func (s *Smartctl) scanDevices() (map[string]*scanDevice, error) { + resp, err := s.exec.scan() + if err != nil { + return nil, err + } + + devices := make(map[string]*scanDevice) + + for _, d := range resp.Get("devices").Array() { + dev := &scanDevice{ + name: d.Get("name").String(), + infoName: d.Get("info_name").String(), + typ: d.Get("type").String(), + } + + if dev.name == "" || dev.typ == "" { + continue + } + + if dev.typ == "scsi" { + // `smartctl --scan` attempts to guess the device type based on the path, but this can be unreliable. + // Accurate device type information is crucial because we use the `--device` option to gather data. + // Using the wrong type can lead to issues. + // For example, using 'scsi' for 'sat' devices prevents `smartctl` from issuing the necessary ATA commands. + resp, _ := s.exec.deviceInfo(dev.name, dev.typ, s.NoCheckPowerMode) + if resp != nil && isExitStatusHasBit(resp, 2) { + newType := "sat" + s.Debugf("changing device '%s' type '%s' -> '%s'", dev.name, dev.typ, newType) + dev.typ = newType + } + s.Debugf("smartctl scan found device '%s' type '%s' info_name '%s'", dev.name, dev.typ, dev.infoName) + } + + devices[dev.key()] = dev + } + + if len(devices) == 0 { + return nil, errors.New("no devices found on scan") + } + + s.Infof("smartctl scan found %d devices", len(devices)) + + return devices, nil +} diff --git a/src/go/collectors/go.d.plugin/modules/smartctl/smart_device.go b/src/go/collectors/go.d.plugin/modules/smartctl/smart_device.go new file mode 100644 index 0000000000..bc1955330c --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/smartctl/smart_device.go @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package smartctl + +import ( + "strings" + + "github.com/tidwall/gjson" +) + +func newSmartDevice(deviceData *gjson.Result) *smartDevice { + return &smartDevice{ + data: deviceData, + } +} + +type smartDevice struct { + data *gjson.Result +} + +func (d *smartDevice) deviceName() string { + v := d.data.Get("device.name").String() + return strings.TrimPrefix(v, "/dev/") +} + +func (d *smartDevice) deviceType() string { + return d.data.Get("device.type").String() +} + +func (d *smartDevice) deviceProtocol() string { + return d.data.Get("device.protocol").String() +} + +func (d *smartDevice) serialNumber() string { + return d.data.Get("serial_number").String() +} + +func (d *smartDevice) modelName() string { + for _, s := range []string{"model_name", "scsi_model_name"} { + if v := d.data.Get(s); v.Exists() { + return v.String() + } + } + return "unknown" +} + +func (d *smartDevice) powerOnTime() (int64, bool) { + h := d.data.Get("power_on_time.hours") + if !h.Exists() { + return 0, false + } + m := d.data.Get("power_on_time.minutes") + return h.Int()*60*60 + m.Int()*60, true +} + +func (d *smartDevice) temperature() (int64, bool) { + v := d.data.Get("temperature.current") + return v.Int(), v.Exists() +} + +func (d *smartDevice) powerCycleCount() (int64, bool) { + v := d.data.Get("power_cycle_count") + if v.Exists() { + return v.Int(), true + } + v = d.data.Get("scsi_start_stop_cycle_counter.accumulated_start_stop_cycles") + return v.Int(), v.Exists() +} + +func (d *smartDevice) smartStatusPassed() (bool, bool) { + v := d.data.Get("smart_status.passed") + return v.Bool(), v.Exists() +} + +func (d *smartDevice) ataSmartErrorLogCount() (int64, bool) { + v := d.data.Get("ata_smart_error_log.summary.count") + return v.Int(), v.Exists() +} + +func (d *smartDevice) ataSmartAttributeTable() ([]*smartAttribute, bool) { + table := d.data.Get("ata_smart_attributes.table") + if !table.Exists() || !table.IsArray() { + return nil, false + } + + var attrs []*smartAttribute + + for _, data := range table.Array() { + attrs = append(attrs, newSmartDeviceAttribute(data)) + } + + return attrs, true +} + +func newSmartDeviceAttribute(attrData gjson.Result) *smartAttribute { + return &smartAttribute{ + data: attrData, + } +} + +type smartAttribute struct { + data gjson.Result +} + +func (a *smartAttribute) id() string { + return a.data.Get("id").String() +} + +func (a *smartAttribute) name() string { + return a.data.Get("name").String() +} + +func (a *smartAttribute) value() string { + return a.data.Get("value").String() +} + +func (a *smartAttribute) rawValue() string { + return a.data.Get("raw.value").String() +} + +func (a *smartAttribute) rawString() string { + return a.data.Get("raw.string").String() +} diff --git a/src/go/collectors/go.d.plugin/modules/smartctl/smartctl.go b/src/go/collectors/go.d.plugin/modules/smartctl/smartctl.go new file mode 100644 index 0000000000..1e1a09c3d1 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/smartctl/smartctl.go @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package smartctl + +import ( + _ "embed" + "errors" + "time" + + "github.com/netdata/netdata/go/go.d.plugin/agent/module" + "github.com/netdata/netdata/go/go.d.plugin/pkg/web" + + "github.com/tidwall/gjson" +) + +//go:embed "config_schema.json" +var configSchema string + +func init() { + module.Register("smartctl", module.Creator{ + JobConfigSchema: configSchema, + Defaults: module.Defaults{ + UpdateEvery: 10, + }, + Create: func() module.Module { return New() }, + }) +} + +func New() *Smartctl { + return &Smartctl{ + Config: Config{ + Timeout: web.Duration(time.Second * 5), + ScanEvery: web.Duration(time.Minute * 15), + PollDevicesEvery: web.Duration(time.Minute * 5), + NoCheckPowerMode: "standby", + }, + charts: &module.Charts{}, + seenDevices: make(map[string]bool), + } +} + +type Config struct { + UpdateEvery int `yaml:"update_every" json:"update_every"` + Timeout web.Duration `yaml:"timeout" json:"timeout"` + ScanEvery web.Duration `yaml:"scan_every" json:"scan_every"` + PollDevicesEvery web.Duration `yaml:"poll_devices_every" json:"poll_devices_every"` + NoCheckPowerMode string `yaml:"no_check_power_mode" json:"no_check_power_mode"` +} + +type ( + Smartctl struct { + module.Base + Config `yaml:",inline" data:""` + + charts *module.Charts + + exec smartctlCli + + lastScanTime time.Time + forceScan bool + scannedDevices map[string]*scanDevice + + lastDevicePollTime time.Time + forceDevicePoll bool + + seenDevices map[string]bool + mx map[string]int64 + } + smartctlCli interface { + scan() (*gjson.Result, error) + deviceInfo(deviceName, deviceType, powerMode string) (*gjson.Result, error) + } +) + +func (s *Smartctl) Configuration() any { + return s.Config +} + +func (s *Smartctl) Init() error { + if err := s.validateConfig(); err != nil { + s.Errorf("config validation error: %s", err) + return err + } + + smartctlExec, err := s.initSmartctlCli() + if err != nil { + s.Errorf("smartctl exec initialization: %v", err) + return err + } + s.exec = smartctlExec + + return nil +} + +func (s *Smartctl) Check() error { + mx, err := s.collect() + if err != nil { + s.Error(err) + return err + } + + if len(mx) == 0 { + return errors.New("no metrics collected") + } + + return nil +} + +func (s *Smartctl) Charts() *module.Charts { + return s.charts +} + +func (s *Smartctl) Collect() map[string]int64 { + mx, err := s.collect() + if err != nil { + s.Error(err) + } + + if len(mx) == 0 { + return nil + } + + return mx +} + +func (s *Smartctl) Cleanup() {} diff --git a/src/go/collectors/go.d.plugin/modules/smartctl/smartctl_test.go b/src/go/collectors/go.d.plugin/modules/smartctl/smartctl_test.go new file mode 100644 index 0000000000..9666924bfc --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/smartctl/smartctl_test.go @@ -0,0 +1,436 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package smartctl + +import ( + "fmt" + "os" + "testing" + "time" + + "github.com/netdata/netdata/go/go.d.plugin/agent/module" + "github.com/netdata/netdata/go/go.d.plugin/pkg/web" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/tidwall/gjson" +) + +var ( + dataConfigJSON, _ = os.ReadFile("testdata/config.json") + dataConfigYAML, _ = os.ReadFile("testdata/config.yaml") + + dataTypeSataScan, _ = os.ReadFile("testdata/type-sat/scan.json") + dataTypeSataDeviceHDDSda, _ = os.ReadFile("testdata/type-sat/device-hdd-sda.json") + dataTypeSataDeviceSSDSdc, _ = os.ReadFile("testdata/type-sat/device-ssd-sdc.json") + + dataTypeNvmeScan, _ = os.ReadFile("testdata/type-nvme/scan.json") + dataTypeNvmeDeviceNvme0, _ = os.ReadFile("testdata/type-nvme/device-nvme0.json") +) + +func Test_testDataIsValid(t *testing.T) { + for name, data := range map[string][]byte{ + "dataConfigJSON": dataConfigJSON, + "dataConfigYAML": dataConfigYAML, + + "dataTypeSataScan": dataTypeSataScan, + "dataTypeSataDeviceHDDSda": dataTypeSataDeviceHDDSda, + "dataTypeSataDeviceSSDSdc": dataTypeSataDeviceSSDSdc, + + "dataTypeNvmeScan": dataTypeNvmeScan, + "dataTypeNvmeDeviceNvme0": dataTypeNvmeDeviceNvme0, + } { + require.NotNil(t, data, name) + } +} + +func TestSmartctl_ConfigurationSerialize(t *testing.T) { + module.TestConfigurationSerialize(t, &Smartctl{}, dataConfigJSON, dataConfigYAML) +} + +func TestSmartctl_Init(t *testing.T) { + tests := map[string]struct { + config Config + wantFail bool + }{ + "fails if invalid power mode": { + wantFail: true, + config: func() Config { + cfg := New().Config + cfg.NoCheckPowerMode = "invalid" + return cfg + }(), + }, + "fails if 'ndsudo' not found": { + wantFail: true, + config: New().Config, + }, + } + + for name, test := range tests { + t.Run(name, func(t *testing.T) { + smart := New() + + if test.wantFail { + assert.Error(t, smart.Init()) + } else { + assert.NoError(t, smart.Init()) + } + }) + } +} + +func TestSmartctl_Cleanup(t *testing.T) { + tests := map[string]struct { + prepare func() *Smartctl + }{ + "not initialized exec": { + prepare: func() *Smartctl { + return New() + }, + }, + "after check": { + prepare: func() *Smartctl { + smart := New() + smart.exec = prepareMockOkTypeSata() + _ = smart.Check() + return smart + }, + }, + "after collect": { + prepare: func() *Smartctl { + smart := New() + smart.exec = prepareMockOkTypeSata() + _ = smart.Collect() + return smart + }, + }, + } + + for name, test := range tests { + t.Run(name, func(t *testing.T) { + smart := test.prepare() + + assert.NotPanics(t, smart.Cleanup) + }) + } +} + +func TestSmartctl_Check(t *testing.T) { + tests := map[string]struct { + prepareMock func() *mockSmartctlCliExec + wantFail bool + }{ + "success type sata devices": { + wantFail: false, + prepareMock: prepareMockOkTypeSata, + }, + "success type nvme devices": { + wantFail: false, + prepareMock: prepareMockOkTypeNvme, + }, + "error on scan": { + wantFail: true, + prepareMock: prepareMockErrOnScan, + }, + "unexpected response on scan": { + wantFail: true, + prepareMock: prepareMockUnexpectedResponse, + }, + "empty response on scan": { + wantFail: true, + prepareMock: prepareMockEmptyResponse, + }, + } + + for name, test := range tests { + t.Run(name, func(t *testing.T) { + smart := New() + mock := test.prepareMock() + smart.exec = mock + + if test.wantFail { + assert.Error(t, smart.Check()) + } else { + assert.NoError(t, smart.Check()) + } + }) + } +} + +func TestSmartctl_Collect(t *testing.T) { + tests := map[string]struct { + prepareMock func() *mockSmartctlCliExec + wantMetrics map[string]int64 + wantCharts int + }{ + "success type sata devices": { + prepareMock: prepareMockOkTypeSata, + wantCharts: 68, + wantMetrics: map[string]int64{ + "device_sda_type_sat_ata_smart_error_log_summary_count": 0, + "device_sda_type_sat_attr_current_pending_sector_decoded": 0, + "device_sda_type_sat_attr_current_pending_sector_normalized": 100, + "device_sda_type_sat_attr_current_pending_sector_raw": 0, + "device_sda_type_sat_attr_load_cycle_count_decoded": 360, + "device_sda_type_sat_attr_load_cycle_count_normalized": 100, + "device_sda_type_sat_attr_load_cycle_count_raw": 360, + "device_sda_type_sat_attr_offline_uncorrectable_decoded": 0, + "device_sda_type_sat_attr_offline_uncorrectable_normalized": 100, + "device_sda_type_sat_attr_offline_uncorrectable_raw": 0, + "device_sda_type_sat_attr_power-off_retract_count_decoded": 360, + "device_sda_type_sat_attr_power-off_retract_count_normalized": 100, + "device_sda_type_sat_attr_power-off_retract_count_raw": 360, + "device_sda_type_sat_attr_power_cycle_count_decoded": 12, + "device_sda_type_sat_attr_power_cycle_count_normalized": 100, + "device_sda_type_sat_attr_power_cycle_count_raw": 12, + "device_sda_type_sat_attr_power_on_hours_decoded": 8244, + "device_sda_type_sat_attr_power_on_hours_normalized": 99, + "device_sda_type_sat_attr_power_on_hours_raw": 8244, + "device_sda_type_sat_attr_raw_read_error_rate_decoded": 0, + "device_sda_type_sat_attr_raw_read_error_rate_normalized": 100, + "device_sda_type_sat_attr_raw_read_error_rate_raw": 0, + "device_sda_type_sat_attr_reallocated_event_count_decoded": 0, + "device_sda_type_sat_attr_reallocated_event_count_normalized": 100, + "device_sda_type_sat_attr_reallocated_event_count_raw": 0, + "device_sda_type_sat_attr_reallocated_sector_ct_decoded": 0, + "device_sda_type_sat_attr_reallocated_sector_ct_normalized": 100, + "device_sda_type_sat_attr_reallocated_sector_ct_raw": 0, + "device_sda_type_sat_attr_seek_error_rate_decoded": 0, + "device_sda_type_sat_attr_seek_error_rate_normalized": 100, + "device_sda_type_sat_attr_seek_error_rate_raw": 0, + "device_sda_type_sat_attr_seek_time_performance_decoded": 15, + "device_sda_type_sat_attr_seek_time_performance_normalized": 140, + "device_sda_type_sat_attr_seek_time_performance_raw": 15, + "device_sda_type_sat_attr_spin_retry_count_decoded": 0, + "device_sda_type_sat_attr_spin_retry_count_normalized": 100, + "device_sda_type_sat_attr_spin_retry_count_raw": 0, + "device_sda_type_sat_attr_spin_up_time_decoded": 281, + "device_sda_type_sat_attr_spin_up_time_normalized": 86, + "device_sda_type_sat_attr_spin_up_time_raw": 25788088601, + "device_sda_type_sat_attr_start_stop_count_decoded": 12, + "device_sda_type_sat_attr_start_stop_count_normalized": 100, + "device_sda_type_sat_attr_start_stop_count_raw": 12, + "device_sda_type_sat_attr_temperature_celsius_decoded": 49, + "device_sda_type_sat_attr_temperature_celsius_normalized": 43, + "device_sda_type_sat_attr_temperature_celsius_raw": 240519741489, + "device_sda_type_sat_attr_throughput_performance_decoded": 48, + "device_sda_type_sat_attr_throughput_performance_normalized": 148, + "device_sda_type_sat_attr_throughput_performance_raw": 48, + "device_sda_type_sat_attr_udma_crc_error_count_decoded": 0, + "device_sda_type_sat_attr_udma_crc_error_count_normalized": 100, + "device_sda_type_sat_attr_udma_crc_error_count_raw": 0, + "device_sda_type_sat_attr_unknown_attribute_decoded": 100, + "device_sda_type_sat_attr_unknown_attribute_normalized": 100, + "device_sda_type_sat_attr_unknown_attribute_raw": 100, + "device_sda_type_sat_power_cycle_count": 12, + "device_sda_type_sat_power_on_time": 29678400, + "device_sda_type_sat_smart_status_failed": 0, + "device_sda_type_sat_smart_status_passed": 1, + "device_sda_type_sat_temperature": 49, + "device_sdc_type_sat_ata_smart_error_log_summary_count": 0, + "device_sdc_type_sat_attr_available_reservd_space_decoded": 100, + "device_sdc_type_sat_attr_available_reservd_space_normalized": 100, + "device_sdc_type_sat_attr_available_reservd_space_raw": 100, + "device_sdc_type_sat_attr_command_timeout_decoded": 0, + "device_sdc_type_sat_attr_command_timeout_normalized": 100, + "device_sdc_type_sat_attr_command_timeout_raw": 0, + "device_sdc_type_sat_attr_end-to-end_error_decoded": 0, + "device_sdc_type_sat_attr_end-to-end_error_normalized": 100, + "device_sdc_type_sat_attr_end-to-end_error_raw": 0, + "device_sdc_type_sat_attr_media_wearout_indicator_decoded": 65406, + "device_sdc_type_sat_attr_media_wearout_indicator_normalized": 100, + "device_sdc_type_sat_attr_media_wearout_indicator_raw": 65406, + "device_sdc_type_sat_attr_power_cycle_count_decoded": 13, + "device_sdc_type_sat_attr_power_cycle_count_normalized": 100, + "device_sdc_type_sat_attr_power_cycle_count_raw": 13, + "device_sdc_type_sat_attr_power_on_hours_decoded": 8244, + "device_sdc_type_sat_attr_power_on_hours_normalized": 100, + "device_sdc_type_sat_attr_power_on_hours_raw": 8244, + "device_sdc_type_sat_attr_reallocated_sector_ct_decoded": 0, + "device_sdc_type_sat_attr_reallocated_sector_ct_normalized": 100, + "device_sdc_type_sat_attr_reallocated_sector_ct_raw": 0, + "device_sdc_type_sat_attr_reported_uncorrect_decoded": 0, + "device_sdc_type_sat_attr_reported_uncorrect_normalized": 100, + "device_sdc_type_sat_attr_reported_uncorrect_raw": 0, + "device_sdc_type_sat_attr_temperature_celsius_decoded": 27, + "device_sdc_type_sat_attr_temperature_celsius_normalized": 73, + "device_sdc_type_sat_attr_temperature_celsius_raw": 184684970011, + "device_sdc_type_sat_attr_total_lbas_read_decoded": 76778, + "device_sdc_type_sat_attr_total_lbas_read_normalized": 253, + "device_sdc_type_sat_attr_total_lbas_read_raw": 76778, + "device_sdc_type_sat_attr_total_lbas_written_decoded": 173833, + "device_sdc_type_sat_attr_total_lbas_written_normalized": 253, + "device_sdc_type_sat_attr_total_lbas_written_raw": 173833, + "device_sdc_type_sat_attr_udma_crc_error_count_decoded": 0, + "device_sdc_type_sat_attr_udma_crc_error_count_normalized": 100, + "device_sdc_type_sat_attr_udma_crc_error_count_raw": 0, + "device_sdc_type_sat_attr_unknown_attribute_decoded": 0, + "device_sdc_type_sat_attr_unknown_attribute_normalized": 0, + "device_sdc_type_sat_attr_unknown_attribute_raw": 0, + "device_sdc_type_sat_attr_unknown_ssd_attribute_decoded": 4694419309637, + "device_sdc_type_sat_attr_unknown_ssd_attribute_normalized": 4, + "device_sdc_type_sat_attr_unknown_ssd_attribute_raw": 4694419309637, + "device_sdc_type_sat_power_cycle_count": 13, + "device_sdc_type_sat_power_on_time": 29678400, + "device_sdc_type_sat_smart_status_failed": 0, + "device_sdc_type_sat_smart_status_passed": 1, + "device_sdc_type_sat_temperature": 27, + }, + }, + "success type nvme devices": { + prepareMock: prepareMockOkTypeNvme, + wantCharts: 4, + wantMetrics: map[string]int64{ + "device_nvme0_type_nvme_power_cycle_count": 2, + "device_nvme0_type_nvme_power_on_time": 11206800, + "device_nvme0_type_nvme_smart_status_failed": 0, + "device_nvme0_type_nvme_smart_status_passed": 1, + "device_nvme0_type_nvme_temperature": 39, + }, + }, + "error on scan": { + prepareMock: prepareMockErrOnScan, + }, + "unexpected response on scan": { + prepareMock: prepareMockUnexpectedResponse, + }, + "empty response on scan": { + prepareMock: prepareMockEmptyResponse, + }, + } + + for name, test := range tests { + t.Run(name, func(t *testing.T) { + smart := New() + mock := test.prepareMock() + smart.exec = mock + smart.ScanEvery = web.Duration(time.Microsecond * 1) + smart.PollDevicesEvery = web.Duration(time.Microsecond * 1) + + var mx map[string]int64 + for i := 0; i < 10; i++ { + mx = smart.Collect() + } + + assert.Equal(t, test.wantMetrics, mx) + assert.Len(t, *smart.Charts(), test.wantCharts) + testMetricsHasAllChartsDims(t, smart, mx) + }) + } +} + +func testMetricsHasAllChartsDims(t *testing.T, smart *Smartctl, mx map[string]int64) { + for _, chart := range *smart.Charts() { + if chart.Obsolete { + continue + } + for _, dim := range chart.Dims { + _, ok := mx[dim.ID] + assert.Truef(t, ok, "collected metrics has no data for dim '%s' chart '%s'", dim.ID, chart.ID) + } + for _, v := range chart.Vars { + _, ok := mx[v.ID] + assert.Truef(t, ok, "collected metrics has no data for var '%s' chart '%s'", v.ID, chart.ID) + } + } +} + +func prepareMockOkTypeSata() *mockSmartctlCliExec { + return &mockSmartctlCliExec{ + errOnScan: false, + scanData: dataTypeSataScan, + deviceDataFunc: func(deviceName, deviceType, powerMode string) ([]byte, error) { + if deviceType != "sat" { + return nil, fmt.Errorf("unexpected device type %s", deviceType) + } + switch deviceName { + case "/dev/sda": + return dataTypeSataDeviceHDDSda, nil + case "/dev/sdc": + return dataTypeSataDeviceSSDSdc, nil + default: + return nil, fmt.Errorf("unexpected device name %s", deviceName) + } + }, + } +} + +func prepareMockOkTypeNvme() *mockSmartctlCliExec { + return &mockSmartctlCliExec{ + errOnScan: false, + scanData: dataTypeNvmeScan, + deviceDataFunc: func(deviceName, deviceType, powerMode string) ([]byte, error) { + if deviceType != "nvme" { + return nil, fmt.Errorf("unexpected device type %s", deviceType) + } + switch deviceName { + case "/dev/nvme0": + return dataTypeNvmeDeviceNvme0, nil + default: + return nil, fmt.Errorf("unexpected device name %s", deviceName) + } + }, + } +} + +func prepareMockErrOnScan() *mockSmartctlCliExec { + return &mockSmartctlCliExec{ + errOnScan: true, + } +} + +func prepareMockUnexpectedResponse() *mockSmartctlCliExec { + return &mockSmartctlCliExec{ + scanData: []byte(randomJsonData), + deviceDataFunc: func(_, _, _ string) ([]byte, error) { return []byte(randomJsonData), nil }, + } +} + +func prepareMockEmptyResponse() *mockSmartctlCliExec { + return &mockSmartctlCliExec{} +} + +type mockSmartctlCliExec struct { + errOnScan bool + scanData []byte + deviceDataFunc func(deviceName, deviceType, powerMode string) ([]byte, error) +} + +func (m *mockSmartctlCliExec) scan() (*gjson.Result, error) { + if m.errOnScan { + return nil, fmt.Errorf("mock.scan() error") + } + res := gjson.ParseBytes(m.scanData) + return &res, nil +} + +func (m *mockSmartctlCliExec) deviceInfo(deviceName, deviceType, powerMode string) (*gjson.Result, error) { + if m.deviceDataFunc == nil { + return nil, nil + } + bs, err := m.deviceDataFunc(deviceName, deviceType, powerMode) + if err != nil { + return nil, err + } + res := gjson.ParseBytes(bs) + return &res, nil +} + +var randomJsonData = ` +{ + "elephant": { + "burn": false, + "mountain": true, + "fog": false, + "skin": -1561907625, + "burst": "anyway", + "shadow": 1558616893 + }, + "start": "ever", + "base": 2093056027, + "mission": -2007590351, + "victory": 999053756, + "die": false +} +` diff --git a/src/go/collectors/go.d.plugin/modules/smartctl/testdata/config.json b/src/go/collectors/go.d.plugin/modules/smartctl/testdata/config.json new file mode 100644 index 0000000000..88fe667b85 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/smartctl/testdata/config.json @@ -0,0 +1,7 @@ +{ + "update_every": 123, + "timeout": 123.123, + "scan_every": 123.123, + "poll_devices_every": 123.123, + "no_check_power_mode": "ok" +} diff --git a/src/go/collectors/go.d.plugin/modules/smartctl/testdata/config.yaml b/src/go/collectors/go.d.plugin/modules/smartctl/testdata/config.yaml new file mode 100644 index 0000000000..967489c18e --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/smartctl/testdata/config.yaml @@ -0,0 +1,5 @@ +update_every: 123 +timeout: 123.123 +scan_every: 123.123 +poll_devices_every: 123.123 +no_check_power_mode: "ok" diff --git a/src/go/collectors/go.d.plugin/modules/smartctl/testdata/type-nvme/device-nvme0.json b/src/go/collectors/go.d.plugin/modules/smartctl/testdata/type-nvme/device-nvme0.json new file mode 100644 index 0000000000..1b31d322d8 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/smartctl/testdata/type-nvme/device-nvme0.json @@ -0,0 +1,112 @@ +{ + "json_format_version": [ + 1, + 0 + ], + "smartctl": { + "version": [ + 7, + 3 + ], + "svn_revision": "5338", + "platform_info": "REDACTED", + "build_info": "(local build)", + "argv": [ + "smartctl", + "--all", + "--json", + "--device=nvme", + "/dev/nvme0" + ], + "exit_status": 0 + }, + "local_time": { + "time_t": 1714480742, + "asctime": "Tue Apr 30 15:39:02 2024 EEST" + }, + "device": { + "name": "/dev/nvme0", + "info_name": "/dev/nvme0", + "type": "nvme", + "protocol": "NVMe" + }, + "model_name": "Seagate FireCuda 530 ZP4000GM30023", + "serial_number": "REDACTED", + "firmware_version": "REDACTED", + "nvme_pci_vendor": { + "id": 7089, + "subsystem_id": 7089 + }, + "nvme_ieee_oui_identifier": 6584743, + "nvme_total_capacity": 4000787030016, + "nvme_unallocated_capacity": 0, + "nvme_controller_id": 1, + "nvme_version": { + "string": "1.4", + "value": 66560 + }, + "nvme_number_of_namespaces": 1, + "nvme_namespaces": [ + { + "id": 1, + "size": { + "blocks": 7814037168, + "bytes": 4000787030016 + }, + "capacity": { + "blocks": 7814037168, + "bytes": 4000787030016 + }, + "utilization": { + "blocks": 7814037168, + "bytes": 4000787030016 + }, + "formatted_lba_size": 512, + "eui64": { + "oui": 6584743, + "ext_id": 553497146765 + } + } + ], + "user_capacity": { + "blocks": 7814037168, + "bytes": 4000787030016 + }, + "logical_block_size": 512, + "smart_support": { + "available": true, + "enabled": true + }, + "smart_status": { + "passed": true, + "nvme": { + "value": 0 + } + }, + "nvme_smart_health_information_log": { + "critical_warning": 0, + "temperature": 39, + "available_spare": 100, + "available_spare_threshold": 5, + "percentage_used": 0, + "data_units_read": 52, + "data_units_written": 0, + "host_reads": 550, + "host_writes": 0, + "controller_busy_time": 0, + "power_cycles": 2, + "power_on_hours": 3113, + "unsafe_shutdowns": 1, + "media_errors": 0, + "num_err_log_entries": 4, + "warning_temp_time": 0, + "critical_comp_time": 0 + }, + "temperature": { + "current": 39 + }, + "power_cycle_count": 2, + "power_on_time": { + "hours": 3113 + } +} diff --git a/src/go/collectors/go.d.plugin/modules/smartctl/testdata/type-nvme/scan.json b/src/go/collectors/go.d.plugin/modules/smartctl/testdata/type-nvme/scan.json new file mode 100644 index 0000000000..b9f716cbdf --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/smartctl/testdata/type-nvme/scan.json @@ -0,0 +1,29 @@ +{ + "json_format_version": [ + 1, + 0 + ], + "smartctl": { + "version": [ + 7, + 3 + ], + "svn_revision": "5338", + "platform_info": "REDACTED", + "build_info": "(local build)", + "argv": [ + "smartctl", + "--scan", + "--json" + ], + "exit_status": 0 + }, + "devices": [ + { + "name": "/dev/nvme0", + "info_name": "/dev/nvme0", + "type": "nvme", + "protocol": "NVMe" + } + ] +} diff --git a/src/go/collectors/go.d.plugin/modules/smartctl/testdata/type-sat/device-hdd-sda.json b/src/go/collectors/go.d.plugin/modules/smartctl/testdata/type-sat/device-hdd-sda.json new file mode 100644 index 0000000000..55cfe15f5d --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/smartctl/testdata/type-sat/device-hdd-sda.json @@ -0,0 +1,601 @@ +{ + "json_format_version": [ + 1, + 0 + ], + "smartctl": { + "version": [ + 7, + 3 + ], + "svn_revision": "5338", + "platform_info": "REDACTED", + "build_info": "(local build)", + "argv": [ + "smartctl", + "--all", + "--json", + "--device=sat", + "/dev/sda" + ], + "drive_database_version": { + "string": "7.3/5319" + }, + "exit_status": 0 + }, + "local_time": { + "time_t": 1714480013, + "asctime": "Tue Apr 30 15:26:53 2024 EEST" + }, + "device": { + "name": "/dev/sda", + "info_name": "/dev/sda [SAT]", + "type": "sat", + "protocol": "ATA" + }, + "model_name": "WDC WD181KRYZ-01AGBB0", + "serial_number": "REDACTED", + "wwn": { + "naa": 5, + "oui": 3274, + "id": 11659362274 + }, + "firmware_version": "REDACTED", + "user_capacity": { + "blocks": 35156656128, + "bytes": 18000207937536 + }, + "logical_block_size": 512, + "physical_block_size": 4096, + "rotation_rate": 7200, + "form_factor": { + "ata_value": 2, + "name": "3.5 inches" + }, + "trim": { + "supported": false + }, + "in_smartctl_database": false, + "ata_version": { + "string": "ACS-4 published, ANSI INCITS 529-2018", + "major_value": 4092, + "minor_value": 156 + }, + "sata_version": { + "string": "SATA 3.3", + "value": 511 + }, + "interface_speed": { + "max": { + "sata_value": 14, + "string": "6.0 Gb/s", + "units_per_second": 60, + "bits_per_unit": 100000000 + }, + "current": { + "sata_value": 3, + "string": "6.0 Gb/s", + "units_per_second": 60, + "bits_per_unit": 100000000 + } + }, + "smart_support": { + "available": true, + "enabled": true + }, + "smart_status": { + "passed": true + }, + "ata_smart_data": { + "offline_data_collection": { + "status": { + "value": 130, + "string": "was completed without error", + "passed": true + }, + "completion_seconds": 101 + }, + "self_test": { + "status": { + "value": 0, + "string": "completed without error", + "passed": true + }, + "polling_minutes": { + "short": 2, + "extended": 1883 + } + }, + "capabilities": { + "values": [ + 91, + 3 + ], + "exec_offline_immediate_supported": true, + "offline_is_aborted_upon_new_cmd": false, + "offline_surface_scan_supported": true, + "self_tests_supported": true, + "conveyance_self_test_supported": false, + "selective_self_test_supported": true, + "attribute_autosave_enabled": true, + "error_logging_supported": true, + "gp_logging_supported": true + } + }, + "ata_sct_capabilities": { + "value": 61, + "error_recovery_control_supported": true, + "feature_control_supported": true, + "data_table_supported": true + }, + "ata_smart_attributes": { + "revision": 16, + "table": [ + { + "id": 1, + "name": "Raw_Read_Error_Rate", + "value": 100, + "worst": 100, + "thresh": 1, + "when_failed": "", + "flags": { +