summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIlya Mashchenko <ilya@netdata.cloud>2024-04-30 18:24:46 +0300
committerGitHub <noreply@github.com>2024-04-30 18:24:46 +0300
commite1c4f7fec367a610c9c8beea1789c51c6434d51a (patch)
tree1c2700683e9fa5079c39e2108274b0635b6abdb2
parentb1bb1e9ded69b68dcc31cb7e2303f5fd80d8c1bf (diff)
go.d smartctl (#17536)
-rw-r--r--src/go/collectors/go.d.plugin/README.md4
-rw-r--r--src/go/collectors/go.d.plugin/config/go.d.conf1
-rw-r--r--src/go/collectors/go.d.plugin/config/go.d/smartctl.conf5
-rw-r--r--src/go/collectors/go.d.plugin/go.mod3
-rw-r--r--src/go/collectors/go.d.plugin/go.sum6
-rw-r--r--src/go/collectors/go.d.plugin/modules/init.go1
-rw-r--r--src/go/collectors/go.d.plugin/modules/smartctl/charts.go237
-rw-r--r--src/go/collectors/go.d.plugin/modules/smartctl/collect.go189
-rw-r--r--src/go/collectors/go.d.plugin/modules/smartctl/config_schema.json67
-rw-r--r--src/go/collectors/go.d.plugin/modules/smartctl/exec.go82
-rw-r--r--src/go/collectors/go.d.plugin/modules/smartctl/init.go32
-rw-r--r--src/go/collectors/go.d.plugin/modules/smartctl/metadata.yaml154
-rw-r--r--src/go/collectors/go.d.plugin/modules/smartctl/scan.go68
-rw-r--r--src/go/collectors/go.d.plugin/modules/smartctl/smart_device.go123
-rw-r--r--src/go/collectors/go.d.plugin/modules/smartctl/smartctl.go126
-rw-r--r--src/go/collectors/go.d.plugin/modules/smartctl/smartctl_test.go436
-rw-r--r--src/go/collectors/go.d.plugin/modules/smartctl/testdata/config.json7
-rw-r--r--src/go/collectors/go.d.plugin/modules/smartctl/testdata/config.yaml5
-rw-r--r--src/go/collectors/go.d.plugin/modules/smartctl/testdata/type-nvme/device-nvme0.json112
-rw-r--r--src/go/collectors/go.d.plugin/modules/smartctl/testdata/type-nvme/scan.json29
-rw-r--r--src/go/collectors/go.d.plugin/modules/smartctl/testdata/type-sat/device-hdd-sda.json601
-rw-r--r--src/go/collectors/go.d.plugin/modules/smartctl/testdata/type-sat/device-ssd-sdc.json652
-rw-r--r--src/go/collectors/go.d.plugin/modules/smartctl/testdata/type-sat/scan.json35
23 files changed, 2973 insertions, 2 deletions
diff --git a/src/go/collectors/go.d.plugin/README.md b/src/go/collectors/go.d.plugin/README.md
index de64966164..4d0718fc0c 100644
--- a/src/go/collectors/go.d.plugin/README.md
+++ b/src/go/collectors/go.d.plugin/README.md
@@ -118,6 +118,7 @@ see the appropriate collector readme.
| [sensors](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules) | Hardware Sensors |
| [SNMP](https://github.com/netdata/netdata/blob/master/src/go/collectors/go.d.plugin/modules/snmp) | SNMP |
| [squidlog](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/squidlog) | Squid |
+| [smartctl](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/smartctl) | S.M.A.R.T Storage Devices |
| [storcli](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/storcli) | Broadcom Hardware RAID |
| [supervisord](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/supervisord) | Supervisor |
| [systemdunits](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/systemdunits) | Systemd unit state |
@@ -213,8 +214,7 @@ sudo su -s /bin/bash netdata
./go.d.plugin -d -m <module name>
```
-Change `<module name>` to the module name you want to debug. See the [whole list](#available-modules) of available
-modules.
+Change `<module name>` to the [module name](#available-modules) you want to debug.
## Netdata Community
diff --git a/src/go/collectors/go.d.plugin/config/go.d.conf b/src/go/collectors/go.d.plugin/config/go.d.conf
index 4d143b1cf9..34abed37f4 100644
--- a/src/go/collectors/go.d.plugin/config/go.d.conf
+++ b/src/go/collectors/go.d.plugin/config/go.d.conf
@@ -80,6 +80,7 @@ modules:
# sensors: yes
# snmp: yes
# squidlog: yes
+# smartctl: yes
# storcli: yes
# supervisord: yes
# systemdunits: yes
diff --git a/src/go/collectors/go.d.plugin/config/go.d/smartctl.conf b/src/go/collectors/go.d.plugin/config/go.d/smartctl.conf
new file mode 100644
index 0000000000..dea5116be9
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/config/go.d/smartctl.conf
@@ -0,0 +1,5 @@
+## All available configuration options, their descriptions and default values:
+## https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/smartctl#readme
+
+jobs:
+ - name: smartctl
diff --git a/src/go/collectors/go.d.plugin/go.mod b/src/go/collectors/go.d.plugin/go.mod
index e5885c23e3..558223f426 100644
--- a/src/go/collectors/go.d.plugin/go.mod
+++ b/src/go/collectors/go.d.plugin/go.mod
@@ -119,6 +119,9 @@ require (
github.com/sirupsen/logrus v1.9.3 // indirect
github.com/spf13/cast v1.3.1 // indirect
github.com/spf13/pflag v1.0.5 // indirect
+ github.com/tidwall/gjson v1.17.1 // indirect
+ github.com/tidwall/match v1.1.1 // indirect
+ github.com/tidwall/pretty v1.2.0 // indirect
github.com/xdg-go/pbkdf2 v1.0.0 // indirect
github.com/xdg-go/scram v1.1.2 // indirect
github.com/xdg-go/stringprep v1.0.4 // indirect
diff --git a/src/go/collectors/go.d.plugin/go.sum b/src/go/collectors/go.d.plugin/go.sum
index 9bd073665e..b8146164a4 100644
--- a/src/go/collectors/go.d.plugin/go.sum
+++ b/src/go/collectors/go.d.plugin/go.sum
@@ -332,6 +332,12 @@ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
+github.com/tidwall/gjson v1.17.1 h1:wlYEnwqAHgzmhNUFfw7Xalt2JzQvsMx2Se4PcoFCT/U=
+github.com/tidwall/gjson v1.17.1/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
+github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=
+github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
+github.com/tidwall/pretty v1.2.0 h1:RWIZEg2iJ8/g6fDDYzMpobmaoGh5OLl4AXtGUGPcqCs=
+github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
github.com/tomasen/fcgi_client v0.0.0-20180423082037-2bb3d819fd19 h1:ZCmSnT6CLGhfoQ2lPEhL4nsJstKDCw1F1RfN8/smTCU=
github.com/tomasen/fcgi_client v0.0.0-20180423082037-2bb3d819fd19/go.mod h1:SXTY+QvI+KTTKXQdg0zZ7nx0u94QWh8ZAwBQYsW9cqk=
github.com/valyala/fastjson v1.6.4 h1:uAUNq9Z6ymTgGhcm0UynUAB6tlbakBrz6CQFax3BXVQ=
diff --git a/src/go/collectors/go.d.plugin/modules/init.go b/src/go/collectors/go.d.plugin/modules/init.go
index b0f1f9e838..c8262096a6 100644
--- a/src/go/collectors/go.d.plugin/modules/init.go
+++ b/src/go/collectors/go.d.plugin/modules/init.go
@@ -70,6 +70,7 @@ import (
_ "github.com/netdata/netdata/go/go.d.plugin/modules/redis"
_ "github.com/netdata/netdata/go/go.d.plugin/modules/scaleio"
_ "github.com/netdata/netdata/go/go.d.plugin/modules/sensors"
+ _ "github.com/netdata/netdata/go/go.d.plugin/modules/smartctl"
_ "github.com/netdata/netdata/go/go.d.plugin/modules/snmp"
_ "github.com/netdata/netdata/go/go.d.plugin/modules/squidlog"
_ "github.com/netdata/netdata/go/go.d.plugin/modules/storcli"
diff --git a/src/go/collectors/go.d.plugin/modules/smartctl/charts.go b/src/go/collectors/go.d.plugin/modules/smartctl/charts.go
new file mode 100644
index 0000000000..7ad4ea4c40
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/smartctl/charts.go
@@ -0,0 +1,237 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+package smartctl
+
+import (
+ "fmt"
+ "strings"
+
+ "github.com/netdata/netdata/go/go.d.plugin/agent/module"
+)
+
+const (
+ prioDeviceSmartStatus = module.Priority + iota
+ prioDeviceAtaSmartErrorLogCount
+ prioDevicePowerOnTime
+ prioDeviceTemperature
+ prioDevicePowerCycleCount
+
+ prioDeviceSmartAttributeDecoded
+ prioDeviceSmartAttributeNormalized
+)
+
+var deviceChartsTmpl = module.Charts{
+ devicePowerOnTimeChartTmpl.Copy(),
+ deviceTemperatureChartTmpl.Copy(),
+ devicePowerCycleCountChartTmpl.Copy(),
+ deviceSmartStatusChartTmpl.Copy(),
+ deviceAtaSmartErrorLogCountChartTmpl.Copy(),
+}
+
+var (
+ deviceSmartStatusChartTmpl = module.Chart{
+ ID: "device_%s_type_%s_smart_status",
+ Title: "Device smart status",
+ Units: "status",
+ Fam: "smart status",
+ Ctx: "smartctl.device_smart_status",
+ Type: module.Line,
+ Priority: prioDeviceSmartStatus,
+ Dims: module.Dims{
+ {ID: "device_%s_type_%s_smart_status_passed", Name: "passed"},
+ {ID: "device_%s_type_%s_smart_status_failed", Name: "failed"},
+ },
+ }
+ deviceAtaSmartErrorLogCountChartTmpl = module.Chart{
+ ID: "device_%s_type_%s_ata_smart_error_log_count",
+ Title: "Device ATA smart error log count",
+ Units: "logs",
+ Fam: "smart error log",
+ Ctx: "smartctl.device_ata_smart_error_log_count",
+ Type: module.Line,
+ Priority: prioDeviceAtaSmartErrorLogCount,
+ Dims: module.Dims{
+ {ID: "device_%s_type_%s_ata_smart_error_log_summary_count", Name: "error_log"},
+ },
+ }
+ devicePowerOnTimeChartTmpl = module.Chart{
+ ID: "device_%s_type_%s_power_on_time",
+ Title: "Device power on time",
+ Units: "seconds",
+ Fam: "power on time",
+ Ctx: "smartctl.device_power_on_time",
+ Type: module.Line,
+ Priority: prioDevicePowerOnTime,
+ Dims: module.Dims{
+ {ID: "device_%s_type_%s_power_on_time", Name: "power_on_time"},
+ },
+ }
+ deviceTemperatureChartTmpl = module.Chart{
+ ID: "device_%s_type_%s_temperature",
+ Title: "Device temperature",
+ Units: "Celsius",
+ Fam: "temperature",
+ Ctx: "smartctl.device_temperature",
+ Type: module.Line,
+ Priority: prioDeviceTemperature,
+ Dims: module.Dims{
+ {ID: "device_%s_type_%s_temperature", Name: "temperature"},
+ },
+ }
+ devicePowerCycleCountChartTmpl = module.Chart{
+ ID: "device_%s_type_%s_power_cycle_count",
+ Title: "Device power cycles",
+ Units: "cycles",
+ Fam: "power cycles",
+ Ctx: "smartctl.device_power_cycles_count",
+ Type: module.Line,
+ Priority: prioDevicePowerCycleCount,
+ Dims: module.Dims{
+ {ID: "device_%s_type_%s_power_cycle_count", Name: "power"},
+ },
+ }
+)
+
+var (
+ deviceSmartAttributeDecodedChartTmpl = module.Chart{
+ ID: "device_%s_type_%s_smart_attr_%s",
+ Title: "Device smart attribute %s",
+ Units: "value",
+ Fam: "attr %s",
+ Ctx: "smartctl.device_smart_attr_%s",
+ Type: module.Line,
+ Priority: prioDeviceSmartAttributeDecoded,
+ Dims: module.Dims{
+ {ID: "device_%s_type_%s_attr_%s_decoded", Name: "%s"},
+ },
+ }
+ deviceSmartAttributeNormalizedChartTmpl = module.Chart{
+ ID: "device_%s_type_%s_smart_attr_%s_normalized",
+ Title: "Device smart attribute normalized %s",
+ Units: "value",
+ Fam: "attr %s",
+ Ctx: "smartctl.device_smart_attr_%s_normalized",
+ Type: module.Line,
+ Priority: prioDeviceSmartAttributeNormalized,
+ Dims: module.Dims{
+ {ID: "device_%s_type_%s_attr_%s_normalized", Name: "%s"},
+ },
+ }
+)
+
+func (s *Smartctl) addDeviceCharts(dev *smartDevice) {
+ charts := module.Charts{}
+
+ if cs := s.newDeviceCharts(dev); cs != nil && len(*cs) > 0 {
+ if err := charts.Add(*cs...); err != nil {
+ s.Warning(err)
+ }
+ }
+ if cs := s.newDeviceSmartAttrCharts(dev); cs != nil && len(*cs) > 0 {
+ if err := charts.Add(*cs...); err != nil {
+ s.Warning(err)
+ }
+ }
+
+ if err := s.Charts().Add(charts...); err != nil {
+ s.Warning(err)
+ }
+}
+
+func (s *Smartctl) removeDeviceCharts(scanDev *scanDevice) {
+ px := fmt.Sprintf("device_%s_%s_", scanDev.shortName(), scanDev.typ)
+
+ for _, chart := range *s.Charts() {
+ if strings.HasPrefix(chart.ID, px) {
+ chart.MarkRemove()
+ chart.MarkNotCreated()
+ }
+ }
+}
+
+func (s *Smartctl) newDeviceCharts(dev *smartDevice) *module.Charts {
+
+ charts := deviceChartsTmpl.Copy()
+
+ if _, ok := dev.powerOnTime(); !ok {
+ _ = charts.Remove(devicePowerOnTimeChartTmpl.ID)
+ }
+ if _, ok := dev.temperature(); !ok {
+ _ = charts.Remove(deviceTemperatureChartTmpl.ID)
+ }
+ if _, ok := dev.powerCycleCount(); !ok {
+ _ = charts.Remove(devicePowerOnTimeChartTmpl.ID)
+ }
+ if _, ok := dev.smartStatusPassed(); !ok {
+ _ = charts.Remove(deviceSmartStatusChartTmpl.ID)
+ }
+ if _, ok := dev.ataSmartErrorLogCount(); !ok {
+ _ = charts.Remove(deviceAtaSmartErrorLogCountChartTmpl.ID)
+ }
+
+ for _, chart := range *charts {
+ chart.ID = fmt.Sprintf(chart.ID, dev.deviceName(), dev.deviceType())
+ chart.Labels = []module.Label{
+ {Key: "device_name", Value: dev.deviceName()},
+ {Key: "device_type", Value: dev.deviceType()},
+ {Key: "model_name", Value: dev.modelName()},
+ {Key: "serial_number", Value: dev.serialNumber()},
+ }
+ for _, dim := range chart.Dims {
+ dim.ID = fmt.Sprintf(dim.ID, dev.deviceName(), dev.deviceType())
+ }
+ }
+
+ return charts
+}
+
+func (s *Smartctl) newDeviceSmartAttrCharts(dev *smartDevice) *module.Charts {
+ attrs, ok := dev.ataSmartAttributeTable()
+ if !ok {
+ return nil
+ }
+ charts := module.Charts{}
+
+ for _, attr := range attrs {
+ if !isSmartAttrValid(attr) || strings.HasPrefix(attr.name(), "Unknown") {
+ continue
+ }
+
+ cs := module.Charts{
+ deviceSmartAttributeDecodedChartTmpl.Copy(),
+ deviceSmartAttributeNormalizedChartTmpl.Copy(),
+ }
+
+ name := cleanAttributeName(attr)
+
+ // FIXME: attribute charts unit
+ for _, chart := range cs {
+ chart.ID = fmt.Sprintf(chart.ID, dev.deviceName(), dev.deviceType(), name)
+ chart.Title = fmt.Sprintf(chart.Title, attr.name())
+ chart.Fam = fmt.Sprintf(chart.Fam, name)
+ chart.Ctx = fmt.Sprintf(chart.Ctx, name)
+ chart.Labels = []module.Label{
+ {Key: "device_name", Value: dev.deviceName()},
+ {Key: "device_type", Value: dev.deviceType()},
+ {Key: "model_name", Value: dev.modelName()},
+ {Key: "serial_number", Value: dev.serialNumber()},
+ }
+ for _, dim := range chart.Dims {
+ dim.ID = fmt.Sprintf(dim.ID, dev.deviceName(), dev.deviceType(), name)
+ dim.Name = fmt.Sprintf(dim.Name, name)
+ }
+ }
+
+ if err := charts.Add(cs...); err != nil {
+ s.Warning(err)
+ }
+ }
+
+ return &charts
+}
+
+var attrNameReplacer = strings.NewReplacer(" ", "_", "/", "_")
+
+func cleanAttributeName(attr *smartAttribute) string {
+ return strings.ToLower(attrNameReplacer.Replace(attr.name()))
+}
diff --git a/src/go/collectors/go.d.plugin/modules/smartctl/collect.go b/src/go/collectors/go.d.plugin/modules/smartctl/collect.go
new file mode 100644
index 0000000000..79cbb13d02
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/smartctl/collect.go
@@ -0,0 +1,189 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+package smartctl
+
+import (
+ "fmt"
+ "maps"
+ "slices"
+ "strconv"
+ "strings"
+ "time"
+
+ "github.com/tidwall/gjson"
+)
+
+func (s *Smartctl) collect() (map[string]int64, error) {
+ now := time.Now()
+
+ if s.forceScan || s.isTimeToScan(now) {
+ devices, err := s.scanDevices()
+ if err != nil {
+ return nil, err
+ }
+
+ for k, dev := range s.scannedDevices {
+ if _, ok := devices[k]; !ok {
+ delete(s.scannedDevices, k)
+ delete(s.seenDevices, k)
+ s.removeDeviceCharts(dev)
+ }
+ }
+
+ s.forceDevicePoll = !maps.Equal(s.scannedDevices, devices)
+ s.scannedDevices = devices
+ s.lastScanTime = now
+ s.forceScan = false
+ }
+
+ if s.forceDevicePoll || s.isTimeToPollDevices(now) {
+ mx := make(map[string]int64)
+
+ // TODO: make it concurrent
+ for _, d := range s.scannedDevices {
+ if err := s.collectScannedDevice(mx, d); err != nil {
+ return nil, err
+ }
+ }
+
+ s.forceDevicePoll = false
+ s.lastDevicePollTime = now
+ s.mx = mx
+ }
+
+ return s.mx, nil
+}
+
+func (s *Smartctl) collectScannedDevice(mx map[string]int64, scanDev *scanDevice) error {
+ resp, err := s.exec.deviceInfo(scanDev.name, scanDev.typ, s.NoCheckPowerMode)
+ if err != nil {
+ if resp != nil && isDeviceOpenFailedNoSuchDevice(resp) {
+ s.Infof("smartctl reported that device '%s' type '%s' no longer exists", scanDev.name, scanDev.typ)
+ s.forceScan = true
+ return nil
+ }
+ return fmt.Errorf("failed to get device info for '%s' type '%s': %v", scanDev.name, scanDev.typ, err)
+ }
+
+ if isDeviceInLowerPowerMode(resp) {
+ s.Debugf("device '%s' type '%s' is in a low-power mode, skipping", scanDev.name, scanDev.typ)
+ return nil
+ }
+
+ dev := newSmartDevice(resp)
+ if !isSmartDeviceValid(dev) {
+ return nil
+ }
+
+ if !s.seenDevices[scanDev.key()] {
+ s.seenDevices[scanDev.key()] = true
+ s.addDeviceCharts(dev)
+ }
+
+ s.collectSmartDevice(mx, dev)
+
+ return nil
+}
+
+func (s *Smartctl) collectSmartDevice(mx map[string]int64, dev *smartDevice) {
+ px := fmt.Sprintf("device_%s_type_%s_", dev.deviceName(), dev.deviceType())
+
+ if v, ok := dev.powerOnTime(); ok {
+ mx[px+"power_on_time"] = v
+ }
+ if v, ok := dev.temperature(); ok {
+ mx[px+"temperature"] = v
+ }
+ if v, ok := dev.powerCycleCount(); ok {
+ mx[px+"power_cycle_count"] = v
+ }
+ if v, ok := dev.smartStatusPassed(); ok {
+ mx[px+"smart_status_passed"] = 0
+ mx[px+"smart_status_failed"] = 0
+ if v {
+ mx[px+"smart_status_passed"] = 1
+ } else {
+ mx[px+"smart_status_failed"] = 1
+ }
+ }
+ if v, ok := dev.ataSmartErrorLogCount(); ok {
+ mx[px+"ata_smart_error_log_summary_count"] = v
+ }
+
+ if attrs, ok := dev.ataSmartAttributeTable(); ok {
+ for _, attr := range attrs {
+ if !isSmartAttrValid(attr) {
+ continue
+ }
+ n := strings.ToLower(attr.name())
+ n = strings.ReplaceAll(n, " ", "_")
+ px := fmt.Sprintf("%sattr_%s_", px, n)
+
+ if v, err := strconv.ParseInt(attr.value(), 10, 64); err == nil {
+ mx[px+"normalized"] = v
+ }
+
+ if v, err := strconv.ParseInt(attr.rawValue(), 10, 64); err == nil {
+ mx[px+"raw"] = v
+ }
+
+ rs := strings.TrimSpace(attr.rawString())
+ if i := strings.IndexByte(rs, ' '); i != -1 {
+ rs = rs[:i]
+ }
+ if v, err := strconv.ParseInt(rs, 10, 64); err == nil {
+ mx[px+"decoded"] = v
+ }
+ }
+ }
+}
+
+func (s *Smartctl) isTimeToScan(now time.Time) bool {
+ return now.After(s.lastScanTime.Add(s.ScanEvery.Duration()))
+}
+
+func (s *Smartctl) isTimeToPollDevices(now time.Time) bool {
+ return now.After(s.lastDevicePollTime.Add(s.PollDevicesEvery.Duration()))
+
+}
+
+func isSmartDeviceValid(d *smartDevice) bool {
+ return d.deviceName() != "" && d.deviceType() != ""
+}
+
+func isSmartAttrValid(a *smartAttribute) bool {
+ return a.id() != "" && a.name() != ""
+}
+
+func isDeviceInLowerPowerMode(r *gjson.Result) bool {
+ if !isExitStatusHasBit(r, 1) {
+ return false
+ }
+
+ messages := r.Get("smartctl.messages").Array()
+
+ return slices.ContainsFunc(messages, func(msg gjson.Result) bool {
+ text := msg.Get("string").String()
+ return strings.HasPrefix(text, "Device is in") && strings.Contains(text, "mode")
+ })
+}
+
+func isDeviceOpenFailedNoSuchDevice(r *gjson.Result) bool {
+ if !isExitStatusHasBit(r, 1) {
+ return false
+ }
+
+ messages := r.Get("smartctl.messages").Array()
+
+ return slices.ContainsFunc(messages, func(msg gjson.Result) bool {
+ text := msg.Get("string").String()
+ return strings.HasSuffix(text, "No such device")
+ })
+}
+
+func isExitStatusHasBit(r *gjson.Result, bit int) bool {
+ // https://manpages.debian.org/bullseye/smartmontools/smartctl.8.en.html#EXIT_STATUS
+ status := int(r.Get("smartctl.exit_status").Int())
+ mask := 1 << bit
+ return (status & mask) != 0
+}
diff --git a/src/go/collectors/go.d.plugin/modules/smartctl/config_schema.json b/src/go/collectors/go.d.plugin/modules/smartctl/config_schema.json
new file mode 100644
index 0000000000..273899cd98
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/smartctl/config_schema.json
@@ -0,0 +1,67 @@
+{
+ "jsonSchema": {
+ "$schema": "http://json-schema.org/draft-07/schema#",
+ "title": "Smartctl collector configuration.",
+ "type": "object",
+ "properties": {
+ "update_every": {
+ "title": "Update every",
+ "description": "Interval for updating Netdata charts, measured in seconds. Collector might use cached data if less than **Devices poll interval**.",
+ "type": "integer",
+ "minimum": 1,
+ "default": 10
+ },
+ "timeout": {
+ "title": "Timeout",
+ "description": "Timeout for executing the `smartctl` binary, specified in seconds.",
+ "type": "number",
+ "minimum": 0.5,
+ "default": 5
+ },
+ "scan_every": {
+ "title": "Scan interval",
+ "description": "Interval for discovering new devices using `smartctl --scan`, measured in seconds.",
+ "type": "number",
+ "minimum": 1,
+ "default": 900
+ },
+ "poll_devices_every": {
+ "title": "Devices poll interval",
+ "description": "Interval for gathering data for every device, measured in seconds. Data is cached for this interval.",
+ "type": "number",
+ "minimum": 1,
+ "default": 300
+ },
+ "no_check_power_mode": {
+ "title": "No check power mode",
+ "description": "ATA only. Skip data collection when the device is in a low-power mode. Prevents unnecessary disk spin-up.",
+ "type": "string",
+ "enum": [
+ "standby",
+ "never",
+ "sleep",
+ "idle"
+ ],
+ "default": "standby"
+ }
+ },
+ "additionalProperties": false,
+ "patternProperties": {
+ "^name$": {}
+ }
+ },
+ "uiSchema": {
+ "uiOptions": {
+ "fullPage": true
+ },
+ "timeout": {
+ "ui:help": "Accepts decimals for precise control (e.g., type 1.5 for 1.5 seconds)."
+ },
+ "no_check_power_mode": {
+ "ui:widget": "radio",
+ "ui:options": {
+ "inline": true
+ }
+ }
+ }
+}
diff --git a/src/go/collectors/go.d.plugin/modules/smartctl/exec.go b/src/go/collectors/go.d.plugin/modules/smartctl/exec.go
new file mode 100644
index 0000000000..a90e1b529b
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/smartctl/exec.go
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+package smartctl
+
+import (
+ "context"
+ "errors"
+ "fmt"
+ "os/exec"
+ "time"
+
+ "github.com/netdata/netdata/go/go.d.plugin/logger"
+
+ "github.com/tidwall/gjson"
+)
+
+func newSmartctlCliExec(ndsudoPath string, timeout time.Duration, log *logger.Logger) *smartctlCliExec {
+ return &smartctlCliExec{
+ Logger: log,
+ ndsudoPath: ndsudoPath,
+ timeout: timeout,
+ }
+}
+
+type smartctlCliExec struct {
+ *logger.Logger
+
+ ndsudoPath string
+ timeout time.Duration
+}
+
+func (e *smartctlCliExec) scan() (*gjson.Result, error) {
+ return e.execute("smartctl-json-scan")
+}
+
+func (e *smartctlCliExec) deviceInfo(deviceName, deviceType, powerMode string) (*gjson.Result, error) {
+ return e.execute("smartctl-json-device-info",
+ "--deviceName", deviceName,
+ "--deviceType", deviceType,
+ "--powerMode", powerMode,
+ )
+}
+
+func (e *smartctlCliExec) execute(args ...string) (*gjson.Result, error) {
+ ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
+ defer cancel()
+
+ cmd := exec.CommandContext(ctx, e.ndsudoPath, args...)
+ e.Debugf("executing '%s'", cmd)
+
+ bs, err := cmd.Output()
+ if err != nil {
+ if errors.Is(err, context.DeadlineExceeded) || isExecExitCode(err, 1) || len(bs) == 0 {
+ return nil, fmt.Errorf("'%s' execution failed: %v", cmd, err)
+ }
+ }
+ if len(bs) == 0 {
+ return nil, fmt.Errorf("'%s' returned no output", cmd)
+ }
+
+ if !gjson.ValidBytes(bs) {
+ return nil, fmt.Errorf("'%s' returned invalid JSON output", cmd)
+ }
+
+ res := gjson.ParseBytes(bs)
+ if !res.Get("smartctl.exit_status").Exists() {
+ return nil, fmt.Errorf("'%s' returned unexpected data", cmd)
+ }
+
+ for _, msg := range res.Get("smartctl.messages").Array() {
+ if msg.Get("severity").String() == "error" {
+ return &res, fmt.Errorf("'%s' reported an error: %s", cmd, msg.Get("string"))
+ }
+ }
+
+ return &res, nil
+}
+
+func isExecExitCode(err error, exitCode int) bool {
+ var v *exec.ExitError
+ return errors.As(err, &v) && v.ExitCode() == exitCode
+}
diff --git a/src/go/collectors/go.d.plugin/modules/smartctl/init.go b/src/go/collectors/go.d.plugin/modules/smartctl/init.go
new file mode 100644
index 0000000000..5c6ede5316
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/smartctl/init.go
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+package smartctl
+
+import (
+ "fmt"
+ "os"
+ "path/filepath"
+
+ "github.com/netdata/netdata/go/go.d.plugin/agent/executable"
+)
+
+func (s *Smartctl) validateConfig() error {
+ switch s.NoCheckPowerMode {
+ case "never", "sleep", "standby", "idle":
+ default:
+ return fmt.Errorf("invalid power mode '%s'", s.NoCheckPowerMode)
+ }
+ return nil
+}
+
+func (s *Smartctl) initSmartctlCli() (smartctlCli, error) {
+ ndsudoPath := filepath.Join(executable.Directory, "ndsudo")
+ if _, err := os.Stat(ndsudoPath); err != nil {
+ return nil, fmt.Errorf("ndsudo executable not found: %v", err)
+
+ }
+
+ smartctlExec := newSmartctlCliExec(ndsudoPath, s.Timeout.Duration(), s.Logger)
+
+ return smartctlExec, nil
+}
diff --git a/src/go/collectors/go.d.plugin/modules/smartctl/metadata.yaml b/src/go/collectors/go.d.plugin/modules/smartctl/metadata.yaml
new file mode 100644
index 0000000000..0ef0843f99
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/smartctl/metadata.yaml
@@ -0,0 +1,154 @@
+plugin_name: go.d.plugin
+modules:
+ - meta:
+ id: collector-go.d.plugin-smartctl
+ plugin_name: go.d.plugin
+ module_name: smartctl
+ monitored_instance:
+ name: S.M.A.R.T.
+ link: "https://linux.die.net/man/8/smartd"
+ icon_filename: "smart.png"
+ categories:
+ - data-collection.hardware-devices-and-sensors
+ keywords:
+ - smart
+ - S.M.A.R.T.
+ - SCSI devices
+ - ATA devices
+ related_resources:
+ integrations:
+ list: []
+ info_provided_to_referring_integrations:
+ description: ""
+ most_popular: false
+ overview:
+ data_collection:
+ metrics_description: |
+ This collector monitors the health status of storage devices by analyzing S.M.A.R.T. (Self-Monitoring, Analysis, and Reporting Technology) counters.
+ It relies on the [`smartctl`](https://linux.die.net/man/8/smartctl) CLI tool but avoids directly executing the binary.
+ Instead, it utilizes `ndsudo`, a Netdata helper specifically designed to run privileged commands securely within the Netdata environment.
+ This approach eliminates the need to use `sudo`, improving security and potentially simplifying permission management.
+
+ Executed commands:
+ - `smartctl --json --scan`
+ - `smartctl --json --all {deviceName} --device {deviceType} --nocheck {powerMode}`
+ method_description: ""
+ supported_platforms:
+ include: []
+ exclude: []
+ multi_instance: false
+ additional_permissions:
+ description: ""
+ default_behavior:
+ auto_detection:
+ description: ""
+ limits:
+ description: ""
+ performance_impact:
+ description: ""
+ setup:
+ prerequisites:
+ list: []
+ configuration:
+ file:
+ name: go.d/smartctl.conf
+ options:
+ description: |
+ The following options can be defined globally: update_every.
+ folding:
+ title: Config options
+ enabled: true
+ list:
+ - name: update_every
+ description: interval for updating Netdata charts, measured in seconds. Collector might use cached data if less than **Devices poll interval**.
+ default_value: 10
+ required: false
+ - name: timeout
+ description: smartctl binary execution timeout.
+ default_value: 5
+ required: false
+ - name: scan_every
+ description: interval for discovering new devices using `smartctl --scan`, measured in seconds.
+ default_value: 900
+ required: false
+ - name: poll_devices_every
+ description: interval for gathering data for every device, measured in seconds. Data is cached for this interval.
+ default_value: 300
+ required: false
+ examples:
+ folding:
+ title: Config
+ enabled: true
+ list:
+ - name: Custom devices poll interval
+ description: Allows you to override the default devices poll interval (data collection).
+ config: |