summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorlgz <ilyamaschenko@gmail.com>2017-11-29 19:04:57 +0900
committerlgz <ilyamaschenko@gmail.com>2017-11-29 19:04:57 +0900
commit1ef00cd9f49410a69b9a6b6cfe59378bb9a34d42 (patch)
tree84f5fc69e80a962a6b2a30f86ab8a754e6ac33ab
parent28dc234eefe792a63c359db69bfbc423175cdb4b (diff)
smartd_log improvements
-rw-r--r--python.d/smartd_log.chart.py428
1 files changed, 299 insertions, 129 deletions
diff --git a/python.d/smartd_log.chart.py b/python.d/smartd_log.chart.py
index 14ed7d6cf8..1f7932ad65 100644
--- a/python.d/smartd_log.chart.py
+++ b/python.d/smartd_log.chart.py
@@ -3,174 +3,344 @@
# Author: l2isbad, vorph1
import os
-from re import compile as r_compile
+import re
+
+from collections import namedtuple
+from time import time
from bases.collection import read_last_line
from bases.FrameworkServices.SimpleService import SimpleService
# charts order (can be overridden if you want less charts, or different order)
-ORDER = ['1', '4', '5', '7', '9', '12', '193', '194', '197', '198', '200']
+# ORDER = ['1', '4', '5', '7', '9', '12', '193', '194', '197', '198', '200']
+ORDER = ['194']
SMART_ATTR = {
- '1': 'Read Error Rate',
- '2': 'Throughput Performance',
- '3': 'Spin-Up Time',
- '4': 'Start/Stop Count',
- '5': 'Reallocated Sectors Count',
- '6': 'Read Channel Margin',
- '7': 'Seek Error Rate',
- '8': 'Seek Time Performance',
- '9': 'Power-On Hours Count',
- '10': 'Spin-up Retries',
- '11': 'Calibration Retries',
- '12': 'Power Cycle Count',
- '13': 'Soft Read Error Rate',
- '100': 'Erase/Program Cycles',
- '103': 'Translation Table Rebuild',
- '108': 'Unknown (108)',
- '170': 'Reserved Block Count',
- '171': 'Program Fail Count',
- '172': 'Erase Fail Count',
- '173': 'Wear Leveller Worst Case Erase Count',
- '174': 'Unexpected Power Loss',
- '175': 'Program Fail Count',
- '176': 'Erase Fail Count',
- '177': 'Wear Leveling Count',
- '178': 'Used Reserved Block Count',
- '179': 'Used Reserved Block Count',
- '180': 'Unused Reserved Block Count',
- '181': 'Program Fail Count',
- '182': 'Erase Fail Count',
- '183': 'SATA Downshifts',
- '184': 'End-to-End error',
- '185': 'Head Stability',
- '186': 'Induced Op-Vibration Detection',
- '187': 'Reported Uncorrectable Errors',
- '188': 'Command Timeout',
- '189': 'High Fly Writes',
- '190': 'Temperature',
- '191': 'G-Sense Errors',
- '192': 'Power-Off Retract Cycles',
- '193': 'Load/Unload Cycles',
- '194': 'Temperature',
- '195': 'Hardware ECC Recovered',
- '196': 'Reallocation Events',
- '197': 'Current Pending Sectors',
- '198': 'Off-line Uncorrectable',
- '199': 'UDMA CRC Error Rate',
- '200': 'Write Error Rate',
- '201': 'Soft Read Errors',
- '202': 'Data Address Mark Errors',
- '203': 'Run Out Cancel',
- '204': 'Soft ECC Corrections',
- '205': 'Thermal Asperity Rate',
- '206': 'Flying Height',
- '207': 'Spin High Current',
- '209': 'Offline Seek Performance',
- '220': 'Disk Shift',
- '221': 'G-Sense Error Rate',
- '222': 'Loaded Hours',
- '223': 'Load/Unload Retries',
- '224': 'Load Friction',
- '225': 'Load/Unload Cycles',
- '226': 'Load-in Time',
- '227': 'Torque Amplification Count',
- '228': 'Power-Off Retracts',
- '230': 'GMR Head Amplitude',
- '231': 'Temperature',
- '232': 'Available Reserved Space',
- '233': 'Media Wearout Indicator',
- '240': 'Head Flying Hours',
- '241': 'Total LBAs Written',
- '242': 'Total LBAs Read',
- '250': 'Read Error Retry Rate'
+ '1': 'Read Error Rate',
+ '2': 'Throughput Performance',
+ '3': 'Spin-Up Time',
+ '4': 'Start/Stop Count',
+ '5': 'Reallocated Sectors Count',
+ '6': 'Read Channel Margin',
+ '7': 'Seek Error Rate',
+ '8': 'Seek Time Performance',
+ '9': 'Power-On Hours Count',
+ '10': 'Spin-up Retries',
+ '11': 'Calibration Retries',
+ '12': 'Power Cycle Count',
+ '13': 'Soft Read Error Rate',
+ '100': 'Erase/Program Cycles',
+ '103': 'Translation Table Rebuild',
+ '108': 'Unknown (108)',
+ '170': 'Reserved Block Count',
+ '171': 'Program Fail Count',
+ '172': 'Erase Fail Count',
+ '173': 'Wear Leveller Worst Case Erase Count',
+ '174': 'Unexpected Power Loss',
+ '175': 'Program Fail Count',
+ '176': 'Erase Fail Count',
+ '177': 'Wear Leveling Count',
+ '178': 'Used Reserved Block Count',
+ '179': 'Used Reserved Block Count',
+ '180': 'Unused Reserved Block Count',
+ '181': 'Program Fail Count',
+ '182': 'Erase Fail Count',
+ '183': 'SATA Downshifts',
+ '184': 'End-to-End error',
+ '185': 'Head Stability',
+ '186': 'Induced Op-Vibration Detection',
+ '187': 'Reported Uncorrectable Errors',
+ '188': 'Command Timeout',
+ '189': 'High Fly Writes',
+ '190': 'Temperature',
+ '191': 'G-Sense Errors',
+ '192': 'Power-Off Retract Cycles',
+ '193': 'Load/Unload Cycles',
+ '194': 'Temperature',
+ '195': 'Hardware ECC Recovered',
+ '196': 'Reallocation Events',
+ '197': 'Current Pending Sectors',
+ '198': 'Off-line Uncorrectable',
+ '199': 'UDMA CRC Error Rate',
+ '200': 'Write Error Rate',
+ '201': 'Soft Read Errors',
+ '202': 'Data Address Mark Errors',
+ '203': 'Run Out Cancel',
+ '204': 'Soft ECC Corrections',
+ '205': 'Thermal Asperity Rate',
+ '206': 'Flying Height',
+ '207': 'Spin High Current',
+ '209': 'Offline Seek Performance',
+ '220': 'Disk Shift',
+ '221': 'G-Sense Error Rate',
+ '222': 'Loaded Hours',
+ '223': 'Load/Unload Retries',
+ '224': 'Load Friction',
+ '225': 'Load/Unload Cycles',
+ '226': 'Load-in Time',
+ '227': 'Torque Amplification Count',
+ '228': 'Power-Off Retracts',
+ '230': 'GMR Head Amplitude',
+ '231': 'Temperature',
+ '232': 'Available Reserved Space',
+ '233': 'Media Wearout Indicator',
+ '240': 'Head Flying Hours',
+ '241': 'Total LBAs Written',
+ '242': 'Total LBAs Read',
+ '250': 'Read Error Retry Rate'
}
+LIMIT = namedtuple('LIMIT', ['min', 'max'])
-class Disk:
- def __init__(self, name, path):
- self.name = name
+LIMITS = {
+ '194': LIMIT(0, 200)
+}
+
+RESCAN_INTERVAL = 60
+
+REGEX = re.compile(
+ '(\d+);' # attribute
+ '(\d+);' # normalized value
+ '(\d+)', # raw value
+ re.X
+)
+
+
+def chart_template(attr, raw):
+ chart_name = 'attr_id' + attr
+ title = SMART_ATTR[attr]
+ units = 'raw' if raw else 'normalized'
+
+ return {
+ chart_name: {
+ 'options': [None, title, units, title.lower(), 'smartd_log.' + chart_name, 'line'],
+ 'lines': []
+ }
+ }
+
+
+def handle_os_error(method):
+ def on_call(*args):
+ try:
+ return method(*args)
+ except OSError:
+ return None
+ return on_call
+
+
+class SmartAttribute(object):
+ def __init__(self, idx, normalized, raw):
+ self.id = idx
+ self.normalized = normalized
+ self._raw = raw
+
+ @property
+ def raw(self):
+ if self.id in LIMITS:
+ limit = LIMITS[self.id]
+ if limit.min <= int(self._raw) <= limit.max:
+ return self._raw
+ return None
+ return self._raw
+
+ @raw.setter
+ def raw(self, value):
+ self._raw = value
+
+
+class DiskLogFile:
+ def __init__(self, path):
self.path = path
+ self.size = os.path.getsize(path)
+
+ @handle_os_error
+ def is_changed(self):
+ new_size = os.path.getsize(self.path)
+ old_size, self.size = self.size, new_size
+
+ return new_size != old_size and new_size
+
+ @staticmethod
+ @handle_os_error
+ def is_valid(log_file, exclude):
+ return all([log_file.endswith('.csv'),
+ not [p for p in exclude if p in log_file],
+ os.access(log_file, os.R_OK),
+ os.path.getsize(log_file)])
+
+
+class Disk:
+ def __init__(self, full_path, age):
+ self.log_file = DiskLogFile(full_path)
+ self.name = os.path.basename(full_path).split('.')[-3]
+ self.age = int(age)
self.status = True
+ self.attributes = dict()
+
+ self.get_attributes()
+
+ def __eq__(self, other):
+ if isinstance(other, Disk):
+ return self.name == other.name
+ return self.name == other
+
+ @handle_os_error
+ def is_active(self):
+ return (time() - os.path.getmtime(self.log_file.path)) / 60 < self.age
+
+ @handle_os_error
+ def get_attributes(self):
+ last_line = read_last_line(self.log_file.path)
+ self.attributes = dict((attr, SmartAttribute(attr, normalized, raw)) for attr, normalized, raw
+ in REGEX.findall(last_line))
+ return True
+
+ def data(self, raw=None):
+ data = dict()
+ for attr in self.attributes.values():
+ value = attr.raw if raw else attr.normalized
+ if value is None:
+ continue
+ key = '_'.join([self.name, attr.id])
+ data[key] = value
+ return data
class Service(SimpleService):
def __init__(self, configuration=None, name=None):
SimpleService.__init__(self, configuration=configuration, name=name)
- self.regex = r_compile(r'(\d+);(\d+);(\d+)')
self.log_path = self.configuration.get('log_path', '/var/log/smartd')
- self.raw_values = self.configuration.get('raw_values')
- self.attr = self.configuration.get('smart_attributes', [])
- self.exclude_disks = self.configuration.get('exclude_disks', str()).split()
+ self.raw = self.configuration.get('raw_values', True)
+ self.exclude = self.configuration.get('exclude_disks', str()).split()
+ self.age = self.configuration.get('age', 60)
+
+ self.runs = 0
+ self.disks = list()
self.order = list()
self.definitions = dict()
- self.disks = list()
-
- for path_to_disk in find_disks_in_log_path(self.log_path):
- disk_name = os.path.basename(path_to_disk).split('.')[-3]
- for pattern in self.exclude_disks:
- if pattern in disk_name:
- break
- else:
- self.disks.append(Disk(name=disk_name, path=path_to_disk))
def check(self):
+ self.disks = self.scan()
+
if not self.disks:
- self.error('Can\'t locate any smartd log files in {0}'.format(self.log_path))
- return False
+ return None
+
+ user_defined_sa = self.configuration.get('smart_attributes')
+
+ if user_defined_sa:
+ order = user_defined_sa.split() or ORDER
+ else:
+ order = ORDER
+
+ self.create_charts(order)
- self.create_charts()
return True
def get_data(self):
+ self.runs += 1
+
+ if self.runs % RESCAN_INTERVAL == 0:
+ self.cleanup_and_rescan()
+
data = dict()
+
for disk in self.disks:
+
if not disk.status:
continue
- try:
- last_line = read_last_line(disk.path)
- except OSError:
+ changed = disk.log_file.is_changed()
+
+ # True = changed, False = unchanged, None = Exception
+ if changed is None:
disk.status = False
continue
- result = self.regex.findall(last_line)
- if not result:
- continue
- for a, n, r in result:
- data.update({'_'.join([disk.name, a]): r if self.raw_values else n})
+ if changed:
+ success = disk.get_attributes()
+ if not success:
+ disk.status = False
+ continue
+
+ data.update(disk.data(self.raw))
return data or None
- def create_charts(self):
+ def create_charts(self, order):
+ for attr in order:
+ chart_id = 'attr_id' + attr
+ chart = chart_template(attr, self.raw)
- def create_lines(attr_id):
- result = list()
for disk in self.disks:
- result.append(['_'.join([disk.name, attr_id]), disk.name, 'absolute'])
- return result
+ if attr not in disk.attributes:
+ self.debug("'{disk}' has no attribute '{attr_id}'".format(disk=disk.name,
+ attr_id=attr))
+ continue
- try:
- order = [attr for attr in self.attr.split() if attr in SMART_ATTR.keys()] or ORDER
- except AttributeError:
- order = ORDER
+ if self.raw and disk.attributes[attr].raw is None:
+ self.debug("'{disk}' attribute '{attr_id}' value not in {limits}".format(disk=disk.name,
+ attr_id=attr,
+ limits=LIMITS[attr]))
+ continue
+ chart[chart_id]['lines'].append(['_'.join([disk.name, attr]), disk.name])
+
+ self.order.append(chart_id)
+ self.definitions.update(chart)
+
+ def scan(self, only_new=None):
+ new_disks = list()
+ for f in os.listdir(self.log_path):
+ full_path = os.path.join(self.log_path, f)
+
+ if DiskLogFile.is_valid(full_path, self.exclude):
+ disk = Disk(full_path, self.age)
+
+ active = disk.is_active()
+ if active is None:
+ continue
+
+ if active:
+ if not only_new:
+ new_disks.append(disk)
+ else:
+ if disk not in self.disks:
+ new_disks.append(disk)
+ else:
+ if not only_new:
+ self.debug("'{disk}' not updated in the last {age} minutes, "
+ "skipping it.".format(disk=disk.name, age=self.age))
+ return new_disks
+
+ def cleanup_and_rescan(self):
+ self.cleanup()
+ new_disks = self.scan(only_new=True)
+
+ for disk in new_disks:
+ valid = False
+
+ for chart in self.charts:
+ idx = chart.id[7:]
+
+ if idx in disk.attributes:
+ valid = True
+ dimension_id = '_'.join([disk.name, idx])
+
+ if dimension_id in chart:
+ chart.hide_dimension(dimension_id=dimension_id, reverse=True)
+ else:
+ chart.add_dimension([dimension_id, disk.name])
+ if valid:
+ self.disks.append(disk)
+
+ def cleanup(self):
+ for disk in self.disks:
+
+ if not disk.is_active():
+ disk.status = False
+
+ if not disk.status:
+ for chart in self.charts:
+ dimension_id = '_'.join([disk.name, chart.id[7:]])
+ chart.hide_dimension(dimension_id=dimension_id)
- self.order = [''.join(['attr_id', i]) for i in order]
- units = 'raw' if self.raw_values else 'normalized'
-
- for k, v in dict([(k, v) for k, v in SMART_ATTR.items() if k in order]).items():
- self.definitions[''.join(['attr_id', k])] =\
- {'options': [None, v, units, v.lower(), 'smartd.attr_id' + k, 'line'],
- 'lines': create_lines(k)}
-
-
-def find_disks_in_log_path(log_path):
- if not os.path.isdir(log_path):
- raise StopIteration
- for f in os.listdir(log_path):
- f = os.path.join(log_path, f)
- if all([os.path.isfile(f),
- os.access(f, os.R_OK),
- f.endswith('.csv'),
- os.path.getsize(f)]):
- yield f
+ self.disks = [disk for disk in self.disks if disk.status]