diff options
Diffstat (limited to 'glances/plugins/alert/__init__.py')
-rw-r--r-- | glances/plugins/alert/__init__.py | 250 |
1 files changed, 250 insertions, 0 deletions
diff --git a/glances/plugins/alert/__init__.py b/glances/plugins/alert/__init__.py index e69de29b..0d3908a9 100644 --- a/glances/plugins/alert/__init__.py +++ b/glances/plugins/alert/__init__.py @@ -0,0 +1,250 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Glances. +# +# SPDX-FileCopyrightText: 2023 Nicolas Hennion <nicolas@nicolargo.com> +# +# SPDX-License-Identifier: LGPL-3.0-only +# + +"""Alert plugin.""" + +from datetime import datetime +from time import tzname +import pytz + +from glances.events import glances_events +from glances.thresholds import glances_thresholds + +# from glances.logger import logger +from glances.plugins.plugin.model import GlancesPluginModel + +# Static decision tree for the global alert message +# - msg: Message to be displayed (result of the decision tree) +# - thresholds: a list of stats to take into account +# - thresholds_min: minimal value of the thresholds sum +# - 0: OK +# - 1: CAREFUL +# - 2: WARNING +# - 3: CRITICAL +tree = [ + {'msg': 'No warning or critical alert detected', 'thresholds': [], 'thresholds_min': 0}, + {'msg': 'High CPU user mode', 'thresholds': ['cpu_user'], 'thresholds_min': 2}, + {'msg': 'High CPU kernel usage', 'thresholds': ['cpu_system'], 'thresholds_min': 2}, + {'msg': 'High CPU I/O waiting', 'thresholds': ['cpu_iowait'], 'thresholds_min': 2}, + { + 'msg': 'Large CPU stolen time. System running the hypervisor is too busy.', + 'thresholds': ['cpu_steal'], + 'thresholds_min': 2, + }, + {'msg': 'High CPU niced value', 'thresholds': ['cpu_niced'], 'thresholds_min': 2}, + {'msg': 'System overloaded in the last 5 minutes', 'thresholds': ['load'], 'thresholds_min': 2}, + {'msg': 'High swap (paging) usage', 'thresholds': ['memswap'], 'thresholds_min': 2}, + {'msg': 'High memory consumption', 'thresholds': ['mem'], 'thresholds_min': 2}, +] + +# TODO: change the algo to use the following decision tree +# Source: Inspire by https://scoutapm.com/blog/slow_server_flow_chart +# _yes means threshold >= 2 +# _no means threshold < 2 +# With threshold: +# - 0: OK +# - 1: CAREFUL +# - 2: WARNING +# - 3: CRITICAL +tree_new = { + 'cpu_iowait': { + '_yes': { + 'memswap': { + '_yes': { + 'mem': { + '_yes': { + # Once you've identified the offenders, the resolution will again + # depend on whether their memory usage seems business-as-usual or not. + # For example, a memory leak can be satisfactorily addressed by a one-time + # or periodic restart of the process. + # - if memory usage seems anomalous: kill the offending processes. + # - if memory usage seems business-as-usual: add RAM to the server, + # or split high-memory using services to other servers. + '_msg': "Memory issue" + }, + '_no': { + # ??? + '_msg': "Swap issue" + }, + } + }, + '_no': { + # Low swap means you have a "real" IO wait problem. The next step is to see what's hogging your IO. + # iotop is an awesome tool for identifying io offenders. Two things to note: + # unless you've already installed iotop, it's probably not already on your system. + # Recommendation: install it before you need it - - it's no fun trying to install a troubleshooting + # tool on an overloaded machine (iotop requires a Linux of 2.62 or above) + '_msg': "I/O issue" + }, + } + }, + '_no': { + 'cpu_total': { + '_yes': { + 'cpu_user': { + '_yes': { + # We expect the user-time percentage to be high. + # There's most likely a program or service you've configured on you server that's + # hogging CPU. + # Checking the % user time just confirms this. When you see that the % user-time is high, + # it's time to see what executable is monopolizing the CPU + # Once you've confirmed that the % usertime is high, check the process list(also provided + # by top). + # Be default, top sorts the process list by % CPU, so you can just look at the top process + # or processes. + # If there's a single process hogging the CPU in a way that seems abnormal, it's an + # anomalous situation + # that a service restart can fix. If there are are multiple processes taking up CPU + # resources, or it + # there's one process that takes lots of resources while otherwise functioning normally, + # than your setup + # may just be underpowered. You'll need to upgrade your server(add more cores), + # or split services out onto + # other boxes. In either case, you have a resolution: + # - if situation seems anomalous: kill the offending processes. + # - if situation seems typical given history: upgrade server or add more servers. + '_msg': "CPU issue with user process(es)" + }, + '_no': { + 'cpu_steal': { + '_yes': { + '_msg': "CPU issue with stolen time. System running the hypervisor may be too busy." + }, + '_no': {'_msg': "CPU issue with system process(es)"}, + } + }, + } + }, + '_no': { + '_yes': { + # ??? + '_msg': "Memory issue" + }, + '_no': { + # Your slowness isn't due to CPU or IO problems, so it's likely an application-specific issue. + # It's also possible that the slowness is being caused by another server in your cluster, or + # by an external service you rely on. + # start by checking important applications for uncharacteristic slowness(the DB is a good place + # to start), think through which parts of your infrastructure could be slowed down externally. + # For example, do you use an externally hosted email service that could slow down critical + # parts of your application ? + # If you suspect another server in your cluster, strace and lsof can provide information on + # what the process is doing or waiting on. Strace will show you which file descriptors are + # being read or written to (or being attempted to be read from) and lsof can give you a + # mapping of those file descriptors to network connections. + '_msg': "External issue" + }, + }, + } + }, + } +} + + +def global_message(): + """Parse the decision tree and return the message. + + Note: message corresponding to the current thresholds values + """ + # Compute the weight for each item in the tree + current_thresholds = glances_thresholds.get() + for i in tree: + i['weight'] = sum([current_thresholds[t].value() for t in i['thresholds'] if t in current_thresholds]) + themax = max(tree, key=lambda d: d['weight']) + if themax['weight'] >= themax['thresholds_min']: + # Check if the weight is > to the minimal threshold value + return themax['msg'] + else: + return tree[0]['msg'] + + +class PluginModel(GlancesPluginModel): + """Glances alert plugin. + + Only for display. + """ + + def __init__(self, args=None, config=None): + """Init the plugin.""" + super(PluginModel, self).__init__(args=args, config=config, stats_init_value=[]) + + # We want to display the stat in the curse interface + self.display_curse = True + + # Set the message position + self.align = 'bottom' + + # Set the maximum number of events to display + if config is not None and (config.has_section('alert') or config.has_section('alerts')): + glances_events.set_max_events(config.get_int_value('alert', 'max_events', default=10)) + + def update(self): + """Nothing to do here. Just return the global glances_log.""" + # Set the stats to the glances_events + self.stats = glances_events.get() + # Define the global message thanks to the current thresholds + # and the decision tree + # !!! Call directly in the msg_curse function + # global_message() + + def msg_curse(self, args=None, max_width=None): + """Return the dict to display in the curse interface.""" + # Init the return message + ret = [] + + # Only process if display plugin enable... + if not self.stats or self.is_disabled(): + return ret + + # Build the string message + # Header + ret.append(self.curse_add_line(global_message(), "TITLE")) + # Loop over alerts + for alert in self.stats: + # New line + ret.append(self.curse_new_line()) + # Start + msg = str(datetime.fromtimestamp(alert[0], tz=pytz.timezone(tzname[0] if tzname[0] else 'UTC'))) + ret.append(self.curse_add_line(msg)) + # Duration + if alert[1] > 0: + # If finished display duration + msg = ' ({})'.format(datetime.fromtimestamp(alert[1]) - datetime.fromtimestamp(alert[0])) + else: + msg = ' (ongoing)' + ret.append(self.curse_add_line(msg)) + ret.append(self.curse_add_line(" - ")) + # Infos + if alert[1] > 0: + # If finished do not display status + msg = '{} on {}'.format(alert[2], alert[3]) + ret.append(self.curse_add_line(msg)) + else: + msg = str(alert[3]) + ret.append(self.curse_add_line(msg, decoration=alert[2])) + # Min / Mean / Max + if self.approx_equal(alert[6], alert[4], tolerance=0.1): + msg = ' ({:.1f})'.format(alert[5]) + else: + msg = ' (Min:{:.1f} Mean:{:.1f} Max:{:.1f})'.format(alert[6], alert[5], alert[4]) + ret.append(self.curse_add_line(msg)) + # Top processes + top_process = ', '.join([p['name'] for p in alert[9]]) + if top_process != '': + msg = ': {}'.format(top_process) + ret.append(self.curse_add_line(msg)) + + return ret + + def approx_equal(self, a, b, tolerance=0.0): + """Compare a with b using the tolerance (if numerical).""" + if str(int(a)).isdigit() and str(int(b)).isdigit(): + return abs(a - b) <= max(abs(a), abs(b)) * tolerance + else: + return a == b |