summaryrefslogtreecommitdiffstats
path: root/glances/plugins/alert/model.py
diff options
context:
space:
mode:
Diffstat (limited to 'glances/plugins/alert/model.py')
-rw-r--r--glances/plugins/alert/model.py251
1 files changed, 0 insertions, 251 deletions
diff --git a/glances/plugins/alert/model.py b/glances/plugins/alert/model.py
deleted file mode 100644
index 950acec5..00000000
--- a/glances/plugins/alert/model.py
+++ /dev/null
@@ -1,251 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# This file is part of Glances.
-#
-# SPDX-FileCopyrightText: 2023 Nicolas Hennion <nicolas@nicolargo.com>
-#
-# SPDX-License-Identifier: LGPL-3.0-only
-#
-
-"""Alert plugin."""
-
-from datetime import datetime
-
-from glances.logger import logger
-from glances.events import glances_events
-from glances.thresholds import glances_thresholds
-
-# from glances.logger import logger
-from glances.plugins.plugin.model import GlancesPluginModel
-
-# Static decision tree for the global alert message
-# - msg: Message to be displayed (result of the decision tree)
-# - thresholds: a list of stats to take into account
-# - thresholds_min: minimal value of the thresholds sum
-# - 0: OK
-# - 1: CAREFUL
-# - 2: WARNING
-# - 3: CRITICAL
-tree = [
- {'msg': 'No warning or critical alert detected', 'thresholds': [], 'thresholds_min': 0},
- {'msg': 'High CPU user mode', 'thresholds': ['cpu_user'], 'thresholds_min': 2},
- {'msg': 'High CPU kernel usage', 'thresholds': ['cpu_system'], 'thresholds_min': 2},
- {'msg': 'High CPU I/O waiting', 'thresholds': ['cpu_iowait'], 'thresholds_min': 2},
- {
- 'msg': 'Large CPU stolen time. System running the hypervisor is too busy.',
- 'thresholds': ['cpu_steal'],
- 'thresholds_min': 2,
- },
- {'msg': 'High CPU niced value', 'thresholds': ['cpu_niced'], 'thresholds_min': 2},
- {'msg': 'System overloaded in the last 5 minutes', 'thresholds': ['load'], 'thresholds_min': 2},
- {'msg': 'High swap (paging) usage', 'thresholds': ['memswap'], 'thresholds_min': 2},
- {'msg': 'High memory consumption', 'thresholds': ['mem'], 'thresholds_min': 2},
-]
-
-# TODO: change the algo to use the following decision tree
-# Source: Inspire by https://scoutapm.com/blog/slow_server_flow_chart
-# _yes means threshold >= 2
-# _no means threshold < 2
-# With threshold:
-# - 0: OK
-# - 1: CAREFUL
-# - 2: WARNING
-# - 3: CRITICAL
-tree_new = {
- 'cpu_iowait': {
- '_yes': {
- 'memswap': {
- '_yes': {
- 'mem': {
- '_yes': {
- # Once you've identified the offenders, the resolution will again
- # depend on whether their memory usage seems business-as-usual or not.
- # For example, a memory leak can be satisfactorily addressed by a one-time
- # or periodic restart of the process.
- # - if memory usage seems anomalous: kill the offending processes.
- # - if memory usage seems business-as-usual: add RAM to the server,
- # or split high-memory using services to other servers.
- '_msg': "Memory issue"
- },
- '_no': {
- # ???
- '_msg': "Swap issue"
- },
- }
- },
- '_no': {
- # Low swap means you have a "real" IO wait problem. The next step is to see what's hogging your IO.
- # iotop is an awesome tool for identifying io offenders. Two things to note:
- # unless you've already installed iotop, it's probably not already on your system.
- # Recommendation: install it before you need it - - it's no fun trying to install a troubleshooting
- # tool on an overloaded machine (iotop requires a Linux of 2.62 or above)
- '_msg': "I/O issue"
- },
- }
- },
- '_no': {
- 'cpu_total': {
- '_yes': {
- 'cpu_user': {
- '_yes': {
- # We expect the user-time percentage to be high.
- # There's most likely a program or service you've configured on you server that's
- # hogging CPU.
- # Checking the % user time just confirms this. When you see that the % user-time is high,
- # it's time to see what executable is monopolizing the CPU
- # Once you've confirmed that the % usertime is high, check the process list(also provided
- # by top).
- # Be default, top sorts the process list by % CPU, so you can just look at the top process
- # or processes.
- # If there's a single process hogging the CPU in a way that seems abnormal, it's an
- # anomalous situation
- # that a service restart can fix. If there are are multiple processes taking up CPU
- # resources, or it
- # there's one process that takes lots of resources while otherwise functioning normally,
- # than your setup
- # may just be underpowered. You'll need to upgrade your server(add more cores),
- # or split services out onto
- # other boxes. In either case, you have a resolution:
- # - if situation seems anomalous: kill the offending processes.
- # - if situation seems typical given history: upgrade server or add more servers.
- '_msg': "CPU issue with user process(es)"
- },
- '_no': {
- 'cpu_steal': {
- '_yes': {
- '_msg': "CPU issue with stolen time. System running the hypervisor may be too busy."
- },
- '_no': {'_msg': "CPU issue with system process(es)"},
- }
- },
- }
- },
- '_no': {
- '_yes': {
- # ???
- '_msg': "Memory issue"
- },
- '_no': {
- # Your slowness isn't due to CPU or IO problems, so it's likely an application-specific issue.
- # It's also possible that the slowness is being caused by another server in your cluster, or
- # by an external service you rely on.
- # start by checking important applications for uncharacteristic slowness(the DB is a good place
- # to start), think through which parts of your infrastructure could be slowed down externally.
- # For example, do you use an externally hosted email service that could slow down critical
- # parts of your application ?
- # If you suspect another server in your cluster, strace and lsof can provide information on
- # what the process is doing or waiting on. Strace will show you which file descriptors are
- # being read or written to (or being attempted to be read from) and lsof can give you a
- # mapping of those file descriptors to network connections.
- '_msg': "External issue"
- },
- },
- }
- },
- }
-}
-
-
-def global_message():
- """Parse the decision tree and return the message.
-
- Note: message corresponding to the current thresholds values
- """
- # Compute the weight for each item in the tree
- current_thresholds = glances_thresholds.get()
- for i in tree:
- i['weight'] = sum([current_thresholds[t].value() for t in i['thresholds'] if t in current_thresholds])
- themax = max(tree, key=lambda d: d['weight'])
- if themax['weight'] >= themax['thresholds_min']:
- # Check if the weight is > to the minimal threshold value
- return themax['msg']
- else:
- return tree[0]['msg']
-
-
-class PluginModel(GlancesPluginModel):
- """Glances alert plugin.
-
- Only for display.
- """
-
- def __init__(self, args=None, config=None):
- """Init the plugin."""
- super(PluginModel, self).__init__(args=args,
- config=config,
- stats_init_value=[])
-
- # We want to display the stat in the curse interface
- self.display_curse = True
-
- # Set the message position
- self.align = 'bottom'
-
- # Set the maximum number of events to display
- if config is not None and (config.has_section('alert') or config.has_section('alerts')):
- glances_events.set_max_events(config.get_int_value('alert', 'max_events'))
-
- def update(self):
- """Nothing to do here. Just return the global glances_log."""
- # Set the stats to the glances_events
- self.stats = glances_events.get()
- # Define the global message thanks to the current thresholds
- # and the decision tree
- # !!! Call directly in the msg_curse function
- # global_message()
-
- def msg_curse(self, args=None, max_width=None):
- """Return the dict to display in the curse interface."""
- # Init the return message
- ret = []
-
- # Only process if display plugin enable...
- if not self.stats or self.is_disabled():
- return ret
-
- # Build the string message
- # Header
- ret.append(self.curse_add_line(global_message(), "TITLE"))
- # Loop over alerts
- for alert in self.stats:
- # New line
- ret.append(self.curse_new_line())
- # Start
- msg = str(datetime.fromtimestamp(alert[0]))
- ret.append(self.curse_add_line(msg))
- # Duration
- if alert[1] > 0:
- # If finished display duration
- msg = ' ({})'.format(datetime.fromtimestamp(alert[1]) - datetime.fromtimestamp(alert[0]))
- else:
- msg = ' (ongoing)'
- ret.append(self.curse_add_line(msg))
- ret.append(self.curse_add_line(" - "))
- # Infos
- if alert[1] > 0:
- # If finished do not display status
- msg = '{} on {}'.format(alert[2], alert[3])
- ret.append(self.curse_add_line(msg))
- else:
- msg = str(alert[3])
- ret.append(self.curse_add_line(msg, decoration=alert[2]))
- # Min / Mean / Max
- if self.approx_equal(alert[6], alert[4], tolerance=0.1):
- msg = ' ({:.1f})'.format(alert[5])
- else:
- msg = ' (Min:{:.1f} Mean:{:.1f} Max:{:.1f})'.format(alert[6], alert[5], alert[4])
- ret.append(self.curse_add_line(msg))
- # Top processes
- top_process = ', '.join([p['name'] for p in alert[9]])
- if top_process != '':
- msg = ': {}'.format(top_process)
- ret.append(self.curse_add_line(msg))
-
- return ret
-
- def approx_equal(self, a, b, tolerance=0.0):
- """Compare a with b using the tolerance (if numerical)."""
- if str(int(a)).isdigit() and str(int(b)).isdigit():
- return abs(a - b) <= max(abs(a), abs(b)) * tolerance
- else:
- return a == b