summaryrefslogtreecommitdiffstats
path: root/glances/events.py
diff options
context:
space:
mode:
Diffstat (limited to 'glances/events.py')
-rw-r--r--glances/events.py410
1 files changed, 0 insertions, 410 deletions
diff --git a/glances/events.py b/glances/events.py
deleted file mode 100644
index 6658c842..00000000
--- a/glances/events.py
+++ /dev/null
@@ -1,410 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# This file is part of Glances.
-#
-# SPDX-FileCopyrightText: 2022 Nicolas Hennion <nicolas@nicolargo.com>
-#
-# SPDX-License-Identifier: LGPL-3.0-only
-#
-
-"""Manage Glances events (previously Glances logs in Glances < 3.1)."""
-
-import time
-from datetime import datetime
-
-from glances.logger import logger
-from glances.processes import glances_processes, sort_stats
-from glances.thresholds import glances_thresholds
-
-# Static decision tree for the global alert message
-# - msg: Message to be displayed (result of the decision tree)
-# - thresholds: a list of stats to take into account
-# - thresholds_min: minimal value of the thresholds sum
-# - 0: OK
-# - 1: CAREFUL
-# - 2: WARNING
-# - 3: CRITICAL
-tree = [
- {'msg': 'No warning or critical alert detected', 'thresholds': [], 'thresholds_min': 0},
- {'msg': 'High CPU user mode', 'thresholds': ['cpu_user'], 'thresholds_min': 2},
- {'msg': 'High CPU kernel usage', 'thresholds': ['cpu_system'], 'thresholds_min': 2},
- {'msg': 'High CPU I/O waiting', 'thresholds': ['cpu_iowait'], 'thresholds_min': 2},
- {
- 'msg': 'Large CPU stolen time. System running the hypervisor is too busy.',
- 'thresholds': ['cpu_steal'],
- 'thresholds_min': 2,
- },
- {'msg': 'High CPU niced value', 'thresholds': ['cpu_niced'], 'thresholds_min': 2},
- {'msg': 'System overloaded in the last 5 minutes', 'thresholds': ['load'], 'thresholds_min': 2},
- {'msg': 'High swap (paging) usage', 'thresholds': ['memswap'], 'thresholds_min': 2},
- {'msg': 'High memory consumption', 'thresholds': ['mem'], 'thresholds_min': 2},
-]
-
-# TODO: change the algo to use the following decision tree
-# Source: Inspire by https://scoutapm.com/blog/slow_server_flow_chart
-# _yes means threshold >= 2
-# _no means threshold < 2
-# With threshold:
-# - 0: OK
-# - 1: CAREFUL
-# - 2: WARNING
-# - 3: CRITICAL
-tree_new = {
- 'cpu_iowait': {
- '_yes': {
- 'memswap': {
- '_yes': {
- 'mem': {
- '_yes': {
- # Once you've identified the offenders, the resolution will again
- # depend on whether their memory usage seems business-as-usual or not.
- # For example, a memory leak can be satisfactorily addressed by a one-time
- # or periodic restart of the process.
- # - if memory usage seems anomalous: kill the offending processes.
- # - if memory usage seems business-as-usual: add RAM to the server,
- # or split high-memory using services to other servers.
- '_msg': "Memory issue"
- },
- '_no': {
- # ???
- '_msg': "Swap issue"
- },
- }
- },
- '_no': {
- # Low swap means you have a "real" IO wait problem. The next step is to see what's hogging your IO.
- # iotop is an awesome tool for identifying io offenders. Two things to note:
- # unless you've already installed iotop, it's probably not already on your system.
- # Recommendation: install it before you need it - - it's no fun trying to install a troubleshooting
- # tool on an overloaded machine (iotop requires a Linux of 2.62 or above)
- '_msg': "I/O issue"
- },
- }
- },
- '_no': {
- 'cpu_total': {
- '_yes': {
- 'cpu_user': {
- '_yes': {
- # We expect the user-time percentage to be high.
- # There's most likely a program or service you've configured on you server that's
- # hogging CPU.
- # Checking the % user time just confirms this. When you see that the % user-time is high,
- # it's time to see what executable is monopolizing the CPU
- # Once you've confirmed that the % usertime is high, check the process list(also provided
- # by top).
- # Be default, top sorts the process list by % CPU, so you can just look at the top process
- # or processes.
- # If there's a single process hogging the CPU in a way that seems abnormal, it's an
- # anomalous situation
- # that a service restart can fix. If there are are multiple processes taking up CPU
- # resources, or it
- # there's one process that takes lots of resources while otherwise functioning normally,
- # than your setup
- # may just be underpowered. You'll need to upgrade your server(add more cores),
- # or split services out onto
- # other boxes. In either case, you have a resolution:
- # - if situation seems anomalous: kill the offending processes.
- # - if situation seems typical given history: upgrade server or add more servers.
- '_msg': "CPU issue with user process(es)"
- },
- '_no': {
- 'cpu_steal': {
- '_yes': {
- '_msg': "CPU issue with stolen time. System running the hypervisor may be too busy."
- },
- '_no': {'_msg': "CPU issue with system process(es)"},
- }
- },
- }
- },
- '_no': {
- '_yes': {
- # ???
- '_msg': "Memory issue"
- },
- '_no': {
- # Your slowness isn't due to CPU or IO problems, so it's likely an application-specific issue.
- # It's also possible that the slowness is being caused by another server in your cluster, or
- # by an external service you rely on.
- # start by checking important applications for uncharacteristic slowness(the DB is a good place
- # to start), think through which parts of your infrastructure could be slowed down externally.
- # For example, do you use an externally hosted email service that could slow down critical
- # parts of your application ?
- # If you suspect another server in your cluster, strace and lsof can provide information on
- # what the process is doing or waiting on. Strace will show you which file descriptors are
- # being read or written to (or being attempted to be read from) and lsof can give you a
- # mapping of those file descriptors to network connections.
- '_msg': "External issue"
- },
- },
- }
- },
- }
-}
-
-
-def build_global_message():
- """Parse the decision tree and return the message.
-
- Note: message corresponding to the current thresholds values
- """
- # Compute the weight for each item in the tree
- current_thresholds = glances_thresholds.get()
- for i in tree:
- i['weight'] = sum([current_thresholds[t].value() for t in i['thresholds'] if t in current_thresholds])
- themax = max(tree, key=lambda d: d['weight'])
- if themax['weight'] >= themax['thresholds_min']:
- # Check if the weight is > to the minimal threshold value
- return themax['msg']
- else:
- return tree[0]['msg']
-
-
-class GlancesEvents(object):
-
- """This class manages events inside the Glances software.
-
- Events is a list of event (stored in the self.events_list var)
- event_state = "OK|CAREFUL|WARNING|CRITICAL"
- event_type = "CPU*|LOAD|MEM|MON"
- event_value = value
-
- Item (or event) is defined by:
- {
- "begin": "begin",
- "end": "end",
- "state": "WARNING|CRITICAL",
- "type": "CPU|LOAD|MEM",
- "max": MAX,
- "avg": AVG,
- "min": MIN,
- "sum": SUM,
- "count": COUNT,
- "top": [top 3 process name],
- "desc": "Processes description",
- "sort": "top sort key",
- "global": "global alert message"
- }
- """
-
- def __init__(self, max_events=10, min_duration=6, min_interval=6):
- """Init the events class.
-
- max_events: maximum size of the events list
- min_duration: events duration should be > min_duration to be taken into account (in seconds)
- min_interval: minimal interval between same kind of alert (in seconds)
- """
- # Maximum size of the events list
- self.set_max_events(max_events)
-
- # Minimal event duraton time (in seconds)
- self.set_min_duration(min_duration)
-
- # Minimal interval between same kind of alert (in seconds)
- self.set_min_interval(min_interval)
-
- # Init the logs list
- self.events_list = []
-
- def set_max_events(self, max_events):
- """Set the maximum size of the events list."""
- self.max_events = max_events
-
- def set_min_duration(self, min_duration):
- """Set the minimal event duration time (in seconds)."""
- self.min_duration = min_duration
-
- def set_min_interval(self, min_interval):
- """Set the minimum interval between same kind of alert (in seconds)."""
- self.min_interval = min_interval
-
- def get(self):
- """Return the raw events list."""
- return self.events_list
-
- def len(self):
- """Return the number of events in the logs list."""
- return self.events_list.__len__()
-
- def __event_exist(self, event_time, event_type):
- """Return the event position in the events list if:
- type is matching
- and (end is < 0 or event_time - end < min_interval)
- Return -1 if the item is not found.
- """
- for i in range(self.len()):
- if ((self.events_list[i]['end'] < 0) or
- (event_time - self.events_list[i]['end'] < self.min_interval)) and \
- self.events_list[i]['type'] == event_type:
- return i
- return -1
-
- def get_event_sort_key(self, event_type):
- """Return the process sort key"""
- # Process sort depending on alert type
- if event_type.startswith("MEM"):
- # Sort TOP process by memory_percent
- ret = 'memory_percent'
- elif event_type.startswith("CPU_IOWAIT"):
- # Sort TOP process by io_counters (only for Linux OS)
- ret = 'io_counters'
- else:
- # Default sort is...
- ret = 'cpu_percent'
- return ret
-
- def set_process_sort(self, event_type):
- """Define the process auto sort key from the alert type."""
- if glances_processes.auto_sort:
- glances_processes.set_sort_key(self.get_event_sort_key(event_type))
-
- def reset_process_sort(self):
- """Reset the process auto sort key."""
- if glances_processes.auto_sort:
- glances_processes.set_sort_key('auto')
-
- def add(self, event_state, event_type, event_value, proc_list=None, proc_desc="", min_duration=None):
- """Add a new item to the logs list.
-
- event_state = "OK|CAREFUL|WARNING|CRITICAL"
- event_type = "CPU|LOAD|MEM|..."
- event_value = value
- proc_list = list of processes
- proc_desc = processes description
- global_message = global alert message
-
- If 'event' is a 'new one', add it at the beginning of the list.
- If 'event' is not a 'new one', update the list .
- When finished if event duration < peak_time then the alert is not set.
- """
- event_time = time.mktime(datetime.now().timetuple())
- global_message = build_global_message()
- proc_list = proc_list or glances_processes.get_list()
-
- # Add or update the log
- event_index = self.__event_exist(event_time, event_type)
- if event_index < 0:
- # Event did not exist, add it
- self._create_event(event_time, event_state, event_type, event_value,
- proc_desc, global_message)
- else:
- # Event exist, update it
- self._update_event(event_time, event_index, event_state, event_type, event_value,
- proc_list, proc_desc, global_message)
-
- return self.len()
-
- def _create_event(self, event_time, event_state, event_type, event_value,
- proc_desc, global_message):
- """Add a new item in the log list.
-
- Item is added only if the criticality (event_state) is WARNING or CRITICAL.
- """
- if event_state == "WARNING" or event_state == "CRITICAL":
- # Define the automatic process sort key
- self.set_process_sort(event_type)
-
- # Create the new log item
- # Time is stored in Epoch format
- # Epoch -> DMYHMS = datetime.fromtimestamp(epoch)
- item = {
- "begin": event_time,
- "end": -1,
- "state": event_state,
- "type": event_type,
- "max": event_value,
- "avg": event_value,
- "min": event_value,
- "sum": event_value,
- "count": 1,
- "top": [],
- "desc": proc_desc,
- "sort": glances_processes.sort_key,
- "global": global_message,
- }
-
- # Add the item to the list
- self.events_list.insert(0, item)
-
- # Limit the list to 'max_events' items
- if self.len() > self.max_events:
- self.events_list.pop()
- return True
- else:
- return False
-
- def _update_event(self, event_time, event_index, event_state, event_type, event_value,
- proc_list, proc_desc, global_message):
- """Update an event in the list"""
- if event_state in ('OK', 'CAREFUL') and self.events_list[event_index]['end'] < 0:
- # Close the event
- self._close_event(event_time, event_index)
- elif event_state in ('OK', 'CAREFUL') and self.events_list[event_index]['end'] >= 0:
- # Event is already closed, do nothing
- pass
- else: # event_state == "WARNING" or event_state == "CRITICAL"
- # Set process sort key
- self.set_process_sort(event_type)
-
- # It's an ongoing event, set the end time to -1
- self.events_list[event_index]['end'] = -1
-
- # Min/Max/Sum/Count/Avergae value
- self.events_list[event_index]['min'] = min(self.events_list[event_index]['min'], event_value)
- self.events_list[event_index]['max'] = max(self.events_list[event_index]['max'], event_value)
- self.events_list[event_index]['sum'] += event_value
- self.events_list[event_index]['count'] += 1
- self.events_list[event_index]['avg'] = self.events_list[event_index]['sum'] / self.events_list[event_index]['count']
-
- if event_state == "CRITICAL":
- # Avoid to change from CRITICAL to WARNING
- # If an events have reached the CRITICAL state, it can't go back to WARNING
- self.events_list[event_index]['state'] = event_state
-
- # TOP PROCESS LIST (only for CRITICAL ALERT)
- events_sort_key = self.get_event_sort_key(event_type)
-
- # Sort the current process list to retrieve the TOP 3 processes
- self.events_list[event_index]['top'] = [p['name'] for p in sort_stats(proc_list, events_sort_key)[0:3]]
- self.events_list[event_index]['sort'] = events_sort_key
-
- # MONITORED PROCESSES DESC
- self.events_list[event_index]['desc'] = proc_desc
-
- # Global message:
- self.events_list[event_index]['global'] = global_message
-
- return True
-
- def _close_event(self, event_time, event_index):
- """Close an event in the list"""
- # Reset the automatic process sort key
- self.reset_process_sort()
-
- # Set the end of the events
- if event_time - self.events_list[event_index]['begin'] >= self.min_duration:
- # If event is >= min_duration seconds
- self.events_list[event_index]['end'] = event_time
- else:
- # If event < min_duration seconds, ignore
- self.events_list.remove(self.events_list[event_index])
-
- def clean(self, critical=False):
- """Clean the logs list by deleting finished items.
-
- By default, only delete WARNING message.
- If critical = True, also delete CRITICAL message.
- """
- # Create a new clean list
- clean_events_list = []
- while self.len() > 0:
- item = self.events_list.pop()
- if item['end'] < 0 or (not critical and item['state'].startswith("CRITICAL")):
- clean_events_list.insert(0, item)
- # The list is now the clean one
- self.events_list = clean_events_list
- return self.len()
-
-
-glances_events = GlancesEvents()