diff options
author | nicolargo <nicolas@nicolargo.com> | 2024-03-17 17:07:26 +0100 |
---|---|---|
committer | nicolargo <nicolas@nicolargo.com> | 2024-03-17 17:07:26 +0100 |
commit | 40f06b9570f6851a14cdfea8b93c9b6cffdfc2de (patch) | |
tree | aa4c5a3b0f8dfd930168e55c2d7e2a11957511c5 | |
parent | 77167b8bce794db9383e748cfa0eee8577687cf6 (diff) |
Event is now a Dataclass
-rw-r--r-- | glances/event.py | 89 | ||||
-rw-r--r-- | glances/events_list.py | 362 |
2 files changed, 451 insertions, 0 deletions
diff --git a/glances/event.py b/glances/event.py new file mode 100644 index 00000000..d7662907 --- /dev/null +++ b/glances/event.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Glances. +# +# SPDX-FileCopyrightText: 2022 Nicolas Hennion <nicolas@nicolargo.com> +# +# SPDX-License-Identifier: LGPL-3.0-only +# + +"""Manage Glances event class +This class is a Pydantic data class for the Glances event. + +event_state = "OK|CAREFUL|WARNING|CRITICAL" +event_type = "CPU*|LOAD|MEM|MON" +event_value = value + +Item (or event) is defined by: + { + "begin": "begin", + "end": "end", + "state": "WARNING|CRITICAL", + "type": "CPU|LOAD|MEM", + "max": MAX, + "avg": AVG, + "min": MIN, + "sum": SUM, + "count": COUNT, + "top": [top 3 process name], + "desc": "Processes description", + "sort": "top sort key", + "global": "global alert message" + } +""" + +from pydantic.dataclasses import dataclass + +from glances.processes import sort_stats + + +@dataclass +class GlancesEvent: + begin: int + state: str + type: str + min: float + max: float + sum: float + count: int + avg: float + top: list[str] + desc: str + sort: str + global_msg: str + end: int = -1 + + def is_ongoing(self): + """Return True if the event is ongoing""" + return self.end == -1 + + def is_finished(self): + """Return True if the event is finished""" + return self.end != -1 + + def update(self, state: str, value: float, + sort_key: str = None, proc_list: list = None, + proc_desc: str = None, global_msg: str = None): + """Update an ongoing event""" + + self.end = -1 + + self.min = min(self.min, value) + self.max = max(self.max, value) + self.sum += value + self.count += 1 + self.avg = self.sum / self.count + + if state == "CRITICAL": + # Avoid to change from CRITICAL to WARNING + # If an events have reached the CRITICAL state, it can't go back to WARNING + self.state = state + # TOP PROCESS LIST (only for CRITICAL ALERT) + self.sort = sort_key + self.top = [p['name'] for p in sort_stats(proc_list, sort_key)[0:3]] + + # MONITORED PROCESSES DESC + self.desc = proc_desc + + # Global message + self.global_msg = global_msg diff --git a/glances/events_list.py b/glances/events_list.py new file mode 100644 index 00000000..87c3b046 --- /dev/null +++ b/glances/events_list.py @@ -0,0 +1,362 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Glances. +# +# SPDX-FileCopyrightText: 2022 Nicolas Hennion <nicolas@nicolargo.com> +# +# SPDX-License-Identifier: LGPL-3.0-only +# + +"""Manage Glances events list (previously Glances logs in Glances < 3.1).""" + +import time +from datetime import datetime +from pydantic import RootModel + +from glances.processes import glances_processes +from glances.thresholds import glances_thresholds +from glances.event import GlancesEvent + +# Static decision tree for the global alert message +# - msg: Message to be displayed (result of the decision tree) +# - thresholds: a list of stats to take into account +# - thresholds_min: minimal value of the thresholds sum +# - 0: OK +# - 1: CAREFUL +# - 2: WARNING +# - 3: CRITICAL +tree = [ + {'msg': 'EVENTS history', 'thresholds': [], 'thresholds_min': 0}, + {'msg': 'High CPU user mode', 'thresholds': ['cpu_user'], 'thresholds_min': 2}, + {'msg': 'High CPU kernel usage', 'thresholds': ['cpu_system'], 'thresholds_min': 2}, + {'msg': 'High CPU I/O waiting', 'thresholds': ['cpu_iowait'], 'thresholds_min': 2}, + { + 'msg': 'Large CPU stolen time. System running the hypervisor is too busy.', + 'thresholds': ['cpu_steal'], + 'thresholds_min': 2, + }, + {'msg': 'High CPU niced value', 'thresholds': ['cpu_niced'], 'thresholds_min': 2}, + {'msg': 'System overloaded in the last 5 minutes', 'thresholds': ['load'], 'thresholds_min': 2}, + {'msg': 'High swap (paging) usage', 'thresholds': ['memswap'], 'thresholds_min': 2}, + {'msg': 'High memory consumption', 'thresholds': ['mem'], 'thresholds_min': 2}, +] + +# TODO: change the algo to use the following decision tree +# Source: Inspire by https://scoutapm.com/blog/slow_server_flow_chart +# _yes means threshold >= 2 +# _no means threshold < 2 +# With threshold: +# - 0: OK +# - 1: CAREFUL +# - 2: WARNING +# - 3: CRITICAL +tree_new = { + 'cpu_iowait': { + '_yes': { + 'memswap': { + '_yes': { + 'mem': { + '_yes': { + # Once you've identified the offenders, the resolution will again + # depend on whether their memory usage seems business-as-usual or not. + # For example, a memory leak can be satisfactorily addressed by a one-time + # or periodic restart of the process. + # - if memory usage seems anomalous: kill the offending processes. + # - if memory usage seems business-as-usual: add RAM to the server, + # or split high-memory using services to other servers. + '_msg': "Memory issue" + }, + '_no': { + # ??? + '_msg': "Swap issue" + }, + } + }, + '_no': { + # Low swap means you have a "real" IO wait problem. The next step is to see what's hogging your IO. + # iotop is an awesome tool for identifying io offenders. Two things to note: + # unless you've already installed iotop, it's probably not already on your system. + # Recommendation: install it before you need it - - it's no fun trying to install a troubleshooting + # tool on an overloaded machine (iotop requires a Linux of 2.62 or above) + '_msg': "I/O issue" + }, + } + }, + '_no': { + 'cpu_total': { + '_yes': { + 'cpu_user': { + '_yes': { + # We expect the user-time percentage to be high. + # There's most likely a program or service you've configured on you server that's + # hogging CPU. + # Checking the % user time just confirms this. When you see that the % user-time is high, + # it's time to see what executable is monopolizing the CPU + # Once you've confirmed that the % usertime is high, check the process list(also provided + # by top). + # Be default, top sorts the process list by % CPU, so you can just look at the top process + # or processes. + # If there's a single process hogging the CPU in a way that seems abnormal, it's an + # anomalous situation + # that a service restart can fix. If there are are multiple processes taking up CPU + # resources, or it + # there's one process that takes lots of resources while otherwise functioning normally, + # than your setup + # may just be underpowered. You'll need to upgrade your server(add more cores), + # or split services out onto + # other boxes. In either case, you have a resolution: + # - if situation seems anomalous: kill the offending processes. + # - if situation seems typical given history: upgrade server or add more servers. + '_msg': "CPU issue with user process(es)" + }, + '_no': { + 'cpu_steal': { + '_yes': { + '_msg': "CPU issue with stolen time. System running the hypervisor may be too busy." + }, + '_no': {'_msg': "CPU issue with system process(es)"}, + } + }, + } + }, + '_no': { + '_yes': { + # ??? + '_msg': "Memory issue" + }, + '_no': { + # Your slowness isn't due to CPU or IO problems, so it's likely an application-specific issue. + # It's also possible that the slowness is being caused by another server in your cluster, or + # by an external service you rely on. + # start by checking important applications for uncharacteristic slowness(the DB is a good place + # to start), think through which parts of your infrastructure could be slowed down externally. + # For example, do you use an externally hosted email service that could slow down critical + # parts of your application ? + # If you suspect another server in your cluster, strace and lsof can provide information on + # what the process is doing or waiting on. Strace will show you which file descriptors are + # being read or written to (or being attempted to be read from) and lsof can give you a + # mapping of those file descriptors to network connections. + '_msg': "External issue" + }, + }, + } + }, + } +} + + +def build_global_message(): + """Parse the decision tree and return the message. + + Note: message corresponding to the current thresholds values + """ + # Compute the weight for each item in the tree + current_thresholds = glances_thresholds.get() + for i in tree: + i['weight'] = sum([current_thresholds[t].value() for t in i['thresholds'] if t in current_thresholds]) + themax = max(tree, key=lambda d: d['weight']) + if themax['weight'] >= themax['thresholds_min']: + # Check if the weight is > to the minimal threshold value + return themax['msg'] + else: + return tree[0]['msg'] + + +class GlancesEventsList(object): + + """This class manages events inside the Glances software. + GlancesEventsList is a list of GlancesEvent. + GlancesEvent is defined in the event.py file + """ + + def __init__(self, max_events=10, min_duration=6, min_interval=6): + """Init the events class. + + max_events: maximum size of the events list + min_duration: events duration should be > min_duration to be taken into account (in seconds) + min_interval: minimal interval between same kind of alert (in seconds) + """ + # Maximum size of the events list + self.set_max_events(max_events) + + # Minimal event duraton time (in seconds) + self.set_min_duration(min_duration) + + # Minimal interval between same kind of alert (in seconds) + self.set_min_interval(min_interval) + + # Init the logs list + self.events_list = [] + + def set_max_events(self, max_events): + """Set the maximum size of the events list.""" + self.max_events = max_events + + def set_min_duration(self, min_duration): + """Set the minimal event duration time (in seconds).""" + self.min_duration = min_duration + + def set_min_interval(self, min_interval): + """Set the minimum interval between same kind of alert (in seconds).""" + self.min_interval = min_interval + + def get(self): + """Return the RAW events list.""" + return [RootModel[GlancesEvent](e).model_dump() for e in self.events_list] + + def len(self): + """Return the number of events in the logs list.""" + return self.events_list.__len__() + + def __event_exist(self, event_time, event_type): + """Return the event position in the events list if: + type is matching + and (end is < 0 or event_time - end < min_interval) + Return -1 if the item is not found. + """ + for i in range(self.len()): + if (self.events_list[i].is_ongoing() or + (event_time - self.events_list[i].end < self.min_interval)) and \ + self.events_list[i].type == event_type: + return i + return -1 + + def get_event_sort_key(self, event_type): + """Return the process sort key""" + # Process sort depending on alert type + if event_type.startswith("MEM"): + # Sort TOP process by memory_percent + ret = 'memory_percent' + elif event_type.startswith("CPU_IOWAIT"): + # Sort TOP process by io_counters (only for Linux OS) + ret = 'io_counters' + else: + # Default sort is... + ret = 'cpu_percent' + return ret + + def set_process_sort(self, event_type): + """Define the process auto sort key from the alert type.""" + if glances_processes.auto_sort: + glances_processes.set_sort_key(self.get_event_sort_key(event_type)) + + def reset_process_sort(self): + """Reset the process auto sort key.""" + if glances_processes.auto_sort: + glances_processes.set_sort_key('auto') + + def add(self, event_state, event_type, event_value, proc_list=None, proc_desc=""): + """Add a new item to the logs list. + + event_state = "OK|CAREFUL|WARNING|CRITICAL" + event_type = "CPU|LOAD|MEM|..." + event_value = value + proc_list = list of processes + proc_desc = processes description + global_message = global alert message + + If 'event' is a 'new one', add it at the beginning of the list. + If 'event' is not a 'new one', update the list . + When finished if event duration < peak_time then the alert is not set. + """ + event_time = time.mktime(datetime.now().timetuple()) + global_message = build_global_message() + proc_list = proc_list or glances_processes.get_list() + + # Add or update the log + event_index = self.__event_exist(event_time, event_type) + if event_index < 0: + # Event did not exist, add it + self._create_event(event_time, event_state, event_type, event_value, + proc_desc, global_message) + else: + # Event exist, update it + self._update_event(event_time, event_index, event_state, event_type, event_value, + proc_list, proc_desc, global_message) + + return self.len() + + def _create_event(self, event_time, event_state, event_type, event_value, + proc_desc, global_message): + """Add a new item in the log list. + + Item is added only if the criticality (event_state) is WARNING or CRITICAL. + """ + if event_state not in ('WARNING', 'CRITICAL'): + return + + # Define the automatic process sort key + self.set_process_sort(event_type) + + # Create the new log item + # Time is stored in Epoch format + # Epoch -> DMYHMS = datetime.fromtimestamp(epoch) + event = GlancesEvent(begin=event_time, + state=event_state, + type=event_type, + min=event_value, max=event_value, sum=event_value, count=1, avg=event_value, + top=[], + desc=proc_desc, + sort=glances_processes.sort_key, + global_msg=global_message) + + # Add the event to the list + self.events_list.insert(0, event) + + # Limit the list to 'max_events' items + if self.len() > self.max_events: + self.events_list.pop() + + def _update_event(self, event_time, event_index, event_state, event_type, event_value, + proc_list, proc_desc, global_message): + """Update an event in the list""" + if event_state in ('OK', 'CAREFUL') and self.events_list[event_index].is_ongoing(): + # Close the event + self._close_event(event_time, event_index) + elif event_state in ('OK', 'CAREFUL') and self.events_list[event_index].is_finished(): + # Event is already closed, do nothing + pass + else: # event_state == "WARNING" or event_state == "CRITICAL" + # Set process sort key + self.set_process_sort(event_type) + + # Update an ongoing event + self.events_list[event_index].update( + state=event_state, + value=event_value, + sort_key=self.get_event_sort_key(event_type), + proc_list=proc_list, + proc_desc=proc_desc, + global_msg=global_message) + + def _close_event(self, event_time, event_index): + """Close an event in the list""" + # Reset the automatic process sort key + self.reset_process_sort() + + # Set the end of the events + if event_time - self.events_list[event_index].begin >= self.min_duration: + # If event is >= min_duration seconds + self.events_list[event_index].end = event_time + else: + # If event < min_duration seconds, ignore + self.events_list.remove(self.events_list[event_index]) + + def clean(self, critical=False): + """Clean the logs list by deleting finished items. + + By default, only delete WARNING message. + If critical = True, also delete CRITICAL message. + """ + # Create a new clean list + clean_events_list = [] + while self.len() > 0: + event = self.events_list.pop() + if event.end < 0 or (not critical and event.state.startswith("CRITICAL")): + clean_events_list.insert(0, event) + # The list is now the clean one + self.events_list = clean_events_list + return self.len() + + +glances_events = GlancesEventsList() |