Event is now a Dataclass

author: nicolargo <nicolas@nicolargo.com> 2024-03-17 17:07:26 +0100
committer: nicolargo <nicolas@nicolargo.com> 2024-03-17 17:07:26 +0100
commit: 40f06b9570f6851a14cdfea8b93c9b6cffdfc2de (patch)
tree: aa4c5a3b0f8dfd930168e55c2d7e2a11957511c5
parent: 77167b8bce794db9383e748cfa0eee8577687cf6 (diff)
2 files changed, 451 insertions, 0 deletions
diff --git a/glances/event.py b/glances/event.py
new file mode 100644
index 00000000..d7662907
--- /dev/null
+++ b/glances/event.py
@@ -0,0 +1,89 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of Glances.
+#
+# SPDX-FileCopyrightText: 2022 Nicolas Hennion <nicolas@nicolargo.com>
+#
+# SPDX-License-Identifier: LGPL-3.0-only
+#
+
+"""Manage Glances event class
+This class is a Pydantic data class for the Glances event.
+
+event_state = "OK|CAREFUL|WARNING|CRITICAL"
+event_type = "CPU*|LOAD|MEM|MON"
+event_value = value
+
+Item (or event) is defined by:
+    {
+        "begin": "begin",
+        "end": "end",
+        "state": "WARNING|CRITICAL",
+        "type": "CPU|LOAD|MEM",
+        "max": MAX,
+        "avg": AVG,
+        "min": MIN,
+        "sum": SUM,
+        "count": COUNT,
+        "top": [top 3 process name],
+        "desc": "Processes description",
+        "sort": "top sort key",
+        "global": "global alert message"
+    }
+"""
+
+from pydantic.dataclasses import dataclass
+
+from glances.processes import sort_stats
+
+
+@dataclass
+class GlancesEvent:
+    begin: int
+    state: str
+    type: str
+    min: float
+    max: float
+    sum: float
+    count: int
+    avg: float
+    top: list[str]
+    desc: str
+    sort: str
+    global_msg: str
+    end: int = -1
+
+    def is_ongoing(self):
+        """Return True if the event is ongoing"""
+        return self.end == -1
+
+    def is_finished(self):
+        """Return True if the event is finished"""
+        return self.end != -1
+
+    def update(self, state: str, value: float,
+               sort_key: str = None, proc_list: list = None,
+               proc_desc: str = None, global_msg: str = None):
+        """Update an ongoing event"""
+
+        self.end = -1
+
+        self.min = min(self.min, value)
+        self.max = max(self.max, value)
+        self.sum += value
+        self.count += 1
+        self.avg = self.sum / self.count
+
+        if state == "CRITICAL":
+            # Avoid to change from CRITICAL to WARNING
+            # If an events have reached the CRITICAL state, it can't go back to WARNING
+            self.state = state
+            # TOP PROCESS LIST (only for CRITICAL ALERT)
+            self.sort = sort_key
+            self.top = [p['name'] for p in sort_stats(proc_list, sort_key)[0:3]]
+
+        # MONITORED PROCESSES DESC
+        self.desc = proc_desc
+
+        # Global message
+        self.global_msg = global_msg
diff --git a/glances/events_list.py b/glances/events_list.py
new file mode 100644
index 00000000..87c3b046
--- /dev/null
+++ b/glances/events_list.py
@@ -0,0 +1,362 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of Glances.
+#
+# SPDX-FileCopyrightText: 2022 Nicolas Hennion <nicolas@nicolargo.com>
+#
+# SPDX-License-Identifier: LGPL-3.0-only
+#
+
+"""Manage Glances events list (previously Glances logs in Glances < 3.1)."""
+
+import time
+from datetime import datetime
+from pydantic import RootModel
+
+from glances.processes import glances_processes
+from glances.thresholds import glances_thresholds
+from glances.event import GlancesEvent
+
+# Static decision tree for the global alert message
+# - msg: Message to be displayed (result of the decision tree)
+# - thresholds: a list of stats to take into account
+# - thresholds_min: minimal value of the thresholds sum
+# -                 0: OK
+# -                 1: CAREFUL
+# -                 2: WARNING
+# -                 3: CRITICAL
+tree = [
+    {'msg': 'EVENTS history', 'thresholds': [], 'thresholds_min': 0},
+    {'msg': 'High CPU user mode', 'thresholds': ['cpu_user'], 'thresholds_min': 2},
+    {'msg': 'High CPU kernel usage', 'thresholds': ['cpu_system'], 'thresholds_min': 2},
+    {'msg': 'High CPU I/O waiting', 'thresholds': ['cpu_iowait'], 'thresholds_min': 2},
+    {
+        'msg': 'Large CPU stolen time. System running the hypervisor is too busy.',
+        'thresholds': ['cpu_steal'],
+        'thresholds_min': 2,
+    },
+    {'msg': 'High CPU niced value', 'thresholds': ['cpu_niced'], 'thresholds_min': 2},
+    {'msg': 'System overloaded in the last 5 minutes', 'thresholds': ['load'], 'thresholds_min': 2},
+    {'msg': 'High swap (paging) usage', 'thresholds': ['memswap'], 'thresholds_min': 2},
+    {'msg': 'High memory consumption', 'thresholds': ['mem'], 'thresholds_min': 2},
+]
+
+# TODO: change the algo to use the following decision tree
+# Source: Inspire by https://scoutapm.com/blog/slow_server_flow_chart
+# _yes means threshold >= 2
+# _no  means threshold < 2
+# With threshold:
+# - 0: OK
+# - 1: CAREFUL
+# - 2: WARNING
+# - 3: CRITICAL
+tree_new = {
+    'cpu_iowait': {
+        '_yes': {
+            'memswap': {
+                '_yes': {
+                    'mem': {
+                        '_yes': {
+                            # Once you've identified the offenders, the resolution will again
+                            # depend on whether their memory usage seems business-as-usual or not.
+                            # For example, a memory leak can be satisfactorily addressed by a one-time
+                            # or periodic restart of the process.
+                            # - if memory usage seems anomalous: kill the offending processes.
+                            # - if memory usage seems business-as-usual: add RAM to the server,
+                            # or split high-memory using services to other servers.
+                            '_msg': "Memory issue"
+                        },
+                        '_no': {
+                            # ???
+                            '_msg': "Swap issue"
+                        },
+                    }
+                },
+                '_no': {
+                    # Low swap means you have a "real" IO wait problem. The next step is to see what's hogging your IO.
+                    # iotop is an awesome tool for identifying io offenders. Two things to note:
+                    # unless you've already installed iotop, it's probably not already on your system.
+                    # Recommendation: install it before you need it - - it's no fun trying to install a troubleshooting
+                    # tool on an overloaded machine (iotop requires a Linux of 2.62 or above)
+                    '_msg': "I/O issue"
+                },
+            }
+        },
+        '_no': {
+            'cpu_total': {
+                '_yes': {
+                    'cpu_user': {
+                        '_yes': {
+                            # We expect the user-time percentage to be high.
+                            # There's most likely a program or service you've configured on you server that's
+                            # hogging CPU.
+                            # Checking the % user time just confirms this. When you see that the % user-time is high,
+                            # it's time to see what executable is monopolizing the CPU
+                            # Once you've confirmed that the % usertime is high, check the process list(also provided
+                            # by top).
+                            # Be default, top sorts the process list by % CPU, so you can just look at the top process
+                            # or processes.
+                            # If there's a single process hogging the CPU in a way that seems abnormal, it's an
+                            # anomalous situation
+                            # that a service restart can fix. If there are are multiple processes taking up CPU
+                            # resources, or it
+                            # there's one process that takes lots of resources while otherwise functioning normally,
+                            # than your setup
+                            # may just be underpowered. You'll need to upgrade your server(add more cores),
+                            # or split services out onto
+                            # other boxes. In either case, you have a resolution:
+                            # - if situation seems anomalous: kill the offending processes.
+                            # - if situation seems typical given history: upgrade server or add more servers.
+                            '_msg': "CPU issue with user process(es)"
+                        },
+                        '_no': {
+                            'cpu_steal': {
+                                '_yes': {
+                                    '_msg': "CPU issue with stolen time. System running the hypervisor may be too busy."
+                                },
+                                '_no': {'_msg': "CPU issue with system process(es)"},
+                            }
+                        },
+                    }
+                },
+                '_no': {
+                    '_yes': {
+                        # ???
+                        '_msg': "Memory issue"
+                    },
+                    '_no': {
+                        # Your slowness isn't due to CPU or IO problems, so it's likely an application-specific issue.
+                        # It's also possible that the slowness is being caused by another server in your cluster, or
+                        # by an external service you rely on.
+                        # start by checking important applications for uncharacteristic slowness(the DB is a good place
+                        # to start), think through which parts of your infrastructure could be slowed down externally.
+                        # For example, do you use an externally hosted email service that could slow down critical
+                        # parts of your application ?
+                        # If you suspect another server in your cluster, strace and lsof can provide information on
+                        # what the process is doing or waiting on. Strace will show you which file descriptors are
+                        # being read or written to (or being attempted to be read from) and lsof can give you a
+                        # mapping of those file descriptors to network connections.
+                        '_msg': "External issue"
+                    },
+                },
+            }
+        },
+    }
+}
+
+
+def build_global_message():
+    """Parse the decision tree and return the message.
+
+    Note: message corresponding to the current thresholds values
+    """
+    # Compute the weight for each item in the tree
+    current_thresholds = glances_thresholds.get()
+    for i in tree:
+        i['weight'] = sum([current_thresholds[t].value() for t in i['thresholds'] if t in current_thresholds])
+    themax = max(tree, key=lambda d: d['weight'])
+    if themax['weight'] >= themax['thresholds_min']:
+        # Check if the weight is > to the minimal threshold value
+        return themax['msg']
+    else:
+        return tree[0]['msg']
+
+
+class GlancesEventsList(object):
+
+    """This class manages events inside the Glances software.
+    GlancesEventsList is a list of GlancesEvent.
+    GlancesEvent is defined in the event.py file
+    """
+
+    def __init__(self, max_events=10, min_duration=6, min_interval=6):
+        """Init the events class.
+
+        max_events: maximum size of the events list
+        min_duration: events duration should be > min_duration to be taken into account (in seconds)
+        min_interval: minimal interval between same kind of alert (in seconds)
+        """
+        # Maximum size of the events list
+        self.set_max_events(max_events)
+
+        # Minimal event duraton time (in seconds)
+        self.set_min_duration(min_duration)
+
+        # Minimal interval between same kind of alert (in seconds)
+        self.set_min_interval(min_interval)
+
+        # Init the logs list
+        self.events_list = []
+
+    def set_max_events(self, max_events):
+        """Set the maximum size of the events list."""
+        self.max_events = max_events
+
+    def set_min_duration(self, min_duration):
+        """Set the minimal event duration time (in seconds)."""
+        self.min_duration = min_duration
+
+    def set_min_interval(self, min_interval):
+        """Set the minimum interval between same kind of alert (in seconds)."""
+        self.min_interval = min_interval
+
+    def get(self):
+        """Return the RAW events list."""
+        return [RootModel[GlancesEvent](e).model_dump() for e in self.events_list]
+
+    def len(self):
+        """Return the number of events in the logs list."""
+        return self.events_list.__len__()
+
+    def __event_exist(self, event_time, event_type):
+        """Return the event position in the events list if:
+        type is matching
+        and (end is < 0 or event_time - end < min_interval)
+        Return -1 if the item is not found.
+        """
+        for i in range(self.len()):
+            if (self.events_list[i].is_ongoing() or
+                (event_time - self.events_list[i].end < self.min_interval)) and \
+               self.events_list[i].type == event_type:
+                return i
+        return -1
+
+    def get_event_sort_key(self, event_type):
+        """Return the process sort key"""
+        # Process sort depending on alert type
+        if event_type.startswith("MEM"):
+            # Sort TOP process by memory_percent
+            ret = 'memory_percent'
+        elif event_type.startswith("CPU_IOWAIT"):
+            # Sort TOP process by io_counters (only for Linux OS)
+            ret = 'io_counters'
+        else:
+            # Default sort is...
+            ret = 'cpu_percent'
+        return ret
+
+    def set_process_sort(self, event_type):
+        """Define the process auto sort key from the alert type."""
+        if glances_processes.auto_sort:
+            glances_processes.set_sort_key(self.get_event_sort_key(event_type))
+
+    def reset_process_sort(self):
+        """Reset the process auto sort key."""
+        if glances_processes.auto_sort:
+            glances_processes.set_sort_key('auto')
+
+    def add(self, event_state, event_type, event_value, proc_list=None, proc_desc=""):
+        """Add a new item to the logs list.
+
+        event_state = "OK|CAREFUL|WARNING|CRITICAL"
+        event_type = "CPU|LOAD|MEM|..."
+        event_value = value
+        proc_list = list of processes
+        proc_desc = processes description
+        global_message = global alert message
+
+        If 'event' is a 'new one', add it at the beginning of the list.
+        If 'event' is not a 'new one', update the list .
+        When finished if event duration < peak_time then the alert is not set.
+        """
+        event_time = time.mktime(datetime.now().timetuple())
+        global_message = build_global_message()
+        proc_list = proc_list or glances_processes.get_list()
+
+        # Add or update the log
+        event_index = self.__event_exist(event_time, event_type)
+        if event_index < 0:
+            # Event did not exist, add it
+            self._create_event(event_time, event_state, event_type, event_value,
+                               proc_desc, global_message)
+        else:
+            # Event exist, update it
+            self._update_event(event_time, event_index, event_state, event_type, event_value,
+                               proc_list, proc_desc, global_message)
+
+        return self.len()
+
+    def _create_event(self, event_time, event_state, event_type, event_value,
+                      proc_desc, global_message):
+        """Add a new item in the log list.
+
+        Item is added only if the criticality (event_state) is WARNING or CRITICAL.
+        """
+        if event_state not in ('WARNING', 'CRITICAL'):
+            return
+
+        # Define the automatic process sort key
+        self.set_process_sort(event_type)
+
+        # Create the new log item
+        # Time is stored in Epoch format
+        # Epoch -> DMYHMS = datetime.fromtimestamp(epoch)
+        event = GlancesEvent(begin=event_time,
+                             state=event_state,
+                             type=event_type,
+                             min=event_value, max=event_value, sum=event_value, count=1, avg=event_value,
+                             top=[],
+                             desc=proc_desc,
+                             sort=glances_processes.sort_key,
+                             global_msg=global_message)
+
+        # Add the event to the list
+        self.events_list.insert(0, event)
+
+        # Limit the list to 'max_events' items
+        if self.len() > self.max_events:
+            self.events_list.pop()
+
+    def _update_event(self, event_time, event_index, event_state, event_type, event_value,
+                      proc_list, proc_desc, global_message):
+        """Update an event in the list"""
+        if event_state in ('OK', 'CAREFUL') and self.events_list[event_index].is_ongoing():
+            # Close the event
+            self._close_event(event_time, event_index)
+        elif event_state in ('OK', 'CAREFUL') and self.events_list[event_index].is_finished():
+            # Event is already closed, do nothing
+            pass
+        else:  # event_state == "WARNING" or event_state == "CRITICAL"
+            # Set process sort key
+            self.set_process_sort(event_type)
+
+            # Update an ongoing event
+            self.events_list[event_index].update(
+                state=event_state,
+                value=event_value,
+                sort_key=self.get_event_sort_key(event_type),
+                proc_list=proc_list,
+                proc_desc=proc_desc,
+                global_msg=global_message)
+
+    def _close_event(self, event_time, event_index):
+        """Close an event in the list"""
+        # Reset the automatic process sort key
+        self.reset_process_sort()
+
+        # Set the end of the events
+        if event_time - self.events_list[event_index].begin >= self.min_duration:
+            # If event is >= min_duration seconds
+            self.events_list[event_index].end = event_time
+        else:
+            # If event < min_duration seconds, ignore
+            self.events_list.remove(self.events_list[event_index])
+
+    def clean(self, critical=False):
+        """Clean the logs list by deleting finished items.
+
+        By default, only delete WARNING message.
+        If critical = True, also delete CRITICAL message.
+        """
+        # Create a new clean list
+        clean_events_list = []
+        while self.len() > 0:
+            event = self.events_list.pop()
+            if event.end < 0 or (not critical and event.state.startswith("CRITICAL")):
+                clean_events_list.insert(0, event)
+        # The list is now the clean one
+        self.events_list = clean_events_list
+        return self.len()
+
+
+glances_events = GlancesEventsList()
author	nicolargo <nicolas@nicolargo.com>	2024-03-17 17:07:26 +0100
committer	nicolargo <nicolas@nicolargo.com>	2024-03-17 17:07:26 +0100
commit	40f06b9570f6851a14cdfea8b93c9b6cffdfc2de (patch)
tree	aa4c5a3b0f8dfd930168e55c2d7e2a11957511c5
parent	77167b8bce794db9383e748cfa0eee8577687cf6 (diff)