diff options
author | nicolargo <nicolas@nicolargo.com> | 2024-03-16 17:54:50 +0100 |
---|---|---|
committer | nicolargo <nicolas@nicolargo.com> | 2024-03-16 17:54:50 +0100 |
commit | 684d1d7b942b0ad48552a1dc6c0b3ac947d5c1a6 (patch) | |
tree | 42b83b37cfc07940bd4cf6f951d6f1b904493761 | |
parent | 6508acaba73fbfd61ed471e1204a23c028a3f9fd (diff) |
Make the global event message available per event
-rw-r--r-- | glances/events.py | 168 | ||||
-rw-r--r-- | glances/plugins/alert/__init__.py | 159 |
2 files changed, 171 insertions, 156 deletions
diff --git a/glances/events.py b/glances/events.py index b05d229b..06ddc669 100644 --- a/glances/events.py +++ b/glances/events.py @@ -13,6 +13,151 @@ import time from datetime import datetime from glances.processes import glances_processes, sort_stats +from glances.thresholds import glances_thresholds + +# Static decision tree for the global alert message +# - msg: Message to be displayed (result of the decision tree) +# - thresholds: a list of stats to take into account +# - thresholds_min: minimal value of the thresholds sum +# - 0: OK +# - 1: CAREFUL +# - 2: WARNING +# - 3: CRITICAL +tree = [ + {'msg': 'No warning or critical alert detected', 'thresholds': [], 'thresholds_min': 0}, + {'msg': 'High CPU user mode', 'thresholds': ['cpu_user'], 'thresholds_min': 2}, + {'msg': 'High CPU kernel usage', 'thresholds': ['cpu_system'], 'thresholds_min': 2}, + {'msg': 'High CPU I/O waiting', 'thresholds': ['cpu_iowait'], 'thresholds_min': 2}, + { + 'msg': 'Large CPU stolen time. System running the hypervisor is too busy.', + 'thresholds': ['cpu_steal'], + 'thresholds_min': 2, + }, + {'msg': 'High CPU niced value', 'thresholds': ['cpu_niced'], 'thresholds_min': 2}, + {'msg': 'System overloaded in the last 5 minutes', 'thresholds': ['load'], 'thresholds_min': 2}, + {'msg': 'High swap (paging) usage', 'thresholds': ['memswap'], 'thresholds_min': 2}, + {'msg': 'High memory consumption', 'thresholds': ['mem'], 'thresholds_min': 2}, +] + +# TODO: change the algo to use the following decision tree +# Source: Inspire by https://scoutapm.com/blog/slow_server_flow_chart +# _yes means threshold >= 2 +# _no means threshold < 2 +# With threshold: +# - 0: OK +# - 1: CAREFUL +# - 2: WARNING +# - 3: CRITICAL +tree_new = { + 'cpu_iowait': { + '_yes': { + 'memswap': { + '_yes': { + 'mem': { + '_yes': { + # Once you've identified the offenders, the resolution will again + # depend on whether their memory usage seems business-as-usual or not. + # For example, a memory leak can be satisfactorily addressed by a one-time + # or periodic restart of the process. + # - if memory usage seems anomalous: kill the offending processes. + # - if memory usage seems business-as-usual: add RAM to the server, + # or split high-memory using services to other servers. + '_msg': "Memory issue" + }, + '_no': { + # ??? + '_msg': "Swap issue" + }, + } + }, + '_no': { + # Low swap means you have a "real" IO wait problem. The next step is to see what's hogging your IO. + # iotop is an awesome tool for identifying io offenders. Two things to note: + # unless you've already installed iotop, it's probably not already on your system. + # Recommendation: install it before you need it - - it's no fun trying to install a troubleshooting + # tool on an overloaded machine (iotop requires a Linux of 2.62 or above) + '_msg': "I/O issue" + }, + } + }, + '_no': { + 'cpu_total': { + '_yes': { + 'cpu_user': { + '_yes': { + # We expect the user-time percentage to be high. + # There's most likely a program or service you've configured on you server that's + # hogging CPU. + # Checking the % user time just confirms this. When you see that the % user-time is high, + # it's time to see what executable is monopolizing the CPU + # Once you've confirmed that the % usertime is high, check the process list(also provided + # by top). + # Be default, top sorts the process list by % CPU, so you can just look at the top process + # or processes. + # If there's a single process hogging the CPU in a way that seems abnormal, it's an + # anomalous situation + # that a service restart can fix. If there are are multiple processes taking up CPU + # resources, or it + # there's one process that takes lots of resources while otherwise functioning normally, + # than your setup + # may just be underpowered. You'll need to upgrade your server(add more cores), + # or split services out onto + # other boxes. In either case, you have a resolution: + # - if situation seems anomalous: kill the offending processes. + # - if situation seems typical given history: upgrade server or add more servers. + '_msg': "CPU issue with user process(es)" + }, + '_no': { + 'cpu_steal': { + '_yes': { + '_msg': "CPU issue with stolen time. System running the hypervisor may be too busy." + }, + '_no': {'_msg': "CPU issue with system process(es)"}, + } + }, + } + }, + '_no': { + '_yes': { + # ??? + '_msg': "Memory issue" + }, + '_no': { + # Your slowness isn't due to CPU or IO problems, so it's likely an application-specific issue. + # It's also possible that the slowness is being caused by another server in your cluster, or + # by an external service you rely on. + # start by checking important applications for uncharacteristic slowness(the DB is a good place + # to start), think through which parts of your infrastructure could be slowed down externally. + # For example, do you use an externally hosted email service that could slow down critical + # parts of your application ? + # If you suspect another server in your cluster, strace and lsof can provide information on + # what the process is doing or waiting on. Strace will show you which file descriptors are + # being read or written to (or being attempted to be read from) and lsof can give you a + # mapping of those file descriptors to network connections. + '_msg': "External issue" + }, + }, + } + }, + } +} + + +def build_global_message(): + """Parse the decision tree and return the message. + + Note: message corresponding to the current thresholds values + """ + # Compute the weight for each item in the tree + current_thresholds = glances_thresholds.get() + for i in tree: + i['weight'] = sum([current_thresholds[t].value() for t in i['thresholds'] if t in current_thresholds]) + themax = max(tree, key=lambda d: d['weight']) + if themax['weight'] >= themax['thresholds_min']: + # Check if the weight is > to the minimal threshold value + return themax['msg'] + else: + return tree[0]['msg'] class GlancesEvents(object): @@ -37,7 +182,8 @@ class GlancesEvents(object): "count": COUNT, "top": [top 3 process name], "desc": "Processes description", - "sort": "top sort key" + "sort": "top sort key", + "global": "global alert message" } """ @@ -125,26 +271,31 @@ class GlancesEvents(object): event_value = value proc_list = list of processes proc_desc = processes description + global_message = global alert message If 'event' is a 'new one', add it at the beginning of the list. If 'event' is not a 'new one', update the list . When finished if event duration < peak_time then the alert is not set. """ event_time = time.mktime(datetime.now().timetuple()) + global_message = build_global_message() proc_list = proc_list or glances_processes.get_list() # Add or update the log event_index = self.__event_exist(event_time, event_type) if event_index < 0: # Event did not exist, add it - self._create_event(event_time, event_state, event_type, event_value, proc_desc) + self._create_event(event_time, event_state, event_type, event_value, + proc_desc, global_message) else: # Event exist, update it - self._update_event(event_time, event_index, event_state, event_type, event_value, proc_list, proc_desc) + self._update_event(event_time, event_index, event_state, event_type, event_value, + proc_list, proc_desc, global_message) return self.len() - def _create_event(self, event_time, event_state, event_type, event_value, proc_desc): + def _create_event(self, event_time, event_state, event_type, event_value, + proc_desc, global_message): """Add a new item in the log list. Item is added only if the criticality (event_state) is WARNING or CRITICAL. @@ -169,6 +320,7 @@ class GlancesEvents(object): "top": [], "desc": proc_desc, "sort": glances_processes.sort_key, + "global": global_message, } # Add the item to the list @@ -181,7 +333,8 @@ class GlancesEvents(object): else: return False - def _update_event(self, event_time, event_index, event_state, event_type, event_value, proc_list, proc_desc): + def _update_event(self, event_time, event_index, event_state, event_type, event_value, + proc_list, proc_desc, global_message): """Update an event in the list""" if event_state == "OK" or event_state == "CAREFUL": # Reset the automatic process sort key @@ -198,7 +351,7 @@ class GlancesEvents(object): else: # Update the item - # It's an ogoing event, update the end time + # It's an ongoing event, update the end time self.events_list[event_index]['end'] = -1 # Set process sort key @@ -226,6 +379,9 @@ class GlancesEvents(object): # MONITORED PROCESSES DESC self.events_list[event_index]['desc'] = proc_desc + # Global message: + self.events_list[event_index]['global'] = global_message + return True def clean(self, critical=False): diff --git a/glances/plugins/alert/__init__.py b/glances/plugins/alert/__init__.py index c787f899..e26369a0 100644 --- a/glances/plugins/alert/__init__.py +++ b/glances/plugins/alert/__init__.py @@ -14,7 +14,6 @@ from time import tzname import pytz from glances.events import glances_events -from glances.thresholds import glances_thresholds # from glances.logger import logger from glances.plugins.plugin.model import GlancesPluginModel @@ -32,6 +31,7 @@ from glances.plugins.plugin.model import GlancesPluginModel # "top": [top3 process list], # "desc": "Processes description", # "sort": "top sort key" +# "global": "global alert message" # } # Fields description # description: human readable description @@ -88,153 +88,13 @@ fields_description = { 'description': 'Sort key of the top processes', 'unit': 'string', }, -} - -# Static decision tree for the global alert message -# - msg: Message to be displayed (result of the decision tree) -# - thresholds: a list of stats to take into account -# - thresholds_min: minimal value of the thresholds sum -# - 0: OK -# - 1: CAREFUL -# - 2: WARNING -# - 3: CRITICAL -tree = [ - {'msg': 'No warning or critical alert detected', 'thresholds': [], 'thresholds_min': 0}, - {'msg': 'High CPU user mode', 'thresholds': ['cpu_user'], 'thresholds_min': 2}, - {'msg': 'High CPU kernel usage', 'thresholds': ['cpu_system'], 'thresholds_min': 2}, - {'msg': 'High CPU I/O waiting', 'thresholds': ['cpu_iowait'], 'thresholds_min': 2}, - { - 'msg': 'Large CPU stolen time. System running the hypervisor is too busy.', - 'thresholds': ['cpu_steal'], - 'thresholds_min': 2, - }, - {'msg': 'High CPU niced value', 'thresholds': ['cpu_niced'], 'thresholds_min': 2}, - {'msg': 'System overloaded in the last 5 minutes', 'thresholds': ['load'], 'thresholds_min': 2}, - {'msg': 'High swap (paging) usage', 'thresholds': ['memswap'], 'thresholds_min': 2}, - {'msg': 'High memory consumption', 'thresholds': ['mem'], 'thresholds_min': 2}, -] - -# TODO: change the algo to use the following decision tree -# Source: Inspire by https://scoutapm.com/blog/slow_server_flow_chart -# _yes means threshold >= 2 -# _no means threshold < 2 -# With threshold: -# - 0: OK -# - 1: CAREFUL -# - 2: WARNING -# - 3: CRITICAL -tree_new = { - 'cpu_iowait': { - '_yes': { - 'memswap': { - '_yes': { - 'mem': { - '_yes': { - # Once you've identified the offenders, the resolution will again - # depend on whether their memory usage seems business-as-usual or not. - # For example, a memory leak can be satisfactorily addressed by a one-time - # or periodic restart of the process. - # - if memory usage seems anomalous: kill the offending processes. - # - if memory usage seems business-as-usual: add RAM to the server, - # or split high-memory using services to other servers. - '_msg': "Memory issue" - }, - '_no': { - # ??? - '_msg': "Swap issue" - }, - } - }, - '_no': { - # Low swap means you have a "real" IO wait problem. The next step is to see what's hogging your IO. - # iotop is an awesome tool for identifying io offenders. Two things to note: - # unless you've already installed iotop, it's probably not already on your system. - # Recommendation: install it before you need it - - it's no fun trying to install a troubleshooting - # tool on an overloaded machine (iotop requires a Linux of 2.62 or above) - '_msg': "I/O issue" - }, - } - }, - '_no': { - 'cpu_total': { - '_yes': { - 'cpu_user': { - '_yes': { - # We expect the user-time percentage to be high. - # There's most likely a program or service you've configured on you server that's - # hogging CPU. - # Checking the % user time just confirms this. When you see that the % user-time is high, - # it's time to see what executable is monopolizing the CPU - # Once you've confirmed that the % usertime is high, check the process list(also provided - # by top). - # Be default, top sorts the process list by % CPU, so you can just look at the top process - # or processes. - # If there's a single process hogging the CPU in a way that seems abnormal, it's an - # anomalous situation - # that a service restart can fix. If there are are multiple processes taking up CPU - # resources, or it - # there's one process that takes lots of resources while otherwise functioning normally, - # than your setup - # may just be underpowered. You'll need to upgrade your server(add more cores), - # or split services out onto - # other boxes. In either case, you have a resolution: - # - if situation seems anomalous: kill the offending processes. - # - if situation seems typical given history: upgrade server or add more servers. - '_msg': "CPU issue with user process(es)" - }, - '_no': { - 'cpu_steal': { - '_yes': { - '_msg': "CPU issue with stolen time. System running the hypervisor may be too busy." - }, - '_no': {'_msg': "CPU issue with system process(es)"}, - } - }, - } - }, - '_no': { - '_yes': { - # ??? - '_msg': "Memory issue" - }, - '_no': { - # Your slowness isn't due to CPU or IO problems, so it's likely an application-specific issue. - # It's also possible that the slowness is being caused by another server in your cluster, or - # by an external service you rely on. - # start by checking important applications for uncharacteristic slowness(the DB is a good place - # to start), think through which parts of your infrastructure could be slowed down externally. - # For example, do you use an externally hosted email service that could slow down critical - # parts of your application ? - # If you suspect another server in your cluster, strace and lsof can provide information on - # what the process is doing or waiting on. Strace will show you which file descriptors are - # being read or written to (or being attempted to be read from) and lsof can give you a - # mapping of those file descriptors to network connections. - '_msg': "External issue" - }, - }, - } - }, + 'global': { + 'description': 'Global alert message', + 'unit': 'string', } } -def global_message(): - """Parse the decision tree and return the message. - - Note: message corresponding to the current thresholds values - """ - # Compute the weight for each item in the tree - current_thresholds = glances_thresholds.get() - for i in tree: - i['weight'] = sum([current_thresholds[t].value() for t in i['thresholds'] if t in current_thresholds]) - themax = max(tree, key=lambda d: d['weight']) - if themax['weight'] >= themax['thresholds_min']: - # Check if the weight is > to the minimal threshold value - return themax['msg'] - else: - return tree[0]['msg'] - - class PluginModel(GlancesPluginModel): """Glances alert plugin. @@ -265,10 +125,6 @@ class PluginModel(GlancesPluginModel): """Nothing to do here. Just return the global glances_log.""" # Set the stats to the glances_events self.stats = glances_events.get() - # Define the global message thanks to the current thresholds - # and the decision tree - # !!! Call directly in the msg_curse function - # global_message() def msg_curse(self, args=None, max_width=None): """Return the dict to display in the curse interface.""" @@ -280,8 +136,11 @@ class PluginModel(GlancesPluginModel): return ret # Build the string message - # Header - ret.append(self.curse_add_line(global_message(), "TITLE")) + # Header with the global message + if len(self.stats) > 0 and self.stats[0]['end'] < 0 and 'global' in self.stats[0]: + ret.append(self.curse_add_line(self.stats[0]['global'], "TITLE")) + else: + ret.append(self.curse_add_line("ALERTS", "TITLE")) # Loop over alerts for alert in self.stats: # New line |