summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authornicolargo <nicolas@nicolargo.com>2024-03-16 17:54:50 +0100
committernicolargo <nicolas@nicolargo.com>2024-03-16 17:54:50 +0100
commit684d1d7b942b0ad48552a1dc6c0b3ac947d5c1a6 (patch)
tree42b83b37cfc07940bd4cf6f951d6f1b904493761
parent6508acaba73fbfd61ed471e1204a23c028a3f9fd (diff)
Make the global event message available per event
-rw-r--r--glances/events.py168
-rw-r--r--glances/plugins/alert/__init__.py159
2 files changed, 171 insertions, 156 deletions
diff --git a/glances/events.py b/glances/events.py
index b05d229b..06ddc669 100644
--- a/glances/events.py
+++ b/glances/events.py
@@ -13,6 +13,151 @@ import time
from datetime import datetime
from glances.processes import glances_processes, sort_stats
+from glances.thresholds import glances_thresholds
+
+# Static decision tree for the global alert message
+# - msg: Message to be displayed (result of the decision tree)
+# - thresholds: a list of stats to take into account
+# - thresholds_min: minimal value of the thresholds sum
+# - 0: OK
+# - 1: CAREFUL
+# - 2: WARNING
+# - 3: CRITICAL
+tree = [
+ {'msg': 'No warning or critical alert detected', 'thresholds': [], 'thresholds_min': 0},
+ {'msg': 'High CPU user mode', 'thresholds': ['cpu_user'], 'thresholds_min': 2},
+ {'msg': 'High CPU kernel usage', 'thresholds': ['cpu_system'], 'thresholds_min': 2},
+ {'msg': 'High CPU I/O waiting', 'thresholds': ['cpu_iowait'], 'thresholds_min': 2},
+ {
+ 'msg': 'Large CPU stolen time. System running the hypervisor is too busy.',
+ 'thresholds': ['cpu_steal'],
+ 'thresholds_min': 2,
+ },
+ {'msg': 'High CPU niced value', 'thresholds': ['cpu_niced'], 'thresholds_min': 2},
+ {'msg': 'System overloaded in the last 5 minutes', 'thresholds': ['load'], 'thresholds_min': 2},
+ {'msg': 'High swap (paging) usage', 'thresholds': ['memswap'], 'thresholds_min': 2},
+ {'msg': 'High memory consumption', 'thresholds': ['mem'], 'thresholds_min': 2},
+]
+
+# TODO: change the algo to use the following decision tree
+# Source: Inspire by https://scoutapm.com/blog/slow_server_flow_chart
+# _yes means threshold >= 2
+# _no means threshold < 2
+# With threshold:
+# - 0: OK
+# - 1: CAREFUL
+# - 2: WARNING
+# - 3: CRITICAL
+tree_new = {
+ 'cpu_iowait': {
+ '_yes': {
+ 'memswap': {
+ '_yes': {
+ 'mem': {
+ '_yes': {
+ # Once you've identified the offenders, the resolution will again
+ # depend on whether their memory usage seems business-as-usual or not.
+ # For example, a memory leak can be satisfactorily addressed by a one-time
+ # or periodic restart of the process.
+ # - if memory usage seems anomalous: kill the offending processes.
+ # - if memory usage seems business-as-usual: add RAM to the server,
+ # or split high-memory using services to other servers.
+ '_msg': "Memory issue"
+ },
+ '_no': {
+ # ???
+ '_msg': "Swap issue"
+ },
+ }
+ },
+ '_no': {
+ # Low swap means you have a "real" IO wait problem. The next step is to see what's hogging your IO.
+ # iotop is an awesome tool for identifying io offenders. Two things to note:
+ # unless you've already installed iotop, it's probably not already on your system.
+ # Recommendation: install it before you need it - - it's no fun trying to install a troubleshooting
+ # tool on an overloaded machine (iotop requires a Linux of 2.62 or above)
+ '_msg': "I/O issue"
+ },
+ }
+ },
+ '_no': {
+ 'cpu_total': {
+ '_yes': {
+ 'cpu_user': {
+ '_yes': {
+ # We expect the user-time percentage to be high.
+ # There's most likely a program or service you've configured on you server that's
+ # hogging CPU.
+ # Checking the % user time just confirms this. When you see that the % user-time is high,
+ # it's time to see what executable is monopolizing the CPU
+ # Once you've confirmed that the % usertime is high, check the process list(also provided
+ # by top).
+ # Be default, top sorts the process list by % CPU, so you can just look at the top process
+ # or processes.
+ # If there's a single process hogging the CPU in a way that seems abnormal, it's an
+ # anomalous situation
+ # that a service restart can fix. If there are are multiple processes taking up CPU
+ # resources, or it
+ # there's one process that takes lots of resources while otherwise functioning normally,
+ # than your setup
+ # may just be underpowered. You'll need to upgrade your server(add more cores),
+ # or split services out onto
+ # other boxes. In either case, you have a resolution:
+ # - if situation seems anomalous: kill the offending processes.
+ # - if situation seems typical given history: upgrade server or add more servers.
+ '_msg': "CPU issue with user process(es)"
+ },
+ '_no': {
+ 'cpu_steal': {
+ '_yes': {
+ '_msg': "CPU issue with stolen time. System running the hypervisor may be too busy."
+ },
+ '_no': {'_msg': "CPU issue with system process(es)"},
+ }
+ },
+ }
+ },
+ '_no': {
+ '_yes': {
+ # ???
+ '_msg': "Memory issue"
+ },
+ '_no': {
+ # Your slowness isn't due to CPU or IO problems, so it's likely an application-specific issue.
+ # It's also possible that the slowness is being caused by another server in your cluster, or
+ # by an external service you rely on.
+ # start by checking important applications for uncharacteristic slowness(the DB is a good place
+ # to start), think through which parts of your infrastructure could be slowed down externally.
+ # For example, do you use an externally hosted email service that could slow down critical
+ # parts of your application ?
+ # If you suspect another server in your cluster, strace and lsof can provide information on
+ # what the process is doing or waiting on. Strace will show you which file descriptors are
+ # being read or written to (or being attempted to be read from) and lsof can give you a
+ # mapping of those file descriptors to network connections.
+ '_msg': "External issue"
+ },
+ },
+ }
+ },
+ }
+}
+
+
+def build_global_message():
+ """Parse the decision tree and return the message.
+
+ Note: message corresponding to the current thresholds values
+ """
+ # Compute the weight for each item in the tree
+ current_thresholds = glances_thresholds.get()
+ for i in tree:
+ i['weight'] = sum([current_thresholds[t].value() for t in i['thresholds'] if t in current_thresholds])
+ themax = max(tree, key=lambda d: d['weight'])
+ if themax['weight'] >= themax['thresholds_min']:
+ # Check if the weight is > to the minimal threshold value
+ return themax['msg']
+ else:
+ return tree[0]['msg']
class GlancesEvents(object):
@@ -37,7 +182,8 @@ class GlancesEvents(object):
"count": COUNT,
"top": [top 3 process name],
"desc": "Processes description",
- "sort": "top sort key"
+ "sort": "top sort key",
+ "global": "global alert message"
}
"""
@@ -125,26 +271,31 @@ class GlancesEvents(object):
event_value = value
proc_list = list of processes
proc_desc = processes description
+ global_message = global alert message
If 'event' is a 'new one', add it at the beginning of the list.
If 'event' is not a 'new one', update the list .
When finished if event duration < peak_time then the alert is not set.
"""
event_time = time.mktime(datetime.now().timetuple())
+ global_message = build_global_message()
proc_list = proc_list or glances_processes.get_list()
# Add or update the log
event_index = self.__event_exist(event_time, event_type)
if event_index < 0:
# Event did not exist, add it
- self._create_event(event_time, event_state, event_type, event_value, proc_desc)
+ self._create_event(event_time, event_state, event_type, event_value,
+ proc_desc, global_message)
else:
# Event exist, update it
- self._update_event(event_time, event_index, event_state, event_type, event_value, proc_list, proc_desc)
+ self._update_event(event_time, event_index, event_state, event_type, event_value,
+ proc_list, proc_desc, global_message)
return self.len()
- def _create_event(self, event_time, event_state, event_type, event_value, proc_desc):
+ def _create_event(self, event_time, event_state, event_type, event_value,
+ proc_desc, global_message):
"""Add a new item in the log list.
Item is added only if the criticality (event_state) is WARNING or CRITICAL.
@@ -169,6 +320,7 @@ class GlancesEvents(object):
"top": [],
"desc": proc_desc,
"sort": glances_processes.sort_key,
+ "global": global_message,
}
# Add the item to the list
@@ -181,7 +333,8 @@ class GlancesEvents(object):
else:
return False
- def _update_event(self, event_time, event_index, event_state, event_type, event_value, proc_list, proc_desc):
+ def _update_event(self, event_time, event_index, event_state, event_type, event_value,
+ proc_list, proc_desc, global_message):
"""Update an event in the list"""
if event_state == "OK" or event_state == "CAREFUL":
# Reset the automatic process sort key
@@ -198,7 +351,7 @@ class GlancesEvents(object):
else:
# Update the item
- # It's an ogoing event, update the end time
+ # It's an ongoing event, update the end time
self.events_list[event_index]['end'] = -1
# Set process sort key
@@ -226,6 +379,9 @@ class GlancesEvents(object):
# MONITORED PROCESSES DESC
self.events_list[event_index]['desc'] = proc_desc
+ # Global message:
+ self.events_list[event_index]['global'] = global_message
+
return True
def clean(self, critical=False):
diff --git a/glances/plugins/alert/__init__.py b/glances/plugins/alert/__init__.py
index c787f899..e26369a0 100644
--- a/glances/plugins/alert/__init__.py
+++ b/glances/plugins/alert/__init__.py
@@ -14,7 +14,6 @@ from time import tzname
import pytz
from glances.events import glances_events
-from glances.thresholds import glances_thresholds
# from glances.logger import logger
from glances.plugins.plugin.model import GlancesPluginModel
@@ -32,6 +31,7 @@ from glances.plugins.plugin.model import GlancesPluginModel
# "top": [top3 process list],
# "desc": "Processes description",
# "sort": "top sort key"
+# "global": "global alert message"
# }
# Fields description
# description: human readable description
@@ -88,153 +88,13 @@ fields_description = {
'description': 'Sort key of the top processes',
'unit': 'string',
},
-}
-
-# Static decision tree for the global alert message
-# - msg: Message to be displayed (result of the decision tree)
-# - thresholds: a list of stats to take into account
-# - thresholds_min: minimal value of the thresholds sum
-# - 0: OK
-# - 1: CAREFUL
-# - 2: WARNING
-# - 3: CRITICAL
-tree = [
- {'msg': 'No warning or critical alert detected', 'thresholds': [], 'thresholds_min': 0},
- {'msg': 'High CPU user mode', 'thresholds': ['cpu_user'], 'thresholds_min': 2},
- {'msg': 'High CPU kernel usage', 'thresholds': ['cpu_system'], 'thresholds_min': 2},
- {'msg': 'High CPU I/O waiting', 'thresholds': ['cpu_iowait'], 'thresholds_min': 2},
- {
- 'msg': 'Large CPU stolen time. System running the hypervisor is too busy.',
- 'thresholds': ['cpu_steal'],
- 'thresholds_min': 2,
- },
- {'msg': 'High CPU niced value', 'thresholds': ['cpu_niced'], 'thresholds_min': 2},
- {'msg': 'System overloaded in the last 5 minutes', 'thresholds': ['load'], 'thresholds_min': 2},
- {'msg': 'High swap (paging) usage', 'thresholds': ['memswap'], 'thresholds_min': 2},
- {'msg': 'High memory consumption', 'thresholds': ['mem'], 'thresholds_min': 2},
-]
-
-# TODO: change the algo to use the following decision tree
-# Source: Inspire by https://scoutapm.com/blog/slow_server_flow_chart
-# _yes means threshold >= 2
-# _no means threshold < 2
-# With threshold:
-# - 0: OK
-# - 1: CAREFUL
-# - 2: WARNING
-# - 3: CRITICAL
-tree_new = {
- 'cpu_iowait': {
- '_yes': {
- 'memswap': {
- '_yes': {
- 'mem': {
- '_yes': {
- # Once you've identified the offenders, the resolution will again
- # depend on whether their memory usage seems business-as-usual or not.
- # For example, a memory leak can be satisfactorily addressed by a one-time
- # or periodic restart of the process.
- # - if memory usage seems anomalous: kill the offending processes.
- # - if memory usage seems business-as-usual: add RAM to the server,
- # or split high-memory using services to other servers.
- '_msg': "Memory issue"
- },
- '_no': {
- # ???
- '_msg': "Swap issue"
- },
- }
- },
- '_no': {
- # Low swap means you have a "real" IO wait problem. The next step is to see what's hogging your IO.
- # iotop is an awesome tool for identifying io offenders. Two things to note:
- # unless you've already installed iotop, it's probably not already on your system.
- # Recommendation: install it before you need it - - it's no fun trying to install a troubleshooting
- # tool on an overloaded machine (iotop requires a Linux of 2.62 or above)
- '_msg': "I/O issue"
- },
- }
- },
- '_no': {
- 'cpu_total': {
- '_yes': {
- 'cpu_user': {
- '_yes': {
- # We expect the user-time percentage to be high.
- # There's most likely a program or service you've configured on you server that's
- # hogging CPU.
- # Checking the % user time just confirms this. When you see that the % user-time is high,
- # it's time to see what executable is monopolizing the CPU
- # Once you've confirmed that the % usertime is high, check the process list(also provided
- # by top).
- # Be default, top sorts the process list by % CPU, so you can just look at the top process
- # or processes.
- # If there's a single process hogging the CPU in a way that seems abnormal, it's an
- # anomalous situation
- # that a service restart can fix. If there are are multiple processes taking up CPU
- # resources, or it
- # there's one process that takes lots of resources while otherwise functioning normally,
- # than your setup
- # may just be underpowered. You'll need to upgrade your server(add more cores),
- # or split services out onto
- # other boxes. In either case, you have a resolution:
- # - if situation seems anomalous: kill the offending processes.
- # - if situation seems typical given history: upgrade server or add more servers.
- '_msg': "CPU issue with user process(es)"
- },
- '_no': {
- 'cpu_steal': {
- '_yes': {
- '_msg': "CPU issue with stolen time. System running the hypervisor may be too busy."
- },
- '_no': {'_msg': "CPU issue with system process(es)"},
- }
- },
- }
- },
- '_no': {
- '_yes': {
- # ???
- '_msg': "Memory issue"
- },
- '_no': {
- # Your slowness isn't due to CPU or IO problems, so it's likely an application-specific issue.
- # It's also possible that the slowness is being caused by another server in your cluster, or
- # by an external service you rely on.
- # start by checking important applications for uncharacteristic slowness(the DB is a good place
- # to start), think through which parts of your infrastructure could be slowed down externally.
- # For example, do you use an externally hosted email service that could slow down critical
- # parts of your application ?
- # If you suspect another server in your cluster, strace and lsof can provide information on
- # what the process is doing or waiting on. Strace will show you which file descriptors are
- # being read or written to (or being attempted to be read from) and lsof can give you a
- # mapping of those file descriptors to network connections.
- '_msg': "External issue"
- },
- },
- }
- },
+ 'global': {
+ 'description': 'Global alert message',
+ 'unit': 'string',
}
}
-def global_message():
- """Parse the decision tree and return the message.
-
- Note: message corresponding to the current thresholds values
- """
- # Compute the weight for each item in the tree
- current_thresholds = glances_thresholds.get()
- for i in tree:
- i['weight'] = sum([current_thresholds[t].value() for t in i['thresholds'] if t in current_thresholds])
- themax = max(tree, key=lambda d: d['weight'])
- if themax['weight'] >= themax['thresholds_min']:
- # Check if the weight is > to the minimal threshold value
- return themax['msg']
- else:
- return tree[0]['msg']
-
-
class PluginModel(GlancesPluginModel):
"""Glances alert plugin.
@@ -265,10 +125,6 @@ class PluginModel(GlancesPluginModel):
"""Nothing to do here. Just return the global glances_log."""
# Set the stats to the glances_events
self.stats = glances_events.get()
- # Define the global message thanks to the current thresholds
- # and the decision tree
- # !!! Call directly in the msg_curse function
- # global_message()
def msg_curse(self, args=None, max_width=None):
"""Return the dict to display in the curse interface."""
@@ -280,8 +136,11 @@ class PluginModel(GlancesPluginModel):
return ret
# Build the string message
- # Header
- ret.append(self.curse_add_line(global_message(), "TITLE"))
+ # Header with the global message
+ if len(self.stats) > 0 and self.stats[0]['end'] < 0 and 'global' in self.stats[0]:
+ ret.append(self.curse_add_line(self.stats[0]['global'], "TITLE"))
+ else:
+ ret.append(self.curse_add_line("ALERTS", "TITLE"))
# Loop over alerts
for alert in self.stats:
# New line