summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEmmanuel Vasilakis <mrzammler@mm.st>2023-07-12 11:24:16 +0300
committerGitHub <noreply@github.com>2023-07-12 11:24:16 +0300
commit38b38993a6547aa33a0591a7ce3e7461c197e893 (patch)
tree76d483877e6770f11f429d14a49f21af51503d39
parentb1bb7bd449af0567e2edabcefd325fffaea0a3fe (diff)
Keep health log history in seconds (#15314)
* rebase * changes queries to delete based on when * readme changes * no need to do migration * wip, protect un-updated events from cleanup * remove index on when_key * fix query for claimed cleanup * if set less than minimum, set minimum * fix query * correct config assign
-rw-r--r--daemon/config/README.md2
-rw-r--r--database/rrd.h1
-rw-r--r--database/sqlite/sqlite_functions.c2
-rw-r--r--database/sqlite/sqlite_health.c66
-rw-r--r--health/health.c22
-rw-r--r--health/health.h8
-rw-r--r--streaming/README.md1
-rw-r--r--streaming/receiver.c6
-rw-r--r--streaming/rrdpush.h1
-rw-r--r--streaming/stream.conf6
-rw-r--r--web/api/health/README.md4
11 files changed, 84 insertions, 35 deletions
diff --git a/daemon/config/README.md b/daemon/config/README.md
index 418b12cf9e..bc5a5885c1 100644
--- a/daemon/config/README.md
+++ b/daemon/config/README.md
@@ -175,7 +175,7 @@ monitoring](https://github.com/netdata/netdata/blob/master/health/README.md).
| script to execute on alarm | `/usr/libexec/netdata/plugins.d/alarm-notify.sh` | The script that sends alarm notifications. Note that in versions before 1.16, the plugins.d directory may be installed in a different location in certain OSs (e.g. under `/usr/lib/netdata`). |
| run at least every seconds | `10` | Controls how often all alarm conditions should be evaluated. |
| postpone alarms during hibernation for seconds | `60` | Prevents false alarms. May need to be increased if you get alarms during hibernation. |
-| rotate log every lines | 2000 | Controls the number of alarm log entries stored in `<lib directory>/health-log.db`, where `<lib directory>` is the one configured in the [\[global\] section](#global-section-options) |
+| health log history | `432000` | Specifies the history of alarm events (in seconds) kept in the agent's sqlite database. |
| enabled alarms | * | Defines which alarms to load from both user and stock directories. This is a [simple pattern](https://github.com/netdata/netdata/blob/master/libnetdata/simple_pattern/README.md) list of alarm or template names. Can be used to disable specific alarms. For example, `enabled alarms = !oom_kill *` will load all alarms except `oom_kill`. |
### [web] section options
diff --git a/database/rrd.h b/database/rrd.h
index 2f697c81df..95da17c82b 100644
--- a/database/rrd.h
+++ b/database/rrd.h
@@ -1105,6 +1105,7 @@ typedef struct alarm_log {
uint32_t next_alarm_id;
unsigned int count;
unsigned int max;
+ uint32_t health_log_history; // the health log history in seconds to be kept in db
ALARM_ENTRY *alarms;
RW_SPINLOCK spinlock;
} ALARM_LOG;
diff --git a/database/sqlite/sqlite_functions.c b/database/sqlite/sqlite_functions.c
index d501214017..4200c15901 100644
--- a/database/sqlite/sqlite_functions.c
+++ b/database/sqlite/sqlite_functions.c
@@ -49,7 +49,6 @@ const char *database_config[] = {
"config_hash_id blob, name text, chart text, family text, recipient text, units text, exec text, "
"chart_context text, last_transition_id blob, UNIQUE (host_id, alarm_id)) ;",
- //TODO indexes
"CREATE INDEX IF NOT EXISTS health_log_ind_1 ON health_log (host_id);",
"CREATE TABLE IF NOT EXISTS health_log_detail (health_log_id int, unique_id int, alarm_id int, alarm_event_id int, "
@@ -62,7 +61,6 @@ const char *database_config[] = {
"CREATE INDEX IF NOT EXISTS health_log_d_ind_2 ON health_log_detail (global_id);",
"CREATE INDEX IF NOT EXISTS health_log_d_ind_3 ON health_log_detail (transition_id);",
"CREATE INDEX IF NOT EXISTS health_log_d_ind_4 ON health_log_detail (health_log_id);",
- //TODO more indexes
NULL
};
diff --git a/database/sqlite/sqlite_health.c b/database/sqlite/sqlite_health.c
index c7a794f885..acaa9748e3 100644
--- a/database/sqlite/sqlite_health.c
+++ b/database/sqlite/sqlite_health.c
@@ -393,8 +393,8 @@ void sql_health_alarm_log_count(RRDHOST *host) {
/* Health related SQL queries
Cleans up the health_log_detail table on a non-claimed host
*/
-#define SQL_CLEANUP_HEALTH_LOG_DETAIL_NOT_CLAIMED(limit) "DELETE FROM health_log_detail where health_log_id in (select health_log_id from health_log where host_id = @host_id) ORDER BY unique_id ASC LIMIT %lu;", limit
-void sql_health_alarm_log_cleanup_not_claimed(RRDHOST *host, size_t rotate_every) {
+#define SQL_CLEANUP_HEALTH_LOG_DETAIL_NOT_CLAIMED "DELETE FROM health_log_detail WHERE health_log_id IN (SELECT health_log_id FROM health_log WHERE host_id = ?1) AND when_key + ?2 < unixepoch() AND updated_by_id <> 0 AND transition_id NOT IN (SELECT last_transition_id FROM health_log hl WHERE hl.host_id = ?3);"
+void sql_health_alarm_log_cleanup_not_claimed(RRDHOST *host) {
sqlite3_stmt *res = NULL;
int rc;
char command[MAX_HEALTH_SQL_SIZE + 1];
@@ -408,9 +408,7 @@ void sql_health_alarm_log_cleanup_not_claimed(RRDHOST *host, size_t rotate_every
char uuid_str[UUID_STR_LEN];
uuid_unparse_lower_fix(&host->host_uuid, uuid_str);
- snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_CLEANUP_HEALTH_LOG_DETAIL_NOT_CLAIMED((unsigned long int) (host->health.health_log_entries_written - rotate_every)));
-
- rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0);
+ rc = sqlite3_prepare_v2(db_meta, SQL_CLEANUP_HEALTH_LOG_DETAIL_NOT_CLAIMED, -1, &res, 0);
if (unlikely(rc != SQLITE_OK)) {
error_report("Failed to prepare statement to cleanup health log detail table (un-claimed)");
return;
@@ -423,15 +421,29 @@ void sql_health_alarm_log_cleanup_not_claimed(RRDHOST *host, size_t rotate_every
return;
}
+ rc = sqlite3_bind_int64(res, 2, (sqlite3_int64)host->health_log.health_log_history);
+ if (unlikely(rc != SQLITE_OK)) {
+ error_report("Failed to bind health log history for SQL_CLEANUP_HEALTH_LOG_NOT_CLAIMED.");
+ sqlite3_finalize(res);
+ return;
+ }
+
+ rc = sqlite3_bind_blob(res, 3, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC);
+ if (unlikely(rc != SQLITE_OK)) {
+ error_report("Failed to bind host_id for SQL_CLEANUP_HEALTH_LOG_NOT_CLAIMED.");
+ sqlite3_finalize(res);
+ return;
+ }
+
rc = sqlite3_step_monitored(res);
if (unlikely(rc != SQLITE_DONE))
- error_report("Failed to cleanup health log table, rc = %d", rc);
+ error_report("Failed to cleanup health log detail table, rc = %d", rc);
rc = sqlite3_finalize(res);
if (unlikely(rc != SQLITE_OK))
error_report("Failed to finalize the prepared statement to cleanup health log detail table (un-claimed)");
- host->health.health_log_entries_written = rotate_every;
+ sql_health_alarm_log_count(host);
snprintfz(command, MAX_HEALTH_SQL_SIZE, "aclk_alert_%s", uuid_str);
if (unlikely(table_exists_in_database(command))) {
@@ -442,8 +454,8 @@ void sql_health_alarm_log_cleanup_not_claimed(RRDHOST *host, size_t rotate_every
/* Health related SQL queries
Cleans up the health_log_detail table on a claimed host
*/
-#define SQL_CLEANUP_HEALTH_LOG_DETAIL_CLAIMED(guid, limit) "DELETE from health_log_detail WHERE unique_id NOT IN (SELECT filtered_alert_unique_id FROM aclk_alert_%s) AND unique_id IN (SELECT hld.unique_id FROM health_log hl, health_log_detail hld WHERE hl.host_id = ?1 AND hl.health_log_id = hld.health_log_id) and health_log_id in (SELECT health_log_id FROM health_log WHERE host_id = ?2) ORDER BY unique_id asc LIMIT %lu;", guid, limit
-void sql_health_alarm_log_cleanup_claimed(RRDHOST *host, size_t rotate_every) {
+#define SQL_CLEANUP_HEALTH_LOG_DETAIL_CLAIMED(guid) "DELETE from health_log_detail WHERE unique_id NOT IN (SELECT filtered_alert_unique_id FROM aclk_alert_%s) AND unique_id IN (SELECT hld.unique_id FROM health_log hl, health_log_detail hld WHERE hl.host_id = ?1 AND hl.health_log_id = hld.health_log_id) AND health_log_id IN (SELECT health_log_id FROM health_log WHERE host_id = ?2) AND when_key + ?3 < unixepoch() AND updated_by_id <> 0 AND transition_id NOT IN (SELECT last_transition_id FROM health_log hl WHERE hl.host_id = ?4);", guid
+void sql_health_alarm_log_cleanup_claimed(RRDHOST *host) {
sqlite3_stmt *res = NULL;
int rc;
char command[MAX_HEALTH_SQL_SIZE + 1];
@@ -459,11 +471,11 @@ void sql_health_alarm_log_cleanup_claimed(RRDHOST *host, size_t rotate_every) {
snprintfz(command, MAX_HEALTH_SQL_SIZE, "aclk_alert_%s", uuid_str);
if (!table_exists_in_database(command)) {
- sql_health_alarm_log_cleanup_not_claimed(host, rotate_every);
+ sql_health_alarm_log_cleanup_not_claimed(host);
return;
}
- snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_CLEANUP_HEALTH_LOG_DETAIL_CLAIMED(uuid_str, (unsigned long int) (host->health.health_log_entries_written - rotate_every)));
+ snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_CLEANUP_HEALTH_LOG_DETAIL_CLAIMED(uuid_str));
rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0);
if (unlikely(rc != SQLITE_OK)) {
@@ -485,9 +497,23 @@ void sql_health_alarm_log_cleanup_claimed(RRDHOST *host, size_t rotate_every) {
return;
}
+ rc = sqlite3_bind_int64(res, 3, (sqlite3_int64)host->health_log.health_log_history);
+ if (unlikely(rc != SQLITE_OK)) {
+ error_report("Failed to bind health log history for SQL_CLEANUP_HEALTH_LOG_CLAIMED.");
+ sqlite3_finalize(res);
+ return;
+ }
+
+ rc = sqlite3_bind_blob(res, 4, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC);
+ if (unlikely(rc != SQLITE_OK)) {
+ error_report("Failed to bind second host_id for SQL_CLEANUP_HEALTH_LOG_CLAIMED.");
+ sqlite3_finalize(res);
+ return;
+ }
+
rc = sqlite3_step_monitored(res);
if (unlikely(rc != SQLITE_DONE))
- error_report("Failed to cleanup health log table, rc = %d", rc);
+ error_report("Failed to cleanup health log detail table, rc = %d", rc);
rc = sqlite3_finalize(res);
if (unlikely(rc != SQLITE_OK))
@@ -496,27 +522,17 @@ void sql_health_alarm_log_cleanup_claimed(RRDHOST *host, size_t rotate_every) {
sql_health_alarm_log_count(host);
sql_aclk_alert_clean_dead_entries(host);
+
}
/* Health related SQL queries
Cleans up the health_log table.
*/
void sql_health_alarm_log_cleanup(RRDHOST *host) {
- static size_t rotate_every = 0;
-
- if(unlikely(rotate_every == 0)) {
- rotate_every = (size_t)config_get_number(CONFIG_SECTION_HEALTH, "rotate log every lines", 2000);
- if(rotate_every < 100) rotate_every = 100;
- }
-
- if(likely(host->health.health_log_entries_written < rotate_every)) {
- return;
- }
-
if (!claimed()) {
- sql_health_alarm_log_cleanup_not_claimed(host, rotate_every);
+ sql_health_alarm_log_cleanup_not_claimed(host);
} else
- sql_health_alarm_log_cleanup_claimed(host, rotate_every);
+ sql_health_alarm_log_cleanup_claimed(host);
}
#define SQL_INJECT_REMOVED "insert into health_log_detail (health_log_id, unique_id, alarm_id, alarm_event_id, updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, delay_up_to_timestamp, info, exec_code, new_status, old_status, delay, new_value, old_value, last_repeat, transition_id, global_id) select health_log_id, ?1, ?2, ?3, 0, ?4, unixepoch(), 0, 0, flags, exec_run_timestamp, unixepoch(), info, exec_code, -2, new_status, delay, NULL, new_value, 0, ?5, now_usec(0) from health_log_detail where unique_id = ?6 and transition_id = ?7;"
diff --git a/health/health.c b/health/health.c
index 45a9761d84..4ae7e99974 100644
--- a/health/health.c
+++ b/health/health.c
@@ -22,7 +22,6 @@ char *silencers_filename;
SIMPLE_PATTERN *conf_enabled_alarms = NULL;
DICTIONARY *health_rrdvars;
-
void health_entry_flags_to_json_array(BUFFER *wb, const char *key, HEALTH_ENTRY_FLAGS flags) {
buffer_json_member_add_array(wb, key);
@@ -803,15 +802,28 @@ static void initialize_health(RRDHOST *host)
long n = config_get_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", host->health_log.max);
if(n < 10) {
- netdata_log_error("Host '%s': health configuration has invalid max log entries %ld. Using default %u",
- rrdhost_hostname(host),
- n,
- host->health_log.max);
+ log_health("Host '%s': health configuration has invalid max log entries %ld. Using default %u", rrdhost_hostname(host), n, host->health_log.max);
config_set_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", (long)host->health_log.max);
}
else
host->health_log.max = (unsigned int)n;
+ uint32_t m = config_get_number(CONFIG_SECTION_HEALTH, "health log history", HEALTH_LOG_DEFAULT_HISTORY);
+ if (m < HEALTH_LOG_MINIMUM_HISTORY) {
+ log_health("Host '%s': health configuration has invalid health log history %u. Using minimum %d", rrdhost_hostname(host), m, HEALTH_LOG_MINIMUM_HISTORY);
+ config_set_number(CONFIG_SECTION_HEALTH, "health log history", HEALTH_LOG_MINIMUM_HISTORY);
+ m = HEALTH_LOG_MINIMUM_HISTORY;
+ }
+
+ //default health log history is 5 days and not less than a day
+ if (host->health_log.health_log_history) {
+ if (host->health_log.health_log_history < HEALTH_LOG_MINIMUM_HISTORY)
+ host->health_log.health_log_history = HEALTH_LOG_MINIMUM_HISTORY;
+ } else
+ host->health_log.health_log_history = m;
+
+ log_health("[%s]: Health log history is set to %u seconds (%u days)", rrdhost_hostname(host), host->health_log.health_log_history, host->health_log.health_log_history / 86400);
+
conf_enabled_alarms = simple_pattern_create(config_get(CONFIG_SECTION_HEALTH, "enabled alarms", "*"), NULL,
SIMPLE_PATTERN_EXACT, true);
diff --git a/health/health.h b/health/health.h
index 74af753858..543bc56a14 100644
--- a/health/health.h
+++ b/health/health.h
@@ -31,6 +31,14 @@ void health_entry_flags_to_json_array(BUFFER *wb, const char *key, HEALTH_ENTRY_
#define HEALTH_LISTEN_BACKLOG 4096
#endif
+#ifndef HEALTH_LOG_DEFAULT_HISTORY
+#define HEALTH_LOG_DEFAULT_HISTORY 432000
+#endif
+
+#ifndef HEALTH_LOG_MINIMUM_HISTORY
+#define HEALTH_LOG_MINIMUM_HISTORY 86400
+#endif
+
#define HEALTH_SILENCERS_MAX_FILE_LEN 10000
extern char *silencers_filename;
diff --git a/streaming/README.md b/streaming/README.md
index bf11f32e47..370186acc2 100644
--- a/streaming/README.md
+++ b/streaming/README.md
@@ -55,6 +55,7 @@ node**. This file is automatically generated by Netdata the first time it is sta
| [`default memory mode`](#default-memory-mode) | `ram` | The [database](https://github.com/netdata/netdata/blob/master/database/README.md) to use for all nodes using this `API_KEY`. Valid settings are `dbengine`, `map`, `save`, `ram`, or `none`. [Read more &rarr;](#default-memory-mode) |
| `health enabled by default` | `auto` | Whether alarms and notifications should be enabled for nodes using this `API_KEY`. `auto` enables alarms when the child is connected. `yes` enables alarms always, and `no` disables alarms. |
| `default postpone alarms on connect seconds` | `60` | Postpone alarms and notifications for a period of time after the child connects. |
+| `default health log history` | `432000` | History of health log events (in seconds) kept in the database. |
| `default proxy enabled` | ` ` | Route metrics through a proxy. |
| `default proxy destination` | ` ` | Space-separated list of `IP:PORT` for proxies. |
| `default proxy api key` | ` ` | The `API_KEY` of the proxy. |
diff --git a/streaming/receiver.c b/streaming/receiver.c
index 2a81b824c9..c827ca7a1d 100644
--- a/streaming/receiver.c
+++ b/streaming/receiver.c
@@ -411,6 +411,8 @@ static bool rrdhost_set_receiver(RRDHOST *host, struct receiver_state *rpt) {
}
}
+ host->health_log.health_log_history = rpt->config.alarms_history;
+
// this is a test
// if(rpt->hops <= host->sender->hops)
// rrdpush_sender_thread_stop(host, "HOPS MISMATCH", false);
@@ -552,6 +554,7 @@ static void rrdpush_receive(struct receiver_state *rpt)
rpt->config.health_enabled = (int)default_health_enabled;
rpt->config.alarms_delay = 60;
+ rpt->config.alarms_history = HEALTH_LOG_DEFAULT_HISTORY;
rpt->config.rrdpush_enabled = (int)default_rrdpush_enabled;
rpt->config.rrdpush_destination = default_rrdpush_destination;
@@ -588,6 +591,9 @@ static void rrdpush_receive(struct receiver_state *rpt)
rpt->config.alarms_delay = appconfig_get_number(&stream_config, rpt->key, "default postpone alarms on connect seconds", rpt->config.alarms_delay);
rpt->config.alarms_delay = appconfig_get_number(&stream_config, rpt->machine_guid, "postpone alarms on connect seconds", rpt->config.alarms_delay);
+ rpt->config.alarms_history = appconfig_get_number(&stream_config, rpt->key, "default health log history", rpt->config.alarms_history);
+ rpt->config.alarms_history = appconfig_get_number(&stream_config, rpt->machine_guid, "health log history", rpt->config.alarms_history);
+
rpt->config.rrdpush_enabled = appconfig_get_boolean(&stream_config, rpt->key, "default proxy enabled", rpt->config.rrdpush_enabled);
rpt->config.rrdpush_enabled = appconfig_get_boolean(&stream_config, rpt->machine_guid, "proxy enabled", rpt->config.rrdpush_enabled);
diff --git a/streaming/rrdpush.h b/streaming/rrdpush.h
index 6781f1cfe3..73bd438c9d 100644
--- a/streaming/rrdpush.h
+++ b/streaming/rrdpush.h
@@ -400,6 +400,7 @@ struct receiver_state {
int update_every;
int health_enabled; // CONFIG_BOOLEAN_YES, CONFIG_BOOLEAN_NO, CONFIG_BOOLEAN_AUTO
time_t alarms_delay;
+ uint32_t alarms_history;
int rrdpush_enabled;
char *rrdpush_api_key; // DONT FREE - it is allocated in appconfig
char *rrdpush_send_charts_matching; // DONT FREE - it is allocated in appconfig
diff --git a/streaming/stream.conf b/streaming/stream.conf
index aa38e6a626..94e94cab7e 100644
--- a/streaming/stream.conf
+++ b/streaming/stream.conf
@@ -155,6 +155,9 @@
# postpone alarms for a short period after the sender is connected
default postpone alarms on connect seconds = 60
+ # seconds of health log events to keep
+ #default health log history = 432000
+
# need to route metrics differently? set these.
# the defaults are the ones at the [stream] section (above)
#default proxy enabled = yes | no
@@ -223,6 +226,9 @@
# postpone alarms when the sender connects
postpone alarms on connect seconds = 60
+ # seconds of health log events to keep
+ #health log history = 432000
+
# need to route metrics differently?
# the defaults are the ones at the [API KEY] section
#proxy enabled = yes | no
diff --git a/web/api/health/README.md b/web/api/health/README.md
index dd46854a1b..f820263b1e 100644
--- a/web/api/health/README.md
+++ b/web/api/health/README.md
@@ -28,12 +28,12 @@ This API call will return the alarms currently in WARNING or CRITICAL state.
### Event Log
-The size of the alarm log is configured in `netdata.conf`. There are 2 settings: the rotation of the alarm log file and the in memory size of the alarm log.
+The size of the alarm log is configured in `netdata.conf`. There are 2 settings: the event history kept in the DB (in seconds), and the in memory size of the alarm log.
```
[health]
in memory max health log entries = 1000
- rotate log every lines = 2000
+ health log history = 432000
```
The API call retrieves all entries of the alarm log: