summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEmmanuel Vasilakis <mrzammler@mm.st>2023-05-23 15:56:56 +0300
committerGitHub <noreply@github.com>2023-05-23 15:56:56 +0300
commitc0c1e0e85a627d0509a37ea4e7ef00c2cf4aa29f (patch)
tree6c1208d26ad8fb47fcb78c242cfb1b6ada5a5907
parenta53850f534f45bd86971160415befd6957ddb6d6 (diff)
Better cleanup of health log table (#15045)
-rw-r--r--database/sqlite/sqlite_aclk_alert.c18
-rw-r--r--database/sqlite/sqlite_db_migration.c2
-rw-r--r--database/sqlite/sqlite_db_migration.h1
-rw-r--r--database/sqlite/sqlite_health.c316
-rw-r--r--database/sqlite/sqlite_health.h2
-rw-r--r--health/health.c81
-rw-r--r--health/health.h4
-rw-r--r--health/health_json.c170
-rw-r--r--web/api/web_api_v1.c2
9 files changed, 348 insertions, 248 deletions
diff --git a/database/sqlite/sqlite_aclk_alert.c b/database/sqlite/sqlite_aclk_alert.c
index 7f6ea65c5b..f22e3eb03b 100644
--- a/database/sqlite/sqlite_aclk_alert.c
+++ b/database/sqlite/sqlite_aclk_alert.c
@@ -75,7 +75,7 @@ static inline bool is_event_from_alert_variable_config(uint32_t unique_id, char
return ret;
}
-#define MAX_REMOVED_PERIOD 86400
+#define MAX_REMOVED_PERIOD 604800 //a week
//decide if some events should be sent or not
#define SQL_SELECT_ALERT_BY_ID "SELECT hl.new_status, hl.config_hash_id, hl.unique_id FROM health_log_%s hl, aclk_alert_%s aa " \
@@ -321,7 +321,7 @@ void aclk_push_alert_event(struct aclk_sync_host_config *wc)
}
}
- char uuid_str[GUID_LEN + 1];
+ char uuid_str[UUID_STR_LEN];
uint64_t first_sequence_id = 0;
uint64_t last_sequence_id = 0;
static __thread uint64_t log_first_sequence_id = 0;
@@ -463,7 +463,7 @@ void aclk_push_alert_events_for_all_hosts(void)
void sql_queue_existing_alerts_to_aclk(RRDHOST *host)
{
- char uuid_str[GUID_LEN + 1];
+ char uuid_str[UUID_STR_LEN];
uuid_unparse_lower_fix(&host->host_uuid, uuid_str);
BUFFER *sql = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite);
@@ -747,7 +747,7 @@ void aclk_process_send_alarm_snapshot(char *node_id, char *claim_id __maybe_unus
void health_alarm_entry2proto_nolock(struct alarm_log_entry *alarm_log, ALARM_ENTRY *ae, RRDHOST *host)
{
char *edit_command = ae->source ? health_edit_command_from_source(ae_source(ae)) : strdupz("UNKNOWN=0=UNKNOWN");
- char config_hash_id[GUID_LEN + 1];
+ char config_hash_id[UUID_STR_LEN];
uuid_unparse_lower(ae->config_hash_id, config_hash_id);
alarm_log->chart = strdupz(ae_chart_name(ae));
@@ -939,18 +939,14 @@ void aclk_push_alert_snapshot_event(char *node_id __maybe_unused)
#endif
}
-#define SQL_DELETE_ALERT_ENTRIES "DELETE FROM aclk_alert_%s WHERE filtered_alert_unique_id NOT IN (SELECT unique_id FROM health_log_%s);"
-
+#define SQL_DELETE_ALERT_ENTRIES "DELETE FROM aclk_alert_%s WHERE filtered_alert_unique_id + %d < UNIXEPOCH();"
void sql_aclk_alert_clean_dead_entries(RRDHOST *host)
{
- if (!claimed())
- return;
-
char uuid_str[UUID_STR_LEN];
uuid_unparse_lower_fix(&host->host_uuid, uuid_str);
- char sql[512];
- snprintfz(sql,511,SQL_DELETE_ALERT_ENTRIES, uuid_str, uuid_str);
+ char sql[ACLK_SYNC_QUERY_SIZE];
+ snprintfz(sql, ACLK_SYNC_QUERY_SIZE - 1, SQL_DELETE_ALERT_ENTRIES, uuid_str, MAX_REMOVED_PERIOD);
char *err_msg = NULL;
int rc = sqlite3_exec_monitored(db_meta, sql, NULL, NULL, &err_msg);
diff --git a/database/sqlite/sqlite_db_migration.c b/database/sqlite/sqlite_db_migration.c
index 3132ae2d05..beea37627e 100644
--- a/database/sqlite/sqlite_db_migration.c
+++ b/database/sqlite/sqlite_db_migration.c
@@ -12,7 +12,7 @@ static int return_int_cb(void *data, int argc, char **argv, char **column)
}
-static int table_exists_in_database(const char *table)
+int table_exists_in_database(const char *table)
{
char *err_msg = NULL;
char sql[128];
diff --git a/database/sqlite/sqlite_db_migration.h b/database/sqlite/sqlite_db_migration.h
index 138643a491..edaac52698 100644
--- a/database/sqlite/sqlite_db_migration.h
+++ b/database/sqlite/sqlite_db_migration.h
@@ -8,5 +8,6 @@
int perform_database_migration(sqlite3 *database, int target_version);
int perform_context_database_migration(sqlite3 *database, int target_version);
+int table_exists_in_database(const char *table);
#endif //NETDATA_SQLITE_DB_MIGRATION_H
diff --git a/database/sqlite/sqlite_health.c b/database/sqlite/sqlite_health.c
index 36a29d2e47..eeaba3ed6f 100644
--- a/database/sqlite/sqlite_health.c
+++ b/database/sqlite/sqlite_health.c
@@ -2,6 +2,7 @@
#include "sqlite_health.h"
#include "sqlite_functions.h"
+#include "sqlite_db_migration.h"
#define MAX_HEALTH_SQL_SIZE 2048
#define sqlite3_bind_string_or_null(res,key,param) ((key) ? sqlite3_bind_text(res, param, string2str(key), -1, SQLITE_STATIC) : sqlite3_bind_null(res, param))
@@ -20,7 +21,7 @@ int sql_create_health_log_table(RRDHOST *host) {
return 1;
}
- char uuid_str[GUID_LEN + 1];
+ char uuid_str[UUID_STR_LEN];
uuid_unparse_lower_fix(&host->host_uuid, uuid_str);
snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_CREATE_HEALTH_LOG_TABLE(uuid_str));
@@ -53,7 +54,7 @@ void sql_health_alarm_log_update(RRDHOST *host, ALARM_ENTRY *ae) {
return;
}
- char uuid_str[GUID_LEN + 1];
+ char uuid_str[UUID_STR_LEN];
uuid_unparse_lower_fix(&host->host_uuid, uuid_str);
snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_UPDATE_HEALTH_LOG(uuid_str));
@@ -128,7 +129,7 @@ void sql_health_alarm_log_insert(RRDHOST *host, ALARM_ENTRY *ae) {
return;
}
- char uuid_str[GUID_LEN + 1];
+ char uuid_str[UUID_STR_LEN];
uuid_unparse_lower_fix(&host->host_uuid, uuid_str);
snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_INSERT_HEALTH_LOG(uuid_str));
@@ -358,34 +359,61 @@ void sql_health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae)
}
/* Health related SQL queries
- Cleans up the health_log table.
+ Get a count of rows from health log table
*/
-#define SQL_CLEANUP_HEALTH_LOG(guid,guid2,limit) "DELETE from health_log_%s where unique_id in (SELECT unique_id from health_log_%s order by unique_id asc LIMIT %lu);", guid, guid2, limit
-void sql_health_alarm_log_cleanup(RRDHOST *host) {
+#define SQL_COUNT_HEALTH_LOG(guid) "SELECT count(1) FROM health_log_%s;", guid
+void sql_health_alarm_log_count(RRDHOST *host) {
sqlite3_stmt *res = NULL;
- static size_t rotate_every = 0;
int rc;
char command[MAX_HEALTH_SQL_SIZE + 1];
- if(unlikely(rotate_every == 0)) {
- rotate_every = (size_t)config_get_number(CONFIG_SECTION_HEALTH, "rotate log every lines", 2000);
- if(rotate_every < 100) rotate_every = 100;
+ if (unlikely(!db_meta)) {
+ if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE)
+ error_report("Database has not been initialized");
+ return;
}
- if(likely(host->health.health_log_entries_written < rotate_every)) {
+ char uuid_str[UUID_STR_LEN];
+ uuid_unparse_lower_fix(&host->host_uuid, uuid_str);
+
+ snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_COUNT_HEALTH_LOG(uuid_str));
+
+ rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0);
+ if (unlikely(rc != SQLITE_OK)) {
+ error_report("Failed to prepare statement to count health log entries from db");
return;
}
+ rc = sqlite3_step_monitored(res);
+ if (likely(rc == SQLITE_ROW))
+ host->health.health_log_entries_written = (size_t) sqlite3_column_int64(res, 0);
+
+ rc = sqlite3_finalize(res);
+ if (unlikely(rc != SQLITE_OK))
+ error_report("Failed to finalize the prepared statement to count health log entries from db");
+
+ info("HEALTH [%s]: Table health_log_%s, contains %lu entries.", rrdhost_hostname(host), uuid_str, (unsigned long int) host->health.health_log_entries_written);
+}
+
+/* Health related SQL queries
+ Cleans up the health_log table on a non-claimed host
+*/
+#define SQL_CLEANUP_HEALTH_LOG_NOT_CLAIMED(guid,limit) "DELETE FROM health_log_%s ORDER BY unique_id ASC LIMIT %lu;", guid, limit
+void sql_health_alarm_log_cleanup_not_claimed(RRDHOST *host, size_t rotate_every) {
+ sqlite3_stmt *res = NULL;
+ int rc;
+ char command[MAX_HEALTH_SQL_SIZE + 1];
+
if (unlikely(!db_meta)) {
if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE)
error_report("Database has not been initialized");
return;
}
- char uuid_str[GUID_LEN + 1];
+ char uuid_str[UUID_STR_LEN];
uuid_unparse_lower_fix(&host->host_uuid, uuid_str);
- snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_CLEANUP_HEALTH_LOG(uuid_str, uuid_str, (unsigned long int) (host->health.health_log_entries_written - rotate_every)));
+ snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_CLEANUP_HEALTH_LOG_NOT_CLAIMED(uuid_str, (unsigned long int) (host->health.health_log_entries_written - rotate_every)));
rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0);
if (unlikely(rc != SQLITE_OK)) {
@@ -403,14 +431,17 @@ void sql_health_alarm_log_cleanup(RRDHOST *host) {
host->health.health_log_entries_written = rotate_every;
- sql_aclk_alert_clean_dead_entries(host);
+ snprintfz(command, MAX_HEALTH_SQL_SIZE, "aclk_alert_%s", uuid_str);
+ if (unlikely(table_exists_in_database(command))) {
+ sql_aclk_alert_clean_dead_entries(host);
+ }
}
/* Health related SQL queries
- Get a count of rows from health log table
+ Cleans up the health_log table on a claimed host
*/
-#define SQL_COUNT_HEALTH_LOG(guid) "SELECT count(1) FROM health_log_%s;", guid
-void sql_health_alarm_log_count(RRDHOST *host) {
+#define SQL_CLEANUP_HEALTH_LOG_CLAIMED(guid, guid2, guid3, limit) "DELETE from health_log_%s WHERE unique_id NOT IN (SELECT filtered_alert_unique_id FROM aclk_alert_%s) AND unique_id IN (SELECT unique_id FROM health_log_%s ORDER BY unique_id asc LIMIT %lu);", guid, guid2, guid3, limit
+void sql_health_alarm_log_cleanup_claimed(RRDHOST *host, size_t rotate_every) {
sqlite3_stmt *res = NULL;
int rc;
char command[MAX_HEALTH_SQL_SIZE + 1];
@@ -421,26 +452,55 @@ void sql_health_alarm_log_count(RRDHOST *host) {
return;
}
- char uuid_str[GUID_LEN + 1];
+ char uuid_str[UUID_STR_LEN];
uuid_unparse_lower_fix(&host->host_uuid, uuid_str);
+ snprintfz(command, MAX_HEALTH_SQL_SIZE, "aclk_alert_%s", uuid_str);
- snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_COUNT_HEALTH_LOG(uuid_str));
+ if (!table_exists_in_database(command)) {
+ sql_health_alarm_log_cleanup_not_claimed(host, rotate_every);
+ return;
+ }
+
+ snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_CLEANUP_HEALTH_LOG_CLAIMED(uuid_str, uuid_str, uuid_str, (unsigned long int) (host->health.health_log_entries_written - rotate_every)));
rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0);
if (unlikely(rc != SQLITE_OK)) {
- error_report("Failed to prepare statement to count health log entries from db");
+ error_report("Failed to prepare statement to cleanup health log table");
return;
}
rc = sqlite3_step_monitored(res);
- if (likely(rc == SQLITE_ROW))
- host->health.health_log_entries_written = (size_t) sqlite3_column_int64(res, 0);
+ if (unlikely(rc != SQLITE_DONE))
+ error_report("Failed to cleanup health log table, rc = %d", rc);
rc = sqlite3_finalize(res);
if (unlikely(rc != SQLITE_OK))
- error_report("Failed to finalize the prepared statement to count health log entries from db");
+ error_report("Failed to finalize the prepared statement to cleanup health log table");
- info("HEALTH [%s]: Table health_log_%s, contains %lu entries.", rrdhost_hostname(host), uuid_str, (unsigned long int) host->health.health_log_entries_written);
+ sql_health_alarm_log_count(host);
+
+ sql_aclk_alert_clean_dead_entries(host);
+}
+
+/* Health related SQL queries
+ Cleans up the health_log table.
+*/
+void sql_health_alarm_log_cleanup(RRDHOST *host) {
+ static size_t rotate_every = 0;
+
+ if(unlikely(rotate_every == 0)) {
+ rotate_every = (size_t)config_get_number(CONFIG_SECTION_HEALTH, "rotate log every lines", 2000);
+ if(rotate_every < 100) rotate_every = 100;
+ }
+
+ if(likely(host->health.health_log_entries_written < rotate_every)) {
+ return;
+ }
+
+ if (!claimed()) {
+ sql_health_alarm_log_cleanup_not_claimed(host, rotate_every);
+ } else
+ sql_health_alarm_log_cleanup_claimed(host, rotate_every);
}
#define SQL_INJECT_REMOVED(guid, guid2) "insert into health_log_%s (hostname, unique_id, alarm_id, alarm_event_id, config_hash_id, updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, " \
@@ -608,7 +668,7 @@ void sql_check_removed_alerts_state(char *uuid_str)
/* Health related SQL queries
Load from the health log table
*/
-#define SQL_LOAD_HEALTH_LOG(guid,limit) "SELECT hostname, unique_id, alarm_id, alarm_event_id, config_hash_id, updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, delay_up_to_timestamp, name, chart, family, exec, recipient, source, units, info, exec_code, new_status, old_status, delay, new_value, old_value, last_repeat, class, component, type, chart_context FROM (SELECT hostname, unique_id, alarm_id, alarm_event_id, config_hash_id, updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, delay_up_to_timestamp, name, chart, family, exec, recipient, source, units, info, exec_code, new_status, old_status, delay, new_value, old_value, last_repeat, class, component, type, chart_context FROM health_log_%s order by unique_id desc limit %u) order by unique_id asc;", guid, limit
+#define SQL_LOAD_HEALTH_LOG(guid) "SELECT hostname, unique_id, alarm_id, alarm_event_id, config_hash_id, updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, delay_up_to_timestamp, name, chart, family, exec, recipient, source, units, info, exec_code, new_status, old_status, delay, new_value, old_value, last_repeat, class, component, type, chart_context FROM health_log_%s group by alarm_id having max(alarm_event_id);", guid
void sql_health_alarm_log_load(RRDHOST *host) {
sqlite3_stmt *res = NULL;
int ret;
@@ -623,12 +683,12 @@ void sql_health_alarm_log_load(RRDHOST *host) {
return;
}
- char uuid_str[GUID_LEN + 1];
+ char uuid_str[UUID_STR_LEN];
uuid_unparse_lower_fix(&host->host_uuid, uuid_str);
sql_check_removed_alerts_state(uuid_str);
- snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_LOAD_HEALTH_LOG(uuid_str, host->health_log.max));
+ snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_LOAD_HEALTH_LOG(uuid_str));
ret = sqlite3_prepare_v2(db_meta, command, -1, &res, 0);
if (unlikely(ret != SQLITE_OK)) {
@@ -1076,7 +1136,7 @@ int alert_hash_and_store_config(
EVP_MD_CTX_destroy(evpctx);
fatal_assert(hash_len > sizeof(uuid_t));
- char uuid_str[GUID_LEN + 1];
+ char uuid_str[UUID_STR_LEN];
uuid_unparse_lower(*((uuid_t *)&hash_value), uuid_str);
uuid_copy(hash_id, *((uuid_t *)&hash_value));
@@ -1091,3 +1151,203 @@ int alert_hash_and_store_config(
return 1;
}
+
+#define SQL_SELECT_HEALTH_LAST_EXECUTED_EVENT "SELECT new_status FROM health_log_%s WHERE alarm_id = %u AND unique_id != %u AND flags & %d ORDER BY unique_id DESC LIMIT 1"
+int sql_health_get_last_executed_event(RRDHOST *host, ALARM_ENTRY *ae, RRDCALC_STATUS *last_executed_status)
+{
+ int rc = 0, ret = -1;
+ char command[MAX_HEALTH_SQL_SIZE + 1];
+
+ char uuid_str[UUID_STR_LEN];
+ uuid_unparse_lower_fix(&host->host_uuid, uuid_str);
+
+ sqlite3_stmt *res = NULL;
+
+ snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_SELECT_HEALTH_LAST_EXECUTED_EVENT, uuid_str, ae->alarm_id, ae->unique_id, HEALTH_ENTRY_FLAG_EXEC_RUN);
+
+ rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0);
+ if (rc != SQLITE_OK) {
+ error_report("Failed to prepare statement when trying to get last executed status");
+ return ret;
+ }
+
+ ret = 0;
+ while (sqlite3_step_monitored(res) == SQLITE_ROW) {
+ *last_executed_status = (RRDCALC_STATUS) sqlite3_column_int(res, 0);
+ ret = 1;
+ }
+
+ rc = sqlite3_finalize(res);
+ if (unlikely(rc != SQLITE_OK))
+ error_report("Failed to finalize the statement.");
+
+ return ret;
+}
+
+#define SQL_SELECT_HEALTH_LOG(guid) "SELECT hostname, unique_id, alarm_id, alarm_event_id, config_hash_id, updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, delay_up_to_timestamp, name, chart, family, exec, recipient, source, units, info, exec_code, new_status, old_status, delay, new_value, old_value, last_repeat, class, component, type, chart_context FROM health_log_%s WHERE 1=1 ", guid
+void sql_health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart) {
+
+ buffer_strcat(wb, "[");
+
+ unsigned int max = host->health_log.max;
+ unsigned int count = 0;
+
+ sqlite3_stmt *res = NULL;
+ int rc;
+
+ BUFFER *command = buffer_create(MAX_HEALTH_SQL_SIZE, NULL);
+ char uuid_str[UUID_STR_LEN];
+ uuid_unparse_lower_fix(&host->host_uuid, uuid_str);
+
+ buffer_sprintf(command, SQL_SELECT_HEALTH_LOG(uuid_str));
+
+ if (chart) {
+ char chart_sql[MAX_HEALTH_SQL_SIZE + 1];
+ snprintfz(chart_sql, MAX_HEALTH_SQL_SIZE, "AND chart = '%s' ", chart);
+ buffer_strcat(command, chart_sql);
+ }
+
+ if (after) {
+ char after_sql[MAX_HEALTH_SQL_SIZE + 1];
+ snprintfz(after_sql, MAX_HEALTH_SQL_SIZE, "AND unique_id > %u ", after);
+ buffer_strcat(command, after_sql);
+ }
+
+ {
+ char limit_sql[MAX_HEALTH_SQL_SIZE + 1];
+ snprintfz(limit_sql, MAX_HEALTH_SQL_SIZE, "ORDER BY unique_id DESC LIMIT %u ", max);
+ buffer_strcat(command, limit_sql);
+ }
+
+ rc = sqlite3_prepare_v2(db_meta, buffer_tostring(command), -1, &res, 0);
+ if (unlikely(rc != SQLITE_OK)) {
+ error_report("Failed to prepare statement SQL_SELECT_HEALTH_LOG");
+ return;
+ }
+
+ while (sqlite3_step(res) == SQLITE_ROW) {
+
+ char old_value_string[100 + 1];
+ char new_value_string[100 + 1];
+
+ char config_hash_id[UUID_STR_LEN];
+ uuid_unparse_lower(*((uuid_t *) sqlite3_column_blob(res, 4)), config_hash_id);
+
+ char *edit_command = health_edit_command_from_source((char *)sqlite3_column_text(res, 18));
+
+ if (count)
+ buffer_sprintf(wb, ",");
+
+ count++;
+
+ buffer_sprintf(
+ wb,
+ "\n\t{\n"
+ "\t\t\"hostname\": \"%s\",\n"
+ "\t\t\"utc_offset\": %d,\n"
+ "\t\t\"timezone\": \"%s\",\n"
+ "\t\t\"unique_id\": %u,\n"
+ "\t\t\"alarm_id\": %u,\n"
+ "\t\t\"alarm_event_id\": %u,\n"
+ "\t\t\"config_hash_id\": \"%s\",\n"
+ "\t\t\"name\": \"%s\",\n"
+ "\t\t\"chart\": \"%s\",\n"
+ "\t\t\"context\": \"%s\",\n"
+ "\t\t\"family\": \"%s\",\n"
+ "\t\t\"class\": \"%s\",\n"
+ "\t\t\"component\": \"%s\",\n"
+ "\t\t\"type\": \"%s\",\n"
+ "\t\t\"processed\": %s,\n"
+ "\t\t\"updated\": %s,\n"
+ "\t\t\"exec_run\": %lu,\n"
+ "\t\t\"exec_failed\": %s,\n"
+ "\t\t\"exec\": \"%s\",\n"
+ "\t\t\"recipient\": \"%s\",\n"
+ "\t\t\"exec_code\": %d,\n"
+ "\t\t\"source\": \"%s\",\n"
+ "\t\t\"command\": \"%s\",\n"
+ "\t\t\"units\": \"%s\",\n"
+ "\t\t\"when\": %lu,\n"
+ "\t\t\"duration\": %lu,\n"
+ "\t\t\"non_clear_duration\": %lu,\n"
+ "\t\t\"status\": \"%s\",\n"
+ "\t\t\"old_status\": \"%s\",\n"
+ "\t\t\"delay\": %d,\n"
+ "\t\t\"delay_up_to_timestamp\": %lu,\n"
+ "\t\t\"updated_by_id\": %u,\n"
+ "\t\t\"updates_id\": %u,\n"
+ "\t\t\"value_string\": \"%s\",\n"
+ "\t\t\"old_value_string\": \"%s\",\n"
+ "\t\t\"last_repeat\": \"%lu\",\n"
+ "\t\t\"silenced\": \"%s\",\n",
+ sqlite3_column_text(res, 0),
+ host->utc_offset,
+ rrdhost_abbrev_timezone(host),
+ (unsigned int) sqlite3_column_int64(res, 1),
+ (unsigned int) sqlite3_column_int64(res, 2),
+ (unsigned int) sqlite3_column_int64(res, 3),
+ config_hash_id,
+ sqlite3_column_text(res, 13),
+ sqlite3_column_text(res, 14),
+ sqlite3_column_text(res, 31),
+ sqlite3_column_text(res, 15),
+ sqlite3_column_text(res, 28) ? (const char *) sqlite3_column_text(res, 28) : (char *) "Unknown",
+ sqlite3_column_text(res, 29) ? (const char *) sqlite3_column_text(res, 29) : (char *) "Unknown",
+ sqlite3_column_text(res, 30) ? (const char *) sqlite3_column_text(res, 30) : (char *) "Unknown",
+ (sqlite3_column_int64(res, 10) & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false",
+ (sqlite3_column_int64(res, 10) & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false",
+ (long unsigned int)sqlite3_column_int64(res, 11),
+ (sqlite3_column_int64(res, 10) & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false",
+ sqlite3_column_text(res, 16) ? (const char *) sqlite3_column_text(res, 16) : string2str(host->health.health_default_exec),
+ sqlite3_column_text(res, 17) ? (const char *) sqlite3_column_text(res, 17) : string2str(host->health.health_default_recipient),
+ sqlite3_column_int(res, 21),
+ sqlite3_column_text(res, 18),
+ edit_command,
+ sqlite3_column_text(res, 19),
+ (long unsigned int)sqlite3_column_int64(res, 7),
+ (long unsigned int)sqlite3_column_int64(res, 8),
+ (long unsigned int)sqlite3_column_int64(res, 9),
+ rrdcalc_status2string(sqlite3_column_int(res, 22)),
+ rrdcalc_status2string(sqlite3_column_int(res, 23)),
+ sqlite3_column_int(res, 24),
+ (long unsigned int)sqlite3_column_int64(res, 12),
+ (unsigned int)sqlite3_column_int64(res, 5),
+ (unsigned int)sqlite3_column_int64(res, 6),
+ sqlite3_column_type(res, 25) == SQLITE_NULL ? "-" : format_value_and_unit(new_value_string, 100, sqlite3_column_double(res, 25), (char *) sqlite3_column_text(res, 19), -1),
+ sqlite3_column_type(res, 26) == SQLITE_NULL ? "-" : format_value_and_unit(old_value_string, 100, sqlite3_column_double(res, 26), (char *) sqlite3_column_text(res, 19), -1),
+ (long unsigned int)sqlite3_column_int64(res, 27),
+ (sqlite3_column_int64(res, 10) & HEALTH_ENTRY_FLAG_SILENCED)?"true":"false");
+
+ health_string2json(wb, "\t\t", "info", (char *) sqlite3_column_text(res, 20), ",\n");
+
+ if(unlikely(sqlite3_column_int64(res, 10) & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION)) {
+ buffer_strcat(wb, "\t\t\"no_clear_notification\": true,\n");
+ }
+
+ buffer_strcat(wb, "\t\t\"value\":");
+ if (sqlite3_column_type(res, 25) == SQLITE_NULL)
+ buffer_strcat(wb, "null");
+ else
+ buffer_print_netdata_double(wb, sqlite3_column_double(res, 25));
+ buffer_strcat(wb, ",\n");
+
+ buffer_strcat(wb, "\t\t\"old_value\":");
+ if (sqlite3_column_type(res, 26) == SQLITE_NULL)
+ buffer_strcat(wb, "null");
+ else
+ buffer_print_netdata_double(wb, sqlite3_column_double(res, 26));
+ buffer_strcat(wb, "\n");
+
+ buffer_strcat(wb, "\t}");
+
+ freez(edit_command);
+ }
+
+ buffer_strcat(wb, "\n]");
+
+ rc = sqlite3_finalize(res);
+ if (unlikely(rc != SQLITE_OK))
+ error_report("Failed to finalize statement for SQL_SELECT_HEALTH_LOG");
+
+ buffer_free(command);
+}
diff --git a/database/sqlite/sqlite_health.h b/database/sqlite/sqlite_health.h
index 87060dacc5..96d090b549 100644
--- a/database/sqlite/sqlite_health.h
+++ b/database/sqlite/sqlite_health.h
@@ -14,4 +14,6 @@ void sql_health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae);
void sql_health_alarm_log_cleanup(RRDHOST *host);
int alert_hash_and_store_config(uuid_t hash_id, struct alert_config *cfg, int store_hash);
void sql_aclk_alert_clean_dead_entries(RRDHOST *host);
+int sql_health_get_last_executed_event(RRDHOST *host, ALARM_ENTRY *ae, RRDCALC_STATUS *last_executed_status);
+void sql_health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart);
#endif //NETDATA_SQLITE_HEALTH_H
diff --git a/health/health.c b/health/health.c
index 5c2b85bc5a..df4798a204 100644
--- a/health/health.c
+++ b/health/health.c
@@ -412,17 +412,13 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
// find the previous notification for the same alarm
// which we have run the exec script
// exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
+ RRDCALC_STATUS last_executed_status = -3;
if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
- uint32_t id = ae->alarm_id;
- ALARM_ENTRY *t;
- for(t = ae->next; t ; t = t->next) {
- if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
- break;
- }
+ int ret = sql_health_get_last_executed_event(host, ae, &last_executed_status);
- if(likely(t)) {
+ if (likely(ret == 1)) {
// we have executed this alarm notification in the past
- if(t && t->new_status == ae->new_status) {
+ if(last_executed_status == ae->new_status) {
// don't send the notification for the same status again
debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_name(ae), ae_name(ae)
, rrdcalc_status2string(ae->new_status));
@@ -561,6 +557,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
ae->exec_spawn_serial = spawn_enq_cmd(command_to_run);
enqueue_alarm_notify_in_progress(ae);
+ health_alarm_log_save(host, ae);
} else {
error("Failed to format command arguments");
}
@@ -628,35 +625,32 @@ static inline void health_alarm_log_process(RRDHOST *host) {
// remember this for the next iteration
host->health_last_processed_id = first_waiting;
- bool cleanup_excess_log_entries = host->health_log.count > host->health_log.max;
-
- if (!cleanup_excess_log_entries)
- return;
-
- // cleanup excess entries in the log
+ //delete those that are updated, no in progress execution, and is not repeating
netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
- ALARM_ENTRY *last = NULL;
- unsigned int count = host->health_log.max * 2 / 3;
- for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
-
- if(ae && last && last->next == ae)
- last->next = NULL;
- else
- ae = NULL;
-
- while(ae) {
- debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
-
- ALARM_ENTRY *t = ae->next;
-
- if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) {
- health_alarm_wait_for_execution(ae);
+ ALARM_ENTRY *prev = host->health_log.alarms;
+ for(ae = host->health_log.alarms; ae ; ae = ae->next) {
+
+ if((likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING)) &&
+ (ae->flags & HEALTH_ENTRY_FLAG_UPDATED) &&
+ (ae->flags & HEALTH_ENTRY_FLAG_SAVED) &&
+ !(ae->flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS))
+ ||
+ ((ae->new_status == RRDCALC_STATUS_REMOVED) &&
+ (ae->flags & HEALTH_ENTRY_FLAG_SAVED) &&
+ (ae->when + 3600 < now_realtime_sec())))
+ {
+
+ if (ae == host->health_log.alarms) {
+ host->health_log.alarms = ae->next;
+ prev = ae->next;
+ } else {
+ prev->next = ae->next;
+ }
health_alarm_log_free_one_nochecks_nounlink(ae);
- host->health_log.count--;
- }
-
- ae = t;
+ ae = prev;
+ } else
+ prev = ae;
}
netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
@@ -904,8 +898,24 @@ static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
return 0;
}
+static void sql_health_postpone_queue_removed(RRDHOST *host __maybe_unused) {
+#ifdef ENABLE_ACLK
+ if (netdata_cloud_setting) {
+ struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config;
+ if (unlikely(!wc)) {
+ return;
+ }
+
+ if (wc->alert_queue_removed >= 1) {
+ wc->alert_queue_removed+=6;
+ }
+ }
+#endif
+}
+
static void health_execute_delayed_initializations(RRDHOST *host) {
RRDSET *st;
+ bool must_postpone = false;
if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION)) return;
rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION);
@@ -941,8 +951,11 @@ static void health_execute_delayed_initializations(RRDHOST *host) {
rrdvar_store_for_chart(host, st);
}
rrddim_foreach_done(rd);
+ must_postpone = true;
}
rrdset_foreach_done(st);
+ if (must_postpone)
+ sql_health_postpone_queue_removed(host);
}
/**
diff --git a/health/health.h b/health/health.h
index 902e36c622..c36aabac7e 100644
--- a/health/health.h
+++ b/health/health.h
@@ -41,7 +41,6 @@ void health_reload(void);
void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* context, RRDCALC_STATUS status);
void health_alarms2json(RRDHOST *host, BUFFER *wb, int all);
void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all);
-void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart);
void health_api_v1_chart_variables2json(RRDSET *st, BUFFER *buf);
void health_api_v1_chart_custom_variables2json(RRDSET *st, BUFFER *buf);
@@ -87,11 +86,10 @@ void health_alarm_log_free_one_nochecks_nounlink(ALARM_ENTRY *ae);
void *health_cmdapi_thread(void *ptr);
-void health_label_log_save(RRDHOST *host);
-
char *health_edit_command_from_source(const char *source);
void sql_refresh_hashes(void);
void health_add_host_labels(void);
+void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix);
#endif //NETDATA_HEALTH_H
diff --git a/health/health_json.c b/health/health_json.c
index ba18bddba9..4f81998f07 100644
--- a/health/health_json.c
+++ b/health/health_json.c
@@ -13,136 +13,6 @@ void health_string2json(BUFFER *wb, const char *prefix, const char *label, const
buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix);
}
-void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) {
- char *edit_command = ae->source ? health_edit_command_from_source(ae_source(ae)) : strdupz("UNKNOWN=0=UNKNOWN");
- char config_hash_id[GUID_LEN + 1];
- uuid_unparse_lower(ae->config_hash_id, config_hash_id);
-
- buffer_sprintf(wb,
- "\n\t{\n"
- "\t\t\"hostname\": \"%s\",\n"
- "\t\t\"utc_offset\": %d,\n"
- "\t\t\"timezone\": \"%s\",\n"
- "\t\t\"unique_id\": %u,\n"
- "\t\t\"alarm_id\": %u,\n"
- "\t\t\"alarm_event_id\": %u,\n"
- "\t\t\"config_hash_id\": \"%s\",\n"
- "\t\t\"name\": \"%s\",\n"
- "\t\t\"chart\": \"%s\",\n"
- "\t\t\"context\": \"%s\",\n"
- "\t\t\"family\": \"%s\",\n"
- "\t\t\"class\": \"%s\",\n"
- "\t\t\"component\": \"%s\",\n"
- "\t\t\"type\": \"%s\",\n"
- "\t\t\"processed\": %s,\n"
- "\t\t\"updated\": %s,\n"
- "\t\t\"exec_run\": %lu,\n"
- "\t\t\"exec_failed\": %s,\n"
- "\t\t\"exec\": \"%s\",\n"
- "\t\t\"recipient\": \"%s\",\n"
- "\t\t\"exec_code\": %d,\n"
- "\t\t\"source\": \"%s\",\n"
- "\t\t\"command\": \"%s\",\n"
- "\t\t\"units\": \"%s\",\n"
- "\t\t\"when\": %lu,\n"
- "\t\t\"duration\": %lu,\n"
- "\t\t\"non_clear_duration\": %lu,\n"
- "\t\t\"status\": \"%s\",\n"
- "\t\t\"old_status\": \"%s\",\n"
- "\t\t\"delay\": %d,\n"
- "\t\t\"delay_up_to_timestamp\": %lu,\n"
- "\t\t\"updated_by_id\": %u,\n"
- "\t\t\"updates_id\": %u,\n"
- "\t\t\"value_string\": \"%s\",\n"
- "\t\t\"old_value_string\": \"%s\",\n"
- "\t\t\"last_repeat\": \"%lu\",\n"
- "\t\t\"silenced\": \"%s\",\n"
- , rrdhost_hostname(host)
- , host->utc_offset
- , rrdhost_abbrev_timezone(host)
- , ae->unique_id
- , ae->alarm_id
- , ae->alarm_event_id
- , config_hash_id
- , ae_name(ae)
- , ae_chart_name(ae)
- , ae_chart_context(ae)
- , ae_family(ae)
- , ae->classification?ae_classification(ae):"Unknown"
- , ae->component?ae_component(ae):"Unknown"
- , ae->type?ae_type(ae):"Unknown"
- , (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false"
- , (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false"
- , (unsigned long)ae->exec_run_timestamp
- , (ae->flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false"
- , ae->exec?ae_exec(ae):string2str(host->health.health_default_exec)
- , ae->recipient?ae_recipient(ae):string2str(host->health.health_default_recipient)
- , ae->exec_code
- , ae_source(ae)
- , edit_command
- , ae_units(ae)
- , (unsigned long)ae->when
- , (unsigned long)ae->duration
- , (unsigned long)ae->non_clear_duration
- , rrdcalc_status2string(ae->new_status)
- , rrdcalc_status2string(ae->old_status)
- , ae->delay
- , (unsigned long)ae->delay_up_to_timestamp
- , ae->updated_by_id
- , ae->updates_id
- , ae_new_value_string(ae)
- , ae_old_value_string(ae)
- , (unsigned long)ae->last_repeat
- , (ae->flags & HEALTH_ENTRY_FLAG_SILENCED)?"true":"false"
- );
-
- health_string2json(wb, "\t\t", "info", ae->info ? ae_info(ae) : "", ",\n");
-
- if(unlikely(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION)) {
- buffer_strcat(wb, "\t\t\"no_clear_notification\": true,\n");
- }
-