diff options
-rw-r--r-- | database/sqlite/sqlite_aclk_alert.c | 18 | ||||
-rw-r--r-- | database/sqlite/sqlite_db_migration.c | 2 | ||||
-rw-r--r-- | database/sqlite/sqlite_db_migration.h | 1 | ||||
-rw-r--r-- | database/sqlite/sqlite_health.c | 316 | ||||
-rw-r--r-- | database/sqlite/sqlite_health.h | 2 | ||||
-rw-r--r-- | health/health.c | 81 | ||||
-rw-r--r-- | health/health.h | 4 | ||||
-rw-r--r-- | health/health_json.c | 170 | ||||
-rw-r--r-- | web/api/web_api_v1.c | 2 |
9 files changed, 348 insertions, 248 deletions
diff --git a/database/sqlite/sqlite_aclk_alert.c b/database/sqlite/sqlite_aclk_alert.c index 7f6ea65c5b..f22e3eb03b 100644 --- a/database/sqlite/sqlite_aclk_alert.c +++ b/database/sqlite/sqlite_aclk_alert.c @@ -75,7 +75,7 @@ static inline bool is_event_from_alert_variable_config(uint32_t unique_id, char return ret; } -#define MAX_REMOVED_PERIOD 86400 +#define MAX_REMOVED_PERIOD 604800 //a week //decide if some events should be sent or not #define SQL_SELECT_ALERT_BY_ID "SELECT hl.new_status, hl.config_hash_id, hl.unique_id FROM health_log_%s hl, aclk_alert_%s aa " \ @@ -321,7 +321,7 @@ void aclk_push_alert_event(struct aclk_sync_host_config *wc) } } - char uuid_str[GUID_LEN + 1]; + char uuid_str[UUID_STR_LEN]; uint64_t first_sequence_id = 0; uint64_t last_sequence_id = 0; static __thread uint64_t log_first_sequence_id = 0; @@ -463,7 +463,7 @@ void aclk_push_alert_events_for_all_hosts(void) void sql_queue_existing_alerts_to_aclk(RRDHOST *host) { - char uuid_str[GUID_LEN + 1]; + char uuid_str[UUID_STR_LEN]; uuid_unparse_lower_fix(&host->host_uuid, uuid_str); BUFFER *sql = buffer_create(1024, &netdata_buffers_statistics.buffers_sqlite); @@ -747,7 +747,7 @@ void aclk_process_send_alarm_snapshot(char *node_id, char *claim_id __maybe_unus void health_alarm_entry2proto_nolock(struct alarm_log_entry *alarm_log, ALARM_ENTRY *ae, RRDHOST *host) { char *edit_command = ae->source ? health_edit_command_from_source(ae_source(ae)) : strdupz("UNKNOWN=0=UNKNOWN"); - char config_hash_id[GUID_LEN + 1]; + char config_hash_id[UUID_STR_LEN]; uuid_unparse_lower(ae->config_hash_id, config_hash_id); alarm_log->chart = strdupz(ae_chart_name(ae)); @@ -939,18 +939,14 @@ void aclk_push_alert_snapshot_event(char *node_id __maybe_unused) #endif } -#define SQL_DELETE_ALERT_ENTRIES "DELETE FROM aclk_alert_%s WHERE filtered_alert_unique_id NOT IN (SELECT unique_id FROM health_log_%s);" - +#define SQL_DELETE_ALERT_ENTRIES "DELETE FROM aclk_alert_%s WHERE filtered_alert_unique_id + %d < UNIXEPOCH();" void sql_aclk_alert_clean_dead_entries(RRDHOST *host) { - if (!claimed()) - return; - char uuid_str[UUID_STR_LEN]; uuid_unparse_lower_fix(&host->host_uuid, uuid_str); - char sql[512]; - snprintfz(sql,511,SQL_DELETE_ALERT_ENTRIES, uuid_str, uuid_str); + char sql[ACLK_SYNC_QUERY_SIZE]; + snprintfz(sql, ACLK_SYNC_QUERY_SIZE - 1, SQL_DELETE_ALERT_ENTRIES, uuid_str, MAX_REMOVED_PERIOD); char *err_msg = NULL; int rc = sqlite3_exec_monitored(db_meta, sql, NULL, NULL, &err_msg); diff --git a/database/sqlite/sqlite_db_migration.c b/database/sqlite/sqlite_db_migration.c index 3132ae2d05..beea37627e 100644 --- a/database/sqlite/sqlite_db_migration.c +++ b/database/sqlite/sqlite_db_migration.c @@ -12,7 +12,7 @@ static int return_int_cb(void *data, int argc, char **argv, char **column) } -static int table_exists_in_database(const char *table) +int table_exists_in_database(const char *table) { char *err_msg = NULL; char sql[128]; diff --git a/database/sqlite/sqlite_db_migration.h b/database/sqlite/sqlite_db_migration.h index 138643a491..edaac52698 100644 --- a/database/sqlite/sqlite_db_migration.h +++ b/database/sqlite/sqlite_db_migration.h @@ -8,5 +8,6 @@ int perform_database_migration(sqlite3 *database, int target_version); int perform_context_database_migration(sqlite3 *database, int target_version); +int table_exists_in_database(const char *table); #endif //NETDATA_SQLITE_DB_MIGRATION_H diff --git a/database/sqlite/sqlite_health.c b/database/sqlite/sqlite_health.c index 36a29d2e47..eeaba3ed6f 100644 --- a/database/sqlite/sqlite_health.c +++ b/database/sqlite/sqlite_health.c @@ -2,6 +2,7 @@ #include "sqlite_health.h" #include "sqlite_functions.h" +#include "sqlite_db_migration.h" #define MAX_HEALTH_SQL_SIZE 2048 #define sqlite3_bind_string_or_null(res,key,param) ((key) ? sqlite3_bind_text(res, param, string2str(key), -1, SQLITE_STATIC) : sqlite3_bind_null(res, param)) @@ -20,7 +21,7 @@ int sql_create_health_log_table(RRDHOST *host) { return 1; } - char uuid_str[GUID_LEN + 1]; + char uuid_str[UUID_STR_LEN]; uuid_unparse_lower_fix(&host->host_uuid, uuid_str); snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_CREATE_HEALTH_LOG_TABLE(uuid_str)); @@ -53,7 +54,7 @@ void sql_health_alarm_log_update(RRDHOST *host, ALARM_ENTRY *ae) { return; } - char uuid_str[GUID_LEN + 1]; + char uuid_str[UUID_STR_LEN]; uuid_unparse_lower_fix(&host->host_uuid, uuid_str); snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_UPDATE_HEALTH_LOG(uuid_str)); @@ -128,7 +129,7 @@ void sql_health_alarm_log_insert(RRDHOST *host, ALARM_ENTRY *ae) { return; } - char uuid_str[GUID_LEN + 1]; + char uuid_str[UUID_STR_LEN]; uuid_unparse_lower_fix(&host->host_uuid, uuid_str); snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_INSERT_HEALTH_LOG(uuid_str)); @@ -358,34 +359,61 @@ void sql_health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) } /* Health related SQL queries - Cleans up the health_log table. + Get a count of rows from health log table */ -#define SQL_CLEANUP_HEALTH_LOG(guid,guid2,limit) "DELETE from health_log_%s where unique_id in (SELECT unique_id from health_log_%s order by unique_id asc LIMIT %lu);", guid, guid2, limit -void sql_health_alarm_log_cleanup(RRDHOST *host) { +#define SQL_COUNT_HEALTH_LOG(guid) "SELECT count(1) FROM health_log_%s;", guid +void sql_health_alarm_log_count(RRDHOST *host) { sqlite3_stmt *res = NULL; - static size_t rotate_every = 0; int rc; char command[MAX_HEALTH_SQL_SIZE + 1]; - if(unlikely(rotate_every == 0)) { - rotate_every = (size_t)config_get_number(CONFIG_SECTION_HEALTH, "rotate log every lines", 2000); - if(rotate_every < 100) rotate_every = 100; + if (unlikely(!db_meta)) { + if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) + error_report("Database has not been initialized"); + return; } - if(likely(host->health.health_log_entries_written < rotate_every)) { + char uuid_str[UUID_STR_LEN]; + uuid_unparse_lower_fix(&host->host_uuid, uuid_str); + + snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_COUNT_HEALTH_LOG(uuid_str)); + + rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to prepare statement to count health log entries from db"); return; } + rc = sqlite3_step_monitored(res); + if (likely(rc == SQLITE_ROW)) + host->health.health_log_entries_written = (size_t) sqlite3_column_int64(res, 0); + + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to finalize the prepared statement to count health log entries from db"); + + info("HEALTH [%s]: Table health_log_%s, contains %lu entries.", rrdhost_hostname(host), uuid_str, (unsigned long int) host->health.health_log_entries_written); +} + +/* Health related SQL queries + Cleans up the health_log table on a non-claimed host +*/ +#define SQL_CLEANUP_HEALTH_LOG_NOT_CLAIMED(guid,limit) "DELETE FROM health_log_%s ORDER BY unique_id ASC LIMIT %lu;", guid, limit +void sql_health_alarm_log_cleanup_not_claimed(RRDHOST *host, size_t rotate_every) { + sqlite3_stmt *res = NULL; + int rc; + char command[MAX_HEALTH_SQL_SIZE + 1]; + if (unlikely(!db_meta)) { if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) error_report("Database has not been initialized"); return; } - char uuid_str[GUID_LEN + 1]; + char uuid_str[UUID_STR_LEN]; uuid_unparse_lower_fix(&host->host_uuid, uuid_str); - snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_CLEANUP_HEALTH_LOG(uuid_str, uuid_str, (unsigned long int) (host->health.health_log_entries_written - rotate_every))); + snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_CLEANUP_HEALTH_LOG_NOT_CLAIMED(uuid_str, (unsigned long int) (host->health.health_log_entries_written - rotate_every))); rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); if (unlikely(rc != SQLITE_OK)) { @@ -403,14 +431,17 @@ void sql_health_alarm_log_cleanup(RRDHOST *host) { host->health.health_log_entries_written = rotate_every; - sql_aclk_alert_clean_dead_entries(host); + snprintfz(command, MAX_HEALTH_SQL_SIZE, "aclk_alert_%s", uuid_str); + if (unlikely(table_exists_in_database(command))) { + sql_aclk_alert_clean_dead_entries(host); + } } /* Health related SQL queries - Get a count of rows from health log table + Cleans up the health_log table on a claimed host */ -#define SQL_COUNT_HEALTH_LOG(guid) "SELECT count(1) FROM health_log_%s;", guid -void sql_health_alarm_log_count(RRDHOST *host) { +#define SQL_CLEANUP_HEALTH_LOG_CLAIMED(guid, guid2, guid3, limit) "DELETE from health_log_%s WHERE unique_id NOT IN (SELECT filtered_alert_unique_id FROM aclk_alert_%s) AND unique_id IN (SELECT unique_id FROM health_log_%s ORDER BY unique_id asc LIMIT %lu);", guid, guid2, guid3, limit +void sql_health_alarm_log_cleanup_claimed(RRDHOST *host, size_t rotate_every) { sqlite3_stmt *res = NULL; int rc; char command[MAX_HEALTH_SQL_SIZE + 1]; @@ -421,26 +452,55 @@ void sql_health_alarm_log_count(RRDHOST *host) { return; } - char uuid_str[GUID_LEN + 1]; + char uuid_str[UUID_STR_LEN]; uuid_unparse_lower_fix(&host->host_uuid, uuid_str); + snprintfz(command, MAX_HEALTH_SQL_SIZE, "aclk_alert_%s", uuid_str); - snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_COUNT_HEALTH_LOG(uuid_str)); + if (!table_exists_in_database(command)) { + sql_health_alarm_log_cleanup_not_claimed(host, rotate_every); + return; + } + + snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_CLEANUP_HEALTH_LOG_CLAIMED(uuid_str, uuid_str, uuid_str, (unsigned long int) (host->health.health_log_entries_written - rotate_every))); rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); if (unlikely(rc != SQLITE_OK)) { - error_report("Failed to prepare statement to count health log entries from db"); + error_report("Failed to prepare statement to cleanup health log table"); return; } rc = sqlite3_step_monitored(res); - if (likely(rc == SQLITE_ROW)) - host->health.health_log_entries_written = (size_t) sqlite3_column_int64(res, 0); + if (unlikely(rc != SQLITE_DONE)) + error_report("Failed to cleanup health log table, rc = %d", rc); rc = sqlite3_finalize(res); if (unlikely(rc != SQLITE_OK)) - error_report("Failed to finalize the prepared statement to count health log entries from db"); + error_report("Failed to finalize the prepared statement to cleanup health log table"); - info("HEALTH [%s]: Table health_log_%s, contains %lu entries.", rrdhost_hostname(host), uuid_str, (unsigned long int) host->health.health_log_entries_written); + sql_health_alarm_log_count(host); + + sql_aclk_alert_clean_dead_entries(host); +} + +/* Health related SQL queries + Cleans up the health_log table. +*/ +void sql_health_alarm_log_cleanup(RRDHOST *host) { + static size_t rotate_every = 0; + + if(unlikely(rotate_every == 0)) { + rotate_every = (size_t)config_get_number(CONFIG_SECTION_HEALTH, "rotate log every lines", 2000); + if(rotate_every < 100) rotate_every = 100; + } + + if(likely(host->health.health_log_entries_written < rotate_every)) { + return; + } + + if (!claimed()) { + sql_health_alarm_log_cleanup_not_claimed(host, rotate_every); + } else + sql_health_alarm_log_cleanup_claimed(host, rotate_every); } #define SQL_INJECT_REMOVED(guid, guid2) "insert into health_log_%s (hostname, unique_id, alarm_id, alarm_event_id, config_hash_id, updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, " \ @@ -608,7 +668,7 @@ void sql_check_removed_alerts_state(char *uuid_str) /* Health related SQL queries Load from the health log table */ -#define SQL_LOAD_HEALTH_LOG(guid,limit) "SELECT hostname, unique_id, alarm_id, alarm_event_id, config_hash_id, updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, delay_up_to_timestamp, name, chart, family, exec, recipient, source, units, info, exec_code, new_status, old_status, delay, new_value, old_value, last_repeat, class, component, type, chart_context FROM (SELECT hostname, unique_id, alarm_id, alarm_event_id, config_hash_id, updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, delay_up_to_timestamp, name, chart, family, exec, recipient, source, units, info, exec_code, new_status, old_status, delay, new_value, old_value, last_repeat, class, component, type, chart_context FROM health_log_%s order by unique_id desc limit %u) order by unique_id asc;", guid, limit +#define SQL_LOAD_HEALTH_LOG(guid) "SELECT hostname, unique_id, alarm_id, alarm_event_id, config_hash_id, updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, delay_up_to_timestamp, name, chart, family, exec, recipient, source, units, info, exec_code, new_status, old_status, delay, new_value, old_value, last_repeat, class, component, type, chart_context FROM health_log_%s group by alarm_id having max(alarm_event_id);", guid void sql_health_alarm_log_load(RRDHOST *host) { sqlite3_stmt *res = NULL; int ret; @@ -623,12 +683,12 @@ void sql_health_alarm_log_load(RRDHOST *host) { return; } - char uuid_str[GUID_LEN + 1]; + char uuid_str[UUID_STR_LEN]; uuid_unparse_lower_fix(&host->host_uuid, uuid_str); sql_check_removed_alerts_state(uuid_str); - snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_LOAD_HEALTH_LOG(uuid_str, host->health_log.max)); + snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_LOAD_HEALTH_LOG(uuid_str)); ret = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); if (unlikely(ret != SQLITE_OK)) { @@ -1076,7 +1136,7 @@ int alert_hash_and_store_config( EVP_MD_CTX_destroy(evpctx); fatal_assert(hash_len > sizeof(uuid_t)); - char uuid_str[GUID_LEN + 1]; + char uuid_str[UUID_STR_LEN]; uuid_unparse_lower(*((uuid_t *)&hash_value), uuid_str); uuid_copy(hash_id, *((uuid_t *)&hash_value)); @@ -1091,3 +1151,203 @@ int alert_hash_and_store_config( return 1; } + +#define SQL_SELECT_HEALTH_LAST_EXECUTED_EVENT "SELECT new_status FROM health_log_%s WHERE alarm_id = %u AND unique_id != %u AND flags & %d ORDER BY unique_id DESC LIMIT 1" +int sql_health_get_last_executed_event(RRDHOST *host, ALARM_ENTRY *ae, RRDCALC_STATUS *last_executed_status) +{ + int rc = 0, ret = -1; + char command[MAX_HEALTH_SQL_SIZE + 1]; + + char uuid_str[UUID_STR_LEN]; + uuid_unparse_lower_fix(&host->host_uuid, uuid_str); + + sqlite3_stmt *res = NULL; + + snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_SELECT_HEALTH_LAST_EXECUTED_EVENT, uuid_str, ae->alarm_id, ae->unique_id, HEALTH_ENTRY_FLAG_EXEC_RUN); + + rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); + if (rc != SQLITE_OK) { + error_report("Failed to prepare statement when trying to get last executed status"); + return ret; + } + + ret = 0; + while (sqlite3_step_monitored(res) == SQLITE_ROW) { + *last_executed_status = (RRDCALC_STATUS) sqlite3_column_int(res, 0); + ret = 1; + } + + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to finalize the statement."); + + return ret; +} + +#define SQL_SELECT_HEALTH_LOG(guid) "SELECT hostname, unique_id, alarm_id, alarm_event_id, config_hash_id, updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, delay_up_to_timestamp, name, chart, family, exec, recipient, source, units, info, exec_code, new_status, old_status, delay, new_value, old_value, last_repeat, class, component, type, chart_context FROM health_log_%s WHERE 1=1 ", guid +void sql_health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart) { + + buffer_strcat(wb, "["); + + unsigned int max = host->health_log.max; + unsigned int count = 0; + + sqlite3_stmt *res = NULL; + int rc; + + BUFFER *command = buffer_create(MAX_HEALTH_SQL_SIZE, NULL); + char uuid_str[UUID_STR_LEN]; + uuid_unparse_lower_fix(&host->host_uuid, uuid_str); + + buffer_sprintf(command, SQL_SELECT_HEALTH_LOG(uuid_str)); + + if (chart) { + char chart_sql[MAX_HEALTH_SQL_SIZE + 1]; + snprintfz(chart_sql, MAX_HEALTH_SQL_SIZE, "AND chart = '%s' ", chart); + buffer_strcat(command, chart_sql); + } + + if (after) { + char after_sql[MAX_HEALTH_SQL_SIZE + 1]; + snprintfz(after_sql, MAX_HEALTH_SQL_SIZE, "AND unique_id > %u ", after); + buffer_strcat(command, after_sql); + } + + { + char limit_sql[MAX_HEALTH_SQL_SIZE + 1]; + snprintfz(limit_sql, MAX_HEALTH_SQL_SIZE, "ORDER BY unique_id DESC LIMIT %u ", max); + buffer_strcat(command, limit_sql); + } + + rc = sqlite3_prepare_v2(db_meta, buffer_tostring(command), -1, &res, 0); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to prepare statement SQL_SELECT_HEALTH_LOG"); + return; + } + + while (sqlite3_step(res) == SQLITE_ROW) { + + char old_value_string[100 + 1]; + char new_value_string[100 + 1]; + + char config_hash_id[UUID_STR_LEN]; + uuid_unparse_lower(*((uuid_t *) sqlite3_column_blob(res, 4)), config_hash_id); + + char *edit_command = health_edit_command_from_source((char *)sqlite3_column_text(res, 18)); + + if (count) + buffer_sprintf(wb, ","); + + count++; + + buffer_sprintf( + wb, + "\n\t{\n" + "\t\t\"hostname\": \"%s\",\n" + "\t\t\"utc_offset\": %d,\n" + "\t\t\"timezone\": \"%s\",\n" + "\t\t\"unique_id\": %u,\n" + "\t\t\"alarm_id\": %u,\n" + "\t\t\"alarm_event_id\": %u,\n" + "\t\t\"config_hash_id\": \"%s\",\n" + "\t\t\"name\": \"%s\",\n" + "\t\t\"chart\": \"%s\",\n" + "\t\t\"context\": \"%s\",\n" + "\t\t\"family\": \"%s\",\n" + "\t\t\"class\": \"%s\",\n" + "\t\t\"component\": \"%s\",\n" + "\t\t\"type\": \"%s\",\n" + "\t\t\"processed\": %s,\n" + "\t\t\"updated\": %s,\n" + "\t\t\"exec_run\": %lu,\n" + "\t\t\"exec_failed\": %s,\n" + "\t\t\"exec\": \"%s\",\n" + "\t\t\"recipient\": \"%s\",\n" + "\t\t\"exec_code\": %d,\n" + "\t\t\"source\": \"%s\",\n" + "\t\t\"command\": \"%s\",\n" + "\t\t\"units\": \"%s\",\n" + "\t\t\"when\": %lu,\n" + "\t\t\"duration\": %lu,\n" + "\t\t\"non_clear_duration\": %lu,\n" + "\t\t\"status\": \"%s\",\n" + "\t\t\"old_status\": \"%s\",\n" + "\t\t\"delay\": %d,\n" + "\t\t\"delay_up_to_timestamp\": %lu,\n" + "\t\t\"updated_by_id\": %u,\n" + "\t\t\"updates_id\": %u,\n" + "\t\t\"value_string\": \"%s\",\n" + "\t\t\"old_value_string\": \"%s\",\n" + "\t\t\"last_repeat\": \"%lu\",\n" + "\t\t\"silenced\": \"%s\",\n", + sqlite3_column_text(res, 0), + host->utc_offset, + rrdhost_abbrev_timezone(host), + (unsigned int) sqlite3_column_int64(res, 1), + (unsigned int) sqlite3_column_int64(res, 2), + (unsigned int) sqlite3_column_int64(res, 3), + config_hash_id, + sqlite3_column_text(res, 13), + sqlite3_column_text(res, 14), + sqlite3_column_text(res, 31), + sqlite3_column_text(res, 15), + sqlite3_column_text(res, 28) ? (const char *) sqlite3_column_text(res, 28) : (char *) "Unknown", + sqlite3_column_text(res, 29) ? (const char *) sqlite3_column_text(res, 29) : (char *) "Unknown", + sqlite3_column_text(res, 30) ? (const char *) sqlite3_column_text(res, 30) : (char *) "Unknown", + (sqlite3_column_int64(res, 10) & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false", + (sqlite3_column_int64(res, 10) & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false", + (long unsigned int)sqlite3_column_int64(res, 11), + (sqlite3_column_int64(res, 10) & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false", + sqlite3_column_text(res, 16) ? (const char *) sqlite3_column_text(res, 16) : string2str(host->health.health_default_exec), + sqlite3_column_text(res, 17) ? (const char *) sqlite3_column_text(res, 17) : string2str(host->health.health_default_recipient), + sqlite3_column_int(res, 21), + sqlite3_column_text(res, 18), + edit_command, + sqlite3_column_text(res, 19), + (long unsigned int)sqlite3_column_int64(res, 7), + (long unsigned int)sqlite3_column_int64(res, 8), + (long unsigned int)sqlite3_column_int64(res, 9), + rrdcalc_status2string(sqlite3_column_int(res, 22)), + rrdcalc_status2string(sqlite3_column_int(res, 23)), + sqlite3_column_int(res, 24), + (long unsigned int)sqlite3_column_int64(res, 12), + (unsigned int)sqlite3_column_int64(res, 5), + (unsigned int)sqlite3_column_int64(res, 6), + sqlite3_column_type(res, 25) == SQLITE_NULL ? "-" : format_value_and_unit(new_value_string, 100, sqlite3_column_double(res, 25), (char *) sqlite3_column_text(res, 19), -1), + sqlite3_column_type(res, 26) == SQLITE_NULL ? "-" : format_value_and_unit(old_value_string, 100, sqlite3_column_double(res, 26), (char *) sqlite3_column_text(res, 19), -1), + (long unsigned int)sqlite3_column_int64(res, 27), + (sqlite3_column_int64(res, 10) & HEALTH_ENTRY_FLAG_SILENCED)?"true":"false"); + + health_string2json(wb, "\t\t", "info", (char *) sqlite3_column_text(res, 20), ",\n"); + + if(unlikely(sqlite3_column_int64(res, 10) & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION)) { + buffer_strcat(wb, "\t\t\"no_clear_notification\": true,\n"); + } + + buffer_strcat(wb, "\t\t\"value\":"); + if (sqlite3_column_type(res, 25) == SQLITE_NULL) + buffer_strcat(wb, "null"); + else + buffer_print_netdata_double(wb, sqlite3_column_double(res, 25)); + buffer_strcat(wb, ",\n"); + + buffer_strcat(wb, "\t\t\"old_value\":"); + if (sqlite3_column_type(res, 26) == SQLITE_NULL) + buffer_strcat(wb, "null"); + else + buffer_print_netdata_double(wb, sqlite3_column_double(res, 26)); + buffer_strcat(wb, "\n"); + + buffer_strcat(wb, "\t}"); + + freez(edit_command); + } + + buffer_strcat(wb, "\n]"); + + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to finalize statement for SQL_SELECT_HEALTH_LOG"); + + buffer_free(command); +} diff --git a/database/sqlite/sqlite_health.h b/database/sqlite/sqlite_health.h index 87060dacc5..96d090b549 100644 --- a/database/sqlite/sqlite_health.h +++ b/database/sqlite/sqlite_health.h @@ -14,4 +14,6 @@ void sql_health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae); void sql_health_alarm_log_cleanup(RRDHOST *host); int alert_hash_and_store_config(uuid_t hash_id, struct alert_config *cfg, int store_hash); void sql_aclk_alert_clean_dead_entries(RRDHOST *host); +int sql_health_get_last_executed_event(RRDHOST *host, ALARM_ENTRY *ae, RRDCALC_STATUS *last_executed_status); +void sql_health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart); #endif //NETDATA_SQLITE_HEALTH_H diff --git a/health/health.c b/health/health.c index 5c2b85bc5a..df4798a204 100644 --- a/health/health.c +++ b/health/health.c @@ -412,17 +412,13 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { // find the previous notification for the same alarm // which we have run the exec script // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set + RRDCALC_STATUS last_executed_status = -3; if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) { - uint32_t id = ae->alarm_id; - ALARM_ENTRY *t; - for(t = ae->next; t ; t = t->next) { - if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN) - break; - } + int ret = sql_health_get_last_executed_event(host, ae, &last_executed_status); - if(likely(t)) { + if (likely(ret == 1)) { // we have executed this alarm notification in the past - if(t && t->new_status == ae->new_status) { + if(last_executed_status == ae->new_status) { // don't send the notification for the same status again debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_name(ae), ae_name(ae) , rrdcalc_status2string(ae->new_status)); @@ -561,6 +557,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS; ae->exec_spawn_serial = spawn_enq_cmd(command_to_run); enqueue_alarm_notify_in_progress(ae); + health_alarm_log_save(host, ae); } else { error("Failed to format command arguments"); } @@ -628,35 +625,32 @@ static inline void health_alarm_log_process(RRDHOST *host) { // remember this for the next iteration host->health_last_processed_id = first_waiting; - bool cleanup_excess_log_entries = host->health_log.count > host->health_log.max; - - if (!cleanup_excess_log_entries) - return; - - // cleanup excess entries in the log + //delete those that are updated, no in progress execution, and is not repeating netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock); - ALARM_ENTRY *last = NULL; - unsigned int count = host->health_log.max * 2 / 3; - for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ; - - if(ae && last && last->next == ae) - last->next = NULL; - else - ae = NULL; - - while(ae) { - debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id); - - ALARM_ENTRY *t = ae->next; - - if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) { - health_alarm_wait_for_execution(ae); + ALARM_ENTRY *prev = host->health_log.alarms; + for(ae = host->health_log.alarms; ae ; ae = ae->next) { + + if((likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING)) && + (ae->flags & HEALTH_ENTRY_FLAG_UPDATED) && + (ae->flags & HEALTH_ENTRY_FLAG_SAVED) && + !(ae->flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS)) + || + ((ae->new_status == RRDCALC_STATUS_REMOVED) && + (ae->flags & HEALTH_ENTRY_FLAG_SAVED) && + (ae->when + 3600 < now_realtime_sec()))) + { + + if (ae == host->health_log.alarms) { + host->health_log.alarms = ae->next; + prev = ae->next; + } else { + prev->next = ae->next; + } health_alarm_log_free_one_nochecks_nounlink(ae); - host->health_log.count--; - } - - ae = t; + ae = prev; + } else + prev = ae; } netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); @@ -904,8 +898,24 @@ static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) { return 0; } +static void sql_health_postpone_queue_removed(RRDHOST *host __maybe_unused) { +#ifdef ENABLE_ACLK + if (netdata_cloud_setting) { + struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config; + if (unlikely(!wc)) { + return; + } + + if (wc->alert_queue_removed >= 1) { + wc->alert_queue_removed+=6; + } + } +#endif +} + static void health_execute_delayed_initializations(RRDHOST *host) { RRDSET *st; + bool must_postpone = false; if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION)) return; rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION); @@ -941,8 +951,11 @@ static void health_execute_delayed_initializations(RRDHOST *host) { rrdvar_store_for_chart(host, st); } rrddim_foreach_done(rd); + must_postpone = true; } rrdset_foreach_done(st); + if (must_postpone) + sql_health_postpone_queue_removed(host); } /** diff --git a/health/health.h b/health/health.h index 902e36c622..c36aabac7e 100644 --- a/health/health.h +++ b/health/health.h @@ -41,7 +41,6 @@ void health_reload(void); void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* context, RRDCALC_STATUS status); void health_alarms2json(RRDHOST *host, BUFFER *wb, int all); void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all); -void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart); void health_api_v1_chart_variables2json(RRDSET *st, BUFFER *buf); void health_api_v1_chart_custom_variables2json(RRDSET *st, BUFFER *buf); @@ -87,11 +86,10 @@ void health_alarm_log_free_one_nochecks_nounlink(ALARM_ENTRY *ae); void *health_cmdapi_thread(void *ptr); -void health_label_log_save(RRDHOST *host); - char *health_edit_command_from_source(const char *source); void sql_refresh_hashes(void); void health_add_host_labels(void); +void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix); #endif //NETDATA_HEALTH_H diff --git a/health/health_json.c b/health/health_json.c index ba18bddba9..4f81998f07 100644 --- a/health/health_json.c +++ b/health/health_json.c @@ -13,136 +13,6 @@ void health_string2json(BUFFER *wb, const char *prefix, const char *label, const buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix); } -void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) { - char *edit_command = ae->source ? health_edit_command_from_source(ae_source(ae)) : strdupz("UNKNOWN=0=UNKNOWN"); - char config_hash_id[GUID_LEN + 1]; - uuid_unparse_lower(ae->config_hash_id, config_hash_id); - - buffer_sprintf(wb, - "\n\t{\n" - "\t\t\"hostname\": \"%s\",\n" - "\t\t\"utc_offset\": %d,\n" - "\t\t\"timezone\": \"%s\",\n" - "\t\t\"unique_id\": %u,\n" - "\t\t\"alarm_id\": %u,\n" - "\t\t\"alarm_event_id\": %u,\n" - "\t\t\"config_hash_id\": \"%s\",\n" - "\t\t\"name\": \"%s\",\n" - "\t\t\"chart\": \"%s\",\n" - "\t\t\"context\": \"%s\",\n" - "\t\t\"family\": \"%s\",\n" - "\t\t\"class\": \"%s\",\n" - "\t\t\"component\": \"%s\",\n" - "\t\t\"type\": \"%s\",\n" - "\t\t\"processed\": %s,\n" - "\t\t\"updated\": %s,\n" - "\t\t\"exec_run\": %lu,\n" - "\t\t\"exec_failed\": %s,\n" - "\t\t\"exec\": \"%s\",\n" - "\t\t\"recipient\": \"%s\",\n" - "\t\t\"exec_code\": %d,\n" - "\t\t\"source\": \"%s\",\n" - "\t\t\"command\": \"%s\",\n" - "\t\t\"units\": \"%s\",\n" - "\t\t\"when\": %lu,\n" - "\t\t\"duration\": %lu,\n" - "\t\t\"non_clear_duration\": %lu,\n" - "\t\t\"status\": \"%s\",\n" - "\t\t\"old_status\": \"%s\",\n" - "\t\t\"delay\": %d,\n" - "\t\t\"delay_up_to_timestamp\": %lu,\n" - "\t\t\"updated_by_id\": %u,\n" - "\t\t\"updates_id\": %u,\n" - "\t\t\"value_string\": \"%s\",\n" - "\t\t\"old_value_string\": \"%s\",\n" - "\t\t\"last_repeat\": \"%lu\",\n" - "\t\t\"silenced\": \"%s\",\n" - , rrdhost_hostname(host) - , host->utc_offset - , rrdhost_abbrev_timezone(host) - , ae->unique_id - , ae->alarm_id - , ae->alarm_event_id - , config_hash_id - , ae_name(ae) - , ae_chart_name(ae) - , ae_chart_context(ae) - , ae_family(ae) - , ae->classification?ae_classification(ae):"Unknown" - , ae->component?ae_component(ae):"Unknown" - , ae->type?ae_type(ae):"Unknown" - , (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false" - , (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false" - , (unsigned long)ae->exec_run_timestamp - , (ae->flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false" - , ae->exec?ae_exec(ae):string2str(host->health.health_default_exec) - , ae->recipient?ae_recipient(ae):string2str(host->health.health_default_recipient) - , ae->exec_code - , ae_source(ae) - , edit_command - , ae_units(ae) - , (unsigned long)ae->when - , (unsigned long)ae->duration - , (unsigned long)ae->non_clear_duration - , rrdcalc_status2string(ae->new_status) - , rrdcalc_status2string(ae->old_status) - , ae->delay - , (unsigned long)ae->delay_up_to_timestamp - , ae->updated_by_id - , ae->updates_id - , ae_new_value_string(ae) - , ae_old_value_string(ae) - , (unsigned long)ae->last_repeat - , (ae->flags & HEALTH_ENTRY_FLAG_SILENCED)?"true":"false" - ); - - health_string2json(wb, "\t\t", "info", ae->info ? ae_info(ae) : "", ",\n"); - - if(unlikely(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION)) { - buffer_strcat(wb, "\t\t\"no_clear_notification\": true,\n"); - } - - buffer_strcat(wb, "\t\t\"value\":"); - buffer_print_netdata_double(wb, ae->new_value); - buffer_strcat(wb, ",\n"); - |