diff options
-rw-r--r-- | daemon/config/README.md | 2 | ||||
-rw-r--r-- | database/rrd.h | 1 | ||||
-rw-r--r-- | database/sqlite/sqlite_functions.c | 2 | ||||
-rw-r--r-- | database/sqlite/sqlite_health.c | 66 | ||||
-rw-r--r-- | health/health.c | 22 | ||||
-rw-r--r-- | health/health.h | 8 | ||||
-rw-r--r-- | streaming/README.md | 1 | ||||
-rw-r--r-- | streaming/receiver.c | 6 | ||||
-rw-r--r-- | streaming/rrdpush.h | 1 | ||||
-rw-r--r-- | streaming/stream.conf | 6 | ||||
-rw-r--r-- | web/api/health/README.md | 4 |
11 files changed, 84 insertions, 35 deletions
diff --git a/daemon/config/README.md b/daemon/config/README.md index 418b12cf9e..bc5a5885c1 100644 --- a/daemon/config/README.md +++ b/daemon/config/README.md @@ -175,7 +175,7 @@ monitoring](https://github.com/netdata/netdata/blob/master/health/README.md). | script to execute on alarm | `/usr/libexec/netdata/plugins.d/alarm-notify.sh` | The script that sends alarm notifications. Note that in versions before 1.16, the plugins.d directory may be installed in a different location in certain OSs (e.g. under `/usr/lib/netdata`). | | run at least every seconds | `10` | Controls how often all alarm conditions should be evaluated. | | postpone alarms during hibernation for seconds | `60` | Prevents false alarms. May need to be increased if you get alarms during hibernation. | -| rotate log every lines | 2000 | Controls the number of alarm log entries stored in `<lib directory>/health-log.db`, where `<lib directory>` is the one configured in the [\[global\] section](#global-section-options) | +| health log history | `432000` | Specifies the history of alarm events (in seconds) kept in the agent's sqlite database. | | enabled alarms | * | Defines which alarms to load from both user and stock directories. This is a [simple pattern](https://github.com/netdata/netdata/blob/master/libnetdata/simple_pattern/README.md) list of alarm or template names. Can be used to disable specific alarms. For example, `enabled alarms = !oom_kill *` will load all alarms except `oom_kill`. | ### [web] section options diff --git a/database/rrd.h b/database/rrd.h index 2f697c81df..95da17c82b 100644 --- a/database/rrd.h +++ b/database/rrd.h @@ -1105,6 +1105,7 @@ typedef struct alarm_log { uint32_t next_alarm_id; unsigned int count; unsigned int max; + uint32_t health_log_history; // the health log history in seconds to be kept in db ALARM_ENTRY *alarms; RW_SPINLOCK spinlock; } ALARM_LOG; diff --git a/database/sqlite/sqlite_functions.c b/database/sqlite/sqlite_functions.c index d501214017..4200c15901 100644 --- a/database/sqlite/sqlite_functions.c +++ b/database/sqlite/sqlite_functions.c @@ -49,7 +49,6 @@ const char *database_config[] = { "config_hash_id blob, name text, chart text, family text, recipient text, units text, exec text, " "chart_context text, last_transition_id blob, UNIQUE (host_id, alarm_id)) ;", - //TODO indexes "CREATE INDEX IF NOT EXISTS health_log_ind_1 ON health_log (host_id);", "CREATE TABLE IF NOT EXISTS health_log_detail (health_log_id int, unique_id int, alarm_id int, alarm_event_id int, " @@ -62,7 +61,6 @@ const char *database_config[] = { "CREATE INDEX IF NOT EXISTS health_log_d_ind_2 ON health_log_detail (global_id);", "CREATE INDEX IF NOT EXISTS health_log_d_ind_3 ON health_log_detail (transition_id);", "CREATE INDEX IF NOT EXISTS health_log_d_ind_4 ON health_log_detail (health_log_id);", - //TODO more indexes NULL }; diff --git a/database/sqlite/sqlite_health.c b/database/sqlite/sqlite_health.c index c7a794f885..acaa9748e3 100644 --- a/database/sqlite/sqlite_health.c +++ b/database/sqlite/sqlite_health.c @@ -393,8 +393,8 @@ void sql_health_alarm_log_count(RRDHOST *host) { /* Health related SQL queries Cleans up the health_log_detail table on a non-claimed host */ -#define SQL_CLEANUP_HEALTH_LOG_DETAIL_NOT_CLAIMED(limit) "DELETE FROM health_log_detail where health_log_id in (select health_log_id from health_log where host_id = @host_id) ORDER BY unique_id ASC LIMIT %lu;", limit -void sql_health_alarm_log_cleanup_not_claimed(RRDHOST *host, size_t rotate_every) { +#define SQL_CLEANUP_HEALTH_LOG_DETAIL_NOT_CLAIMED "DELETE FROM health_log_detail WHERE health_log_id IN (SELECT health_log_id FROM health_log WHERE host_id = ?1) AND when_key + ?2 < unixepoch() AND updated_by_id <> 0 AND transition_id NOT IN (SELECT last_transition_id FROM health_log hl WHERE hl.host_id = ?3);" +void sql_health_alarm_log_cleanup_not_claimed(RRDHOST *host) { sqlite3_stmt *res = NULL; int rc; char command[MAX_HEALTH_SQL_SIZE + 1]; @@ -408,9 +408,7 @@ void sql_health_alarm_log_cleanup_not_claimed(RRDHOST *host, size_t rotate_every char uuid_str[UUID_STR_LEN]; uuid_unparse_lower_fix(&host->host_uuid, uuid_str); - snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_CLEANUP_HEALTH_LOG_DETAIL_NOT_CLAIMED((unsigned long int) (host->health.health_log_entries_written - rotate_every))); - - rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); + rc = sqlite3_prepare_v2(db_meta, SQL_CLEANUP_HEALTH_LOG_DETAIL_NOT_CLAIMED, -1, &res, 0); if (unlikely(rc != SQLITE_OK)) { error_report("Failed to prepare statement to cleanup health log detail table (un-claimed)"); return; @@ -423,15 +421,29 @@ void sql_health_alarm_log_cleanup_not_claimed(RRDHOST *host, size_t rotate_every return; } + rc = sqlite3_bind_int64(res, 2, (sqlite3_int64)host->health_log.health_log_history); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind health log history for SQL_CLEANUP_HEALTH_LOG_NOT_CLAIMED."); + sqlite3_finalize(res); + return; + } + + rc = sqlite3_bind_blob(res, 3, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind host_id for SQL_CLEANUP_HEALTH_LOG_NOT_CLAIMED."); + sqlite3_finalize(res); + return; + } + rc = sqlite3_step_monitored(res); if (unlikely(rc != SQLITE_DONE)) - error_report("Failed to cleanup health log table, rc = %d", rc); + error_report("Failed to cleanup health log detail table, rc = %d", rc); rc = sqlite3_finalize(res); if (unlikely(rc != SQLITE_OK)) error_report("Failed to finalize the prepared statement to cleanup health log detail table (un-claimed)"); - host->health.health_log_entries_written = rotate_every; + sql_health_alarm_log_count(host); snprintfz(command, MAX_HEALTH_SQL_SIZE, "aclk_alert_%s", uuid_str); if (unlikely(table_exists_in_database(command))) { @@ -442,8 +454,8 @@ void sql_health_alarm_log_cleanup_not_claimed(RRDHOST *host, size_t rotate_every /* Health related SQL queries Cleans up the health_log_detail table on a claimed host */ -#define SQL_CLEANUP_HEALTH_LOG_DETAIL_CLAIMED(guid, limit) "DELETE from health_log_detail WHERE unique_id NOT IN (SELECT filtered_alert_unique_id FROM aclk_alert_%s) AND unique_id IN (SELECT hld.unique_id FROM health_log hl, health_log_detail hld WHERE hl.host_id = ?1 AND hl.health_log_id = hld.health_log_id) and health_log_id in (SELECT health_log_id FROM health_log WHERE host_id = ?2) ORDER BY unique_id asc LIMIT %lu;", guid, limit -void sql_health_alarm_log_cleanup_claimed(RRDHOST *host, size_t rotate_every) { +#define SQL_CLEANUP_HEALTH_LOG_DETAIL_CLAIMED(guid) "DELETE from health_log_detail WHERE unique_id NOT IN (SELECT filtered_alert_unique_id FROM aclk_alert_%s) AND unique_id IN (SELECT hld.unique_id FROM health_log hl, health_log_detail hld WHERE hl.host_id = ?1 AND hl.health_log_id = hld.health_log_id) AND health_log_id IN (SELECT health_log_id FROM health_log WHERE host_id = ?2) AND when_key + ?3 < unixepoch() AND updated_by_id <> 0 AND transition_id NOT IN (SELECT last_transition_id FROM health_log hl WHERE hl.host_id = ?4);", guid +void sql_health_alarm_log_cleanup_claimed(RRDHOST *host) { sqlite3_stmt *res = NULL; int rc; char command[MAX_HEALTH_SQL_SIZE + 1]; @@ -459,11 +471,11 @@ void sql_health_alarm_log_cleanup_claimed(RRDHOST *host, size_t rotate_every) { snprintfz(command, MAX_HEALTH_SQL_SIZE, "aclk_alert_%s", uuid_str); if (!table_exists_in_database(command)) { - sql_health_alarm_log_cleanup_not_claimed(host, rotate_every); + sql_health_alarm_log_cleanup_not_claimed(host); return; } - snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_CLEANUP_HEALTH_LOG_DETAIL_CLAIMED(uuid_str, (unsigned long int) (host->health.health_log_entries_written - rotate_every))); + snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_CLEANUP_HEALTH_LOG_DETAIL_CLAIMED(uuid_str)); rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); if (unlikely(rc != SQLITE_OK)) { @@ -485,9 +497,23 @@ void sql_health_alarm_log_cleanup_claimed(RRDHOST *host, size_t rotate_every) { return; } + rc = sqlite3_bind_int64(res, 3, (sqlite3_int64)host->health_log.health_log_history); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind health log history for SQL_CLEANUP_HEALTH_LOG_CLAIMED."); + sqlite3_finalize(res); + return; + } + + rc = sqlite3_bind_blob(res, 4, &host->host_uuid, sizeof(host->host_uuid), SQLITE_STATIC); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind second host_id for SQL_CLEANUP_HEALTH_LOG_CLAIMED."); + sqlite3_finalize(res); + return; + } + rc = sqlite3_step_monitored(res); if (unlikely(rc != SQLITE_DONE)) - error_report("Failed to cleanup health log table, rc = %d", rc); + error_report("Failed to cleanup health log detail table, rc = %d", rc); rc = sqlite3_finalize(res); if (unlikely(rc != SQLITE_OK)) @@ -496,27 +522,17 @@ void sql_health_alarm_log_cleanup_claimed(RRDHOST *host, size_t rotate_every) { sql_health_alarm_log_count(host); sql_aclk_alert_clean_dead_entries(host); + } /* Health related SQL queries Cleans up the health_log table. */ void sql_health_alarm_log_cleanup(RRDHOST *host) { - static size_t rotate_every = 0; - - if(unlikely(rotate_every == 0)) { - rotate_every = (size_t)config_get_number(CONFIG_SECTION_HEALTH, "rotate log every lines", 2000); - if(rotate_every < 100) rotate_every = 100; - } - - if(likely(host->health.health_log_entries_written < rotate_every)) { - return; - } - if (!claimed()) { - sql_health_alarm_log_cleanup_not_claimed(host, rotate_every); + sql_health_alarm_log_cleanup_not_claimed(host); } else - sql_health_alarm_log_cleanup_claimed(host, rotate_every); + sql_health_alarm_log_cleanup_claimed(host); } #define SQL_INJECT_REMOVED "insert into health_log_detail (health_log_id, unique_id, alarm_id, alarm_event_id, updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, delay_up_to_timestamp, info, exec_code, new_status, old_status, delay, new_value, old_value, last_repeat, transition_id, global_id) select health_log_id, ?1, ?2, ?3, 0, ?4, unixepoch(), 0, 0, flags, exec_run_timestamp, unixepoch(), info, exec_code, -2, new_status, delay, NULL, new_value, 0, ?5, now_usec(0) from health_log_detail where unique_id = ?6 and transition_id = ?7;" diff --git a/health/health.c b/health/health.c index 45a9761d84..4ae7e99974 100644 --- a/health/health.c +++ b/health/health.c @@ -22,7 +22,6 @@ char *silencers_filename; SIMPLE_PATTERN *conf_enabled_alarms = NULL; DICTIONARY *health_rrdvars; - void health_entry_flags_to_json_array(BUFFER *wb, const char *key, HEALTH_ENTRY_FLAGS flags) { buffer_json_member_add_array(wb, key); @@ -803,15 +802,28 @@ static void initialize_health(RRDHOST *host) long n = config_get_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", host->health_log.max); if(n < 10) { - netdata_log_error("Host '%s': health configuration has invalid max log entries %ld. Using default %u", - rrdhost_hostname(host), - n, - host->health_log.max); + log_health("Host '%s': health configuration has invalid max log entries %ld. Using default %u", rrdhost_hostname(host), n, host->health_log.max); config_set_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", (long)host->health_log.max); } else host->health_log.max = (unsigned int)n; + uint32_t m = config_get_number(CONFIG_SECTION_HEALTH, "health log history", HEALTH_LOG_DEFAULT_HISTORY); + if (m < HEALTH_LOG_MINIMUM_HISTORY) { + log_health("Host '%s': health configuration has invalid health log history %u. Using minimum %d", rrdhost_hostname(host), m, HEALTH_LOG_MINIMUM_HISTORY); + config_set_number(CONFIG_SECTION_HEALTH, "health log history", HEALTH_LOG_MINIMUM_HISTORY); + m = HEALTH_LOG_MINIMUM_HISTORY; + } + + //default health log history is 5 days and not less than a day + if (host->health_log.health_log_history) { + if (host->health_log.health_log_history < HEALTH_LOG_MINIMUM_HISTORY) + host->health_log.health_log_history = HEALTH_LOG_MINIMUM_HISTORY; + } else + host->health_log.health_log_history = m; + + log_health("[%s]: Health log history is set to %u seconds (%u days)", rrdhost_hostname(host), host->health_log.health_log_history, host->health_log.health_log_history / 86400); + conf_enabled_alarms = simple_pattern_create(config_get(CONFIG_SECTION_HEALTH, "enabled alarms", "*"), NULL, SIMPLE_PATTERN_EXACT, true); diff --git a/health/health.h b/health/health.h index 74af753858..543bc56a14 100644 --- a/health/health.h +++ b/health/health.h @@ -31,6 +31,14 @@ void health_entry_flags_to_json_array(BUFFER *wb, const char *key, HEALTH_ENTRY_ #define HEALTH_LISTEN_BACKLOG 4096 #endif +#ifndef HEALTH_LOG_DEFAULT_HISTORY +#define HEALTH_LOG_DEFAULT_HISTORY 432000 +#endif + +#ifndef HEALTH_LOG_MINIMUM_HISTORY +#define HEALTH_LOG_MINIMUM_HISTORY 86400 +#endif + #define HEALTH_SILENCERS_MAX_FILE_LEN 10000 extern char *silencers_filename; diff --git a/streaming/README.md b/streaming/README.md index bf11f32e47..370186acc2 100644 --- a/streaming/README.md +++ b/streaming/README.md @@ -55,6 +55,7 @@ node**. This file is automatically generated by Netdata the first time it is sta | [`default memory mode`](#default-memory-mode) | `ram` | The [database](https://github.com/netdata/netdata/blob/master/database/README.md) to use for all nodes using this `API_KEY`. Valid settings are `dbengine`, `map`, `save`, `ram`, or `none`. [Read more →](#default-memory-mode) | | `health enabled by default` | `auto` | Whether alarms and notifications should be enabled for nodes using this `API_KEY`. `auto` enables alarms when the child is connected. `yes` enables alarms always, and `no` disables alarms. | | `default postpone alarms on connect seconds` | `60` | Postpone alarms and notifications for a period of time after the child connects. | +| `default health log history` | `432000` | History of health log events (in seconds) kept in the database. | | `default proxy enabled` | ` ` | Route metrics through a proxy. | | `default proxy destination` | ` ` | Space-separated list of `IP:PORT` for proxies. | | `default proxy api key` | ` ` | The `API_KEY` of the proxy. | diff --git a/streaming/receiver.c b/streaming/receiver.c index 2a81b824c9..c827ca7a1d 100644 --- a/streaming/receiver.c +++ b/streaming/receiver.c @@ -411,6 +411,8 @@ static bool rrdhost_set_receiver(RRDHOST *host, struct receiver_state *rpt) { } } + host->health_log.health_log_history = rpt->config.alarms_history; + // this is a test // if(rpt->hops <= host->sender->hops) // rrdpush_sender_thread_stop(host, "HOPS MISMATCH", false); @@ -552,6 +554,7 @@ static void rrdpush_receive(struct receiver_state *rpt) rpt->config.health_enabled = (int)default_health_enabled; rpt->config.alarms_delay = 60; + rpt->config.alarms_history = HEALTH_LOG_DEFAULT_HISTORY; rpt->config.rrdpush_enabled = (int)default_rrdpush_enabled; rpt->config.rrdpush_destination = default_rrdpush_destination; @@ -588,6 +591,9 @@ static void rrdpush_receive(struct receiver_state *rpt) rpt->config.alarms_delay = appconfig_get_number(&stream_config, rpt->key, "default postpone alarms on connect seconds", rpt->config.alarms_delay); rpt->config.alarms_delay = appconfig_get_number(&stream_config, rpt->machine_guid, "postpone alarms on connect seconds", rpt->config.alarms_delay); + rpt->config.alarms_history = appconfig_get_number(&stream_config, rpt->key, "default health log history", rpt->config.alarms_history); + rpt->config.alarms_history = appconfig_get_number(&stream_config, rpt->machine_guid, "health log history", rpt->config.alarms_history); + rpt->config.rrdpush_enabled = appconfig_get_boolean(&stream_config, rpt->key, "default proxy enabled", rpt->config.rrdpush_enabled); rpt->config.rrdpush_enabled = appconfig_get_boolean(&stream_config, rpt->machine_guid, "proxy enabled", rpt->config.rrdpush_enabled); diff --git a/streaming/rrdpush.h b/streaming/rrdpush.h index 6781f1cfe3..73bd438c9d 100644 --- a/streaming/rrdpush.h +++ b/streaming/rrdpush.h @@ -400,6 +400,7 @@ struct receiver_state { int update_every; int health_enabled; // CONFIG_BOOLEAN_YES, CONFIG_BOOLEAN_NO, CONFIG_BOOLEAN_AUTO time_t alarms_delay; + uint32_t alarms_history; int rrdpush_enabled; char *rrdpush_api_key; // DONT FREE - it is allocated in appconfig char *rrdpush_send_charts_matching; // DONT FREE - it is allocated in appconfig diff --git a/streaming/stream.conf b/streaming/stream.conf index aa38e6a626..94e94cab7e 100644 --- a/streaming/stream.conf +++ b/streaming/stream.conf @@ -155,6 +155,9 @@ # postpone alarms for a short period after the sender is connected default postpone alarms on connect seconds = 60 + # seconds of health log events to keep + #default health log history = 432000 + # need to route metrics differently? set these. # the defaults are the ones at the [stream] section (above) #default proxy enabled = yes | no @@ -223,6 +226,9 @@ # postpone alarms when the sender connects postpone alarms on connect seconds = 60 + # seconds of health log events to keep + #health log history = 432000 + # need to route metrics differently? # the defaults are the ones at the [API KEY] section #proxy enabled = yes | no diff --git a/web/api/health/README.md b/web/api/health/README.md index dd46854a1b..f820263b1e 100644 --- a/web/api/health/README.md +++ b/web/api/health/README.md @@ -28,12 +28,12 @@ This API call will return the alarms currently in WARNING or CRITICAL state. ### Event Log -The size of the alarm log is configured in `netdata.conf`. There are 2 settings: the rotation of the alarm log file and the in memory size of the alarm log. +The size of the alarm log is configured in `netdata.conf`. There are 2 settings: the event history kept in the DB (in seconds), and the in memory size of the alarm log. ``` [health] in memory max health log entries = 1000 - rotate log every lines = 2000 + health log history = 432000 ``` The API call retrieves all entries of the alarm log: |