summaryrefslogtreecommitdiffstats
path: root/streaming
diff options
context:
space:
mode:
authorAndrew Moss <1043609+amoss@users.noreply.github.com>2020-06-03 08:38:25 +0200
committerGitHub <noreply@github.com>2020-06-03 08:38:25 +0200
commit49719a961d6c079004b65458ea8c5e08ada1c44c (patch)
tree258b25ac60c403696a72b1589d5fa8634dfc6764 /streaming
parent1aa2cd7c43f6dd68b4bb43a87eb8b2995687ca9c (diff)
Fix bugs in streaming and enable support for gap filling (#9214)
This PR adds (inactive) support that we will use to fill the gaps on chart when a receiving agent goes offline and the sender reconnects. The streaming component has been reworked to make the connection bi-directional and fix several outstanding bugs in the area. * Fixed an incorrect case of version negotiation. Removed fatal() on exhaustion of fds. * Fixed cases that fell through to polling the socket after closing. * Fixed locking of data related to sender and receiver in the host structure. * Added fine-grained locks to reduce contention. * Added circular buffer to sender to prevent starvation in high-latency conditions. * Fixed case where agent is a proxy and negotiated different streaming versions with sender and receiver. * Changed interface to new parser to put the buffering code in streaming. * Fixed the bug that stopped senders from reconnecting after their socket times out - this was part of the scaling fixes that provide an early shortcut path for rejecting connections without lock contention. * Uses fine-grained locking and a different approach to thread shutdown instead. * Added liveness detection to connections to allow selection of the best connection.
Diffstat (limited to 'streaming')
-rw-r--r--streaming/receiver.c429
-rw-r--r--streaming/rrdpush.c1099
-rw-r--r--streaming/rrdpush.h87
-rw-r--r--streaming/sender.c709
4 files changed, 1298 insertions, 1026 deletions
diff --git a/streaming/receiver.c b/streaming/receiver.c
new file mode 100644
index 0000000000..39a606e291
--- /dev/null
+++ b/streaming/receiver.c
@@ -0,0 +1,429 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "rrdpush.h"
+
+extern struct config stream_config;
+
+static void rrdpush_receiver_thread_cleanup(void *ptr) {
+ static __thread int executed = 0;
+ if(!executed) {
+ executed = 1;
+ struct receiver_state *rpt = (struct receiver_state *) ptr;
+
+ // Make sure that we detach this thread and don't kill a freshly arriving receiver
+ netdata_mutex_lock(&rpt->host->receiver_lock);
+ if (rpt->host->receiver == rpt)
+ rpt->host->receiver = NULL;
+ netdata_mutex_unlock(&rpt->host->receiver_lock);
+
+ info("STREAM %s [receive from [%s]:%s]: receive thread ended (task id %d)", rpt->hostname, rpt->client_ip, rpt->client_port, gettid());
+
+ freez(rpt->key);
+ freez(rpt->hostname);
+ freez(rpt->registry_hostname);
+ freez(rpt->machine_guid);
+ freez(rpt->os);
+ freez(rpt->timezone);
+ freez(rpt->tags);
+ freez(rpt->client_ip);
+ freez(rpt->client_port);
+ freez(rpt->program_name);
+ freez(rpt->program_version);
+#ifdef ENABLE_HTTPS
+ if(rpt->ssl.conn){
+ SSL_free(rpt->ssl.conn);
+ }
+#endif
+ freez(rpt);
+
+ }
+}
+
+#include "../collectors/plugins.d/pluginsd_parser.h"
+
+PARSER_RC streaming_timestamp(char **words, void *user, PLUGINSD_ACTION *plugins_action)
+{
+ UNUSED(plugins_action);
+ char *remote_time_txt = words[1];
+ time_t remote_time = 0;
+ RRDHOST *host = ((PARSER_USER_OBJECT *)user)->host;
+ struct plugind *cd = ((PARSER_USER_OBJECT *)user)->cd;
+ if (cd->version < VERSION_GAP_FILLING ) {
+ error("STREAM %s from %s: Slave negotiated version %u but sent TIMESTAMP!", host->hostname, cd->cmd,
+ cd->version);
+ return PARSER_RC_OK; // Ignore error and continue stream
+ }
+ if (remote_time_txt && *remote_time_txt) {
+ remote_time = str2ull(remote_time_txt);
+ time_t now = now_realtime_sec(), prev = rrdhost_last_entry_t(host);
+ time_t gap = 0;
+ if (prev == 0)
+ info("STREAM %s from %s: Initial connection (no gap to check), remote=%ld local=%ld slew=%ld",
+ host->hostname, cd->cmd, remote_time, now, now-remote_time);
+ else {
+ gap = now - prev;
+ info("STREAM %s from %s: Checking for gaps... remote=%ld local=%ld..%ld slew=%ld %ld-sec gap",
+ host->hostname, cd->cmd, remote_time, prev, now, remote_time - now, gap);
+ }
+ char message[128];
+ sprintf(message,"REPLICATE %ld %ld\n", remote_time - gap, remote_time);
+ int ret;
+#ifdef ENABLE_HTTPS
+ SSL *conn = host->stream_ssl.conn ;
+ if(conn && !host->stream_ssl.flags) {
+ ret = SSL_write(conn, message, strlen(message));
+ } else {
+ ret = send(host->receiver->fd, message, strlen(message), MSG_DONTWAIT);
+ }
+#else
+ ret = send(host->receiver->fd, message, strlen(message), MSG_DONTWAIT);
+#endif
+ if (ret != (int)strlen(message))
+ error("Failed to send initial timestamp - gaps may appear in charts");
+ return PARSER_RC_OK;
+ }
+ return PARSER_RC_ERROR;
+}
+
+/* The receiver socket is blocking, perform a single read into a buffer so that we can reassemble lines for parsing.
+ */
+static int receiver_read(struct receiver_state *r, FILE *fp) {
+#ifdef ENABLE_HTTPS
+ if (r->ssl.conn && !r->ssl.flags) {
+ ERR_clear_error();
+ int desired = sizeof(r->read_buffer) - r->read_len - 1;
+ int ret = SSL_read(r->ssl.conn, r->read_buffer + r->read_len, desired);
+ if (ret > 0 ) {
+ r->read_len += ret;
+ return 0;
+ }
+ // Don't treat SSL_ERROR_WANT_READ or SSL_ERROR_WANT_WRITE differently on blocking socket
+ u_long err;
+ char buf[256];
+ while ((err = ERR_get_error()) != 0) {
+ ERR_error_string_n(err, buf, sizeof(buf));
+ error("STREAM %s [receive from %s] ssl error: %s", r->hostname, r->client_ip, buf);
+ }
+ return 1;
+ }
+#endif
+ if (!fgets(r->read_buffer, sizeof(r->read_buffer), fp))
+ return 1;
+ r->read_len = strlen(r->read_buffer);
+ return 0;
+}
+
+/* Produce a full line if one exists, statefully return where we start next time.
+ * When we hit the end of the buffer with a partial line move it to the beginning for the next fill.
+ */
+static char *receiver_next_line(struct receiver_state *r, int *pos) {
+ int start = *pos, scan = *pos;
+ if (scan >= r->read_len) {
+ r->read_len = 0;
+ return NULL;
+ }
+ while (scan < r->read_len && r->read_buffer[scan] != '\n')
+ scan++;
+ if (scan < r->read_len && r->read_buffer[scan] == '\n') {
+ *pos = scan+1;
+ r->read_buffer[scan] = 0;
+ return &r->read_buffer[start];
+ }
+ memcpy(r->read_buffer, &r->read_buffer[start], r->read_len - start);
+ r->read_len -= start;
+ return NULL;
+}
+
+
+size_t streaming_parser(struct receiver_state *rpt, struct plugind *cd, FILE *fp) {
+ size_t result;
+ PARSER_USER_OBJECT *user = callocz(1, sizeof(*user));
+ user->enabled = cd->enabled;
+ user->host = rpt->host;
+ user->opaque = rpt;
+ user->cd = cd;
+ user->trust_durations = 0;
+
+ PARSER *parser = parser_init(rpt->host, user, fp, PARSER_INPUT_SPLIT);
+ parser_add_keyword(parser, "TIMESTAMP", streaming_timestamp);
+
+ if (unlikely(!parser)) {
+ error("Failed to initialize parser");
+ cd->serial_failures++;
+ freez(user);
+ return 0;
+ }
+
+ parser->plugins_action->begin_action = &pluginsd_begin_action;
+ parser->plugins_action->flush_action = &pluginsd_flush_action;
+ parser->plugins_action->end_action = &pluginsd_end_action;
+ parser->plugins_action->disable_action = &pluginsd_disable_action;
+ parser->plugins_action->variable_action = &pluginsd_variable_action;
+ parser->plugins_action->dimension_action = &pluginsd_dimension_action;
+ parser->plugins_action->label_action = &pluginsd_label_action;
+ parser->plugins_action->overwrite_action = &pluginsd_overwrite_action;
+ parser->plugins_action->chart_action = &pluginsd_chart_action;
+ parser->plugins_action->set_action = &pluginsd_set_action;
+
+ user->parser = parser;
+
+ do {
+ if (receiver_read(rpt, fp))
+ break;
+ int pos = 0;
+ char *line;
+ while ((line = receiver_next_line(rpt, &pos))) {
+ if (unlikely(netdata_exit || rpt->shutdown || parser_action(parser, line)))
+ goto done;
+ }
+ rpt->last_msg_t = now_realtime_sec();
+ }
+ while(!netdata_exit);
+done:
+ result= user->count;
+ freez(user);
+ parser_destroy(parser);
+ return result;
+}
+
+
+static int rrdpush_receive(struct receiver_state *rpt)
+{
+ int history = default_rrd_history_entries;
+ RRD_MEMORY_MODE mode = default_rrd_memory_mode;
+ int health_enabled = default_health_enabled;
+ int rrdpush_enabled = default_rrdpush_enabled;
+ char *rrdpush_destination = default_rrdpush_destination;
+ char *rrdpush_api_key = default_rrdpush_api_key;
+ char *rrdpush_send_charts_matching = default_rrdpush_send_charts_matching;
+ time_t alarms_delay = 60;
+
+ rpt->update_every = (int)appconfig_get_number(&stream_config, rpt->machine_guid, "update every", rpt->update_every);
+ if(rpt->update_every < 0) rpt->update_every = 1;
+
+ history = (int)appconfig_get_number(&stream_config, rpt->key, "default history", history);
+ history = (int)appconfig_get_number(&stream_config, rpt->machine_guid, "history", history);
+ if(history < 5) history = 5;
+
+ mode = rrd_memory_mode_id(appconfig_get(&stream_config, rpt->key, "default memory mode", rrd_memory_mode_name(mode)));
+ mode = rrd_memory_mode_id(appconfig_get(&stream_config, rpt->machine_guid, "memory mode", rrd_memory_mode_name(mode)));
+
+ health_enabled = appconfig_get_boolean_ondemand(&stream_config, rpt->key, "health enabled by default", health_enabled);
+ health_enabled = appconfig_get_boolean_ondemand(&stream_config, rpt->machine_guid, "health enabled", health_enabled);
+
+ alarms_delay = appconfig_get_number(&stream_config, rpt->key, "default postpone alarms on connect seconds", alarms_delay);
+ alarms_delay = appconfig_get_number(&stream_config, rpt->machine_guid, "postpone alarms on connect seconds", alarms_delay);
+
+ rrdpush_enabled = appconfig_get_boolean(&stream_config, rpt->key, "default proxy enabled", rrdpush_enabled);
+ rrdpush_enabled = appconfig_get_boolean(&stream_config, rpt->machine_guid, "proxy enabled", rrdpush_enabled);
+
+ rrdpush_destination = appconfig_get(&stream_config, rpt->key, "default proxy destination", rrdpush_destination);
+ rrdpush_destination = appconfig_get(&stream_config, rpt->machine_guid, "proxy destination", rrdpush_destination);
+
+ rrdpush_api_key = appconfig_get(&stream_config, rpt->key, "default proxy api key", rrdpush_api_key);
+ rrdpush_api_key = appconfig_get(&stream_config, rpt->machine_guid, "proxy api key", rrdpush_api_key);
+
+ rrdpush_send_charts_matching = appconfig_get(&stream_config, rpt->key, "default proxy send charts matching", rrdpush_send_charts_matching);
+ rrdpush_send_charts_matching = appconfig_get(&stream_config, rpt->machine_guid, "proxy send charts matching", rrdpush_send_charts_matching);
+
+ rpt->tags = (char*)appconfig_set_default(&stream_config, rpt->machine_guid, "host tags", (rpt->tags)?rpt->tags:"");
+ if(rpt->tags && !*rpt->tags) rpt->tags = NULL;
+
+ if (strcmp(rpt->machine_guid, localhost->machine_guid) == 0) {
+ log_stream_connection(rpt->client_ip, rpt->client_port, rpt->key, rpt->machine_guid, rpt->hostname, "DENIED - ATTEMPT TO RECEIVE METRICS FROM MACHINE_GUID IDENTICAL TO MASTER");
+ error("STREAM %s [receive from %s:%s]: denied to receive metrics, machine GUID [%s] is my own. Did you copy the master/proxy machine guid to a slave?", rpt->hostname, rpt->client_ip, rpt->client_port, rpt->machine_guid);
+ close(rpt->fd);
+ return 1;
+ }
+
+ if (rpt->host==NULL) {
+
+ rpt->host = rrdhost_find_or_create(
+ rpt->hostname
+ , rpt->registry_hostname
+ , rpt->machine_guid
+ , rpt->os
+ , rpt->timezone
+ , rpt->tags
+ , program_name
+ , program_version
+ , rpt->update_every
+ , history
+ , mode
+ , (unsigned int)(health_enabled != CONFIG_BOOLEAN_NO)
+ , (unsigned int)(rrdpush_enabled && rrdpush_destination && *rrdpush_destination && rrdpush_api_key && *rrdpush_api_key)
+ , rrdpush_destination
+ , rrdpush_api_key
+ , rrdpush_send_charts_matching
+ , rpt->system_info
+ );
+
+ if(!rpt->host) {
+ close(rpt->fd);
+ log_stream_connection(rpt->client_ip, rpt->client_port, rpt->key, rpt->machine_guid, rpt->hostname, "FAILED - CANNOT ACQUIRE HOST");
+ error("STREAM %s [receive from [%s]:%s]: failed to find/create host structure.", rpt->hostname, rpt->client_ip, rpt->client_port);
+ return 1;
+ }
+
+ netdata_mutex_lock(&rpt->host->receiver_lock);
+ if (rpt->host->receiver == NULL)
+ rpt->host->receiver = rpt;
+ else {
+ error("Multiple receivers connected for %s concurrently, cancelling this one...", rpt->machine_guid);
+ netdata_mutex_unlock(&rpt->host->receiver_lock);
+ close(rpt->fd);
+ log_stream_connection(rpt->client_ip, rpt->client_port, rpt->key, rpt->machine_guid, rpt->hostname, "FAILED - BEATEN TO HOST CREATION");
+ return 1;
+ }
+ netdata_mutex_unlock(&rpt->host->receiver_lock);
+ }
+
+ int ssl = 0;
+#ifdef ENABLE_HTTPS
+ if (rpt->ssl.conn != NULL)
+ ssl = 1;
+#endif
+
+#ifdef NETDATA_INTERNAL_CHECKS
+ info("STREAM %s [receive from [%s]:%s]: client willing to stream metrics for host '%s' with machine_guid '%s': update every = %d, history = %ld, memory mode = %s, health %s,%s tags '%s'"
+ , rpt->hostname
+ , rpt->client_ip
+ , rpt->client_port
+ , rpt->host->hostname
+ , rpt->host->machine_guid
+ , rpt->host->rrd_update_every
+ , rpt->host->rrd_history_entries
+ , rrd_memory_mode_name(rpt->host->rrd_memory_mode)
+ , (health_enabled == CONFIG_BOOLEAN_NO)?"disabled":((health_enabled == CONFIG_BOOLEAN_YES)?"enabled":"auto")
+ , ssl ? " SSL," : ""
+ , rpt->host->tags?rpt->host->tags:""
+ );
+#endif // NETDATA_INTERNAL_CHECKS
+
+
+ struct plugind cd = {
+ .enabled = 1,
+ .update_every = default_rrd_update_every,
+ .pid = 0,
+ .serial_failures = 0,
+ .successful_collections = 0,
+ .obsolete = 0,
+ .started_t = now_realtime_sec(),
+ .next = NULL,
+ .version = 0,
+ };
+
+ // put the client IP and port into the buffers used by plugins.d
+ snprintfz(cd.id, CONFIG_MAX_NAME, "%s:%s", rpt->client_ip, rpt->client_port);
+ snprintfz(cd.filename, FILENAME_MAX, "%s:%s", rpt->client_ip, rpt->client_port);
+ snprintfz(cd.fullfilename, FILENAME_MAX, "%s:%s", rpt->client_ip, rpt->client_port);
+ snprintfz(cd.cmd, PLUGINSD_CMD_MAX, "%s:%s", rpt->client_ip, rpt->client_port);
+
+ info("STREAM %s [receive from [%s]:%s]: initializing communication...", rpt->host->hostname, rpt->client_ip, rpt->client_port);
+ char initial_response[HTTP_HEADER_SIZE];
+ if (rpt->stream_version > 1) {
+ info("STREAM %s [receive from [%s]:%s]: Netdata is using the stream version %u.", rpt->host->hostname, rpt->client_ip, rpt->client_port, rpt->stream_version);
+ sprintf(initial_response, "%s%u", START_STREAMING_PROMPT_VN, rpt->stream_version);
+ } else if (rpt->stream_version == 1) {
+ info("STREAM %s [receive from [%s]:%s]: Netdata is using the stream version %u.", rpt->host->hostname, rpt->client_ip, rpt->client_port, rpt->stream_version);
+ sprintf(initial_response, "%s", START_STREAMING_PROMPT_V2);
+ } else {
+ info("STREAM %s [receive from [%s]:%s]: Netdata is using first stream protocol.", rpt->host->hostname, rpt->client_ip, rpt->client_port);
+ sprintf(initial_response, "%s", START_STREAMING_PROMPT);
+ }
+ debug(D_STREAM, "Initial response to %s: %s", rpt->client_ip, initial_response);
+ #ifdef ENABLE_HTTPS
+ rpt->host->stream_ssl.conn = rpt->ssl.conn;
+ rpt->host->stream_ssl.flags = rpt->ssl.flags;
+ if(send_timeout(&rpt->ssl, rpt->fd, initial_response, strlen(initial_response), 0, 60) != (ssize_t)strlen(initial_response)) {
+#else
+ if(send_timeout(rpt->fd, initial_response, strlen(initial_response), 0, 60) != strlen(initial_response)) {
+#endif
+ log_stream_connection(rpt->client_ip, rpt->client_port, rpt->key, rpt->host->machine_guid, rpt->host->hostname, "FAILED - CANNOT REPLY");
+ error("STREAM %s [receive from [%s]:%s]: cannot send ready command.", rpt->host->hostname, rpt->client_ip, rpt->client_port);
+ close(rpt->fd);
+ return 0;
+ }
+
+ // remove the non-blocking flag from the socket
+ if(sock_delnonblock(rpt->fd) < 0)
+ error("STREAM %s [receive from [%s]:%s]: cannot remove the non-blocking flag from socket %d", rpt->host->hostname, rpt->client_ip, rpt->client_port, rpt->fd);
+
+ // convert the socket to a FILE *
+ FILE *fp = fdopen(rpt->fd, "r");
+ if(!fp) {
+ log_stream_connection(rpt->client_ip, rpt->client_port, rpt->key, rpt->host->machine_guid, rpt->host->hostname, "FAILED - SOCKET ERROR");
+ error("STREAM %s [receive from [%s]:%s]: failed to get a FILE for FD %d.", rpt->host->hostname, rpt->client_ip, rpt->client_port, rpt->fd);
+ close(rpt->fd);
+ return 0;
+ }
+
+ rrdhost_wrlock(rpt->host);
+/* if(rpt->host->connected_senders > 0) {
+ rrdhost_unlock(rpt->host);
+ log_stream_connection(rpt->client_ip, rpt->client_port, rpt->key, rpt->host->machine_guid, rpt->host->hostname, "REJECTED - ALREADY CONNECTED");
+ info("STREAM %s [receive from [%s]:%s]: multiple streaming connections for the same host detected. Rejecting new connection.", rpt->host->hostname, rpt->client_ip, rpt->client_port);
+ fclose(fp);
+ return 0;
+ }
+*/
+
+ rrdhost_flag_clear(rpt->host, RRDHOST_FLAG_ORPHAN);
+// rpt->host->connected_senders++;
+ rpt->host->senders_disconnected_time = 0;
+ rpt->host->labels_flag = (rpt->stream_version > 0)?LABEL_FLAG_UPDATE_STREAM:LABEL_FLAG_STOP_STREAM;
+
+ if(health_enabled != CONFIG_BOOLEAN_NO) {
+ if(alarms_delay > 0) {
+ rpt->host->health_delay_up_to = now_realtime_sec() + alarms_delay;
+ info("Postponing health checks for %ld seconds, on host '%s', because it was just connected."
+ , alarms_delay
+ , rpt->host->hostname
+ );
+ }
+ }
+ rrdhost_unlock(rpt->host);
+
+ // call the plugins.d processor to receive the metrics
+ info("STREAM %s [receive from [%s]:%s]: receiving metrics...", rpt->host->hostname, rpt->client_ip, rpt->client_port);
+ log_stream_connection(rpt->client_ip, rpt->client_port, rpt->key, rpt->host->machine_guid, rpt->host->hostname, "CONNECTED");
+
+ cd.version = rpt->stream_version;
+
+
+ size_t count = streaming_parser(rpt, &cd, fp);
+ //size_t count = pluginsd_process(host, &cd, fp, 1);
+
+ log_stream_connection(rpt->client_ip, rpt->client_port, rpt->key, rpt->host->machine_guid, rpt->host->hostname, "DISCONNECTED");
+ error("STREAM %s [receive from [%s]:%s]: disconnected (completed %zu updates).", rpt->host->hostname, rpt->client_ip, rpt->client_port, count);
+
+ netdata_mutex_lock(&rpt->host->receiver_lock);
+ if (rpt->host->receiver == rpt) {
+ rrdhost_wrlock(rpt->host);
+ rpt->host->senders_disconnected_time = now_realtime_sec();
+ rrdhost_flag_set(rpt->host, RRDHOST_FLAG_ORPHAN);
+ if(health_enabled == CONFIG_BOOLEAN_AUTO)
+ rpt->host->health_enabled = 0;
+ rrdhost_unlock(rpt->host);
+ rrdpush_sender_thread_stop(rpt->host);
+ }
+ netdata_mutex_unlock(&rpt->host->receiver_lock);
+
+ // cleanup
+ fclose(fp);
+
+ return (int)count;
+}
+
+void *rrdpush_receiver_thread(void *ptr) {
+ netdata_thread_cleanup_push(rrdpush_receiver_thread_cleanup, ptr);
+
+ struct receiver_state *rpt = (struct receiver_state *)ptr;
+ info("STREAM %s [%s]:%s: receive thread created (task id %d)", rpt->hostname, rpt->client_ip, rpt->client_port, gettid());
+
+ rrdpush_receive(rpt);
+
+ netdata_thread_cleanup_pop(1);
+ return NULL;
+}
+
diff --git a/streaming/rrdpush.c b/streaming/rrdpush.c
index fa3cc2d478..5dc3f832dc 100644
--- a/streaming/rrdpush.c
+++ b/streaming/rrdpush.c
@@ -26,25 +26,7 @@
*
*/
-#define STREAMING_PROTOCOL_VERSION "1.1"
-#define START_STREAMING_PROMPT "Hit me baby, push them over..."
-#define START_STREAMING_PROMPT_V2 "Hit me baby, push them over and bring the host labels..."
-#define START_STREAMING_PROMPT_VN "Hit me baby, push them over with the version="
-
-typedef enum {
- RRDPUSH_MULTIPLE_CONNECTIONS_ALLOW,
- RRDPUSH_MULTIPLE_CONNECTIONS_DENY_NEW
-} RRDPUSH_MULTIPLE_CONNECTIONS_STRATEGY;
-
-typedef struct {
- char *os_name;
- char *os_id;
- char *os_version;
- char *kernel_name;
- char *kernel_version;
-} stream_encoded_t;
-
-static struct config stream_config = {
+struct config stream_config = {
.first_section = NULL,
.last_section = NULL,
.mutex = NETDATA_MUTEX_INITIALIZER,
@@ -125,8 +107,6 @@ int rrdpush_init() {
return default_rrdpush_enabled;
}
-#define CONNECTED_TO_SIZE 100
-
// data collection happens from multiple threads
// each of these threads calls rrdset_done()
// which in turn calls rrdset_done_push()
@@ -141,8 +121,6 @@ int rrdpush_init() {
// this is for the first iterations of each chart
unsigned int remote_clock_resync_iterations = 60;
-#define rrdpush_buffer_lock(host) netdata_mutex_lock(&((host)->rrdpush_sender_buffer_mutex))
-#define rrdpush_buffer_unlock(host) netdata_mutex_unlock(&((host)->rrdpush_sender_buffer_mutex))
static inline int should_send_chart_matching(RRDSET *st) {
if(unlikely(!rrdset_flag_check(st, RRDSET_FLAG_ENABLED))) {
@@ -205,7 +183,8 @@ static inline int need_to_send_chart_definition(RRDSET *st) {
return 0;
}
-// sends the current chart definition
+// Send the current chart definition.
+// Assumes that collector thread has already called sender_start for mutex / buffer state.
static inline void rrdpush_send_chart_definition_nolock(RRDSET *st) {
RRDHOST *host = st->rrdhost;
@@ -224,11 +203,9 @@ static inline void rrdpush_send_chart_definition_nolock(RRDSET *st) {
}
}
- // info("CHART '%s' '%s'", st->id, name);
-
// send the chart
buffer_sprintf(
- host->rrdpush_sender_buffer
+ host->sender->build
, "CHART \"%s\" \"%s\" \"%s\" \"%s\" \"%s\" \"%s\" \"%s\" %ld %d \"%s %s %s %s\" \"%s\" \"%s\"\n"
, st->id
, name
@@ -251,7 +228,7 @@ static inline void rrdpush_send_chart_definition_nolock(RRDSET *st) {
RRDDIM *rd;
rrddim_foreach_read(rd, st) {
buffer_sprintf(
- host->rrdpush_sender_buffer
+ host->sender->build
, "DIMENSION \"%s\" \"%s\" \"%s\" " COLLECTED_NUMBER_FORMAT " " COLLECTED_NUMBER_FORMAT " \"%s %s %s\"\n"
, rd->id
, rd->name
@@ -272,7 +249,7 @@ static inline void rrdpush_send_chart_definition_nolock(RRDSET *st) {
calculated_number *value = (calculated_number *) rs->value;
buffer_sprintf(
- host->rrdpush_sender_buffer
+ host->sender->build
, "VARIABLE CHART %s = " CALCULATED_NUMBER_FORMAT "\n"
, rs->variable
, *value
@@ -284,25 +261,29 @@ static inline void rrdpush_send_chart_definition_nolock(RRDSET *st) {
}
// sends the current chart dimensions
-static inline void rrdpush_send_chart_metrics_nolock(RRDSET *st) {
+static inline void rrdpush_send_chart_metrics_nolock(RRDSET *st, struct sender_state *s) {
RRDHOST *host = st->rrdhost;
- buffer_sprintf(host->rrdpush_sender_buffer, "BEGIN \"%s\" %llu\n", st->id, (st->last_collected_time.tv_sec > st->upstream_resync_time)?st->usec_since_last_update:0);
+ buffer_sprintf(host->sender->build, "BEGIN \"%s\" %llu", st->id, (st->last_collected_time.tv_sec > st->upstream_resync_time)?st->usec_since_last_update:0);
+ if (s->version >= VERSION_GAP_FILLING)
+ buffer_sprintf(host->sender->build, " %ld\n", st->last_collected_time.tv_sec);
+ else
+ buffer_strcat(host->sender->build, "\n");
RRDDIM *rd;
rrddim_foreach_read(rd, st) {
if(rd->updated && rd->exposed)
- buffer_sprintf(host->rrdpush_sender_buffer
+ buffer_sprintf(host->sender->build
, "SET \"%s\" = " COLLECTED_NUMBER_FORMAT "\n"
, rd->id
, rd->collected_value
);
}
-
- buffer_strcat(host->rrdpush_sender_buffer, "END\n");
+ buffer_strcat(host->sender->build, "END\n");
}
static void rrdpush_sender_thread_spawn(RRDHOST *host);
+// Called from the internal collectors to mark a chart obsolete.
void rrdset_push_chart_definition_now(RRDSET *st) {
RRDHOST *host = st->rrdhost;
@@ -310,9 +291,9 @@ void rrdset_push_chart_definition_now(RRDSET *st) {
return;
rrdset_rdlock(st);
- rrdpush_buffer_lock(host);
+ sender_start(host->sender);
rrdpush_send_chart_definition_nolock(st);
- rrdpush_buffer_unlock(host);
+ sender_commit(host->sender);
rrdset_unlock(st);
}
@@ -322,18 +303,14 @@ void rrdset_done_push(RRDSET *st) {
RRDHOST *host = st->rrdhost;
- rrdpush_buffer_lock(host);
-
if(unlikely(host->rrdpush_send_enabled && !host->rrdpush_sender_spawn))
rrdpush_sender_thread_spawn(host);
- if(unlikely(!host->rrdpush_sender_buffer || !host->rrdpush_sender_connected)) {
+ // Handle non-connected case
+ if(unlikely(!host->rrdpush_sender_connected)) {
if(unlikely(!host->rrdpush_sender_error_shown))
error("STREAM %s [send]: not ready - discarding collected metrics.", host->hostname);
-
host->rrdpush_sender_error_shown = 1;
-
- rrdpush_buffer_unlock(host);
return;
}
else if(unlikely(host->rrdpush_sender_error_shown)) {
@@ -341,16 +318,18 @@ void rrdset_done_push(RRDSET *st) {
host->rrdpush_sender_error_shown = 0;
}
+ sender_start(host->sender);
+
if(need_to_send_chart_definition(st))
rrdpush_send_chart_definition_nolock(st);
- rrdpush_send_chart_metrics_nolock(st);
+ rrdpush_send_chart_metrics_nolock(st, host->sender);
// signal the sender there are more data
if(host->rrdpush_sender_pipe[PIPE_WRITE] != -1 && write(host->rrdpush_sender_pipe[PIPE_WRITE], " ", 1) == -1)
error("STREAM %s [send]: cannot write to internal pipe", host->hostname);
- rrdpush_buffer_unlock(host);
+ sender_commit(host->sender);
}
// labels
@@ -358,13 +337,13 @@ void rrdpush_send_labels(RRDHOST *host) {
if (!host->labels || !(host->labels_flag & LABEL_FLAG_UPDATE_STREAM) || (host->labels_flag & LABEL_FLAG_STOP_STREAM))
return;
- rrdpush_buffer_lock(host);
+ sender_start(host->sender);
rrdhost_rdlock(host);
netdata_rwlock_rdlock(&host->labels_rwlock);
struct label *labels = host->labels;
while(labels) {
- buffer_sprintf(host->rrdpush_sender_buffer
+ buffer_sprintf(host->sender->build
, "LABEL \"%s\" = %d %s\n"
, labels->key
, (int)labels->label_source
@@ -373,103 +352,26 @@ void rrdpush_send_labels(RRDHOST *host) {
labels = labels->next;
}
- buffer_sprintf(host->rrdpush_sender_buffer
+ buffer_sprintf(host->sender->build
, "OVERWRITE %s\n", "labels");
netdata_rwlock_unlock(&host->labels_rwlock);
rrdhost_unlock(host);
+ sender_commit(host->sender);
if(host->rrdpush_sender_pipe[PIPE_WRITE] != -1 && write(host->rrdpush_sender_pipe[PIPE_WRITE], " ", 1) == -1)
error("STREAM %s [send]: cannot write to internal pipe", host->hostname);
- rrdpush_buffer_unlock(host);
host->labels_flag &= ~LABEL_FLAG_UPDATE_STREAM;
}
// ----------------------------------------------------------------------------
// rrdpush sender thread
-static inline void rrdpush_sender_add_host_variable_to_buffer_nolock(RRDHOST *host, RRDVAR *rv) {
- calculated_number *value = (calculated_number *)rv->value;
-
- buffer_sprintf(
- host->rrdpush_sender_buffer
- , "VARIABLE HOST %s = " CALCULATED_NUMBER_FORMAT "\n"
- , rv->name
- , *value
- );
-
- debug(D_STREAM, "RRDVAR pushed HOST VARIABLE %s = " CALCULATED_NUMBER_FORMAT, rv->name, *value);
-}
-
-void rrdpush_sender_send_this_host_variable_now(RRDHOST *host, RRDVAR *rv) {
- if(host->rrdpush_send_enabled && host->rrdpush_sender_spawn && host->rrdpush_sender_connected) {
- rrdpush_buffer_lock(host);
- rrdpush_sender_add_host_variable_to_buffer_nolock(host, rv);
- rrdpush_buffer_unlock(host);
- }
-}
-
-static int rrdpush_sender_thread_custom_host_variables_callback(void *rrdvar_ptr, void *host_ptr) {
- RRDVAR *rv = (RRDVAR *)rrdvar_ptr;
- RRDHOST *host = (RRDHOST *)host_ptr;
-
- if(unlikely(rv->options & RRDVAR_OPTION_CUSTOM_HOST_VAR && rv->type == RRDVAR_TYPE_CALCULATED)) {
- rrdpush_sender_add_host_variable_to_buffer_nolock(host, rv);
-
- // return 1, so that the traversal will return the number of variables sent
- return 1;
- }
-
- // returning a negative number will break the traversal
- return 0;
-}
-
-static void rrdpush_sender_thread_send_custom_host_variables(RRDHOST *host) {
- int ret = rrdvar_callback_for_all_host_variables(host, rrdpush_sender_thread_custom_host_variables_callback, host);
- (void)ret;
-
- debug(D_STREAM, "RRDVAR sent %d VARIABLES", ret);
-}
-
-// resets all the chart, so that their definitions
-// will be resent to the central netdata
-static void rrdpush_sender_thread_reset_all_charts(RRDHOST *host) {
- rrdhost_rdlock(host);
-
- RRDSET *st;
- rrdset_foreach_read(st, host) {
- rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED);
-
- st->upstream_resync_time = 0;
-
- rrdset_rdlock(st);
-
- RRDDIM *rd;
- rrddim_foreach_read(rd, st)
- rd->exposed = 0;
-
- rrdset_unlock(st);
- }
-
- rrdhost_unlock(host);
-}
-
-static inline void rrdpush_sender_thread_data_flush(RRDHOST *host) {
- rrdpush_buffer_lock(host);
-
- if(buffer_strlen(host->rrdpush_sender_buffer))
- error("STREAM %s [send]: discarding %zu bytes of metrics already in the buffer.", host->hostname, buffer_strlen(host->rrdpush_sender_buffer));
-
- buffer_flush(host->rrdpush_sender_buffer);
-
- rrdpush_sender_thread_reset_all_charts(host);
- rrdpush_sender_thread_send_custom_host_variables(host);
-
- rrdpush_buffer_unlock(host);
-}
-
+// Either the receiver lost the connection or the host is being destroyed.
+// Don't lock the sender buffer - doesn't affect consistency in either case.
+// TODO-GAPS During the host destruction sequence we should make sure the disconnect happens early enough to lock
+// out collectors hitting the sender. Locking the mutex means there may be waiting threads when we free.
void rrdpush_sender_thread_stop(RRDHOST *host) {
- rrdpush_buffer_lock(host);
rrdhost_wrlock(host);
netdata_thread_t thr = 0;
@@ -489,7 +391,6 @@ void rrdpush_sender_thread_stop(RRDHOST *host) {
}
rrdhost_unlock(host);
- rrdpush_buffer_unlock(host);
if(thr != 0) {
info("STREAM %s [send]: waiting for the sending thread to stop...", host->hostname);
@@ -499,903 +400,14 @@ void rrdpush_sender_thread_stop(RRDHOST *host) {
}
}
-static inline void rrdpush_sender_thread_close_socket(RRDHOST *host) {
- host->rrdpush_sender_connected = 0;
-
- if(host->rrdpush_sender_socket != -1) {
- close(host->rrdpush_sender_socket);
- host->rrdpush_sender_socket = -1;
- }
-}
-
-static inline void rrdpush_set_flags_to_newest_stream(RRDHOST *host) {
- host->labels_flag |= LABEL_FLAG_UPDATE_STREAM;
- host->labels_flag &= ~LABEL_FLAG_STOP_STREAM;
-}
-
-void rrdpush_encode_variable(stream_encoded_t *se, RRDHOST *host)
-{
- se->os_name = (host->system_info->host_os_name)?url_encode(host->system_info->host_os_name):"";
- se->os_id = (host->system_info->host_os_id)?url_encode(host->system_info->host_os_id):"";
- se->os_version = (host->system_info->host_os_version)?url_encode(host->system_info->host_os_version):"";
- se->kernel_name = (host->system_info->kernel_name)?url_encode(host->system_info->kernel_name):"";
- se->kernel_version = (host->system_info->kernel_version)?url_encode(host->system_info->kernel_version):"";
-}
-
-void rrdpush_clean_encoded(stream_encoded_t *se)
-{
- if (se->os_name)
- freez(se->os_name);
-
- if (se->os_id)
- freez(se->os_id);
-
- if (se->os_version)
- freez(se->os_version);
-
- if (se->kernel_name)
- freez(se->kernel_name);
-
- if (se->kernel_version)
- freez(se->kernel_version);
-}
-
-//called from client side
-static int rrdpush_sender_thread_connect_to_master(RRDHOST *host, int default_port, int timeout, size_t *reconnects_counter, char *connected_to, size_t connected_to_size) {
- struct timeval tv = {
- .tv_sec = timeout,
- .tv_usec = 0
- };
-
- // make sure the socket is closed
- rrdpush_sender_thread_close_socket(host);
-
- debug(D_STREAM, "STREAM: Attempting to connect...");
- info("STREAM %s [send to %s]: connecting...", host->hostname, host->rrdpush_send_destination);
-
- host->rrdpush_sender_socket = connect_to_one_of(
- host->rrdpush_send_destination
- , default_port
- , &tv
- , reconnects_counter
- , connected_to
- , connected_to_size
- );
-
- if(unlikely(host->rrdpush_sender_socket == -1)) {
- error("STREAM %s [send to %s]: failed to connect", host->hostname, host->rrdpush_send_destination);
- return 0;
- }
-
- info("STREAM %s [send to %s]: initializing communication...", host->hostname, connected_to);
-
-#ifdef ENABLE_HTTPS
- if( netdata_client_ctx ){
- host->ssl.flags = NETDATA_SSL_START;
- if (!host->ssl.conn){
- host->ssl.conn = SSL_new(netdata_client_ctx);
- if(!host->ssl.conn){
- error("Failed to allocate SSL structure.");
- host->ssl.flags = NETDATA_SSL_NO_HANDSHAKE;
- }</