summaryrefslogtreecommitdiffstats
path: root/aclk
diff options
context:
space:
mode:
Diffstat (limited to 'aclk')
-rw-r--r--aclk/aclk_lws_wss_client.c26
-rw-r--r--aclk/aclk_stats.c195
-rw-r--r--aclk/aclk_stats.h51
-rw-r--r--aclk/agent_cloud_link.c47
-rw-r--r--aclk/mqtt.c14
5 files changed, 331 insertions, 2 deletions
diff --git a/aclk/aclk_lws_wss_client.c b/aclk/aclk_lws_wss_client.c
index 9317d835b5..fb2a7e3db4 100644
--- a/aclk/aclk_lws_wss_client.c
+++ b/aclk/aclk_lws_wss_client.c
@@ -5,6 +5,7 @@
#include "libnetdata/libnetdata.h"
#include "../daemon/common.h"
#include "aclk_common.h"
+#include "aclk_stats.h"
extern int aclk_shutting_down;
@@ -436,8 +437,14 @@ static int aclk_lws_wss_callback(struct lws *wsi, enum lws_callback_reasons reas
if ( bytes_left > FRAGMENT_SIZE)
bytes_left = FRAGMENT_SIZE;
int n = lws_write(wsi, data->data + LWS_PRE + data->written, bytes_left, LWS_WRITE_BINARY);
- if (n>=0)
+ if (n>=0) {
data->written += n;
+ if (aclk_stats_enabled) {
+ ACLK_STATS_LOCK;
+ aclk_metrics_per_sample.write_q_consumed += n;
+ ACLK_STATS_UNLOCK;
+ }
+ }
//error("lws_write(req=%u,written=%u) %zu of %zu",bytes_left, rc, data->written,data->data_size,rc);
if (data->written == data->data_size)
{
@@ -455,6 +462,11 @@ static int aclk_lws_wss_callback(struct lws *wsi, enum lws_callback_reasons reas
if (!received_data_to_ringbuff(engine_instance->read_ringbuffer, in, len))
retval = 1;
aclk_lws_mutex_unlock(&engine_instance->read_buf_mutex);
+ if (aclk_stats_enabled) {
+ ACLK_STATS_LOCK;
+ aclk_metrics_per_sample.read_q_added += len;
+ ACLK_STATS_UNLOCK;
+ }
// to future myself -> do not call this while read lock is active as it will eventually
// want to acquire same lock later in aclk_lws_wss_client_read() function
@@ -524,6 +536,12 @@ int aclk_lws_wss_client_write(void *buf, size_t count)
lws_wss_packet_buffer_append(&engine_instance->write_buffer_head, lws_wss_packet_buffer_new(buf, count));
aclk_lws_mutex_unlock(&engine_instance->write_buf_mutex);
+ if (aclk_stats_enabled) {
+ ACLK_STATS_LOCK;
+ aclk_metrics_per_sample.write_q_added += count;
+ ACLK_STATS_UNLOCK;
+ }
+
lws_callback_on_writable(engine_instance->lws_wsi);
return count;
}
@@ -549,6 +567,12 @@ int aclk_lws_wss_client_read(void *buf, size_t count)
if (data_to_be_read == readable_byte_count)
engine_instance->data_to_read = 0;
+ if (aclk_stats_enabled) {
+ ACLK_STATS_LOCK;
+ aclk_metrics_per_sample.read_q_consumed += data_to_be_read;
+ ACLK_STATS_UNLOCK;
+ }
+
abort:
aclk_lws_mutex_unlock(&engine_instance->read_buf_mutex);
return data_to_be_read;
diff --git a/aclk/aclk_stats.c b/aclk/aclk_stats.c
new file mode 100644
index 0000000000..2d9f5e43b5
--- /dev/null
+++ b/aclk/aclk_stats.c
@@ -0,0 +1,195 @@
+#include "aclk_stats.h"
+
+netdata_mutex_t aclk_stats_mutex = NETDATA_MUTEX_INITIALIZER;
+
+int aclk_stats_enabled;
+
+struct aclk_metrics aclk_metrics = {
+ .online = 0,
+};
+
+struct aclk_metrics_per_sample aclk_metrics_per_sample;
+
+static void aclk_stats_collect(struct aclk_metrics_per_sample *per_sample, struct aclk_metrics *permanent)
+{
+ static RRDSET *st_aclkstats = NULL;
+ static RRDDIM *rd_online_status = NULL;
+
+ if (unlikely(!st_aclkstats)) {
+ st_aclkstats = rrdset_create_localhost(
+ "netdata", "aclk_status", NULL, "aclk_stats", NULL, "ACLK/Cloud connection status",
+ "connected", "netdata", "stats", 200000, localhost->rrd_update_every, RRDSET_TYPE_LINE);
+
+ rd_online_status = rrddim_add(st_aclkstats, "online", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ } else
+ rrdset_next(st_aclkstats);
+
+ rrddim_set_by_pointer(st_aclkstats, rd_online_status, per_sample->offline_during_sample ? 0 : permanent->online);
+
+ rrdset_done(st_aclkstats);
+}
+
+static void aclk_stats_query_thread(struct aclk_metrics_per_sample *per_sample)
+{
+ static RRDSET *st_query_thread = NULL;
+ static RRDDIM *rd_queued = NULL;
+ static RRDDIM *rd_dispatched = NULL;
+
+ if (unlikely(!st_query_thread)) {
+ st_query_thread = rrdset_create_localhost(
+ "netdata", "aclk_query_per_second", NULL, "aclk_stats", NULL, "ACLK Queries per second", "queries/s",
+ "netdata", "stats", 200001, localhost->rrd_update_every, RRDSET_TYPE_AREA);
+
+ rd_queued = rrddim_add(st_query_thread, "added", NULL, 1, localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE);
+ rd_dispatched = rrddim_add(st_query_thread, "dispatched", NULL, 1, localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE);
+ } else
+ rrdset_next(st_query_thread);
+
+ rrddim_set_by_pointer(st_query_thread, rd_queued, per_sample->queries_queued);
+ rrddim_set_by_pointer(st_query_thread, rd_dispatched, -per_sample->queries_dispatched);
+
+ rrdset_done(st_query_thread);
+}
+
+#ifdef NETDATA_INTERNAL_CHECKS
+static void aclk_stats_latency(struct aclk_metrics_per_sample *per_sample)
+{
+ static RRDSET *st = NULL;
+ static RRDDIM *rd_avg = NULL;
+ static RRDDIM *rd_max = NULL;
+
+ if (unlikely(!st)) {
+ st = rrdset_create_localhost(
+ "netdata", "aclk_latency_mqtt", NULL, "aclk_stats", NULL, "ACLK Message Publish Latency", "ms",
+ "netdata", "stats", 200002, localhost->rrd_update_every, RRDSET_TYPE_LINE);
+
+ rd_avg = rrddim_add(st, "avg", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ rd_max = rrddim_add(st, "max", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ } else
+ rrdset_next(st);
+ if(per_sample->latency_count)
+ rrddim_set_by_pointer(st, rd_avg, roundf((float)per_sample->latency_total / per_sample->latency_count));
+ else
+ rrddim_set_by_pointer(st, rd_avg, 0);
+
+ rrddim_set_by_pointer(st, rd_max, per_sample->latency_max);
+
+ rrdset_done(st);
+}
+#endif
+
+static void aclk_stats_write_q(struct aclk_metrics_per_sample *per_sample)
+{
+ static RRDSET *st = NULL;
+ static RRDDIM *rd_wq_add = NULL;
+ static RRDDIM *rd_wq_consumed = NULL;
+
+ if (unlikely(!st)) {
+ st = rrdset_create_localhost(
+ "netdata", "aclk_write_q", NULL, "aclk_stats", NULL, "Write Queue Mosq->Libwebsockets", "kB/s",
+ "netdata", "stats", 200003, localhost->rrd_update_every, RRDSET_TYPE_AREA);
+
+ rd_wq_add = rrddim_add(st, "added", NULL, 1, 1024 * localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE);
+ rd_wq_consumed = rrddim_add(st, "consumed", NULL, 1, -1024 * localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE);
+ } else
+ rrdset_next(st);
+
+ rrddim_set_by_pointer(st, rd_wq_add, per_sample->write_q_added);
+ rrddim_set_by_pointer(st, rd_wq_consumed, per_sample->write_q_consumed);
+
+ rrdset_done(st);
+}
+
+static void aclk_stats_read_q(struct aclk_metrics_per_sample *per_sample)
+{
+ static RRDSET *st = NULL;
+ static RRDDIM *rd_rq_add = NULL;
+ static RRDDIM *rd_rq_consumed = NULL;
+
+ if (unlikely(!st)) {
+ st = rrdset_create_localhost(
+ "netdata", "aclk_read_q", NULL, "aclk_stats", NULL, "Read Queue Libwebsockets->Mosq", "kB/s",
+ "netdata", "stats", 200004, localhost->rrd_update_every, RRDSET_TYPE_AREA);
+
+ rd_rq_add = rrddim_add(st, "added", NULL, 1, 1024 * localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE);
+ rd_rq_consumed = rrddim_add(st, "consumed", NULL, 1, -1024 * localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE);
+ } else
+ rrdset_next(st);
+
+ rrddim_set_by_pointer(st, rd_rq_add, per_sample->read_q_added);
+ rrddim_set_by_pointer(st, rd_rq_consumed, per_sample->read_q_consumed);
+
+ rrdset_done(st);
+}
+
+static void aclk_stats_cloud_req(struct aclk_metrics_per_sample *per_sample)
+{
+ static RRDSET *st = NULL;
+ static RRDDIM *rd_rq_rcvd = NULL;
+ static RRDDIM *rd_rq_err = NULL;
+
+ if (unlikely(!st)) {
+ st = rrdset_create_localhost(
+ "netdata", "aclk_cloud_req", NULL, "aclk_stats", NULL, "Requests received from cloud", "req/s",
+ "netdata", "stats", 200005, localhost->rrd_update_every, RRDSET_TYPE_STACKED);
+
+ rd_rq_rcvd = rrddim_add(st, "received", NULL, 1, localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE);
+ rd_rq_err = rrddim_add(st, "malformed", NULL, 1, localhost->rrd_update_every, RRD_ALGORITHM_ABSOLUTE);
+ } else
+ rrdset_next(st);
+
+ rrddim_set_by_pointer(st, rd_rq_rcvd, per_sample->cloud_req_recvd - per_sample->cloud_req_err);
+ rrddim_set_by_pointer(st, rd_rq_err, per_sample->cloud_req_err);
+
+ rrdset_done(st);
+}
+
+void *aclk_stats_main_thread(void *ptr)
+{
+ UNUSED(ptr);
+ heartbeat_t hb;
+ heartbeat_init(&hb);
+ usec_t step_ut = localhost->rrd_update_every * USEC_PER_SEC;
+ memset(&aclk_metrics_per_sample, 0, sizeof(struct aclk_metrics_per_sample));
+ struct aclk_metrics_per_sample per_sample;
+ struct aclk_metrics permanent;
+
+ while (!netdata_exit) {
+ netdata_thread_testcancel();
+ // ------------------------------------------------------------------------
+ // Wait for the next iteration point.
+
+ heartbeat_next(&hb, step_ut);
+
+ ACLK_STATS_LOCK;
+ // to not hold lock longer than necessary, especially not to hold it
+ // during database rrd* operations
+ memcpy(&per_sample, &aclk_metrics_per_sample, sizeof(struct aclk_metrics_per_sample));
+ memcpy(&permanent, &aclk_metrics, sizeof(struct aclk_metrics));
+ memset(&aclk_metrics_per_sample, 0, sizeof(struct aclk_metrics_per_sample));
+ ACLK_STATS_UNLOCK;
+
+ aclk_stats_collect(&per_sample, &permanent);
+ aclk_stats_query_thread(&per_sample);
+#ifdef NETDATA_INTERNAL_CHECKS
+ aclk_stats_latency(&per_sample);
+#endif
+ aclk_stats_write_q(&per_sample);
+ aclk_stats_read_q(&per_sample);
+
+ aclk_stats_cloud_req(&per_sample);
+ }
+ return 0;
+}
+
+void aclk_stats_upd_online(int online) {
+ if(!aclk_stats_enabled)
+ return;
+
+ ACLK_STATS_LOCK;
+ aclk_metrics.online = online;
+
+ if(!online)
+ aclk_metrics_per_sample.offline_during_sample = 1;
+ ACLK_STATS_UNLOCK;
+}
diff --git a/aclk/aclk_stats.h b/aclk/aclk_stats.h
new file mode 100644
index 0000000000..f2a5b1a518
--- /dev/null
+++ b/aclk/aclk_stats.h
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef NETDATA_ACLK_STATS_H
+#define NETDATA_ACLK_STATS_H
+
+#include "../daemon/common.h"
+#include "libnetdata/libnetdata.h"
+
+#define ACLK_STATS_THREAD_NAME "ACLK_Stats"
+
+extern netdata_mutex_t aclk_stats_mutex;
+
+#define ACLK_STATS_LOCK netdata_mutex_lock(&aclk_stats_mutex)
+#define ACLK_STATS_UNLOCK netdata_mutex_unlock(&aclk_stats_mutex)
+
+extern int aclk_stats_enabled;
+
+// preserve between samples
+struct aclk_metrics {
+ volatile uint8_t online;
+};
+
+// reset to 0 on every sample
+extern struct aclk_metrics_per_sample {
+ /* in the unlikely event of ACLK disconnecting
+ and reconnecting under 1 sampling rate
+ we want to make sure we record the disconnection
+ despite it being then seemingly longer in graph */
+ volatile uint8_t offline_during_sample;
+
+ volatile uint8_t queries_queued;
+ volatile uint8_t queries_dispatched;
+#ifdef NETDATA_INTERNAL_CHECKS
+ volatile uint32_t latency_max;
+ volatile uint32_t latency_total;
+ volatile uint32_t latency_count;
+#endif
+ volatile uint32_t write_q_added;
+ volatile uint32_t write_q_consumed;
+
+ volatile uint32_t read_q_added;
+ volatile uint32_t read_q_consumed;
+
+ volatile uint32_t cloud_req_recvd;
+ volatile uint32_t cloud_req_err;
+} aclk_metrics_per_sample;
+
+void *aclk_stats_main_thread(void *ptr);
+void aclk_stats_upd_online(int online);
+
+#endif /* NETDATA_ACLK_STATS_H */
diff --git a/aclk/agent_cloud_link.c b/aclk/agent_cloud_link.c
index 05c0225ad8..cc366a8332 100644
--- a/aclk/agent_cloud_link.c
+++ b/aclk/agent_cloud_link.c
@@ -4,6 +4,7 @@
#include "agent_cloud_link.h"
#include "aclk_lws_https_client.h"
#include "aclk_common.h"
+#include "aclk_stats.h"
int aclk_shutting_down = 0;
// State-machine for the on-connect metadata transmission.
@@ -324,6 +325,12 @@ int aclk_queue_query(char *topic, char *data, char *msg_id, char *query, int run
aclk_queue.count--;
}
+ if (aclk_stats_enabled) {
+ ACLK_STATS_LOCK;
+ aclk_metrics_per_sample.queries_queued++;
+ ACLK_STATS_UNLOCK;
+ }
+
new_query = callocz(1, sizeof(struct aclk_query));
new_query->cmd = aclk_cmd;
if (internal) {
@@ -894,6 +901,12 @@ int aclk_process_query()
aclk_query_free(this_query);
+ if (aclk_stats_enabled) {
+ ACLK_STATS_LOCK;
+ aclk_metrics_per_sample.queries_dispatched++;
+ ACLK_STATS_UNLOCK;
+ }
+
return 1;
}
@@ -1358,6 +1371,7 @@ void *aclk_main(void *ptr)
{
struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
struct netdata_static_thread *query_thread;
+ struct netdata_static_thread *stats_thread = NULL;
// This thread is unusual in that it cannot be cancelled by cancel_main_threads()
// as it must notify the far end that it shutdown gracefully and avoid the LWT.
@@ -1383,6 +1397,15 @@ void *aclk_main(void *ptr)
}
}
+ aclk_stats_enabled = appconfig_get_boolean(&cloud_config, CONFIG_SECTION_GLOBAL, "statistics", CONFIG_BOOLEAN_YES);
+ if (aclk_stats_enabled) {
+ stats_thread = callocz(1, sizeof(struct netdata_static_thread));
+ stats_thread->thread = mallocz(sizeof(netdata_thread_t));
+ netdata_thread_create(
+ stats_thread->thread, ACLK_STATS_THREAD_NAME, NETDATA_THREAD_OPTION_JOINABLE, aclk_stats_main_thread,
+ stats_thread);
+ }
+
last_init_sequence = now_realtime_sec();
query_thread = NULL;
@@ -1502,6 +1525,13 @@ exited:
RSA_free(aclk_private_key);
aclk_main_cleanup(ptr);
+
+ if(aclk_stats_enabled) {
+ netdata_thread_join(*stats_thread->thread, NULL);
+ freez(stats_thread->thread);
+ freez(stats_thread);
+ }
+
return NULL;
}
@@ -1587,6 +1617,9 @@ int aclk_subscribe(char *sub_topic, int qos)
void aclk_connect()
{
info("Connection detected (%"PRIu64" queued queries)", aclk_queue.count);
+
+ aclk_stats_upd_online(1);
+
aclk_connected = 1;
waiting_init = 0;
aclk_reconnect_delay(0);
@@ -1599,6 +1632,9 @@ void aclk_disconnect()
{
if (likely(aclk_connected))
info("Disconnect detected (%"PRIu64" queued queries)", aclk_queue.count);
+
+ aclk_stats_upd_online(0);
+
aclk_subscribed = 0;
aclk_metadata_submitted = ACLK_METADATA_REQUIRED;
waiting_init = 1;
@@ -1901,6 +1937,11 @@ int aclk_handle_cloud_request(char *payload)
.type_id = NULL, .msg_id = NULL, .callback_topic = NULL, .payload = NULL, .version = 0
};
+ if (aclk_stats_enabled) {
+ ACLK_STATS_LOCK;
+ aclk_metrics_per_sample.cloud_req_recvd++;
+ ACLK_STATS_UNLOCK;
+ }
if (unlikely(agent_state == AGENT_INITIALIZING)) {
debug(D_ACLK, "Ignoring cloud request; agent not in stable state");
@@ -1938,6 +1979,12 @@ int aclk_handle_cloud_request(char *payload)
if (cloud_to_agent.callback_topic)
freez(cloud_to_agent.callback_topic);
+ if (aclk_stats_enabled) {
+ ACLK_STATS_LOCK;
+ aclk_metrics_per_sample.cloud_req_err++;
+ ACLK_STATS_UNLOCK;
+ }
+
return 1;
}
diff --git a/aclk/mqtt.c b/aclk/mqtt.c
index f1ee629742..26164bbd92 100644
--- a/aclk/mqtt.c
+++ b/aclk/mqtt.c
@@ -4,6 +4,7 @@
#include "../daemon/common.h"
#include "mqtt.h"
#include "aclk_lws_wss_client.h"
+#include "aclk_stats.h"
extern usec_t aclk_session_us;
extern time_t aclk_session_sec;
@@ -38,8 +39,19 @@ void publish_callback(struct mosquitto *mosq, void *obj, int rc)
now_realtime_timeval(&now);
orig = &sendTimes[ rc & 0x3ff ];
int64_t diff = (now.tv_sec - orig->tv_sec) * USEC_PER_SEC + (now.tv_usec - orig->tv_usec);
+ diff /= 1000;
- info("Publish_callback: mid=%d latency=%" PRId64 "ms", rc, diff / 1000);
+ info("Publish_callback: mid=%d latency=%" PRId64 "ms", rc, diff);
+
+ if (aclk_stats_enabled) {
+ ACLK_STATS_LOCK;
+ if (aclk_metrics_per_sample.latency_max < diff)
+ aclk_metrics_per_sample.latency_max = diff;
+
+ aclk_metrics_per_sample.latency_total += diff;
+ aclk_metrics_per_sample.latency_count++;
+ ACLK_STATS_UNLOCK;
+ }
#endif
return;
}