summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCosta Tsaousis <costa@netdata.cloud>2022-07-24 22:33:09 +0300
committerGitHub <noreply@github.com>2022-07-24 22:33:09 +0300
commit291b978282e1066a43a80658bd1b49be0fbb2eaf (patch)
tree63bffa5ceb270ad05ffe3187d4fb7e18cb56ff35
parent94040523c73c0f515ea3d682ab0a20c11400a43c (diff)
Rrdcontext (#13335)
* type checking on dictionary return values * first STRING implementation, used by DICTIONARY and RRDLABEL * enable AVL compilation of STRING * Initial functions to store context info * Call simple test functions * Add host_id when getting charts * Allow host to be null and in this case it will process the localhost * Simplify init Do not use strdupz - link directly to sqlite result set * Init the database during startup * make it compile - no functionality yet * intermediate commit * intermidiate * first interface to sql * loading instances * check if we need to update cloud * comparison of rrdcontext on conflict * merge context titles * rrdcontext public interface; statistics on STRING; scratchpad on DICTIONARY * dictionaries maintain version numbers; rrdcontext api * cascading changes * first operational cleanup * string unittest * proper cleanup of referenced dictionaries * added rrdmetrics * rrdmetric starting retention * Add fields to context Adjuct context creation and delete * Memory cleanup * Fix get context list Fix memory double free in tests Store context with two hosts * calculated retention * rrdcontext retention with collection * Persist database and shutdown * loading all from sql * Get chart list and dimension list changes * fully working attempt 1 * fully working attempt 2 * missing archived flag from log * fixed archived / collected * operational * proper cleanup * cleanup - implemented all interface functions - dictionary react callback triggers after the dictionary is unlocked * track all reasons for changes * proper tracking of reasons of changes * fully working thread * better versioning of contexts * fix string indexing with AVL * running version per context vs hub version; ifdef dbengine * added option to disable rrdmetrics * release old context when a chart changes context * cleanup properly * renamed config * cleanup contexts; general cleanup; * deletion inline with dequeue; lots of cleanup; child connected/disconnected * ml should start after rrdcontext * added missing NULL to ri->rrdset; rrdcontext flags are now only changed under a mutex lock * fix buggy STRING under AVL * Rework database initialization Add migration logic to the context database * fix data race conditions during context deletion * added version hash algorithm * fix string over AVL * update aclk-schemas * compile new ctx related protos * add ctx stream message utils * add context messages * add dummy rx message handlers * add the new topics * add ctx capability * add helper functions to send the new messages * update cmake build to not fail * update topic names * handle rrdcontext_enabled * add more functions * fatal on OOM cases instead of return NULL * silence unknown query type error * fully working attempt 1 * fully working attempt 2 * allow compiling without ACLK * added family to the context * removed excess character in UUID * smarter merging of titles and families * Database migration code to add family Add family to SQL_CHART_DATA and VERSIONED_CONTEXT_DATA * add family to context message * enable ctx in communication * hardcoded enabled contexts * Add hard code for CTX * add update node collectors to json * add context message log * fix log about last_time_t * fix collected flags for queued items * prevent crash on charts cleanup * fix bug in AVL indexing of dictionaries; make sure react callback of dictionaries has a reference counter, which is acquired while the dictionary is locked * fixed dictionary unittest * strict policy to cleanup and garbage collector * fix db rotation and garbage collection timings * remove deadlock * proper garbage collection - a lot faster retention recalculation * Added not NULL in database columns Remove migration code for context -- we will ship with version 1 of the table schema Added define for query in tests to detect localhost * Use UUID_STR_LEN instead of GUID_LEN + 1 Use realistic timestamps when adding test data in the database * Add NULL checks for passed parameters * Log deleted context when compiled with NETDATA_INTERNAL_CHECKS * Error checking for null host id * add missing ContextsCheckpoint log convertor * Fix spelling in VACCUM * Hold additional information for host -- prepare to load archived hosts on startup * Make sure claim id is valid * is_get_claimed is actually get the current claim id * Simplify ctx get chart list query * remove env negotiation * fix string unittest when there are some strings already in the index * propagate live-retention flag upstream; cleanup all update reasons; updated instances logging; automated attaching started/stopped collecting flags; * first implementation of /api/v1/contexts * full contexts API; updated swagger * disabled debugging; rrdcontext enabled by default * final cleanup and renaming of global variables * return current time on currently collected contexts, charts and dimensions * added option "deepscan" to the API to have the server refresh the retention and recalculate the contexts on the fly * fixed identation of yaml * Add constrains to the host table * host->node_id may not be available * new capabilities * lock the context while rendering json * update aclk-schemas * added permanent labels to all charts about plugin, module and family; added labels to all proc plugin modules * always add the labels * allow merging of families down to [x] * dont show uuids by default, added option to enable them; response is now accepting after,before to show only data for a specific timeframe; deleted items are only shown when "deleted" is requested; hub version is now shown when "queue" is requested * Use the localhost claim id * Fix to handle host constrains better * cgroups: add "k8s." prefix to chart context in k8s * Improve sqlite metadata version migration check * empty values set to "[none]"; fix labels unit test to reflect that * Check if we reached the version we want first (address CODACY report re: Array index 'i' is used before limits check) * Rewrite condition to address CODACY report (Redundant condition: t->filter_callback. '!A || (A && B)' is equivalent to '!A || B') * Properly unlock context * fixed memory leak on rrdcontexts - it was not freeing all dictionaries in rrdhost; added wait of up to 100ms on dictionary_destroy() to give time to dictionaries to release their items before destroying them * fixed memory leak on rrdlabels not freed on rrdinstances * fixed leak when dimensions and charts are redefined * Mark entries for charts and dimensions as submitted to the cloud 3600 seconds after their creation Mark entries for charts and dimensions as updated (confirmed by the cloud) 1800 seconds after their submission * renamed struct string * update cgroups alarms * fixed codacy suggestions * update dashboard info * fix k8s_cgroup_10s_received_packets_storm alarm * added filtering options to /api/v1/contexts and /api/v1/context * fix eslint * fix eslint * Fix pointer binding for host / chart uuids * Fix cgroups unit tests * fixed non-retention updates not propagated upstream * removed non-fatal fatals * Remove context from 2 way string merge. * Move string_2way_merge to dictionary.c * Add 2-way string merge tests. * split long lines * fix indentation in netdata-swagger.yaml * update netdata-swagger.json * yamllint please * remove the deleted flag when a context is collected * fix yaml warning in swagger * removed non-fatal fatals * charts should now be able to switch contexts * allow deletion of unused metrics, instances and contexts * keep the queued flag * cleanup old rrdinstance labels * dont hide objects when there is no filter; mark objects as deleted when there are no sub-objects * delete old instances once they changed context * delete all instances and contexts that do not have sub-objects * more precise transitions * Load archived hosts on startup (part 1) * update the queued time every time * disable by default; dedup deleted dimensions after snapshot * Load archived hosts on startup (part 2) * delayed processing of events until charts are being collected * remove dont-trigger flag when object is collected * polish all triggers given the new dont_process flag * Remove always true condition Enums for readbility / create_host_callback only if ACLK is enabled (for now) * Skip retention message if context streaming is enabled Add messages in the access log if context streaming is enabled * Check for node id being a UUID that can be parsed Improve error check / reporting when loading archived hosts and creating ACLK sync threads * collected, archived, deleted are now mutually exclusive * Enable the "orphan" handling for now Remove dead code Fix memory leak on free host * Queue charts and dimensions will be no-op if host is set to stream contexts * removed unused parameter and made sure flags are set on rrdcontext insert * make the rrdcontext thread abort mid-work when exiting * Skip chart hash computation and storage if contexts streaming is enabled Co-authored-by: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com> Co-authored-by: Timo <timotej@netdata.cloud> Co-authored-by: ilyam8 <ilya@netdata.cloud> Co-authored-by: Vladimir Kobal <vlad@prokk.net> Co-authored-by: Vasilis Kalintiris <vasilis@netdata.cloud>
-rw-r--r--CMakeLists.txt12
-rw-r--r--Makefile.am24
m---------aclk/aclk-schemas0
-rw-r--r--aclk/aclk.c32
-rw-r--r--aclk/aclk_api.c1
-rw-r--r--aclk/aclk_api.h1
-rw-r--r--aclk/aclk_contexts_api.c23
-rw-r--r--aclk/aclk_contexts_api.h12
-rw-r--r--aclk/aclk_otp.c9
-rw-r--r--aclk/aclk_query.c1
-rw-r--r--aclk/aclk_query_queue.c1
-rw-r--r--aclk/aclk_query_queue.h1
-rw-r--r--aclk/aclk_rx_msgs.c47
-rw-r--r--aclk/aclk_tx_msgs.c1
-rw-r--r--aclk/aclk_util.c4
-rw-r--r--aclk/aclk_util.h4
-rw-r--r--aclk/schema-wrappers/context.cc125
-rw-r--r--aclk/schema-wrappers/context.h53
-rw-r--r--aclk/schema-wrappers/context_stream.cc42
-rw-r--r--aclk/schema-wrappers/context_stream.h36
-rw-r--r--aclk/schema-wrappers/node_connection.cc9
-rw-r--r--aclk/schema-wrappers/node_connection.h3
-rw-r--r--aclk/schema-wrappers/proto_2_json.cc12
-rw-r--r--aclk/schema-wrappers/schema_wrappers.h2
-rw-r--r--claim/claim.c2
-rw-r--r--claim/claim.h2
-rw-r--r--collectors/cgroups.plugin/sys_fs_cgroup.c87
-rw-r--r--collectors/cgroups.plugin/tests/test_doubles.c3
-rw-r--r--collectors/cups.plugin/cups_plugin.c12
-rw-r--r--collectors/diskspace.plugin/plugin_diskspace.c4
-rw-r--r--collectors/proc.plugin/plugin_proc.h7
-rw-r--r--collectors/proc.plugin/proc_diskstats.c125
-rw-r--r--collectors/proc.plugin/proc_interrupts.c4
-rw-r--r--collectors/proc.plugin/proc_mdstat.c38
-rw-r--r--collectors/proc.plugin/proc_net_dev.c55
-rw-r--r--collectors/proc.plugin/proc_net_wireless.c274
-rw-r--r--collectors/proc.plugin/proc_pagetypeinfo.c12
-rw-r--r--collectors/proc.plugin/proc_softirqs.c4
-rw-r--r--collectors/proc.plugin/proc_stat.c4
-rw-r--r--collectors/proc.plugin/sys_block_zram.c4
-rw-r--r--collectors/proc.plugin/sys_class_power_supply.c8
-rw-r--r--collectors/proc.plugin/sys_devices_system_node.c2
-rw-r--r--collectors/proc.plugin/sys_fs_btrfs.c13
-rw-r--r--daemon/analytics.c2
-rw-r--r--daemon/global_statistics.c1
-rw-r--r--daemon/main.c11
-rw-r--r--daemon/static_threads.c10
-rw-r--r--daemon/unit_test.c3
-rw-r--r--database/engine/rrdengine.c1
-rwxr-xr-xdatabase/engine/rrdengineapi.c30
-rw-r--r--database/engine/rrdengineapi.h1
-rw-r--r--database/rrd.h59
-rw-r--r--database/rrdcontext.c2830
-rw-r--r--database/rrdcontext.h91
-rw-r--r--database/rrddim.c12
-rw-r--r--database/rrdhost.c176
-rw-r--r--database/rrdlabels.c94
-rw-r--r--database/rrdset.c26
-rw-r--r--database/sqlite/sqlite_aclk.c95
-rw-r--r--database/sqlite/sqlite_aclk.h2
-rw-r--r--database/sqlite/sqlite_aclk_alert.c6
-rw-r--r--database/sqlite/sqlite_aclk_chart.c30
-rw-r--r--database/sqlite/sqlite_aclk_node.c5
-rw-r--r--database/sqlite/sqlite_context.c598
-rw-r--r--database/sqlite/sqlite_context.h68
-rw-r--r--database/sqlite/sqlite_db_migration.c97
-rw-r--r--database/sqlite/sqlite_db_migration.h1
-rw-r--r--database/sqlite/sqlite_functions.c159
-rw-r--r--database/sqlite/sqlite_functions.h8
-rw-r--r--health/health.d/cgroups.conf70
-rw-r--r--libnetdata/dictionary/README.md6
-rw-r--r--libnetdata/dictionary/dictionary.c800
-rw-r--r--libnetdata/dictionary/dictionary.h87
-rw-r--r--streaming/receiver.c5
-rw-r--r--streaming/rrdpush.c3
-rw-r--r--streaming/rrdpush.h2
-rw-r--r--streaming/sender.c20
-rw-r--r--web/api/netdata-swagger.json302
-rw-r--r--web/api/netdata-swagger.yaml282
-rw-r--r--web/api/queries/query.c4
-rw-r--r--web/api/web_api_v1.c157
-rw-r--r--web/gui/dashboard_info.js486
82 files changed, 6977 insertions, 778 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3d0d9559b3..0babcc11b1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -710,6 +710,8 @@ set(RRD_PLUGIN_FILES
database/rrdcalc.h
database/rrdcalctemplate.c
database/rrdcalctemplate.h
+ database/rrdcontext.c
+ database/rrdcontext.h
database/rrddim.c
database/rrddimvar.c
database/rrddimvar.h
@@ -729,6 +731,8 @@ set(RRD_PLUGIN_FILES
database/ram/rrddim_mem.h
database/sqlite/sqlite_functions.c
database/sqlite/sqlite_functions.h
+ database/sqlite/sqlite_context.c
+ database/sqlite/sqlite_context.h
database/sqlite/sqlite_db_migration.c
database/sqlite/sqlite_db_migration.h
database/sqlite/sqlite_aclk.c
@@ -881,6 +885,8 @@ set(ACLK_FILES
aclk/aclk_charts_api.h
aclk/aclk_alarm_api.c
aclk/aclk_alarm_api.h
+ aclk/aclk_contexts_api.c
+ aclk/aclk_contexts_api.h
mqtt_websockets/src/mqtt_wss_client.c
mqtt_websockets/src/include/mqtt_wss_client.h
mqtt_websockets/src/mqtt_wss_log.c
@@ -917,6 +923,10 @@ set(ACLK_FILES
aclk/schema-wrappers/capability.h
aclk/schema-wrappers/proto_2_json.cc
aclk/schema-wrappers/proto_2_json.h
+ aclk/schema-wrappers/context_stream.cc
+ aclk/schema-wrappers/context_stream.h
+ aclk/schema-wrappers/context.cc
+ aclk/schema-wrappers/context.h
aclk/schema-wrappers/schema_wrappers.h
aclk/schema-wrappers/schema_wrapper_utils.cc
aclk/schema-wrappers/schema_wrapper_utils.h
@@ -1231,6 +1241,8 @@ set(ACLK_PROTO_DEFS
aclk/aclk-schemas/proto/nodeinstance/connection/v1/connection.proto
aclk/aclk-schemas/proto/nodeinstance/create/v1/creation.proto
aclk/aclk-schemas/proto/nodeinstance/info/v1/info.proto
+ aclk/aclk-schemas/proto/context/v1/context.proto
+ aclk/aclk-schemas/proto/context/v1/stream.proto
)
PROTOBUF_ACLK_GENERATE_CPP(ACLK_PROTO_BUILT_SRCS ACLK_PROTO_BUILT_HDRS ${ACLK_PROTO_DEFS})
diff --git a/Makefile.am b/Makefile.am
index f4ed61eb91..5eefdc13cf 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -434,6 +434,8 @@ RRD_PLUGIN_FILES = \
database/rrdcalc.h \
database/rrdcalctemplate.c \
database/rrdcalctemplate.h \
+ database/rrdcontext.c \
+ database/rrdcontext.h \
database/rrddim.c \
database/rrddimvar.c \
database/rrddimvar.h \
@@ -453,6 +455,8 @@ RRD_PLUGIN_FILES = \
database/ram/rrddim_mem.h \
database/sqlite/sqlite_functions.c \
database/sqlite/sqlite_functions.h \
+ database/sqlite/sqlite_context.c \
+ database/sqlite/sqlite_context.h \
database/sqlite/sqlite_db_migration.c \
database/sqlite/sqlite_db_migration.h \
database/sqlite/sqlite_aclk.c \
@@ -706,6 +710,8 @@ ACLK_FILES = \
aclk/aclk_charts_api.h \
aclk/aclk_alarm_api.c \
aclk/aclk_alarm_api.h \
+ aclk/aclk_contexts_api.c \
+ aclk/aclk_contexts_api.h \
aclk/schema-wrappers/connection.cc \
aclk/schema-wrappers/connection.h \
aclk/schema-wrappers/node_connection.cc \
@@ -729,6 +735,10 @@ ACLK_FILES = \
aclk/schema-wrappers/schema_wrappers.h \
aclk/schema-wrappers/schema_wrapper_utils.cc \
aclk/schema-wrappers/schema_wrapper_utils.h \
+ aclk/schema-wrappers/context_stream.cc \
+ aclk/schema-wrappers/context_stream.h \
+ aclk/schema-wrappers/context.cc \
+ aclk/schema-wrappers/context.h \
$(NULL)
mqtt_websockets/src/mqtt_wss_client.$(OBJEXT) : CFLAGS += -Wno-unused-result
@@ -746,6 +756,8 @@ ACLK_PROTO_DEFINITIONS = \
aclk/aclk-schemas/proto/nodeinstance/connection/v1/connection.proto \
aclk/aclk-schemas/proto/nodeinstance/create/v1/creation.proto \
aclk/aclk-schemas/proto/nodeinstance/info/v1/info.proto \
+ aclk/aclk-schemas/proto/context/v1/context.proto \
+ aclk/aclk-schemas/proto/context/v1/stream.proto \
$(NULL)
dist_noinst_DATA += $(ACLK_PROTO_DEFINITIONS)
@@ -774,6 +786,10 @@ ACLK_PROTO_BUILT_FILES = aclk/aclk-schemas/proto/agent/v1/connection.pb.cc \
aclk/aclk-schemas/proto/alarm/v1/stream.pb.h \
aclk/aclk-schemas/proto/nodeinstance/info/v1/info.pb.cc \
aclk/aclk-schemas/proto/nodeinstance/info/v1/info.pb.h \
+ aclk/aclk-schemas/proto/context/v1/context.pb.cc \
+ aclk/aclk-schemas/proto/context/v1/context.pb.h \
+ aclk/aclk-schemas/proto/context/v1/stream.pb.cc \
+ aclk/aclk-schemas/proto/context/v1/stream.pb.h \
$(NULL)
BUILT_SOURCES += $(ACLK_PROTO_BUILT_FILES)
@@ -828,6 +844,14 @@ aclk/aclk-schemas/proto/nodeinstance/info/v1/info.pb.cc \
aclk/aclk-schemas/proto/nodeinstance/info/v1/info.pb.h: aclk/aclk-schemas/proto/nodeinstance/info/v1/info.proto
$(PROTOC) -I=aclk/aclk-schemas --cpp_out=$(builddir)/aclk/aclk-schemas $^
+aclk/aclk-schemas/proto/context/v1/context.pb.cc \
+aclk/aclk-schemas/proto/context/v1/context.pb.h: aclk/aclk-schemas/proto/context/v1/context.proto
+ $(PROTOC) -I=aclk/aclk-schemas --cpp_out=$(builddir)/aclk/aclk-schemas $^
+
+aclk/aclk-schemas/proto/context/v1/stream.pb.cc \
+aclk/aclk-schemas/proto/context/v1/stream.pb.h: aclk/aclk-schemas/proto/context/v1/stream.proto
+ $(PROTOC) -I=aclk/aclk-schemas --cpp_out=$(builddir)/aclk/aclk-schemas $^
+
endif #ENABLE_ACLK
ACLK_ALWAYS_BUILD_FILES = \
diff --git a/aclk/aclk-schemas b/aclk/aclk-schemas
-Subproject fa46ccca237a9bdb613b3b1f2809a25b7b45c7c
+Subproject 3252118bd547640251356629f0df05eaf952ac3
diff --git a/aclk/aclk.c b/aclk/aclk.c
index 4477f1bb5f..7b3641b1e2 100644
--- a/aclk/aclk.c
+++ b/aclk/aclk.c
@@ -141,12 +141,12 @@ static int wait_till_cloud_enabled()
static int wait_till_agent_claimed(void)
{
//TODO prevent malloc and freez
- char *agent_id = is_agent_claimed();
+ char *agent_id = get_agent_claimid();
while (likely(!agent_id)) {
sleep_usec(USEC_PER_SEC * 1);
if (netdata_exit)
return 1;
- agent_id = is_agent_claimed();
+ agent_id = get_agent_claimid();
}
freez(agent_id);
return 0;
@@ -769,6 +769,16 @@ void aclk_host_state_update(RRDHOST *host, int cmd)
};
node_state_update.node_id = mallocz(UUID_STR_LEN);
uuid_unparse_lower(node_id, (char*)node_state_update.node_id);
+
+ struct capability caps[] = {
+ { .name = "proto", .version = 1, .enabled = 1 },
+ { .name = "ml", .version = ml_capable(localhost), .enabled = ml_enabled(host) },
+ { .name = "mc", .version = enable_metric_correlations ? metric_correlations_version : 0, .enabled = enable_metric_correlations },
+ { .name = "ctx", .version = 1, .enabled = rrdcontext_enabled },
+ { .name = NULL, .version = 0, .enabled = 0 }
+ };
+ node_state_update.capabilities = caps;
+
rrdhost_aclk_state_lock(localhost);
node_state_update.claim_id = localhost->aclk_state.claimed_id;
query->data.bin_payload.payload = generate_node_instance_connection(&query->data.bin_payload.size, &node_state_update);
@@ -801,6 +811,20 @@ void aclk_send_node_instances()
};
node_state_update.node_id = mallocz(UUID_STR_LEN);
uuid_unparse_lower(list->node_id, (char*)node_state_update.node_id);
+
+ char host_id[UUID_STR_LEN];
+ uuid_unparse_lower(list->host_id, host_id);
+
+ RRDHOST *host = rrdhost_find_by_guid(host_id, 0);
+ struct capability caps[] = {
+ { .name = "proto", .version = 1, .enabled = 1 },
+ { .name = "ml", .version = ml_capable(localhost), .enabled = host ? ml_enabled(host) : 0 },
+ { .name = "mc", .version = enable_metric_correlations ? metric_correlations_version : 0, .enabled = enable_metric_correlations },
+ { .name = "ctx", .version = 1, .enabled = rrdcontext_enabled },
+ { .name = NULL, .version = 0, .enabled = 0 }
+ };
+ node_state_update.capabilities = caps;
+
rrdhost_aclk_state_lock(localhost);
node_state_update.claim_id = localhost->aclk_state.claimed_id;
query->data.bin_payload.payload = generate_node_instance_connection(&query->data.bin_payload.size, &node_state_update);
@@ -913,7 +937,7 @@ char *ng_aclk_state(void)
);
buffer_sprintf(wb, "Protocol Used: Protobuf\nMQTT Version: %d\nClaimed: ", use_mqtt_5 ? 5 : 3);
- char *agent_id = is_agent_claimed();
+ char *agent_id = get_agent_claimid();
if (agent_id == NULL)
buffer_strcat(wb, "No\n");
else {
@@ -1079,7 +1103,7 @@ char *ng_aclk_state_json(