From 003df5f2b76973f898b44742b7e071ff2654343a Mon Sep 17 00:00:00 2001 From: vkalintiris Date: Thu, 13 Apr 2023 20:29:52 +0300 Subject: Save and load ML models (#14810) * Revert "Revert "Use static thread-pool for training. (#14702)" (#14782)" This reverts commit 5321ca8d1ef8d974a6a2b2128ca8804de6acb693. * Model I/O. * Minor changes Meant to make debugging a crash issues easier on cloud VMs: - Less verbose logging - Higher logging history - Modify installer to use debug info by default * Fix ML initialization order. * read lock hosts when running detection. * Revert debugging changes. * Update ml/Config.cc Co-authored-by: Andrew Maguire --------- Co-authored-by: Andrew Maguire --- daemon/global_statistics.c | 28 +--------------------------- daemon/main.c | 18 +++++++++--------- daemon/main.h | 20 +++++++++----------- 3 files changed, 19 insertions(+), 47 deletions(-) (limited to 'daemon') diff --git a/daemon/global_statistics.c b/daemon/global_statistics.c index 0dc3ee6452..ee68bebd15 100644 --- a/daemon/global_statistics.c +++ b/daemon/global_statistics.c @@ -827,33 +827,7 @@ static void global_statistics_charts(void) { rrdset_done(st_points_stored); } - { - static RRDSET *st = NULL; - static RRDDIM *rd = NULL; - - if (unlikely(!st)) { - st = rrdset_create_localhost( - "netdata" // type - , "ml_models_consulted" // id - , NULL // name - , NETDATA_ML_CHART_FAMILY // family - , NULL // context - , "KMeans models used for prediction" // title - , "models" // units - , NETDATA_ML_PLUGIN // plugin - , NETDATA_ML_MODULE_DETECTION // module - , NETDATA_ML_CHART_PRIO_MACHINE_LEARNING_STATUS // priority - , localhost->rrd_update_every // update_every - , RRDSET_TYPE_AREA // chart_type - ); - - rd = rrddim_add(st, "num_models_consulted", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); - } - - rrddim_set_by_pointer(st, rd, (collected_number) gs.ml_models_consulted); - - rrdset_done(st); - } + ml_update_global_statistics_charts(gs.ml_models_consulted); } // ---------------------------------------------------------------------------- diff --git a/daemon/main.c b/daemon/main.c index 682106b78e..478b5d002c 100644 --- a/daemon/main.c +++ b/daemon/main.c @@ -148,10 +148,6 @@ static void service_to_buffer(BUFFER *wb, SERVICE_TYPE service) { buffer_strcat(wb, "MAINTENANCE "); if(service & SERVICE_COLLECTORS) buffer_strcat(wb, "COLLECTORS "); - if(service & SERVICE_ML_TRAINING) - buffer_strcat(wb, "ML_TRAINING "); - if(service & SERVICE_ML_PREDICTION) - buffer_strcat(wb, "ML_PREDICTION "); if(service & SERVICE_REPLICATION) buffer_strcat(wb, "REPLICATION "); if(service & ABILITY_DATA_QUERIES) @@ -340,6 +336,11 @@ void netdata_cleanup_and_exit(int ret) { } #endif + delta_shutdown_time("disable ML detection and training threads"); + + ml_stop_threads(); + ml_fini(); + delta_shutdown_time("disable maintenance, new queries, new web requests, new streaming connections and aclk"); service_signal_exit( @@ -351,12 +352,11 @@ void netdata_cleanup_and_exit(int ret) { | SERVICE_ACLKSYNC ); - delta_shutdown_time("stop replication, exporters, ML training, health and web servers threads"); + delta_shutdown_time("stop replication, exporters, health and web servers threads"); timeout = !service_wait_exit( SERVICE_REPLICATION | SERVICE_EXPORTERS - | SERVICE_ML_TRAINING | SERVICE_HEALTH | SERVICE_WEB_SERVER , 3 * USEC_PER_SEC); @@ -368,11 +368,10 @@ void netdata_cleanup_and_exit(int ret) { | SERVICE_STREAMING , 3 * USEC_PER_SEC); - delta_shutdown_time("stop ML prediction and context threads"); + delta_shutdown_time("stop context thread"); timeout = !service_wait_exit( - SERVICE_ML_PREDICTION - | SERVICE_CONTEXT + SERVICE_CONTEXT , 3 * USEC_PER_SEC); delta_shutdown_time("stop maintenance thread"); @@ -2085,6 +2084,7 @@ int main(int argc, char **argv) { } else debug(D_SYSTEM, "Not starting thread %s.", st->name); } + ml_start_threads(); // ------------------------------------------------------------------------ // Initialize netdata agent command serving from cli and signals diff --git a/daemon/main.h b/daemon/main.h index 7e659e939a..3e32c5ad6d 100644 --- a/daemon/main.h +++ b/daemon/main.h @@ -33,17 +33,15 @@ typedef enum { ABILITY_STREAMING_CONNECTIONS = (1 << 2), SERVICE_MAINTENANCE = (1 << 3), SERVICE_COLLECTORS = (1 << 4), - SERVICE_ML_TRAINING = (1 << 5), - SERVICE_ML_PREDICTION = (1 << 6), - SERVICE_REPLICATION = (1 << 7), - SERVICE_WEB_SERVER = (1 << 8), - SERVICE_ACLK = (1 << 9), - SERVICE_HEALTH = (1 << 10), - SERVICE_STREAMING = (1 << 11), - SERVICE_CONTEXT = (1 << 12), - SERVICE_ANALYTICS = (1 << 13), - SERVICE_EXPORTERS = (1 << 14), - SERVICE_ACLKSYNC = (1 << 15) + SERVICE_REPLICATION = (1 << 5), + SERVICE_WEB_SERVER = (1 << 6), + SERVICE_ACLK = (1 << 7), + SERVICE_HEALTH = (1 << 8), + SERVICE_STREAMING = (1 << 9), + SERVICE_CONTEXT = (1 << 10), + SERVICE_ANALYTICS = (1 << 11), + SERVICE_EXPORTERS = (1 << 12), + SERVICE_ACLKSYNC = (1 << 13) } SERVICE_TYPE; typedef enum { -- cgit v1.2.3