Save and load ML models (#14810)

* Revert "Revert "Use static thread-pool for training. (#14702)" (#14782)" This reverts commit 5321ca8d1ef8d974a6a2b2128ca8804de6acb693. * Model I/O. * Minor changes Meant to make debugging a crash issues easier on cloud VMs: - Less verbose logging - Higher logging history - Modify installer to use debug info by default * Fix ML initialization order. * read lock hosts when running detection. * Revert debugging changes. * Update ml/Config.cc Co-authored-by: Andrew Maguire <andrewm4894@gmail.com> --------- Co-authored-by: Andrew Maguire <andrewm4894@gmail.com>
author: vkalintiris <vasilis@netdata.cloud> 2023-04-13 20:29:52 +0300
committer: GitHub <noreply@github.com> 2023-04-13 20:29:52 +0300
commit: 003df5f2b76973f898b44742b7e071ff2654343a (patch)
tree: 46183f6a35edb887ab8f3de8a1d6e398438a6a0f /ml/ml-private.h
parent: 40f69dc20f7e40b7155d29a3f735ec4af29f4865 (diff)
1 files changed, 29 insertions, 13 deletions
diff --git a/ml/ml-private.h b/ml/ml-private.h
index 173b82e265..d014c71d26 100644
--- a/ml/ml-private.h
+++ b/ml/ml-private.h
@@ -33,14 +33,15 @@ typedef struct {
 /*
  * KMeans
  */
-typedef struct {
-    size_t num_clusters;
-    size_t max_iterations;
 
+typedef struct {
     std::vector<DSample> cluster_centers;
 
     calculated_number_t min_dist;
     calculated_number_t max_dist;
+
+    uint32_t after;
+    uint32_t before;
 } ml_kmeans_t;
 
 typedef struct machine_learning_stats_t {
@@ -123,6 +124,7 @@ enum ml_training_result {
 
 typedef struct {
     // Chart/dimension we want to train
+    STRING *host_id;
     STRING *chart_id;
     STRING *dimension_id;
 
@@ -168,6 +170,7 @@ typedef struct {
 /*
  * Queue
 */
+
 typedef struct {
     std::queue<ml_training_request_t> internal;
     netdata_mutex_t mutex;
@@ -175,7 +178,6 @@ typedef struct {
     std::atomic<bool> exit;
 } ml_queue_t;
 
-
 typedef struct {
     RRDDIM *rd;
 
@@ -207,19 +209,12 @@ typedef struct {
     RRDHOST *rh;
 
     ml_machine_learning_stats_t mls;
-    ml_training_stats_t ts;
 
     calculated_number_t host_anomaly_rate;
 
-    std::atomic<bool> threads_running;
-    std::atomic<bool> threads_cancelled;
-    std::atomic<bool> threads_joined;
-
-    ml_queue_t *training_queue;
-
     netdata_mutex_t mutex;
 
-    netdata_thread_t training_thread;
+    ml_queue_t *training_queue;
 
     /*
      * bookkeeping for anomaly detection charts
@@ -249,6 +244,19 @@ typedef struct {
     RRDSET *detector_events_rs;
     RRDDIM *detector_events_above_threshold_rd;
     RRDDIM *detector_events_new_anomaly_event_rd;
+} ml_host_t;
+
+typedef struct {
+    size_t id;
+    netdata_thread_t nd_thread;
+    netdata_mutex_t nd_mutex;
+
+    ml_queue_t *training_queue;
+    ml_training_stats_t training_stats;
+
+    calculated_number_t *training_cns;
+    calculated_number_t *scratch_training_cns;
+    std::vector<DSample> training_samples;
 
     RRDSET *queue_stats_rs;
     RRDDIM *queue_stats_queue_size_rd;
@@ -265,7 +273,7 @@ typedef struct {
     RRDDIM *training_results_not_enough_collected_values_rd;
     RRDDIM *training_results_null_acquired_dimension_rd;
     RRDDIM *training_results_chart_under_replication_rd;
-} ml_host_t;
+} ml_training_thread_t;
 
 typedef struct {
     bool enable_anomaly_detection;
@@ -302,6 +310,14 @@ typedef struct {
     std::vector<uint32_t> random_nums;
 
     netdata_thread_t detection_thread;
+    std::atomic<bool> detection_stop;
+
+    size_t num_training_threads;
+
+    std::vector<ml_training_thread_t> training_threads;
+    std::atomic<bool> training_stop;
+
+    bool enable_statistics_charts;
 } ml_config_t;
 
 void ml_config_load(ml_config_t *cfg);
author	vkalintiris <vasilis@netdata.cloud>	2023-04-13 20:29:52 +0300
committer	GitHub <noreply@github.com>	2023-04-13 20:29:52 +0300
commit	003df5f2b76973f898b44742b7e071ff2654343a (patch)
tree	46183f6a35edb887ab8f3de8a1d6e398438a6a0f /ml/ml-private.h
parent	40f69dc20f7e40b7155d29a3f735ec4af29f4865 (diff)