Refactor ML code and add support for multiple KMeans models. (#14065)

* Add profile.plugin Creates the specified number of charts/dimensions, and supports backfilling with pseudo-historical data. * Bump * Remove wrongly merged line. * Use the number of models specified from the config section. * Add option to consult all ML models. * Remove profiling option consuming all models. * Add underscore after chart name prefix. * prediction -> dimensions chart * reorder funcs * Split charts across types with correct priority * Ignore training request when chart is under replication. * Track global number of models consulted. * Cleanup config. * initial readme updates * fix readme * readme * Fix function definition when ML is disabled. * Add dummy ml_chart_update_{begin,end} * Remove profile_plugin * Define chart priorities under collectors/all.h * s/curr_t/current_time/ Co-authored-by: Andrew Maguire <andrewm4894@gmail.com>
author: vkalintiris <vasilis@netdata.cloud> 2022-12-21 15:03:05 +0200
committer: GitHub <noreply@github.com> 2022-12-21 15:03:05 +0200
commit: 689dc6b7fbbf495ce3e020dcff0d014a8d338c52 (patch)
tree: 1f458a3218798b53809d0868a82fcad1a20b1e32 /ml
parent: fe386aad57f24574783f4c68bab433a5cdfe6f64 (diff)
19 files changed, 1314 insertions, 719 deletions
diff --git a/ml/ADCharts.cc b/ml/ADCharts.cc
index 00c593c0c4..49816f8f4b 100644
--- a/ml/ADCharts.cc
+++ b/ml/ADCharts.cc
@@ -3,55 +3,182 @@
 #include "ADCharts.h"
 #include "Config.h"
 
-void ml::updateDimensionsChart(RRDHOST *RH,
-                               collected_number NumTrainedDimensions,
-                               collected_number NumNormalDimensions,
-                               collected_number NumAnomalousDimensions) {
-    static thread_local RRDSET *RS = nullptr;
-    static thread_local RRDDIM *NumTotalDimensionsRD = nullptr;
-    static thread_local RRDDIM *NumTrainedDimensionsRD = nullptr;
-    static thread_local RRDDIM *NumNormalDimensionsRD = nullptr;
-    static thread_local RRDDIM *NumAnomalousDimensionsRD = nullptr;
-
-    if (!RS) {
-        std::stringstream IdSS, NameSS;
+void ml::updateDimensionsChart(RRDHOST *RH, const MachineLearningStats &MLS) {
+    /*
+     * Machine learning status
+    */
+    {
+        static thread_local RRDSET *MachineLearningStatusRS = nullptr;
+
+        static thread_local RRDDIM *Enabled = nullptr;
+        static thread_local RRDDIM *DisabledUE = nullptr;
+        static thread_local RRDDIM *DisabledSP = nullptr;
+
+        if (!MachineLearningStatusRS) {
+            std::stringstream IdSS, NameSS;
+
+            IdSS << "machine_learning_status_for_" << localhost->machine_guid;
+            NameSS << "machine_learning_status_for_" << localhost->hostname;
+
+            MachineLearningStatusRS = rrdset_create_localhost(
+                "netdata", // type
+                IdSS.str().c_str(), // id
+                NameSS.str().c_str(), // name
+                "ml", // family
+                "netdata.machine_learning_status", // ctx
+                "Machine learning status", // title
+                "dimensions", // units
+                "netdata", // plugin
+                "ml", // module
+                NETDATA_ML_CHART_PRIO_MACHINE_LEARNING_STATUS, // priority
+                RH->rrd_update_every, // update_every
+                RRDSET_TYPE_LINE // chart_type
+            );
+            rrdset_flag_set(MachineLearningStatusRS , RRDSET_FLAG_ANOMALY_DETECTION);
+
+            Enabled = rrddim_add(MachineLearningStatusRS, "enabled", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+            DisabledUE = rrddim_add(MachineLearningStatusRS, "disabled-ue", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+            DisabledSP = rrddim_add(MachineLearningStatusRS, "disabled-sp", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+        }
+
+        rrddim_set_by_pointer(MachineLearningStatusRS, Enabled, MLS.NumMachineLearningStatusEnabled);
+        rrddim_set_by_pointer(MachineLearningStatusRS, DisabledUE, MLS.NumMachineLearningStatusDisabledUE);
+        rrddim_set_by_pointer(MachineLearningStatusRS, DisabledSP, MLS.NumMachineLearningStatusDisabledSP);
+
+        rrdset_done(MachineLearningStatusRS);
+    }
 
-        IdSS << "dimensions_on_" << localhost->machine_guid;
-        NameSS << "dimensions_on_" << localhost->hostname;
+    /*
+     * Metric type
+    */
+    {
+        static thread_local RRDSET *MetricTypesRS = nullptr;
+
+        static thread_local RRDDIM *Constant = nullptr;
+        static thread_local RRDDIM *Variable = nullptr;
+
+        if (!MetricTypesRS) {
+            std::stringstream IdSS, NameSS;
+
+            IdSS << "metric_types_for_" << localhost->machine_guid;
+            NameSS << "metric_types_for_" << localhost->hostname;
+
+            MetricTypesRS = rrdset_create_localhost(
+                "netdata", // type
+                IdSS.str().c_str(), // id
+                NameSS.str().c_str(), // name
+                "ml", // family
+                "netdata.metric_types", // ctx
+                "Dimensions by metric type", // title
+                "dimensions", // units
+                "netdata", // plugin
+                "ml", // module
+                NETDATA_ML_CHART_PRIO_METRIC_TYPES, // priority
+                RH->rrd_update_every, // update_every
+                RRDSET_TYPE_LINE // chart_type
+            );
+            rrdset_flag_set(MetricTypesRS, RRDSET_FLAG_ANOMALY_DETECTION);
+
+            Constant = rrddim_add(MetricTypesRS, "constant", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+            Variable = rrddim_add(MetricTypesRS, "variable", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+        }
+
+        rrddim_set_by_pointer(MetricTypesRS, Constant, MLS.NumMetricTypeConstant);
+        rrddim_set_by_pointer(MetricTypesRS, Variable, MLS.NumMetricTypeVariable);
+
+        rrdset_done(MetricTypesRS);
+    }
 
-        RS = rrdset_create(
-            RH,
-            "anomaly_detection", // type
-            IdSS.str().c_str(), // id
-            NameSS.str().c_str(), // name
-            "dimensions", // family
-            "anomaly_detection.dimensions", // ctx
-            "Anomaly detection dimensions", // title
-            "dimensions", // units
-            "netdata", // plugin
-            "ml", // module
-            39183, // priority
-            RH->rrd_update_every, // update_every
-            RRDSET_TYPE_LINE // chart_type
-        );
-        rrdset_flag_set(RS, RRDSET_FLAG_ANOMALY_DETECTION);
-
-        NumTotalDimensionsRD = rrddim_add(RS, "total", NULL,
-                1, 1, RRD_ALGORITHM_ABSOLUTE);
-        NumTrainedDimensionsRD = rrddim_add(RS, "trained", NULL,
-                1, 1, RRD_ALGORITHM_ABSOLUTE);
-        NumNormalDimensionsRD = rrddim_add(RS, "normal", NULL,
-                1, 1, RRD_ALGORITHM_ABSOLUTE);
-        NumAnomalousDimensionsRD = rrddim_add(RS, "anomalous", NULL,
-                1, 1, RRD_ALGORITHM_ABSOLUTE);
+    /*
+     * Training status
+    */
+    {
+        static thread_local RRDSET *TrainingStatusRS = nullptr;
+
+        static thread_local RRDDIM *Untrained = nullptr;
+        static thread_local RRDDIM *PendingWithoutModel = nullptr;
+        static thread_local RRDDIM *Trained = nullptr;
+        static thread_local RRDDIM *PendingWithModel = nullptr;
+
+        if (!TrainingStatusRS) {
+            std::stringstream IdSS, NameSS;
+
+            IdSS << "training_status_for_" << localhost->machine_guid;
+            NameSS << "training_status_for_" << localhost->hostname;
+
+            TrainingStatusRS = rrdset_create_localhost(
+                "netdata", // type
+                IdSS.str().c_str(), // id
+                NameSS.str().c_str(), // name
+                "ml", // family
+                "netdata.training_status", // ctx
+                "Training status of dimensions", // title
+                "dimensions", // units
+                "netdata", // plugin
+                "ml", // module
+                NETDATA_ML_CHART_PRIO_TRAINING_STATUS, // priority
+                RH->rrd_update_every, // update_every
+                RRDSET_TYPE_LINE // chart_type
+            );
+
+            rrdset_flag_set(TrainingStatusRS, RRDSET_FLAG_ANOMALY_DETECTION);
+
+            Untrained = rrddim_add(TrainingStatusRS, "untrained", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+            PendingWithoutModel = rrddim_add(TrainingStatusRS, "pending-without-model", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+            Trained = rrddim_add(TrainingStatusRS, "trained", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+            PendingWithModel = rrddim_add(TrainingStatusRS, "pending-with-model", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+        }
+
+        rrddim_set_by_pointer(TrainingStatusRS, Untrained, MLS.NumTrainingStatusUntrained);
+        rrddim_set_by_pointer(TrainingStatusRS, PendingWithoutModel, MLS.NumTrainingStatusPendingWithoutModel);
+        rrddim_set_by_pointer(TrainingStatusRS, Trained, MLS.NumTrainingStatusTrained);
+        rrddim_set_by_pointer(TrainingStatusRS, PendingWithModel, MLS.NumTrainingStatusPendingWithModel);
+
+        rrdset_done(TrainingStatusRS);
     }
 
-    rrddim_set_by_pointer(RS, NumTotalDimensionsRD, NumNormalDimensions + NumAnomalousDimensions);
-    rrddim_set_by_pointer(RS, NumTrainedDimensionsRD, NumTrainedDimensions);
-    rrddim_set_by_pointer(RS, NumNormalDimensionsRD, NumNormalDimensions);
-    rrddim_set_by_pointer(RS, NumAnomalousDimensionsRD, NumAnomalousDimensions);
+    /*
+     * Prediction status
+    */
+    {
+        static thread_local RRDSET *PredictionRS = nullptr;
+
+        static thread_local RRDDIM *Anomalous = nullptr;
+        static thread_local RRDDIM *Normal = nullptr;
+
+        if (!PredictionRS) {
+            std::stringstream IdSS, NameSS;
+
+            IdSS << "dimensions_on_" << localhost->machine_guid;
+            NameSS << "dimensions_on_" << localhost->hostname;
+
+            PredictionRS = rrdset_create(
+                RH,
+                "anomaly_detection", // type
+                IdSS.str().c_str(), // id
+                NameSS.str().c_str(), // name
+                "dimensions", // family
+                "anomaly_detection.dimensions", // ctx
+                "Anomaly detection dimensions", // title
+                "dimensions", // units
+                "netdata", // plugin
+                "ml", // module
+                ML_CHART_PRIO_DIMENSIONS, // priority
+                RH->rrd_update_every, // update_every
+                RRDSET_TYPE_LINE // chart_type
+            );
+            rrdset_flag_set(PredictionRS, RRDSET_FLAG_ANOMALY_DETECTION);
+
+            Anomalous = rrddim_add(PredictionRS, "anomalous", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+            Normal = rrddim_add(PredictionRS, "normal", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+        }
+
+        rrddim_set_by_pointer(PredictionRS, Anomalous, MLS.NumAnomalousDimensions);
+        rrddim_set_by_pointer(PredictionRS, Normal, MLS.NumNormalDimensions);
+
+        rrdset_done(PredictionRS);
+    }
 
-    rrdset_done(RS);
 }
 
 void ml::updateHostAndDetectionRateCharts(RRDHOST *RH, collected_number AnomalyRate) {
@@ -75,7 +202,7 @@ void ml::updateHostAndDetectionRateCharts(RRDHOST *RH, collected_number AnomalyR
             "percentage", // units
             "netdata", // plugin
             "ml", // module
-            39184, // priority
+            ML_CHART_PRIO_ANOMALY_RATE, // priority
             RH->rrd_update_every, // update_every
             RRDSET_TYPE_LINE // chart_type
         );
@@ -109,7 +236,7 @@ void ml::updateHostAndDetectionRateCharts(RRDHOST *RH, collected_number AnomalyR
             "percentage", // units
             "netdata", // plugin
             "ml", // module
-            39185, // priority
+            ML_CHART_PRIO_DETECTOR_EVENTS, // priority
             RH->rrd_update_every, // update_every
             RRDSET_TYPE_LINE // chart_type
         );
@@ -143,6 +270,7 @@ void ml::updateHostAndDetectionRateCharts(RRDHOST *RH, collected_number AnomalyR
             0, /* tier */
             QUERY_SOURCE_ML
     );
+
     if(R) {
         assert(R->d == 1 && R->n == 1 && R->rows == 1);
 
@@ -157,77 +285,227 @@ void ml::updateHostAndDetectionRateCharts(RRDHOST *RH, collected_number AnomalyR
 
         rrdr_free(OWA, R);
     }
+
     onewayalloc_destroy(OWA);
 }
 
-void ml::updateDetectionChart(RRDHOST *RH) {
-    static thread_local RRDSET *RS = nullptr;
-    static thread_local RRDDIM *UserRD, *SystemRD = nullptr;
-
-    if (!RS) {
-        std::stringstream IdSS, NameSS;
-
-        IdSS << "prediction_stats_" << RH->machine_guid;
-        NameSS << "prediction_stats_for_" << RH->hostname;
-
-        RS = rrdset_create_localhost(
-            "netdata", // type
-            IdSS.str().c_str(), // id
-            NameSS.str().c_str(), // name
-            "ml", // family
-            "netdata.prediction_stats", // ctx
-            "Prediction thread CPU usage", // title
-            "milliseconds/s", // units
-            "netdata", // plugin
-            "ml", // module
-            136000, // priority
-            RH->rrd_update_every, // update_every
-            RRDSET_TYPE_STACKED // chart_type
-        );
-
-        UserRD = rrddim_add(RS, "user", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL);
-        SystemRD = rrddim_add(RS, "system", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL);
+void ml::updateResourceUsageCharts(RRDHOST *RH, const struct rusage &PredictionRU, const struct rusage &TrainingRU) {
+    /*
+     * prediction rusage
+    */
+    {
+        static thread_local RRDSET *RS = nullptr;
+
+        static thread_local RRDDIM *User = nullptr;
+        static thread_local RRDDIM *System = nullptr;
+
+        if (!RS) {
+            std::stringstream IdSS, NameSS;
+
+            IdSS << "prediction_usage_for_" << localhost->machine_guid;
+            NameSS << "prediction_usage_for_" << localhost->hostname;
+
+            RS = rrdset_create_localhost(
+                "netdata", // type
+                IdSS.str().c_str(), // id
+                NameSS.str().c_str(), // name
+                "ml", // family
+                "netdata.prediction_usage", // ctx
+                "Prediction resource usage", // title
+                "milliseconds/s", // units
+                "netdata", // plugin
+                "ml", // module
+                NETDATA_ML_CHART_PRIO_PREDICTION_USAGE, // priority
+                RH->rrd_update_every, // update_every
+                RRDSET_TYPE_STACKED // chart_type
+            );
+            rrdset_flag_set(RS, RRDSET_FLAG_ANOMALY_DETECTION);
+
+            User = rrddim_add(RS, "user", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL);
+            System = rrddim_add(RS, "system", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL);
+        }
+
+        rrddim_set_by_pointer(RS, User, PredictionRU.ru_utime.tv_sec * 1000000ULL + PredictionRU.ru_utime.tv_usec);
+        rrddim_set_by_pointer(RS, System, PredictionRU.ru_stime.tv_sec * 1000000ULL + PredictionRU.ru_stime.tv_usec);
+
+        rrdset_done(RS);
     }
 
-    struct rusage TRU;
-    getrusage(RUSAGE_THREAD, &TRU);
-
-    rrddim_set_by_pointer(RS, UserRD, TRU.ru_utime.tv_sec * 1000000ULL + TRU.ru_utime.tv_usec);
-    rrddim_set_by_pointer(RS, SystemRD, TRU.ru_stime.tv_sec * 1000000ULL + TRU.ru_stime.tv_usec);
-    rrdset_done(RS);
+    /*
+     * training rusage
+    */
+    {
+        static thread_local RRDSET *RS = nullptr;
+
+        static thread_local RRDDIM *User = nullptr;
+        static thread_local RRDDIM *System = nullptr;
+
+        if (!RS) {
+            std::stringstream IdSS, NameSS;
+
+            IdSS << "training_usage_for_" << localhost->machine_guid;
+            NameSS << "training_usage_for_" << localhost->hostname;
+
+            RS = rrdset_create_localhost(
+                "netdata", // type
+                IdSS.str().c_str(), // id
+                NameSS.str().c_str(), // name
+                "ml", // family
+                "netdata.training_usage", // ctx
+                "Training resource usage", // title
+                "milliseconds/s", // units
+                "netdata", // plugin
+                "ml", // module
+                NETDATA_ML_CHART_PRIO_TRAINING_USAGE, // priority
+                RH->rrd_update_every, // update_every
+                RRDSET_TYPE_STACKED // chart_type
+            );
+            rrdset_flag_set(RS, RRDSET_FLAG_ANOMALY_DETECTION);
+
+            User = rrddim_add(RS, "user", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL);
+            System = rrddim_add(RS, "system", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL);
+        }
+
+        rrddim_set_by_pointer(RS, User, TrainingRU.ru_utime.tv_sec * 1000000ULL + TrainingRU.ru_utime.tv_usec);
+        rrddim_set_by_pointer(RS, System, TrainingRU.ru_stime.tv_sec * 1000000ULL + TrainingRU.ru_stime.tv_usec);
+
+        rrdset_done(RS);
+    }
 }
 
-void ml::updateTrainingChart(RRDHOST *RH, struct rusage *TRU) {
-    static thread_local RRDSET *RS = nullptr;
-    static thread_local RRDDIM *UserRD = nullptr;
-    static thread_local RRDDIM *SystemRD = nullptr;
-
-    if (!RS) {
-        std::stringstream IdSS, NameSS;
-
-        IdSS << "training_stats_" << RH->machine_guid;
-        NameSS << "training_stats_for_" << RH->hostname;
-
-        RS = rrdset_create_localhost(
-            "netdata", // type
-            IdSS.str().c_str(), // id
-            NameSS.str().c_str(), // name
-            "ml", // family
-            "netdata.training_stats", // ctx
-            "Training thread CPU usage", // title
-            "milliseconds/s", // units
-            "netdata", // plugin
-            "ml", // module
-            136001, // priority
-            RH->rrd_update_every, // update_every
-            RRDSET_TYPE_STACKED // chart_type
-        );
+void ml::updateTrainingStatisticsChart(RRDHOST *RH, const TrainingStats &TS) {
+    /*
+     * queue stats
+    */
+    {
+        static thread_local RRDSET *RS = nullptr;
+
+        static thread_local RRDDIM *QueueSize = nullptr;
+        static thread_local RRDDIM *PoppedItems = nullptr;
+
+        if (!RS) {
+            std::stringstream IdSS, NameSS;
+
+            IdSS << "queue_stats_for_" << localhost->machine_guid;
+            NameSS << "queue_stats_for_" << localhost->hostname;
+
+            RS = rrdset_create_localhost(
+                "netdata", // type
+                IdSS.str().c_str(), // id
+                NameSS.str().c_str(), // name
+                "ml", // family
+                "netdata.queue_stats", // ctx
+                "Training queue stats", // title
+                "items", // units
+                "netdata", // plugin
+                "ml", // module
+                NETDATA_ML_CHART_PRIO_QUEUE_STATS, // priority
+                RH->rrd_update_every, // update_every
+                RRDSET_TYPE_LINE// chart_type
+            );
+            rrdset_flag_set(RS, RRDSET_FLAG_ANOMALY_DETECTION);
+
+            QueueSize = rrddim_add(RS, "queue_size", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+            PoppedItems = rrddim_add(RS, "popped_items", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+        }
+
+        rrddim_set_by_pointer(RS, QueueSize, TS.QueueSize);
+        rrddim_set_by_pointer(RS, PoppedItems, TS.NumPoppedItems);
+
+        rrdset_done(RS);
+    }
 
-        UserRD = rrddim_add(RS, "user", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL);
-        SystemRD = rrddim_add(RS, "system", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL);
+    /*
+     * training stats
+    */
+    {
+        static thread_local RRDSET *RS = nullptr;
+
+        static thread_local RRDDIM *Allotted = nullptr;
+        static thread_local RRDDIM *Consumed = nullptr;
+        static thread_local RRDDIM *Remaining = nullptr;
+
+        if (!RS) {
+            std::stringstream IdSS, NameSS;
+
+            IdSS << "training_time_stats_for_" << localhost->machine_guid;
+            NameSS << "training_time_stats_for_" << localhost->hostname;
+
+            RS = rrdset_create_localhost(
+                "netdata", // type
+                IdSS.str().c_str(), // id
+                NameSS.str().c_str(), // name
+                "ml", // family
+                "netdata.training_time_stats", // ctx
+                "Training time stats", // title
+                "milliseconds", // units
+                "netdata", // plugin
+                "ml", // module
+                NETDATA_ML_CHART_PRIO_TRAINING_TIME_STATS, // priority
+                RH->rrd_update_every, // update_every
+                RRDSET_TYPE_LINE// chart_type
+            );
+            rrdset_flag_set(RS, RRDSET_FLAG_ANOMALY_DETECTION);
+
+            Allotted = rrddim_add(RS, "allotted", NULL, 1, 1000, RRD_ALGORITHM_ABSOLUTE);
+            Consumed = rrddim_add(RS, "consumed", NULL, 1, 1000, RRD_ALGORITHM_ABSOLUTE);
+            Remaining = rrddim_add(RS, "remaining", NULL, 1, 1000, RRD_ALGORITHM_ABSOLUTE);
+        }
+
+        rrddim_set_by_pointer(RS, Allotted, TS.AllottedUT);
+        rrddim_set_by_pointer(RS, Consumed, TS.ConsumedUT);
+        rrddim_set_by_pointer(RS, Remaining, TS.RemainingUT);
+
+        rrdset_done(RS);
     }
 
-    rrddim_set_by_pointer(RS, UserRD, TRU->ru_utime.tv_sec * 1000000ULL + TRU->ru_utime.tv_usec);
-    rrddim_set_by_pointer(RS, SystemRD, TRU->ru_stime.tv_sec * 1000000ULL + TRU->ru_stime.tv_usec);
-    rrdset_done(RS);
+    /*
+     * training result stats
+    */
+    {
+        static thread_local RRDSET *RS = nullptr;
+
+        static thread_local RRDDIM *Ok  = nullptr;
+        static thread_local RRDDIM *InvalidQueryTimeRange = nullptr;
+        static thread_local RRDDIM *NotEnoughCollectedValues = nullptr;
+        static thread_local RRDDIM *NullAcquiredDimension = nullptr;
+        static thread_local RRDDIM *ChartUnderReplication = nullptr;
+
+        if (!RS) {
+            std::stringstream IdSS, NameSS;
+
+            IdSS << "training_results_for_" << localhost->machine_guid;
+            NameSS << "training_results_for_" << localhost->hostname;
+
+            RS = rrdset_create_localhost(
+                "netdata", // type
+                IdSS.str().c_str(), // id
+                NameSS.str().c_str(), // name
+                "ml", // family
+                "netdata.training_results", // ctx
+                "Training results", // title
+                "events", // units
+                "netdata", // plugin
+                "ml", // module
+                NETDATA_ML_CHART_PRIO_TRAINING_RESULTS, // priority
+                RH->rrd_update_every, // update_every
+                RRDSET_TYPE_LINE// chart_type
+            );
+            rrdset_flag_set(RS, RRDSET_FLAG_ANOMALY_DETECTION);
+
+            Ok = rrddim_add(RS, "ok", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+            InvalidQueryTimeRange = rrddim_add(RS, "invalid-queries", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+            NotEnoughCollectedValues = rrddim_add(RS, "not-enough-values", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+            NullAcquiredDimension = rrddim_add(RS, "null-acquired-dimensions", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+            ChartUnderReplication = rrddim_add(RS, "chart-under-replication", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+        }
+
+        rrddim_set_by_pointer(RS, Ok, TS.TrainingResultOk);
+        rrddim_set_by_pointer(RS, InvalidQueryTimeRange, TS.TrainingResultInvalidQueryTimeRange);
+        rrddim_set_by_pointer(RS, NotEnoughCollectedValues, TS.TrainingResultNotEnoughCollectedValues);
+        rrddim_set_by_pointer(RS, NullAcquiredDimension, TS.TrainingResultNullAcquiredDimension);
+        rrddim_set_by_pointer(RS, ChartUnderReplication, TS.TrainingResultChartUnderReplication);
+
+        rrdset_done(RS);
+    }
 }
diff --git a/ml/ADCharts.h b/ml/ADCharts.h
index 0be324f7d7..ee09669e22 100644
--- a/ml/ADCharts.h
+++ b/ml/ADCharts.h
@@ -3,20 +3,18 @@
 #ifndef ML_ADCHARTS_H
 #define ML_ADCHARTS_H
 
+#include "Stats.h"
 #include "ml-private.h"
 
 namespace ml {
 
-void updateDimensionsChart(RRDHOST *RH,
-                           collected_number NumTrainedDimensions,
-                           collected_number NumNormalDimensions,
-                           collected_number NumAnomalousDimensions);
+void updateDimensionsChart(RRDHOST *RH, const MachineLearningStats &MLS);
 
 void updateHostAndDetectionRateCharts(RRDHOST *RH, collected_number AnomalyRate);
 
-void updateDetectionChart(RRDHOST *RH);
+void updateResourceUsageCharts(RRDHOST *RH, const struct rusage &PredictionRU, const struct rusage &TrainingRU);
 
-void updateTrainingChart(RRDHOST *RH, struct rusage *TRU);
+void updateTrainingStatisticsChart(RRDHOST *RH, const TrainingStats &TS);
 
 } // namespace ml
 
diff --git a/ml/Chart.cc b/ml/Chart.cc
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/ml/Chart.cc
diff --git a/ml/Chart.h b/ml/Chart.h
new file mode 100644
index 0000000000..c62f4bae38
--- /dev/null
+++ b/ml/Chart.h
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef ML_CHART_H
+#define ML_CHART_H
+
+#include "Config.h"
+#include "Dimension.h"
+
+#include "ml-private.h"
+#include "json/single_include/nlohmann/json.hpp"
+
+namespace ml
+{
+
+class Chart {
+public:
+    Chart(RRDSET *RS) :
+        RS(RS),
+        MLS()
+     { }
+
+    RRDSET *getRS() const {
+        return RS;
+    }
+
+    bool isAvailableForML() {
+        return rrdset_is_available_for_exporting_and_alarms(RS);
+    }
+
+    void addDimension(Dimension *D) {
+        std::lock_guard<std::mutex> Lock(Mutex);
+        Dimensions[D->getRD()] = D;
+    }
+
+    void removeDimension(Dimension *D) {
+        std::lock_guard<std::mutex> Lock(Mutex);
+        Dimensions.erase(D->getRD());
+    }
+
+    void getModelsAsJson(nlohmann::json &Json) {
+        std::lock_guard<std::mutex> Lock(Mutex);
+
+        for (auto &DP : Dimensions) {
+            Dimension *D = DP.second;
+            nlohmann::json JsonArray = nlohmann::json::array();
+            for (const KMeans &KM : D->getModels()) {
+                nlohmann::json J;
+                KM.toJson(J);
+                JsonArray.push_back(J);
+            }
+
+            Json[getMLDimensionID(D->getRD())] = JsonArray;
+        }
+    }
+
+    void updateBegin() {
+        Mutex.lock();
+        MLS = {};
+    }
+
+    void updateDimension(Dimension *D, bool IsAnomalous) {
+        switch (D->getMLS()) {
+            case MachineLearningStatus::DisabledDueToUniqueUpdateEvery:
+                MLS.NumMachineLearningStatusDisabledUE++;
+                return;
+            case MachineLearningStatus::DisabledDueToExcludedChart:
+                MLS.NumMachineLearningStatusDisabledSP++;
+                return;
+            case MachineLearningStatus::Enabled: {
+                MLS.NumMachineLearningStatusEnabled++;
+
+                switch (D->getMT()) {
+                    case MetricType::Constant:
+                        MLS.NumMetricTypeConstant++;
+                        MLS.NumTrainingStatusTrained++;
+                        MLS.NumNormalDimensions++;
+                        return;
+                    case MetricType::Variable:
+                        MLS.NumMetricTypeVariable++;
+                        break;
+                }
+
+                switch (D->getTS()) {
+                    case TrainingStatus::Untrained:
+                        MLS.NumTrainingStatusUntrained++;
+                        return;
+                    case TrainingStatus::PendingWithoutModel:
+                        MLS.NumTrainingStatusPendingWithoutModel++;
+                        return;
+                    case TrainingStatus::Trained:
+                        MLS.NumTrainingStatusTrained++;
+
+                        MLS.NumAnomalousDimensions += IsAnomalous;
+                        MLS.NumNormalDimensions += !IsAnomalous;
+                        return;
+                    case TrainingStatus::PendingWithModel:
+                        MLS.NumTrainingStatusPendingWithModel++;
+
+                        MLS.NumAnomalousDimensions += IsAnomalous;
+                        MLS.NumNormalDimensions += !IsAnomalous;
+                        return;
+                }
+
+                return;
+            }
+        }
+    }
+
+    void updateEnd() {
+        Mutex.unlock();
+    }
+
+    MachineLearningStats getMLS() {
+        std::lock_guard<std::mutex> Lock(Mutex);
+        return MLS;
+    }
+
+private:
+    RRDSET *RS;
+    MachineLearningStats MLS;
+
+    std::mutex Mutex;
+    std::unordered_map<RRDDIM *, Dimension *> Dimensions;
+};
+
+} // namespace ml
+
+#endif /* ML_CHART_H */
diff --git a/ml/Config.cc b/ml/Config.cc
index eedd8c29fd..ba3a614452 100644
--- a/ml/Config.cc
+++ b/ml/Config.cc
@@ -31,7 +31,7 @@ void Config::readMLConfig(void) {
     unsigned MaxTrainSamples = config_get_number(ConfigSectionML, "maximum num samples to train", 4 * 3600);
     unsigned MinTrainSamples = config_get_number(ConfigSectionML, "minimum num samples to train", 1 * 900);
     unsigned TrainEvery = config_get_number(ConfigSectionML, "train every", 1 * 3600);
-    unsigned NumModelsToUse = config_get_number(ConfigSectionML, "number of models per dimension", 1 * 24);
+    unsigned NumModelsToUse = config_get_number(ConfigSectionML, "number of models per dimension", 1);
 
     unsigned DiffN = config_get_number(ConfigSectionML, "num samples to diff", 1);
     unsigned SmoothN = config_get_number(ConfigSectionML, "num samples to smooth", 3);
@@ -53,7 +53,7 @@ void Config::readMLConfig(void) {
     MaxTrainSamples = clamp<unsigned>(MaxTrainSamples, 1 * 3600, 24 * 3600);
     MinTrainSamples = clamp<unsigned>(MinTrainSamples, 1 * 900, 6 * 3600);
     TrainEvery = clamp<unsigned>(TrainEvery, 1 * 3600, 6 * 3600);
-    NumModelsToUse = clamp<unsigned>(TrainEvery, 1, 7 * 24);
+    NumModelsToUse = clamp<unsigned>(NumModelsToUse, 1, 7 * 24);
 
     DiffN = clamp(DiffN, 0u, 1u);
     SmoothN = clamp(SmoothN, 0u, 5u);
@@ -108,7 +108,7 @@ void Config::readMLConfig(void) {
     // Always exclude anomaly_detection charts from training.
     Cfg.ChartsToSkip = "anomaly_detection.* ";
     Cfg.ChartsToSkip += config_get(ConfigSectionML, "charts to skip from training", "netdata.*");
-    Cfg.SP_ChartsToSkip = simple_pattern_create(ChartsToSkip.c_str(), NULL, SIMPLE_PATTERN_EXACT);
+    Cfg.SP_ChartsToSkip = simple_pattern_create(Cfg.ChartsToSkip.c_str(), NULL, SIMPLE_PATTERN_EXACT);
 
     Cfg.StreamADCharts = config_get_boolean(ConfigSectionML, "stream anomaly detection charts", true);
 }
diff --git a/ml/Config.h b/ml/Config.h
index d876d4aa41..f10e114926 100644
--- a/ml/Config.h
+++ b/ml/Config.h
@@ -14,6 +14,7 @@ public:
     unsigned MaxTrainSamples;
     unsigned MinTrainSamples;
     unsigned TrainEvery;
+
     unsigned NumModelsToUse;
 
     unsigned DBEngineAnomalyRateEvery;
diff --git a/ml/Dimension.cc b/ml/Dimension.cc
index bf34abb72f..c2195f175d 100644
--- a/ml/Dimension.cc
+++ b/ml/Dimension.cc
@@ -3,84 +3,174 @@
 #include "Config.h"
 #include "Dimension.h"
 #include "Query.h"
+#include "Host.h"
 
 using namespace ml;
 
-bool Dimension::isActive() const {
-    bool SetObsolete = rrdset_flag_check(RD->rrdset, RRDSET_FLAG_OBSOLETE);
-    bool DimObsolete = rrddim_flag_check(RD, RRDDIM_FLAG_OBSOLETE);
-    return !SetObsolete && !DimObsolete;
+static const char *mls2str(MachineLearningStatus MLS) {
+    switch (MLS) {
+        case ml::MachineLearningStatus::Enabled:
+            return "enabled";
+        case ml::MachineLearningStatus::DisabledDueToUniqueUpdateEvery:
+            return "disabled-ue";
author	vkalintiris <vasilis@netdata.cloud>	2022-12-21 15:03:05 +0200
committer	GitHub <noreply@github.com>	2022-12-21 15:03:05 +0200
commit	689dc6b7fbbf495ce3e020dcff0d014a8d338c52 (patch)
tree	1f458a3218798b53809d0868a82fcad1a20b1e32 /ml
parent	fe386aad57f24574783f4c68bab433a5cdfe6f64 (diff)