diff options
author | vkalintiris <vasilis@netdata.cloud> | 2023-05-10 11:40:19 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-05-10 11:40:19 +0300 |
commit | 216f91b91f24467b6379435e99438df104bebfda (patch) | |
tree | 4d42f611c711d5d4874d8fea1b3ef3ede8bb771c | |
parent | a2d9b45ab0ab92d09f8bec8666e8d75d128c92e3 (diff) |
Silence dimensions with noisy ML models (#15011)
* Add suppression options.
* Silence noisy dimensions
-rw-r--r-- | ml/Config.cc | 11 | ||||
-rw-r--r-- | ml/ad_charts.cc | 4 | ||||
-rw-r--r-- | ml/ml-private.h | 11 | ||||
-rw-r--r-- | ml/ml.cc | 27 |
4 files changed, 52 insertions, 1 deletions
diff --git a/ml/Config.cc b/ml/Config.cc index d451c602c2..e82fa714d5 100644 --- a/ml/Config.cc +++ b/ml/Config.cc @@ -46,6 +46,9 @@ void ml_config_load(ml_config_t *cfg) { size_t num_training_threads = config_get_number(config_section_ml, "num training threads", 4); size_t flush_models_batch_size = config_get_number(config_section_ml, "flush models batch size", 128); + size_t suppression_window = config_get_number(config_section_ml, "dimension anomaly rate suppression window", 1800); + size_t suppression_threshold = config_get_number(config_section_ml, "dimension anomaly rate suppression threshold", suppression_window / 2); + bool enable_statistics_charts = config_get_boolean(config_section_ml, "enable statistics charts", true); /* @@ -72,7 +75,10 @@ void ml_config_load(ml_config_t *cfg) { num_training_threads = clamp<size_t>(num_training_threads, 1, 128); flush_models_batch_size = clamp<size_t>(flush_models_batch_size, 8, 512); - /* + suppression_window = clamp<size_t>(suppression_window, 1, max_train_samples); + suppression_threshold = clamp<size_t>(suppression_threshold, 1, suppression_window); + + /* * Validate */ @@ -121,5 +127,8 @@ void ml_config_load(ml_config_t *cfg) { cfg->num_training_threads = num_training_threads; cfg->flush_models_batch_size = flush_models_batch_size; + cfg->suppression_window = suppression_window; + cfg->suppression_threshold = suppression_threshold; + cfg->enable_statistics_charts = enable_statistics_charts; } diff --git a/ml/ad_charts.cc b/ml/ad_charts.cc index 086cd5aa02..bd065cfcc4 100644 --- a/ml/ad_charts.cc +++ b/ml/ad_charts.cc @@ -124,6 +124,8 @@ void ml_update_dimensions_chart(ml_host_t *host, const ml_machine_learning_stats rrddim_add(host->training_status_rs, "trained", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); host->training_status_pending_with_model_rd = rrddim_add(host->training_status_rs, "pending-with-model", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + host->training_status_silenced_rd = + rrddim_add(host->training_status_rs, "silenced", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); } rrddim_set_by_pointer(host->training_status_rs, @@ -134,6 +136,8 @@ void ml_update_dimensions_chart(ml_host_t *host, const ml_machine_learning_stats host->training_status_trained_rd, mls.num_training_status_trained); rrddim_set_by_pointer(host->training_status_rs, host->training_status_pending_with_model_rd, mls.num_training_status_pending_with_model); + rrddim_set_by_pointer(host->training_status_rs, + host->training_status_silenced_rd, mls.num_training_status_silenced); rrdset_done(host->training_status_rs); } diff --git a/ml/ml-private.h b/ml/ml-private.h index 327cc59a2a..2ed70d1ca9 100644 --- a/ml/ml-private.h +++ b/ml/ml-private.h @@ -55,6 +55,7 @@ typedef struct machine_learning_stats_t { size_t num_training_status_pending_without_model; size_t num_training_status_trained; size_t num_training_status_pending_with_model; + size_t num_training_status_silenced; size_t num_anomalous_dimensions; size_t num_normal_dimensions; @@ -103,6 +104,9 @@ enum ml_training_status { // Have a valid, up-to-date model TRAINING_STATUS_TRAINED, + + // Have a valid, up-to-date model that is silenced because its too noisy + TRAINING_STATUS_SILENCED, }; enum ml_training_result { @@ -194,6 +198,9 @@ typedef struct { netdata_mutex_t mutex; ml_kmeans_t kmeans; std::vector<DSample> feature; + + uint32_t suppression_window_counter; + uint32_t suppression_anomaly_counter; } ml_dimension_t; typedef struct { @@ -233,6 +240,7 @@ typedef struct { RRDDIM *training_status_pending_without_model_rd; RRDDIM *training_status_trained_rd; RRDDIM *training_status_pending_with_model_rd; + RRDDIM *training_status_silenced_rd; RRDSET *dimensions_rs; RRDDIM *dimensions_anomalous_rd; @@ -325,6 +333,9 @@ typedef struct { std::vector<ml_training_thread_t> training_threads; std::atomic<bool> training_stop; + size_t suppression_window; + size_t suppression_threshold; + bool enable_statistics_charts; } ml_config_t; @@ -63,6 +63,8 @@ ml_training_status_to_string(enum ml_training_status ts) return "trained"; case TRAINING_STATUS_UNTRAINED: return "untrained"; + case TRAINING_STATUS_SILENCED: + return "silenced"; default: return "unknown"; } @@ -679,6 +681,8 @@ ml_dimension_train_model(ml_training_thread_t *training_thread, ml_dimension_t * break; } + dim->suppression_anomaly_counter = 0; + dim->suppression_window_counter = 0; dim->tr = training_response; dim->last_training_time = training_response.last_entry_on_response; @@ -735,6 +739,10 @@ ml_dimension_train_model(ml_training_thread_t *training_thread, ml_dimension_t * dim->mt = METRIC_TYPE_CONSTANT; dim->ts = TRAINING_STATUS_TRAINED; + + dim->suppression_anomaly_counter = 0; + dim->suppression_window_counter = 0; + dim->tr = training_response; dim->last_training_time = rrddim_last_entry_s(dim->rd); @@ -771,6 +779,7 @@ ml_dimension_schedule_for_training(ml_dimension_t *dim, time_t curr_time) schedule_for_training = true; dim->ts = TRAINING_STATUS_PENDING_WITHOUT_MODEL; break; + case TRAINING_STATUS_SILENCED: case TRAINING_STATUS_TRAINED: if ((dim->last_training_time + (Cfg.train_every * dim->rd->update_every)) < curr_time) { schedule_for_training = true; @@ -856,6 +865,7 @@ ml_dimension_predict(ml_dimension_t *dim, time_t curr_time, calculated_number_t switch (dim->ts) { case TRAINING_STATUS_UNTRAINED: case TRAINING_STATUS_PENDING_WITHOUT_MODEL: { + case TRAINING_STATUS_SILENCED: netdata_mutex_unlock(&dim->mutex); return false; } @@ -863,6 +873,8 @@ ml_dimension_predict(ml_dimension_t *dim, time_t curr_time, calculated_number_t break; } + dim->suppression_window_counter++; + /* * Use the KMeans models to check if the value is anomalous */ @@ -886,6 +898,13 @@ ml_dimension_predict(ml_dimension_t *dim, time_t curr_time, calculated_number_t sum += 1; } + dim->suppression_anomaly_counter += sum ? 1 : 0; + + if ((dim->suppression_anomaly_counter >= Cfg.suppression_threshold) && + (dim->suppression_window_counter >= Cfg.suppression_window)) { + dim->ts = TRAINING_STATUS_SILENCED; + } + netdata_mutex_unlock(&dim->mutex); global_statistics_ml_models_consulted(models_consulted); @@ -942,6 +961,13 @@ ml_chart_update_dimension(ml_chart_t *chart, ml_dimension_t *dim, bool is_anomal chart->mls.num_anomalous_dimensions += is_anomalous; chart->mls.num_normal_dimensions += !is_anomalous; return; + case TRAINING_STATUS_SILENCED: + chart->mls.num_training_status_silenced++; + chart->mls.num_training_status_trained++; + + chart->mls.num_anomalous_dimensions += is_anomalous; + chart->mls.num_normal_dimensions += !is_anomalous; + return; } return; @@ -995,6 +1021,7 @@ ml_host_detect_once(ml_host_t *host) host->mls.num_training_status_pending_without_model += chart_mls.num_training_status_pending_without_model; host->mls.num_training_status_trained += chart_mls.num_training_status_trained; host->mls.num_training_status_pending_with_model += chart_mls.num_training_status_pending_with_model; + host->mls.num_training_status_silenced += chart_mls.num_training_status_silenced; host->mls.num_anomalous_dimensions += chart_mls.num_anomalous_dimensions; host->mls.num_normal_dimensions += chart_mls.num_normal_dimensions; |