summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorvkalintiris <vasilis@netdata.cloud>2023-05-10 11:40:19 +0300
committerGitHub <noreply@github.com>2023-05-10 11:40:19 +0300
commit216f91b91f24467b6379435e99438df104bebfda (patch)
tree4d42f611c711d5d4874d8fea1b3ef3ede8bb771c
parenta2d9b45ab0ab92d09f8bec8666e8d75d128c92e3 (diff)
Silence dimensions with noisy ML models (#15011)
* Add suppression options. * Silence noisy dimensions
-rw-r--r--ml/Config.cc11
-rw-r--r--ml/ad_charts.cc4
-rw-r--r--ml/ml-private.h11
-rw-r--r--ml/ml.cc27
4 files changed, 52 insertions, 1 deletions
diff --git a/ml/Config.cc b/ml/Config.cc
index d451c602c2..e82fa714d5 100644
--- a/ml/Config.cc
+++ b/ml/Config.cc
@@ -46,6 +46,9 @@ void ml_config_load(ml_config_t *cfg) {
size_t num_training_threads = config_get_number(config_section_ml, "num training threads", 4);
size_t flush_models_batch_size = config_get_number(config_section_ml, "flush models batch size", 128);
+ size_t suppression_window = config_get_number(config_section_ml, "dimension anomaly rate suppression window", 1800);
+ size_t suppression_threshold = config_get_number(config_section_ml, "dimension anomaly rate suppression threshold", suppression_window / 2);
+
bool enable_statistics_charts = config_get_boolean(config_section_ml, "enable statistics charts", true);
/*
@@ -72,7 +75,10 @@ void ml_config_load(ml_config_t *cfg) {
num_training_threads = clamp<size_t>(num_training_threads, 1, 128);
flush_models_batch_size = clamp<size_t>(flush_models_batch_size, 8, 512);
- /*
+ suppression_window = clamp<size_t>(suppression_window, 1, max_train_samples);
+ suppression_threshold = clamp<size_t>(suppression_threshold, 1, suppression_window);
+
+ /*
* Validate
*/
@@ -121,5 +127,8 @@ void ml_config_load(ml_config_t *cfg) {
cfg->num_training_threads = num_training_threads;
cfg->flush_models_batch_size = flush_models_batch_size;
+ cfg->suppression_window = suppression_window;
+ cfg->suppression_threshold = suppression_threshold;
+
cfg->enable_statistics_charts = enable_statistics_charts;
}
diff --git a/ml/ad_charts.cc b/ml/ad_charts.cc
index 086cd5aa02..bd065cfcc4 100644
--- a/ml/ad_charts.cc
+++ b/ml/ad_charts.cc
@@ -124,6 +124,8 @@ void ml_update_dimensions_chart(ml_host_t *host, const ml_machine_learning_stats
rrddim_add(host->training_status_rs, "trained", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
host->training_status_pending_with_model_rd =
rrddim_add(host->training_status_rs, "pending-with-model", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ host->training_status_silenced_rd =
+ rrddim_add(host->training_status_rs, "silenced", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
}
rrddim_set_by_pointer(host->training_status_rs,
@@ -134,6 +136,8 @@ void ml_update_dimensions_chart(ml_host_t *host, const ml_machine_learning_stats
host->training_status_trained_rd, mls.num_training_status_trained);
rrddim_set_by_pointer(host->training_status_rs,
host->training_status_pending_with_model_rd, mls.num_training_status_pending_with_model);
+ rrddim_set_by_pointer(host->training_status_rs,
+ host->training_status_silenced_rd, mls.num_training_status_silenced);
rrdset_done(host->training_status_rs);
}
diff --git a/ml/ml-private.h b/ml/ml-private.h
index 327cc59a2a..2ed70d1ca9 100644
--- a/ml/ml-private.h
+++ b/ml/ml-private.h
@@ -55,6 +55,7 @@ typedef struct machine_learning_stats_t {
size_t num_training_status_pending_without_model;
size_t num_training_status_trained;
size_t num_training_status_pending_with_model;
+ size_t num_training_status_silenced;
size_t num_anomalous_dimensions;
size_t num_normal_dimensions;
@@ -103,6 +104,9 @@ enum ml_training_status {
// Have a valid, up-to-date model
TRAINING_STATUS_TRAINED,
+
+ // Have a valid, up-to-date model that is silenced because its too noisy
+ TRAINING_STATUS_SILENCED,
};
enum ml_training_result {
@@ -194,6 +198,9 @@ typedef struct {
netdata_mutex_t mutex;
ml_kmeans_t kmeans;
std::vector<DSample> feature;
+
+ uint32_t suppression_window_counter;
+ uint32_t suppression_anomaly_counter;
} ml_dimension_t;
typedef struct {
@@ -233,6 +240,7 @@ typedef struct {
RRDDIM *training_status_pending_without_model_rd;
RRDDIM *training_status_trained_rd;
RRDDIM *training_status_pending_with_model_rd;
+ RRDDIM *training_status_silenced_rd;
RRDSET *dimensions_rs;
RRDDIM *dimensions_anomalous_rd;
@@ -325,6 +333,9 @@ typedef struct {
std::vector<ml_training_thread_t> training_threads;
std::atomic<bool> training_stop;
+ size_t suppression_window;
+ size_t suppression_threshold;
+
bool enable_statistics_charts;
} ml_config_t;
diff --git a/ml/ml.cc b/ml/ml.cc
index 999f7f7e66..34f2b93bdc 100644
--- a/ml/ml.cc
+++ b/ml/ml.cc
@@ -63,6 +63,8 @@ ml_training_status_to_string(enum ml_training_status ts)
return "trained";
case TRAINING_STATUS_UNTRAINED:
return "untrained";
+ case TRAINING_STATUS_SILENCED:
+ return "silenced";
default:
return "unknown";
}
@@ -679,6 +681,8 @@ ml_dimension_train_model(ml_training_thread_t *training_thread, ml_dimension_t *
break;
}
+ dim->suppression_anomaly_counter = 0;
+ dim->suppression_window_counter = 0;
dim->tr = training_response;
dim->last_training_time = training_response.last_entry_on_response;
@@ -735,6 +739,10 @@ ml_dimension_train_model(ml_training_thread_t *training_thread, ml_dimension_t *
dim->mt = METRIC_TYPE_CONSTANT;
dim->ts = TRAINING_STATUS_TRAINED;
+
+ dim->suppression_anomaly_counter = 0;
+ dim->suppression_window_counter = 0;
+
dim->tr = training_response;
dim->last_training_time = rrddim_last_entry_s(dim->rd);
@@ -771,6 +779,7 @@ ml_dimension_schedule_for_training(ml_dimension_t *dim, time_t curr_time)
schedule_for_training = true;
dim->ts = TRAINING_STATUS_PENDING_WITHOUT_MODEL;
break;
+ case TRAINING_STATUS_SILENCED:
case TRAINING_STATUS_TRAINED:
if ((dim->last_training_time + (Cfg.train_every * dim->rd->update_every)) < curr_time) {
schedule_for_training = true;
@@ -856,6 +865,7 @@ ml_dimension_predict(ml_dimension_t *dim, time_t curr_time, calculated_number_t
switch (dim->ts) {
case TRAINING_STATUS_UNTRAINED:
case TRAINING_STATUS_PENDING_WITHOUT_MODEL: {
+ case TRAINING_STATUS_SILENCED:
netdata_mutex_unlock(&dim->mutex);
return false;
}
@@ -863,6 +873,8 @@ ml_dimension_predict(ml_dimension_t *dim, time_t curr_time, calculated_number_t
break;
}
+ dim->suppression_window_counter++;
+
/*
* Use the KMeans models to check if the value is anomalous
*/
@@ -886,6 +898,13 @@ ml_dimension_predict(ml_dimension_t *dim, time_t curr_time, calculated_number_t
sum += 1;
}
+ dim->suppression_anomaly_counter += sum ? 1 : 0;
+
+ if ((dim->suppression_anomaly_counter >= Cfg.suppression_threshold) &&
+ (dim->suppression_window_counter >= Cfg.suppression_window)) {
+ dim->ts = TRAINING_STATUS_SILENCED;
+ }
+
netdata_mutex_unlock(&dim->mutex);
global_statistics_ml_models_consulted(models_consulted);
@@ -942,6 +961,13 @@ ml_chart_update_dimension(ml_chart_t *chart, ml_dimension_t *dim, bool is_anomal
chart->mls.num_anomalous_dimensions += is_anomalous;
chart->mls.num_normal_dimensions += !is_anomalous;
return;
+ case TRAINING_STATUS_SILENCED:
+ chart->mls.num_training_status_silenced++;
+ chart->mls.num_training_status_trained++;
+
+ chart->mls.num_anomalous_dimensions += is_anomalous;
+ chart->mls.num_normal_dimensions += !is_anomalous;
+ return;
}
return;
@@ -995,6 +1021,7 @@ ml_host_detect_once(ml_host_t *host)
host->mls.num_training_status_pending_without_model += chart_mls.num_training_status_pending_without_model;
host->mls.num_training_status_trained += chart_mls.num_training_status_trained;
host->mls.num_training_status_pending_with_model += chart_mls.num_training_status_pending_with_model;
+ host->mls.num_training_status_silenced += chart_mls.num_training_status_silenced;
host->mls.num_anomalous_dimensions += chart_mls.num_anomalous_dimensions;
host->mls.num_normal_dimensions += chart_mls.num_normal_dimensions;