summaryrefslogtreecommitdiffstats
path: root/ml/Host.cc
diff options
context:
space:
mode:
authorvkalintiris <vasilis@netdata.cloud>2022-02-24 10:57:30 +0200
committerGitHub <noreply@github.com>2022-02-24 10:57:30 +0200
commit69ea17d6ec534e1ed796a92fd042bd76a3ca9215 (patch)
tree4345e3405b2ac1e37a9be1615d6dad799e15327d /ml/Host.cc
parent8756eb80c77caf411f7fae605d3f1bb03dd60b76 (diff)
Track anomaly rates with DBEngine. (#12083)
* Track anomaly rates with DBEngine. This commit adds support for tracking anomaly rates with DBEngine. We do so by creating a single chart with id "anomaly_detection.anomaly_rates" for each trainable/predictable host, which is responsible for tracking the anomaly rate of each dimension that we train/predict for that host. The rrdset->state->is_ar_chart boolean flag is set to true only for anomaly rates charts. We use this flag to: - Disable exposing the anomaly rates charts through the functionality in backends/, exporting/ and streaming/. - Skip generation of configuration options for the name, algorithm, multiplier, divisor of each dimension in an anomaly rates chart. - Skip the creation of health variables for anomaly rates dimensions. - Skip the chart/dim queue of ACLK. - Post-process the RRDR result of an anomaly rates chart, so that we can return a sorted, trimmed number of anomalous dimensions. In a child/parent configuration where both the child and the parent run ML for the child, we want to be able to stream the rest of the ML-related charts to the parent. To be able to do this without any chart name collisions, the charts are now created on localhost and their IDs and titles have the node's machine_guid and hostname as a suffix, respectively. * Fix exporting_engine tests. * Restore default ML configuration. The reverted changes where meant for local testing only. This commit restores the default values that we want to have when someone runs anomaly detection on their node. * Set context for anomaly_detection.* charts. * Check for anomaly rates chart only with a valid pointer. * Remove duplicate code. * Use a more descriptive name for id/title pair variable
Diffstat (limited to 'ml/Host.cc')
-rw-r--r--ml/Host.cc128
1 files changed, 87 insertions, 41 deletions
diff --git a/ml/Host.cc b/ml/Host.cc
index b632710a44..a1a89f3042 100644
--- a/ml/Host.cc
+++ b/ml/Host.cc
@@ -9,6 +9,17 @@
using namespace ml;
+static std::pair<std::string, std::string>
+getHostSpecificIdAndTitle(RRDHOST *RH, const std::string &IdPrefix,
+ const std::string &TitlePrefix) {
+ std::stringstream IdSS, TitleSS;
+
+ IdSS << IdPrefix << "_" << RH->machine_guid;
+ TitleSS << TitlePrefix << " " << RH->hostname;
+
+ return {IdSS.str(), TitleSS.str()};
+}
+
static void updateDimensionsChart(RRDHOST *RH,
collected_number NumTrainedDimensions,
collected_number NumNormalDimensions,
@@ -20,14 +31,17 @@ static void updateDimensionsChart(RRDHOST *RH,
static thread_local RRDDIM *NumAnomalousDimensionsRD = nullptr;
if (!RS) {
- RS = rrdset_create(
- RH, // host
+ std::string IdPrefix = "dimensions";
+ std::string TitlePrefix = "Anomaly detection dimensions for host";
+ auto IdTitlePair = getHostSpecificIdAndTitle(RH, IdPrefix, TitlePrefix);
+
+ RS = rrdset_create_localhost(
"anomaly_detection", // type
- "dimensions", // id
+ IdTitlePair.first.c_str(), // id
NULL, // name
"dimensions", // family
- NULL, // ctx
- "Anomaly detection dimensions", // title
+ "anomaly_detection.dimensions", // ctx
+ IdTitlePair.second.c_str(), // title
"dimensions", // units
"netdata", // plugin
"ml", // module
@@ -60,14 +74,17 @@ static void updateRateChart(RRDHOST *RH, collected_number AnomalyRate) {
static thread_local RRDDIM *AnomalyRateRD = nullptr;
if (!RS) {
- RS = rrdset_create(
- RH, // host
+ std::string IdPrefix = "anomaly_rate";
+ std::string TitlePrefix = "Percentage of anomalous dimensions for host";
+ auto IdTitlePair = getHostSpecificIdAndTitle(RH, IdPrefix, TitlePrefix);
+
+ RS = rrdset_create_localhost(
"anomaly_detection", // type
- "anomaly_rate", // id
+ IdTitlePair.first.c_str(), // id
NULL, // name
"anomaly_rate", // family
- NULL, // ctx
- "Percentage of anomalous dimensions", // title
+ "anomaly_detection.anomaly_rate", // ctx
+ IdTitlePair.second.c_str(), // title
"percentage", // units
"netdata", // plugin
"ml", // module
@@ -91,14 +108,17 @@ static void updateWindowLengthChart(RRDHOST *RH, collected_number WindowLength)
static thread_local RRDDIM *WindowLengthRD = nullptr;
if (!RS) {
- RS = rrdset_create(
- RH, // host
+ std::string IdPrefix = "detector_window";
+ std::string TitlePrefix = "Anomaly detector window length for host";
+ auto IdTitlePair = getHostSpecificIdAndTitle(RH, IdPrefix, TitlePrefix);
+
+ RS = rrdset_create_localhost(
"anomaly_detection", // type
- "detector_window", // id
+ IdTitlePair.first.c_str(), // id
NULL, // name
"detector_window", // family
- NULL, // ctx
- "Anomaly detector window length", // title
+ "anomaly_detection.detector_window", // ctx
+ IdTitlePair.second.c_str(), // title
"seconds", // units
"netdata", // plugin
"ml", // module
@@ -126,14 +146,17 @@ static void updateEventsChart(RRDHOST *RH,
static thread_local RRDDIM *NewAnomalyEventRD = nullptr;
if (!RS) {
- RS = rrdset_create(
- RH, // host
+ std::string IdPrefix = "detector_events";
+ std::string TitlePrefix = "Anomaly events triggered for host";
+ auto IdTitlePair = getHostSpecificIdAndTitle(RH, IdPrefix, TitlePrefix);
+
+ RS = rrdset_create_localhost(
"anomaly_detection", // type
- "detector_events", // id
+ IdTitlePair.first.c_str(), // id
NULL, // name
"detector_events", // family
- NULL, // ctx
- "Anomaly events triggered", // title
+ "anomaly_detection.detector_events", // ctx
+ IdTitlePair.second.c_str(), // title
"boolean", // units
"netdata", // plugin
"ml", // module
@@ -166,14 +189,17 @@ static void updateDetectionChart(RRDHOST *RH, collected_number PredictionDuratio
static thread_local RRDDIM *PredictiobDurationRD = nullptr;
if (!RS) {
- RS = rrdset_create(
- RH, // host
+ std::string IdPrefix = "prediction_stats";
+ std::string TitlePrefix = "Time it took to run prediction for host";
+ auto IdTitlePair = getHostSpecificIdAndTitle(RH, IdPrefix, TitlePrefix);
+
+ RS = rrdset_create_localhost(
"anomaly_detection", // type
- "prediction_stats", // id
+ IdTitlePair.first.c_str(), // id
NULL, // name
"prediction_stats", // family
- NULL, // ctx
- "Time it took to run prediction", // title
+ "anomaly_detection.prediction_stats", // ctx
+ IdTitlePair.second.c_str(), // title
"milliseconds", // units
"netdata", // plugin
"ml", // module
@@ -201,14 +227,17 @@ static void updateTrainingChart(RRDHOST *RH,
static thread_local RRDDIM *MaxTrainingDurationRD = nullptr;
if (!RS) {
- RS = rrdset_create(
- RH, // host
+ std::string IdPrefix = "training_stats";
+ std::string TitlePrefix = "Training step statistics for host";
+ auto IdTitlePair = getHostSpecificIdAndTitle(RH, IdPrefix, TitlePrefix);
+
+ RS = rrdset_create_localhost(
"anomaly_detection", // type
- "training_stats", // id
+ IdTitlePair.first.c_str(), // id
NULL, // name
"training_stats", // family
- NULL, // ctx
- "Training step statistics", // title
+ "anomaly_detection.training_stats", // ctx
+ IdTitlePair.second.c_str(), // title
"milliseconds", // units
"netdata", // plugin
"ml", // module
@@ -231,12 +260,18 @@ static void updateTrainingChart(RRDHOST *RH,
}
void RrdHost::addDimension(Dimension *D) {
- std::lock_guard<std::mutex> Lock(Mutex);
+ RRDDIM *AnomalyRateRD = rrddim_add(AnomalyRateRS, D->getID().c_str(), NULL,
+ 1, 1000, RRD_ALGORITHM_ABSOLUTE);
+ D->setAnomalyRateRD(AnomalyRateRD);
+
+ {
+ std::lock_guard<std::mutex> Lock(Mutex);
- DimensionsMap[D->getRD()] = D;
+ DimensionsMap[D->getRD()] = D;
- // Default construct mutex for dimension
- LocksMap[D];
+ // Default construct mutex for dimension
+ LocksMap[D];
+ }
}
void RrdHost::removeDimension(Dimension *D) {
@@ -344,7 +379,7 @@ void TrainableHost::train() {
}
void DetectableHost::detectOnce() {
- auto P = BRW.insert(AnomalyRate >= Cfg.HostAnomalyRateThreshold);
+ auto P = BRW.insert(WindowAnomalyRate >= Cfg.HostAnomalyRateThreshold);
BitRateWindow::Edge Edge = P.first;
size_t WindowLength = P.second;
@@ -361,6 +396,10 @@ void DetectableHost::detectOnce() {
double TotalTrainingDuration = 0.0;
double MaxTrainingDuration = 0.0;
+ bool CollectAnomalyRates = (++AnomalyRateTimer == Cfg.DBEngineAnomalyRateEvery);
+ if (CollectAnomalyRates)
+ rrdset_next(AnomalyRateRS);
+
{
std::lock_guard<std::mutex> Lock(Mutex);
@@ -371,7 +410,7 @@ void DetectableHost::detectOnce() {
auto P = D->detect(WindowLength, ResetBitCounter);
bool IsAnomalous = P.first;
- double AnomalyRate = P.second;
+ double AnomalyScore = P.second;
NumTrainedDimensions += D->isTrained();
@@ -382,24 +421,31 @@ void DetectableHost::detectOnce() {
if (IsAnomalous)
NumAnomalousDimensions += 1;
- if (NewAnomalyEvent && (AnomalyRate >= Cfg.ADDimensionRateThreshold))
- DimsOverThreshold.push_back({ AnomalyRate, D->getID() });
+ if (NewAnomalyEvent && (AnomalyScore >= Cfg.ADDimensionRateThreshold))
+ DimsOverThreshold.push_back({ AnomalyScore, D->getID() });
+
+ D->updateAnomalyBitCounter(AnomalyRateRS, AnomalyRateTimer, IsAnomalous);
}
if (NumAnomalousDimensions)
- AnomalyRate = static_cast<double>(NumAnomalousDimensions) / DimensionsMap.size();
+ WindowAnomalyRate = static_cast<double>(NumAnomalousDimensions) / DimensionsMap.size();
else
- AnomalyRate = 0.0;
+ WindowAnomalyRate = 0.0;
NumNormalDimensions = DimensionsMap.size() - NumAnomalousDimensions;
}
+ if (CollectAnomalyRates) {
+ AnomalyRateTimer = 0;
+ rrdset_done(AnomalyRateRS);
+ }
+
this->NumAnomalousDimensions = NumAnomalousDimensions;
this->NumNormalDimensions = NumNormalDimensions;
this->NumTrainedDimensions = NumTrainedDimensions;
updateDimensionsChart(getRH(), NumTrainedDimensions, NumNormalDimensions, NumAnomalousDimensions);
- updateRateChart(getRH(), AnomalyRate * 10000.0);
+ updateRateChart(getRH(), WindowAnomalyRate * 10000.0);
updateWindowLengthChart(getRH(), WindowLength);
updateEventsChart(getRH(), P, ResetBitCounter, NewAnomalyEvent);
updateTrainingChart(getRH(), TotalTrainingDuration * 1000.0, MaxTrainingDuration * 1000.0);