summaryrefslogtreecommitdiffstats
path: root/ml
diff options
context:
space:
mode:
authorvkalintiris <vasilis@netdata.cloud>2022-04-13 20:44:10 +0300
committerGitHub <noreply@github.com>2022-04-13 20:44:10 +0300
commitc234d2366cc9c42389e66a7559e31a7da553d506 (patch)
tree2cf6079fb032d5f4e0f91a4e0a9b2943f68f0bdb /ml
parent65662fe1524cee3840ead92c0535f00fbadb8b98 (diff)
Cancel anomaly detection threads before joining. (#12681)
Originally, the main training/detection thread loops where meant to be run only for the localhost host. They would stop when `netdata_exit` was set to true during the shutdown process. By enabling training/detection for children, we have to explicitly cancel ML threads because the service thread can free a child host at any point in time without setting `netdata_exit` to true. To support this: - We send a cancellation request to the training and the detection threads when we call rrdhost_free. - We disable/enable cancelation for the actual training/detection step on every iteration (in order to protect locks and shared data structures).
Diffstat (limited to 'ml')
-rw-r--r--ml/Host.cc12
1 files changed, 11 insertions, 1 deletions
diff --git a/ml/Host.cc b/ml/Host.cc
index b287d1f228..3166720cc8 100644
--- a/ml/Host.cc
+++ b/ml/Host.cc
@@ -359,6 +359,9 @@ void TrainableHost::train() {
Duration<double> MaxSleepFor = Seconds{10 * updateEvery()};
while (!netdata_exit) {
+ netdata_thread_testcancel();
+ netdata_thread_disable_cancelability();
+
updateResourceUsage();
TimePoint NowTP = SteadyClock::now();
@@ -366,6 +369,8 @@ void TrainableHost::train() {
auto P = findDimensionToTrain(NowTP);
trainDimension(P.first, NowTP);
+ netdata_thread_enable_cancelability();
+
Duration<double> AllottedDuration = P.second;
Duration<double> RealDuration = SteadyClock::now() - NowTP;
@@ -477,11 +482,13 @@ void DetectableHost::detect() {
heartbeat_init(&HB);
while (!netdata_exit) {
+ netdata_thread_testcancel();
heartbeat_next(&HB, updateEvery() * USEC_PER_SEC);
+ netdata_thread_disable_cancelability();
detectOnce();
-
updateDetectionChart(getRH());
+ netdata_thread_enable_cancelability();
}
}
@@ -499,6 +506,9 @@ void DetectableHost::startAnomalyDetectionThreads() {
}
void DetectableHost::stopAnomalyDetectionThreads() {
+ netdata_thread_cancel(TrainingThread.native_handle());
+ netdata_thread_cancel(DetectionThread.native_handle());
+
TrainingThread.join();
DetectionThread.join();
}