summaryrefslogtreecommitdiffstats
path: root/ml
diff options
context:
space:
mode:
authorCosta Tsaousis <costa@netdata.cloud>2022-05-09 16:34:31 +0300
committerGitHub <noreply@github.com>2022-05-09 16:34:31 +0300
commiteb216a1f4bbb26e1f18537b30d22e8ad8711f42c (patch)
tree353938a0f71da7b04d4f9b67769d2a38ba6db2cb /ml
parent0b3ee50c76dcc3b8dcdd13cec0e432394d3c6964 (diff)
Workers utilization charts (#12807)
* initial version of worker utilization * working example * without mutexes * monitoring DBENGINE, ACLKSYNC, WEB workers * added charts to monitor worker usage * fixed charts units * updated contexts * updated priorities * added documentation * converted threads to stacked chart * One query per query thread * Revert "One query per query thread" This reverts commit 6aeb391f5987c3c6ba2864b559fd7f0cd64b14d3. * fixed priority for web charts * read worker cpu utilization from proc * read workers cpu utilization via /proc/self/task/PID/stat, so that we have cpu utilization even when the jobs are too long to finish within our update_every frequency * disabled web server cpu utilization monitoring - it is now monitored by worker utilization * tight integration of worker utilization to web server * monitoring statsd worker threads * code cleanup and renaming of variables * contrained worker and statistics conflict to just one variable * support for rendering jobs per type * better priorities and removed the total jobs chart * added busy time in ms per job type * added proc.plugin monitoring, switch clock to MONOTONIC_RAW if available, global statistics now cleans up old worker threads * isolated worker thread families * added cgroups.plugin workers * remove unneeded dimensions when then expected worker is just one * plugins.d and streaming monitoring * rebased; support worker_is_busy() to be called one after another * added diskspace plugin monitoring * added tc.plugin monitoring * added ML threads monitoring * dont create dimensions and charts that are not needed * fix crash when job types are added on the fly * added timex and idlejitter plugins; collected heartbeat statistics; reworked heartbeat according to the POSIX * the right name is heartbeat for this chart * monitor streaming senders * added streaming senders to global stats * prevent division by zero * added clock_init() to external C plugins * added freebsd and macos plugins * added freebsd and macos to global statistics * dont use new as a variable; address compiler warnings on FreeBSD and MacOS * refactored contexts to be unique; added health threads monitoring Co-authored-by: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com>
Diffstat (limited to 'ml')
-rw-r--r--ml/Host.cc32
1 files changed, 32 insertions, 0 deletions
diff --git a/ml/Host.cc b/ml/Host.cc
index 3166720cc8..4f64bf694e 100644
--- a/ml/Host.cc
+++ b/ml/Host.cc
@@ -358,6 +358,10 @@ void TrainableHost::trainDimension(Dimension *D, const TimePoint &NowTP) {
void TrainableHost::train() {
Duration<double> MaxSleepFor = Seconds{10 * updateEvery()};
+ worker_register("MLTRAIN");
+ worker_register_job_name(0, "dimensions");
+
+ worker_is_busy(0);
while (!netdata_exit) {
netdata_thread_testcancel();
netdata_thread_disable_cancelability();
@@ -378,11 +382,23 @@ void TrainableHost::train() {
if (RealDuration >= AllottedDuration)
continue;
+ worker_is_idle();
SleepFor = std::min(AllottedDuration - RealDuration, MaxSleepFor);
std::this_thread::sleep_for(SleepFor);
+ worker_is_busy(0);
}
}
+#define WORKER_JOB_DETECT_DIMENSION 0
+#define WORKER_JOB_UPDATE_DETECTION_CHART 1
+#define WORKER_JOB_UPDATE_ANOMALY_RATES 2
+#define WORKER_JOB_UPDATE_CHARTS 3
+#define WORKER_JOB_SAVE_ANOMALY_EVENT 4
+
+#if WORKER_UTILIZATION_MAX_JOB_TYPES < 5
+#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 5
+#endif
+
void DetectableHost::detectOnce() {
auto P = BRW.insert(WindowAnomalyRate >= Cfg.HostAnomalyRateThreshold);
BitRateWindow::Edge Edge = P.first;
@@ -408,6 +424,8 @@ void DetectableHost::detectOnce() {
DimsOverThreshold.reserve(DimensionsMap.size());
for (auto &DP : DimensionsMap) {
+ worker_is_busy(WORKER_JOB_DETECT_DIMENSION);
+
Dimension *D = DP.second;
auto P = D->detect(WindowLength, ResetBitCounter);
@@ -434,6 +452,7 @@ void DetectableHost::detectOnce() {
}
if (CollectAnomalyRates) {
+ worker_is_busy(WORKER_JOB_UPDATE_ANOMALY_RATES);
AnomalyRateTimer = 0;
rrdset_done(AnomalyRateRS);
}
@@ -442,6 +461,7 @@ void DetectableHost::detectOnce() {
this->NumNormalDimensions = NumNormalDimensions;
this->NumTrainedDimensions = NumTrainedDimensions;
+ worker_is_busy(WORKER_JOB_UPDATE_CHARTS);
updateDimensionsChart(getRH(), NumTrainedDimensions, NumNormalDimensions, NumAnomalousDimensions);
updateRateChart(getRH(), WindowAnomalyRate * 10000.0);
updateWindowLengthChart(getRH(), WindowLength);
@@ -454,6 +474,8 @@ void DetectableHost::detectOnce() {
if (!NewAnomalyEvent || (DimsOverThreshold.size() == 0))
return;
+ worker_is_busy(WORKER_JOB_SAVE_ANOMALY_EVENT);
+
std::sort(DimsOverThreshold.begin(), DimsOverThreshold.end());
std::reverse(DimsOverThreshold.begin(), DimsOverThreshold.end());
@@ -476,6 +498,13 @@ void DetectableHost::detectOnce() {
}
void DetectableHost::detect() {
+ worker_register("MLDETECT");
+ worker_register_job_name(WORKER_JOB_DETECT_DIMENSION, "dimensions");
+ worker_register_job_name(WORKER_JOB_UPDATE_DETECTION_CHART, "detection chart");
+ worker_register_job_name(WORKER_JOB_UPDATE_ANOMALY_RATES, "anomaly rates");
+ worker_register_job_name(WORKER_JOB_UPDATE_CHARTS, "charts");
+ worker_register_job_name(WORKER_JOB_SAVE_ANOMALY_EVENT, "anomaly event");
+
std::this_thread::sleep_for(Seconds{10});
heartbeat_t HB;
@@ -483,10 +512,13 @@ void DetectableHost::detect() {
while (!netdata_exit) {
netdata_thread_testcancel();
+ worker_is_idle();
heartbeat_next(&HB, updateEvery() * USEC_PER_SEC);
netdata_thread_disable_cancelability();
detectOnce();
+
+ worker_is_busy(WORKER_JOB_UPDATE_DETECTION_CHART);
updateDetectionChart(getRH());
netdata_thread_enable_cancelability();
}