diff options
author | Costa Tsaousis <costa@netdata.cloud> | 2022-05-09 16:34:31 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-05-09 16:34:31 +0300 |
commit | eb216a1f4bbb26e1f18537b30d22e8ad8711f42c (patch) | |
tree | 353938a0f71da7b04d4f9b67769d2a38ba6db2cb /ml | |
parent | 0b3ee50c76dcc3b8dcdd13cec0e432394d3c6964 (diff) |
Workers utilization charts (#12807)
* initial version of worker utilization
* working example
* without mutexes
* monitoring DBENGINE, ACLKSYNC, WEB workers
* added charts to monitor worker usage
* fixed charts units
* updated contexts
* updated priorities
* added documentation
* converted threads to stacked chart
* One query per query thread
* Revert "One query per query thread"
This reverts commit 6aeb391f5987c3c6ba2864b559fd7f0cd64b14d3.
* fixed priority for web charts
* read worker cpu utilization from proc
* read workers cpu utilization via /proc/self/task/PID/stat, so that we have cpu utilization even when the jobs are too long to finish within our update_every frequency
* disabled web server cpu utilization monitoring - it is now monitored by worker utilization
* tight integration of worker utilization to web server
* monitoring statsd worker threads
* code cleanup and renaming of variables
* contrained worker and statistics conflict to just one variable
* support for rendering jobs per type
* better priorities and removed the total jobs chart
* added busy time in ms per job type
* added proc.plugin monitoring, switch clock to MONOTONIC_RAW if available, global statistics now cleans up old worker threads
* isolated worker thread families
* added cgroups.plugin workers
* remove unneeded dimensions when then expected worker is just one
* plugins.d and streaming monitoring
* rebased; support worker_is_busy() to be called one after another
* added diskspace plugin monitoring
* added tc.plugin monitoring
* added ML threads monitoring
* dont create dimensions and charts that are not needed
* fix crash when job types are added on the fly
* added timex and idlejitter plugins; collected heartbeat statistics; reworked heartbeat according to the POSIX
* the right name is heartbeat for this chart
* monitor streaming senders
* added streaming senders to global stats
* prevent division by zero
* added clock_init() to external C plugins
* added freebsd and macos plugins
* added freebsd and macos to global statistics
* dont use new as a variable; address compiler warnings on FreeBSD and MacOS
* refactored contexts to be unique; added health threads monitoring
Co-authored-by: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com>
Diffstat (limited to 'ml')
-rw-r--r-- | ml/Host.cc | 32 |
1 files changed, 32 insertions, 0 deletions
diff --git a/ml/Host.cc b/ml/Host.cc index 3166720cc8..4f64bf694e 100644 --- a/ml/Host.cc +++ b/ml/Host.cc @@ -358,6 +358,10 @@ void TrainableHost::trainDimension(Dimension *D, const TimePoint &NowTP) { void TrainableHost::train() { Duration<double> MaxSleepFor = Seconds{10 * updateEvery()}; + worker_register("MLTRAIN"); + worker_register_job_name(0, "dimensions"); + + worker_is_busy(0); while (!netdata_exit) { netdata_thread_testcancel(); netdata_thread_disable_cancelability(); @@ -378,11 +382,23 @@ void TrainableHost::train() { if (RealDuration >= AllottedDuration) continue; + worker_is_idle(); SleepFor = std::min(AllottedDuration - RealDuration, MaxSleepFor); std::this_thread::sleep_for(SleepFor); + worker_is_busy(0); } } +#define WORKER_JOB_DETECT_DIMENSION 0 +#define WORKER_JOB_UPDATE_DETECTION_CHART 1 +#define WORKER_JOB_UPDATE_ANOMALY_RATES 2 +#define WORKER_JOB_UPDATE_CHARTS 3 +#define WORKER_JOB_SAVE_ANOMALY_EVENT 4 + +#if WORKER_UTILIZATION_MAX_JOB_TYPES < 5 +#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 5 +#endif + void DetectableHost::detectOnce() { auto P = BRW.insert(WindowAnomalyRate >= Cfg.HostAnomalyRateThreshold); BitRateWindow::Edge Edge = P.first; @@ -408,6 +424,8 @@ void DetectableHost::detectOnce() { DimsOverThreshold.reserve(DimensionsMap.size()); for (auto &DP : DimensionsMap) { + worker_is_busy(WORKER_JOB_DETECT_DIMENSION); + Dimension *D = DP.second; auto P = D->detect(WindowLength, ResetBitCounter); @@ -434,6 +452,7 @@ void DetectableHost::detectOnce() { } if (CollectAnomalyRates) { + worker_is_busy(WORKER_JOB_UPDATE_ANOMALY_RATES); AnomalyRateTimer = 0; rrdset_done(AnomalyRateRS); } @@ -442,6 +461,7 @@ void DetectableHost::detectOnce() { this->NumNormalDimensions = NumNormalDimensions; this->NumTrainedDimensions = NumTrainedDimensions; + worker_is_busy(WORKER_JOB_UPDATE_CHARTS); updateDimensionsChart(getRH(), NumTrainedDimensions, NumNormalDimensions, NumAnomalousDimensions); updateRateChart(getRH(), WindowAnomalyRate * 10000.0); updateWindowLengthChart(getRH(), WindowLength); @@ -454,6 +474,8 @@ void DetectableHost::detectOnce() { if (!NewAnomalyEvent || (DimsOverThreshold.size() == 0)) return; + worker_is_busy(WORKER_JOB_SAVE_ANOMALY_EVENT); + std::sort(DimsOverThreshold.begin(), DimsOverThreshold.end()); std::reverse(DimsOverThreshold.begin(), DimsOverThreshold.end()); @@ -476,6 +498,13 @@ void DetectableHost::detectOnce() { } void DetectableHost::detect() { + worker_register("MLDETECT"); + worker_register_job_name(WORKER_JOB_DETECT_DIMENSION, "dimensions"); + worker_register_job_name(WORKER_JOB_UPDATE_DETECTION_CHART, "detection chart"); + worker_register_job_name(WORKER_JOB_UPDATE_ANOMALY_RATES, "anomaly rates"); + worker_register_job_name(WORKER_JOB_UPDATE_CHARTS, "charts"); + worker_register_job_name(WORKER_JOB_SAVE_ANOMALY_EVENT, "anomaly event"); + std::this_thread::sleep_for(Seconds{10}); heartbeat_t HB; @@ -483,10 +512,13 @@ void DetectableHost::detect() { while (!netdata_exit) { netdata_thread_testcancel(); + worker_is_idle(); heartbeat_next(&HB, updateEvery() * USEC_PER_SEC); netdata_thread_disable_cancelability(); detectOnce(); + + worker_is_busy(WORKER_JOB_UPDATE_DETECTION_CHART); updateDetectionChart(getRH()); netdata_thread_enable_cancelability(); } |