From fbdb5ff55ed06a1717b484cbdb2fedbf7b7f5c79 Mon Sep 17 00:00:00 2001 From: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com> Date: Mon, 8 Apr 2024 22:33:37 +0300 Subject: Increase number of pages per extent in dbengine (#17343) Increase max extent pages to 109 Default to be used is still 64 --- src/database/engine/rrdengine.c | 2 +- src/database/engine/rrdengine.h | 3 ++- src/database/rrdhost.c | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/database/engine/rrdengine.c b/src/database/engine/rrdengine.c index c47a3f2b10..87c1e3ca0b 100644 --- a/src/database/engine/rrdengine.c +++ b/src/database/engine/rrdengine.c @@ -11,7 +11,7 @@ rrdeng_stats_t rrdeng_reserved_file_descriptors = 0; rrdeng_stats_t global_pg_cache_over_half_dirty_events = 0; rrdeng_stats_t global_flushing_pressure_page_deletions = 0; -unsigned rrdeng_pages_per_extent = MAX_PAGES_PER_EXTENT; +unsigned rrdeng_pages_per_extent = DEFAULT_PAGES_PER_EXTENT; #if WORKER_UTILIZATION_MAX_JOB_TYPES < (RRDENG_OPCODE_MAX + 2) #error Please increase WORKER_UTILIZATION_MAX_JOB_TYPES to at least (RRDENG_MAX_OPCODE + 2) diff --git a/src/database/engine/rrdengine.h b/src/database/engine/rrdengine.h index 86cf56c2db..ab30ce8770 100644 --- a/src/database/engine/rrdengine.h +++ b/src/database/engine/rrdengine.h @@ -30,7 +30,8 @@ extern unsigned rrdeng_pages_per_extent; struct rrdengine_instance; struct rrdeng_cmd; -#define MAX_PAGES_PER_EXTENT (64) /* TODO: can go higher only when journal supports bigger than 4KiB transactions */ +#define MAX_PAGES_PER_EXTENT (109) /* TODO: can go higher only when journal supports bigger than 4KiB transactions */ +#define DEFAULT_PAGES_PER_EXTENT (64) #define RRDENG_FILE_NUMBER_SCAN_TMPL "%1u-%10u" #define RRDENG_FILE_NUMBER_PRINT_TMPL "%1.1u-%10.10u" diff --git a/src/database/rrdhost.c b/src/database/rrdhost.c index 8ea1611bb9..a005d60d27 100644 --- a/src/database/rrdhost.c +++ b/src/database/rrdhost.c @@ -827,8 +827,8 @@ void dbengine_init(char *hostname) { #ifdef ENABLE_DBENGINE use_direct_io = config_get_boolean(CONFIG_SECTION_DB, "dbengine use direct io", use_direct_io); - unsigned read_num = (unsigned)config_get_number(CONFIG_SECTION_DB, "dbengine pages per extent", MAX_PAGES_PER_EXTENT); - if (read_num > 0 && read_num <= MAX_PAGES_PER_EXTENT) + unsigned read_num = (unsigned)config_get_number(CONFIG_SECTION_DB, "dbengine pages per extent", DEFAULT_PAGES_PER_EXTENT); + if (read_num > 0 && read_num <= DEFAULT_PAGES_PER_EXTENT) rrdeng_pages_per_extent = read_num; else { nd_log(NDLS_DAEMON, NDLP_WARNING, -- cgit v1.2.3 From 5a9f16f9da0e8b772420904e5cd934af63e48eac Mon Sep 17 00:00:00 2001 From: netdatabot Date: Tue, 9 Apr 2024 00:16:58 +0000 Subject: [ci skip] Update changelog and version for nightly build: v1.45.0-123-nightly. --- CHANGELOG.md | 19 +++++++++---------- packaging/version | 2 +- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c7d9cdf381..a9559f9672 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,10 +6,19 @@ **Merged pull requests:** +- Increase number of pages per extent in dbengine [\#17343](https://github.com/netdata/netdata/pull/17343) ([stelfrag](https://github.com/stelfrag)) +- fix invalid var in prepare\_offline\_install\_source\(\) [\#17342](https://github.com/netdata/netdata/pull/17342) ([ilyam8](https://github.com/ilyam8)) +- remove unused install\_go.sh [\#17339](https://github.com/netdata/netdata/pull/17339) ([ilyam8](https://github.com/ilyam8)) +- fix proc-power-supply charts family [\#17338](https://github.com/netdata/netdata/pull/17338) ([ilyam8](https://github.com/ilyam8)) +- Bump github.com/likexian/whois from 1.15.1 to 1.15.2 in /src/go/collectors/go.d.plugin [\#17337](https://github.com/netdata/netdata/pull/17337) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Bump github.com/vmware/govmomi from 0.36.2 to 0.36.3 in /src/go/collectors/go.d.plugin [\#17336](https://github.com/netdata/netdata/pull/17336) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Bump github.com/prometheus/common from 0.51.1 to 0.52.2 in /src/go/collectors/go.d.plugin [\#17335](https://github.com/netdata/netdata/pull/17335) ([dependabot[bot]](https://github.com/apps/dependabot)) +- go.d: schema windows: fix url placeholder scheme [\#17326](https://github.com/netdata/netdata/pull/17326) ([ilyam8](https://github.com/ilyam8)) - go.d: schemas: add missing "body" and "method" [\#17325](https://github.com/netdata/netdata/pull/17325) ([ilyam8](https://github.com/ilyam8)) - remove old overview infrastructure and add home tab doc [\#17323](https://github.com/netdata/netdata/pull/17323) ([Ancairon](https://github.com/Ancairon)) - Drop generic bitmap implementation. [\#17322](https://github.com/netdata/netdata/pull/17322) ([vkalintiris](https://github.com/vkalintiris)) - set min thread stack size to 1 MB [\#17317](https://github.com/netdata/netdata/pull/17317) ([ilyam8](https://github.com/ilyam8)) +- Call with resize true when dictionary has DICT\_OPTION\_INDEX\_HASHTABLE [\#17316](https://github.com/netdata/netdata/pull/17316) ([stelfrag](https://github.com/stelfrag)) - Drop legacy dbengine support [\#17315](https://github.com/netdata/netdata/pull/17315) ([stelfrag](https://github.com/stelfrag)) - Fix assorted issues in the Docker build process. [\#17312](https://github.com/netdata/netdata/pull/17312) ([Ferroin](https://github.com/Ferroin)) - dyncfg function on parents should not require any access rights [\#17310](https://github.com/netdata/netdata/pull/17310) ([ktsaou](https://github.com/ktsaou)) @@ -395,16 +404,6 @@ - health: add httpcheck bad header alert [\#16736](https://github.com/netdata/netdata/pull/16736) ([ilyam8](https://github.com/ilyam8)) - update default netdata.conf used for native packages [\#16734](https://github.com/netdata/netdata/pull/16734) ([ilyam8](https://github.com/ilyam8)) - fix missing CPU frequency [\#16732](https://github.com/netdata/netdata/pull/16732) ([ilyam8](https://github.com/ilyam8)) -- Fix handling of hardening flags with Clang [\#16731](https://github.com/netdata/netdata/pull/16731) ([Ferroin](https://github.com/Ferroin)) -- fix excessive "maximum number of cgroups reached" log messages [\#16730](https://github.com/netdata/netdata/pull/16730) ([ilyam8](https://github.com/ilyam8)) -- Regenerate integrations.js [\#16728](https://github.com/netdata/netdata/pull/16728) ([netdatabot](https://github.com/netdatabot)) -- update ebpf-socket function name and columns [\#16727](https://github.com/netdata/netdata/pull/16727) ([ilyam8](https://github.com/ilyam8)) -- Fix --distro-override parameter name in docs [\#16726](https://github.com/netdata/netdata/pull/16726) ([moschlar](https://github.com/moschlar)) -- update go.d.plugin to v0.58.0 [\#16725](https://github.com/netdata/netdata/pull/16725) ([ilyam8](https://github.com/ilyam8)) -- Add GHA workflow to upload kickstart script to our repo server. [\#16724](https://github.com/netdata/netdata/pull/16724) ([Ferroin](https://github.com/Ferroin)) -- Add Netdata Mobile App to issue template config [\#16723](https://github.com/netdata/netdata/pull/16723) ([ilyam8](https://github.com/ilyam8)) -- fix clock resolution detection [\#16720](https://github.com/netdata/netdata/pull/16720) ([ktsaou](https://github.com/ktsaou)) -- cgroups: don't multiply cgroup\_check\_for\_new\_every by update\_every [\#16719](https://github.com/netdata/netdata/pull/16719) ([ilyam8](https://github.com/ilyam8)) ## [v1.44.3](https://github.com/netdata/netdata/tree/v1.44.3) (2024-02-12) diff --git a/packaging/version b/packaging/version index 9f0ddbfc3c..32acd2e22f 100644 --- a/packaging/version +++ b/packaging/version @@ -1 +1 @@ -v1.45.0-113-nightly +v1.45.0-123-nightly -- cgit v1.2.3 From 3acc67a1c7dea77a7e2160c705ef00ab0294cb0d Mon Sep 17 00:00:00 2001 From: Ilya Mashchenko Date: Tue, 9 Apr 2024 13:45:55 +0300 Subject: add intel_gpu_top collector (#17344) --- src/go/collectors/go.d.plugin/README.md | 1 + src/go/collectors/go.d.plugin/config/go.d.conf | 1 + .../go.d.plugin/config/go.d/intelgpu.conf | 6 + src/go/collectors/go.d.plugin/modules/init.go | 1 + .../go.d.plugin/modules/intelgpu/charts.go | 92 ++++++++++ .../go.d.plugin/modules/intelgpu/collect.go | 76 ++++++++ .../modules/intelgpu/config_schema.json | 37 ++++ .../go.d.plugin/modules/intelgpu/exec.go | 140 +++++++++++++++ .../go.d.plugin/modules/intelgpu/init.go | 22 +++ .../go.d.plugin/modules/intelgpu/intelgpu.go | 110 ++++++++++++ .../go.d.plugin/modules/intelgpu/intelgpu_test.go | 200 +++++++++++++++++++++ .../go.d.plugin/modules/intelgpu/metadata.yaml | 115 ++++++++++++ .../modules/intelgpu/testdata/config.json | 4 + .../modules/intelgpu/testdata/config.yaml | 2 + .../go.d.plugin/modules/intelgpu/testdata/igt.json | 80 +++++++++ 15 files changed, 887 insertions(+) create mode 100644 src/go/collectors/go.d.plugin/config/go.d/intelgpu.conf create mode 100644 src/go/collectors/go.d.plugin/modules/intelgpu/charts.go create mode 100644 src/go/collectors/go.d.plugin/modules/intelgpu/collect.go create mode 100644 src/go/collectors/go.d.plugin/modules/intelgpu/config_schema.json create mode 100644 src/go/collectors/go.d.plugin/modules/intelgpu/exec.go create mode 100644 src/go/collectors/go.d.plugin/modules/intelgpu/init.go create mode 100644 src/go/collectors/go.d.plugin/modules/intelgpu/intelgpu.go create mode 100644 src/go/collectors/go.d.plugin/modules/intelgpu/intelgpu_test.go create mode 100644 src/go/collectors/go.d.plugin/modules/intelgpu/metadata.yaml create mode 100644 src/go/collectors/go.d.plugin/modules/intelgpu/testdata/config.json create mode 100644 src/go/collectors/go.d.plugin/modules/intelgpu/testdata/config.yaml create mode 100644 src/go/collectors/go.d.plugin/modules/intelgpu/testdata/igt.json diff --git a/src/go/collectors/go.d.plugin/README.md b/src/go/collectors/go.d.plugin/README.md index fbebb2fd3a..3423c69686 100644 --- a/src/go/collectors/go.d.plugin/README.md +++ b/src/go/collectors/go.d.plugin/README.md @@ -77,6 +77,7 @@ see the appropriate collector readme. | [haproxy](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/haproxy) | HAProxy | | [hdfs](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/hdfs) | HDFS | | [httpcheck](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/httpcheck) | Any HTTP Endpoint | +| [intelgpu](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/intelgpu) | Intel integrated GPU | | [isc_dhcpd](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/isc_dhcpd) | ISC DHCP | | [k8s_kubelet](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/k8s_kubelet) | Kubelet | | [k8s_kubeproxy](https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/k8s_kubeproxy) | Kube-proxy | diff --git a/src/go/collectors/go.d.plugin/config/go.d.conf b/src/go/collectors/go.d.plugin/config/go.d.conf index 9f9d29e0fc..bcd5455bc0 100644 --- a/src/go/collectors/go.d.plugin/config/go.d.conf +++ b/src/go/collectors/go.d.plugin/config/go.d.conf @@ -40,6 +40,7 @@ modules: # haproxy: yes # hdfs: yes # httpcheck: yes +# intelgpu: yes # isc_dhcpd: yes # k8s_kubelet: yes # k8s_kubeproxy: yes diff --git a/src/go/collectors/go.d.plugin/config/go.d/intelgpu.conf b/src/go/collectors/go.d.plugin/config/go.d/intelgpu.conf new file mode 100644 index 0000000000..abaea2d4d0 --- /dev/null +++ b/src/go/collectors/go.d.plugin/config/go.d/intelgpu.conf @@ -0,0 +1,6 @@ +## All available configuration options, their descriptions and default values: +## https://github.com/netdata/netdata/tree/master/src/go/collectors/go.d.plugin/modules/intelgpu#readme + +#jobs: +# - name: intelgpu +# binary_path: /usr/bin/intel_gpu_top diff --git a/src/go/collectors/go.d.plugin/modules/init.go b/src/go/collectors/go.d.plugin/modules/init.go index 37f6c80d4a..bcfd39f3e2 100644 --- a/src/go/collectors/go.d.plugin/modules/init.go +++ b/src/go/collectors/go.d.plugin/modules/init.go @@ -30,6 +30,7 @@ import ( _ "github.com/netdata/netdata/go/go.d.plugin/modules/haproxy" _ "github.com/netdata/netdata/go/go.d.plugin/modules/hdfs" _ "github.com/netdata/netdata/go/go.d.plugin/modules/httpcheck" + _ "github.com/netdata/netdata/go/go.d.plugin/modules/intelgpu" _ "github.com/netdata/netdata/go/go.d.plugin/modules/isc_dhcpd" _ "github.com/netdata/netdata/go/go.d.plugin/modules/k8s_kubelet" _ "github.com/netdata/netdata/go/go.d.plugin/modules/k8s_kubeproxy" diff --git a/src/go/collectors/go.d.plugin/modules/intelgpu/charts.go b/src/go/collectors/go.d.plugin/modules/intelgpu/charts.go new file mode 100644 index 0000000000..752e210c74 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/intelgpu/charts.go @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package intelgpu + +import ( + "fmt" + "strings" + + "github.com/netdata/netdata/go/go.d.plugin/agent/module" +) + +const ( + prioGPUFrequency = module.Priority + iota + prioGPUPower + prioGPUEngineBusy +) + +var charts = module.Charts{ + intelGPUFrequencyChart.Copy(), + intelGPUPowerGPUChart.Copy(), +} + +var intelGPUFrequencyChart = module.Chart{ + ID: "igpu_frequency", + Title: "Intel GPU frequency", + Units: "MHz", + Fam: "frequency", + Ctx: "intelgpu.frequency", + Type: module.Line, + Priority: prioGPUFrequency, + Dims: module.Dims{ + {ID: "frequency_actual", Name: "frequency", Div: precision}, + }, +} + +var intelGPUPowerGPUChart = module.Chart{ + ID: "igpu_power_gpu", + Title: "Intel GPU power", + Units: "Watts", + Fam: "power", + Ctx: "intelgpu.power", + Type: module.Line, + Priority: prioGPUPower, + Dims: module.Dims{ + {ID: "power_gpu", Name: "gpu", Div: precision}, + {ID: "power_package", Name: "package", Div: precision}, + }, +} + +var intelGPUEngineBusyPercChartTmpl = module.Chart{ + ID: "igpu_engine_%s_busy_percentage", + Title: "Intel GPU engine busy time percentage", + Units: "percentage", + Fam: "engines", + Ctx: "intelgpu.engine_busy_perc", + Type: module.Line, + Priority: prioGPUEngineBusy, + Dims: module.Dims{ + {ID: "engine_%s_busy", Name: "busy", Div: precision}, + }, +} + +func (ig *IntelGPU) addEngineCharts(engine string) { + chart := intelGPUEngineBusyPercChartTmpl.Copy() + + s := strings.ToLower(engine) + s = strings.ReplaceAll(s, "/", "_") + + chart.ID = fmt.Sprintf(chart.ID, s) + chart.Labels = []module.Label{ + {Key: "engine", Value: engineDisplayName(engine)}, + {Key: "engine_instance", Value: engine}, + } + for _, dim := range chart.Dims { + dim.ID = fmt.Sprintf(dim.ID, engine) + } + + if err := ig.Charts().Add(chart); err != nil { + ig.Warning(err) + } +} + +func engineDisplayName(engine string) string { + // https://gitlab.freedesktop.org/drm/igt-gpu-tools/-/blob/master/tools/intel_gpu_top.c#L431 + engines := []string{"Render/3D", "Blitter", "VideoEnhance", "Video", "Compute"} + for _, name := range engines { + if strings.HasPrefix(engine, name) { + return name + } + } + return "unknown" +} diff --git a/src/go/collectors/go.d.plugin/modules/intelgpu/collect.go b/src/go/collectors/go.d.plugin/modules/intelgpu/collect.go new file mode 100644 index 0000000000..38e8b305af --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/intelgpu/collect.go @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package intelgpu + +import ( + "encoding/json" + "errors" + "fmt" +) + +type ( + gpuSummaryStats struct { + Frequency struct { + Actual float64 `json:"actual"` + } `json:"frequency"` + Power struct { + GPU float64 `json:"gpu"` + Package float64 `json:"package"` + } `json:"power"` + Engines map[string]struct { + Busy float64 `json:"busy"` + } `json:"engines"` + } +) + +const precision = 100 + +func (ig *IntelGPU) collect() (map[string]int64, error) { + if ig.exec == nil { + return nil, errors.New("collector not initialized") + } + + stats, err := ig.getGPUSummaryStats() + if err != nil { + return nil, err + } + + mx := make(map[string]int64) + + mx["frequency_actual"] = int64(stats.Frequency.Actual * precision) + mx["power_gpu"] = int64(stats.Power.GPU * precision) + mx["power_package"] = int64(stats.Power.Package * precision) + + for name, es := range stats.Engines { + if !ig.engines[name] { + ig.addEngineCharts(name) + ig.engines[name] = true + } + + key := fmt.Sprintf("engine_%s_busy", name) + mx[key] = int64(es.Busy * precision) + } + + return mx, nil +} +func (ig *IntelGPU) getGPUSummaryStats() (*gpuSummaryStats, error) { + bs, err := ig.exec.queryGPUSummaryJson() + if err != nil { + return nil, err + } + + if len(bs) == 0 { + return nil, errors.New("query returned empty response") + } + + var stats gpuSummaryStats + if err := json.Unmarshal(bs, &stats); err != nil { + return nil, err + } + + if len(stats.Engines) == 0 { + return nil, errors.New("query returned unexpected response") + } + + return &stats, nil +} diff --git a/src/go/collectors/go.d.plugin/modules/intelgpu/config_schema.json b/src/go/collectors/go.d.plugin/modules/intelgpu/config_schema.json new file mode 100644 index 0000000000..4224129f04 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/intelgpu/config_schema.json @@ -0,0 +1,37 @@ +{ + "jsonSchema": { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Intel GPU collector configuration.", + "type": "object", + "properties": { + "update_every": { + "title": "Update every", + "description": "Data collection interval, measured in seconds.", + "type": "integer", + "minimum": 1, + "default": 1 + }, + "binary_path": { + "title": "Binary path", + "description": "Path to the `intel_gpu_top` binary.", + "type": "string", + "default": "/usr/bin/intel_gpu_top" + } + }, + "required": [ + "binary_path" + ], + "additionalProperties": false, + "patternProperties": { + "^name$": {} + } + }, + "uiSchema": { + "uiOptions": { + "fullPage": true + }, + "binary_path": { + "ui:help": "If an absolute path is provided, the collector will use it directly; otherwise, it will search for the binary in directories specified in the PATH environment variable." + } + } +} diff --git a/src/go/collectors/go.d.plugin/modules/intelgpu/exec.go b/src/go/collectors/go.d.plugin/modules/intelgpu/exec.go new file mode 100644 index 0000000000..836c6f58a7 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/intelgpu/exec.go @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package intelgpu + +import ( + "bufio" + "bytes" + "errors" + "os/exec" + "strconv" + "sync" + "time" +) + +func newIntelGpuTopExec(binPath string, updateEvery int) (*intelGpuTopExec, error) { + topExec := &intelGpuTopExec{ + binPath: binPath, + updateEvery: updateEvery, + } + + if err := topExec.run(); err != nil { + return nil, err + } + + return topExec, nil +} + +type intelGpuTopExec struct { + binPath string + updateEvery int + + cmd *exec.Cmd + done chan struct{} + + mux sync.Mutex + lastSample string +} + +func (e *intelGpuTopExec) run() error { + refresh := 900 + if e.updateEvery > 1 { + refresh = e.updateEvery*1000 - 500 // milliseconds + } + + cmd := exec.Command(e.binPath, "-J", "-s", strconv.Itoa(refresh)) + + r, err := cmd.StdoutPipe() + if err != nil { + return err + } + + if err := cmd.Start(); err != nil { + return err + } + + firstSample := make(chan struct{}, 1) + done := make(chan struct{}) + e.cmd = cmd + e.done = done + + go func() { + defer close(done) + sc := bufio.NewScanner(r) + var buf bytes.Buffer + var n int + + for sc.Scan() { + if n++; n > 1000 { + break + } + + text := sc.Text() + + if buf.Cap() == 0 && text != "{" || text == "" { + continue + } + + if text == "}," { + text = "}" + } + + buf.WriteString(text + "\n") + + if text[0] == '}' { + e.mux.Lock() + e.lastSample = buf.String() + e.mux.Unlock() + + select { + case firstSample <- struct{}{}: + default: + } + + buf.Reset() + n = 0 + } + } + }() + + select { + case <-e.done: + _ = e.stop() + return errors.New("process exited before the first sample was collected") + case <-time.After(time.Second * 3): + _ = e.stop() + return errors.New("timed out waiting for first sample") + case <-firstSample: + return nil + } +} + +func (e *intelGpuTopExec) queryGPUSummaryJson() ([]byte, error) { + select { + case <-e.done: + return nil, errors.New("process has already exited") + default: + } + + e.mux.Lock() + defer e.mux.Unlock() + + return []byte(e.lastSample), nil +} + +func (e *intelGpuTopExec) stop() error { + if e.cmd == nil || e.cmd.Process == nil { + return nil + } + + _ = e.cmd.Process.Kill() + _, _ = e.cmd.Process.Wait() + e.cmd = nil + + select { + case <-e.done: + return nil + case <-time.After(time.Second * 2): + return errors.New("timed out waiting for process to exit") + } +} diff --git a/src/go/collectors/go.d.plugin/modules/intelgpu/init.go b/src/go/collectors/go.d.plugin/modules/intelgpu/init.go new file mode 100644 index 0000000000..a398b0fa1f --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/intelgpu/init.go @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package intelgpu + +import ( + "fmt" + "os" + "os/exec" +) + +func (ig *IntelGPU) initIntelGPUTopExec() (intelGpuTop, error) { + binPath := ig.BinaryPath + if _, err := os.Stat(binPath); os.IsNotExist(err) { + path, err := exec.LookPath(ig.binName) + if err != nil { + return nil, fmt.Errorf("error on lookup '%s': %v", ig.binName, err) + } + binPath = path + } + + return newIntelGpuTopExec(binPath, ig.UpdateEvery) +} diff --git a/src/go/collectors/go.d.plugin/modules/intelgpu/intelgpu.go b/src/go/collectors/go.d.plugin/modules/intelgpu/intelgpu.go new file mode 100644 index 0000000000..5b355811ba --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/intelgpu/intelgpu.go @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package intelgpu + +import ( + _ "embed" + "errors" + + "github.com/netdata/netdata/go/go.d.plugin/agent/module" +) + +//go:embed "config_schema.json" +var configSchema string + +func init() { + module.Register("intelgpu", module.Creator{ + Create: func() module.Module { return New() }, + JobConfigSchema: configSchema, + }) +} + +func New() *IntelGPU { + return &IntelGPU{ + Config: Config{ + BinaryPath: "/usr/bin/intel_gpu_top", + }, + binName: "intel_gpu_top", + charts: charts.Copy(), + engines: make(map[string]bool), + } +} + +type Config struct { + UpdateEvery int `yaml:"update_every" json:"update_every"` + BinaryPath string `yaml:"binary_path" json:"binary_path"` +} + +type ( + IntelGPU struct { + module.Base + Config `yaml:",inline" json:""` + + charts *module.Charts + + exec intelGpuTop + binName string + + engines map[string]bool + } + intelGpuTop interface { + queryGPUSummaryJson() ([]byte, error) + stop() error + } +) + +func (ig *IntelGPU) Configuration() any { + return ig.Config +} + +func (ig *IntelGPU) Init() error { + topExec, err := ig.initIntelGPUTopExec() + if err != nil { + ig.Error(err) + return err + } + + ig.exec = topExec + + return nil +} + +func (ig *IntelGPU) Check() error { + mx, err := ig.collect() + if err != nil { + ig.Error(err) + return err + } + + if len(mx) == 0 { + return errors.New("no metrics collected") + } + + return nil +} + +func (ig *IntelGPU) Charts() *module.Charts { + return ig.charts +} + +func (ig *IntelGPU) Collect() map[string]int64 { + mx, err := ig.collect() + if err != nil { + ig.Error(err) + } + + if len(mx) == 0 { + return nil + } + + return mx +} + +func (ig *IntelGPU) Cleanup() { + if ig.exec != nil { + if err := ig.exec.stop(); err != nil { + ig.Error(err) + } + ig.exec = nil + } +} diff --git a/src/go/collectors/go.d.plugin/modules/intelgpu/intelgpu_test.go b/src/go/collectors/go.d.plugin/modules/intelgpu/intelgpu_test.go new file mode 100644 index 0000000000..99ca604da0 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/intelgpu/intelgpu_test.go @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package intelgpu + +import ( + "errors" + "os" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +var ( + dataConfigJSON, _ = os.ReadFile("testdata/config.json") + dataConfigYAML, _ = os.ReadFile("testdata/config.yaml") + + dataIntelTopGpuJSON, _ = os.ReadFile("testdata/igt.json") +) + +func Test_testDataIsValid(t *testing.T) { + for name, data := range map[string][]byte{ + "dataConfigJSON": dataConfigJSON, + "dataConfigYAML": dataConfigYAML, + "dataIntelTopGpuJSON": dataIntelTopGpuJSON, + } { + require.NotNil(t, data, name) + } +} + +func TestIntelGPU_Init(t *testing.T) { + tests := map[string]struct { + prepare func(igt *IntelGPU) + wantFail bool + }{ + "fails if can't find intel_gpu_top": { + wantFail: true, + prepare: func(igt *IntelGPU) { + igt.binName += "!!!" + }, + }, + } + + for name, test := range tests { + t.Run(name, func(t *testing.T) { + igt := New() + + test.prepare(igt) + + if test.wantFail { + assert.Error(t, igt.Init()) + } else { + assert.NoError(t, igt.Init()) + } + }) + } +} + +func TestIntelGPU_Check(t *testing.T) { + tests := map[string]struct { + prepareMock func() *mockIntelGpuTop + wantFail bool + }{ + "success case": { + prepareMock: prepareMockOK, + wantFail: false, + }, + "fail on error": { + prepareMock: prepareMockErrOnGPUSummaryJson, + wantFail: true, + }, + } + + for name, test := range tests { + t.Run(name, func(t *testing.T) { + igt := New() + mock := test.prepareMock() + igt.exec = mock + + if test.wantFail { + assert.Error(t, igt.Check()) + } else { + assert.NoError(t, igt.Check()) + } + }) + } +} + +func TestIntelGPU_Collect(t *testing.T) { + tests := map[string]struct { + prepareMock func() *mockIntelGpuTop + wantMetrics map[string]int64 + }{ + "success case": { + prepareMock: prepareMockOK, + wantMetrics: map[string]int64{ + "engine_Blitter/0_busy": 0, + "engine_Render/3D/0_busy": 9609, + "engine_Video/0_busy": 7295, + "engine_Video/1_busy": 7740, + "engine_VideoEnhance/0_busy": 0, + "frequency_actual": 125308, + "power_gpu": 323, + "power_package": 1665, + }, + }, + "fail on error": { + prepareMock: prepareMockErrOnGPUSummaryJson, + wantMetrics: nil, + }, + } + + for name, test := range tests { + t.Run(name, func(t *testing.T) { + igt := New() + mock := test.prepareMock() + igt.exec = mock + + mx := igt.Collect() + + assert.Equal(t, test.wantMetrics, mx) + if len(test.wantMetrics) > 0 { + assert.Len(t, *igt.Charts(), len(charts)+len(igt.engines)) + } + }) + } +} + +func TestIntelGPU_Cleanup(t *testing.T) { + tests := map[string]struct { + prepare func() *IntelGPU + }{ + "not initialized exec": { + prepare: func() *IntelGPU { + return New() + }, + }, + "after check": { + prepare: func() *IntelGPU { + igt := New() + igt.exec = prepareMockOK() + _ = igt.Check() + return igt + }, + }, + "after collect": { + prepare: func() *IntelGPU { + igt := New() + igt.exec = prepareMockOK() + _ = igt.Collect() + return igt + }, + }, + } + + for name, test := range tests { + t.Run(name, func(t *testing.T) { + igt := test.prepare() + + mock, ok := igt.exec.(*mockIntelGpuTop) + + assert.NotPanics(t, igt.Cleanup) + + if ok { + assert.True(t, mock.stopCalled) + } + }) + } +} + +func prepareMockOK() *mockIntelGpuTop { + return &mockIntelGpuTop{ + gpuSummaryJson: dataIntelTopGpuJSON, + } +} + +func prepareMockErrOnGPUSummaryJson() *mockIntelGpuTop { + return &mockIntelGpuTop{ + errOnQueryGPUSummaryJson: true, + } +} + +type mockIntelGpuTop struct { + errOnQueryGPUSummaryJson bool + gpuSummaryJson []byte + + stopCalled bool +} + +func (m *mockIntelGpuTop) queryGPUSummaryJson() ([]byte, error) { + if m.errOnQueryGPUSummaryJson { + return nil, errors.New("error on mock.queryGPUSummaryJson()") + } + return m.gpuSummaryJson, nil +} + +func (m *mockIntelGpuTop) stop() error { + m.stopCalled = true + return nil +} diff --git a/src/go/collectors/go.d.plugin/modules/intelgpu/metadata.yaml b/src/go/collectors/go.d.plugin/modules/intelgpu/metadata.yaml new file mode 100644 index 0000000000..4290df65af --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/intelgpu/metadata.yaml @@ -0,0 +1,115 @@ +plugin_name: go.d.plugin +modules: + - meta: + id: collector-go.d.plugin-intelgpu + plugin_name: go.d.plugin + module_name: intelgpu + monitored_instance: + name: Intel GPU + link: https://www.intel.com/ + icon_filename: intel.svg + categories: + - data-collection.hardware-devices-and-sensors + keywords: + - intel + - gpu + - hardware + related_resources: + integrations: + list: [] + info_provided_to_referring_integrations: + description: "" + most_popular: false + overview: + data_collection: + metrics_description: | + This collector monitors Intel integrated GPUs performance metrics using + the [intel_gpu_top](https://manpages.debian.org/testing/intel-gpu-tools/intel_gpu_top.1.en.html) CLI tool. + method_description: "" + supported_platforms: + include: [] + exclude: [] + multi_instance: true + additional_permissions: + description: "" + default_behavior: + auto_detection: + description: "" + limits: + description: "" + performance_impact: + description: "" + setup: + prerequisites: + list: [] + configuration: + file: + name: go.d/intelgpu.conf + options: + description: | + The following options can be defined globally: update_every. + folding: + title: Config options + enabled: true + list: + - name: update_every + description: Data collection frequency. + default_value: 1 + required: false + - name: binary_path + description: Path to the intel_gpu_top binary. If an absolute path is provided, the collector will use it directly; otherwise, it will search for the binary in directories specified in the PATH environment variable. + default_value: /usr/bin/intel_gpu_top + required: false + examples: + folding: + title: Config + enabled: true + list: + - name: Custom binary path + description: The executable is not in the directories specified in the PATH environment variable. + config: | + jobs: + - name: nvidia_smi + binary_path: /usr/local/sbin/intel_gpu_top + troubleshooting: + problems: + list: [] + alerts: [] + metrics: + folding: + title: Metrics + enabled: false + description: "" + availability: [] + scopes: + - name: global + description: These metrics refer to the Intel GPU. + labels: [] + metrics: + - name: intelgpu.frequency + description: Intel GPU frequency + unit: MHz + chart_type: line + dimensions: + - name: frequency + - name: intelgpu.power + description: Intel GPU power + unit: Watts + chart_type: line + dimensions: + - name: gpu + - name: package + - name: engine + description: These metrics refer to the GPU hardware engine. + labels: + - name: engine + description: Engine name (Render/3D, Blitter, VideoEnhance, Video, Compute). + - name: engine_instance + description: Engine instance (e.g. Render/3D/0, Video/0, Video/1). + metrics: + - name: intelgpu.engine_busy_perc + description: Intel GPU engine busy time percentage + unit: percentage + chart_type: line + dimensions: + - name: busy diff --git a/src/go/collectors/go.d.plugin/modules/intelgpu/testdata/config.json b/src/go/collectors/go.d.plugin/modules/intelgpu/testdata/config.json new file mode 100644 index 0000000000..723a66ff24 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/intelgpu/testdata/config.json @@ -0,0 +1,4 @@ +{ + "update_every": 123, + "binary_path": "ok" +} diff --git a/src/go/collectors/go.d.plugin/modules/intelgpu/testdata/config.yaml b/src/go/collectors/go.d.plugin/modules/intelgpu/testdata/config.yaml new file mode 100644 index 0000000000..7d0ab9437f --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/intelgpu/testdata/config.yaml @@ -0,0 +1,2 @@ +update_every: 123 +binary_path: "ok" diff --git a/src/go/collectors/go.d.plugin/modules/intelgpu/testdata/igt.json b/src/go/collectors/go.d.plugin/modules/intelgpu/testdata/igt.json new file mode 100644 index 0000000000..4d43cbc5f9 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/intelgpu/testdata/igt.json @@ -0,0 +1,80 @@ +{ + "period": { + "duration": 1055.794145, + "unit": "ms" + }, + "frequency": { + "requested": 1449.146131, + "actual": 1253.085184, + "unit": "MHz" + }, + "interrupts": { + "count": 1757.918443, + "unit": "irq/s" + }, + "rc6": { + "value": 0.000000, + "unit": "%" + }, + "power": { + "GPU": 3.233528, + "Package": 16.658620, + "unit": "W" + }, + "engines": { + "Render/3D/0": { + "busy": 96.092944, + "sema": 0.000000, + "wait": 0.000000, + "unit": "%" + }, + "Blitter/0": { + "busy": 0.000000, + "sema": 0.000000, + "wait": 0.000000, + "unit": "%" + }, + "Video/0": { + "busy": 72.950675, + "sema": 0.000000, + "wait": 0.000000, + "unit": "%" + }, + "Video/1": { + "busy": 77.402254, + "sema": 0.000000, + "wait": 0.000000, + "unit": "%" + }, + "VideoEnhance/0": { + "busy": 0.000000, + "sema": 0.000000, + "wait": 0.000000, + "unit": "%" + } + }, + "clients": { + "4292239459": { + "name": "ffmpeg", + "pid": "2727837", + "engine-classes": { + "Render/3D": { + "busy": "101.396726", + "unit": "%" + }, + "Blitter": { + "busy": "0.000000", + "unit": "%" + }, + "Video": { + "busy": "159.292435", + "unit": "%" + }, + "VideoEnhance": { + "busy": "0.000000", + "unit": "%" + } + } + } + } +} -- cgit v1.2.3 From 758f63e9a4c51e8024018dd9e01e12608bf730a1 Mon Sep 17 00:00:00 2001 From: Fotis Voutsas Date: Tue, 9 Apr 2024 14:01:21 +0300 Subject: add try except (#17352) --- integrations/gen_docs_integrations.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/integrations/gen_docs_integrations.py b/integrations/gen_docs_integrations.py index caceb9cc9c..f869cb9d7f 100644 --- a/integrations/gen_docs_integrations.py +++ b/integrations/gen_docs_integrations.py @@ -335,8 +335,11 @@ def make_symlinks(symlink_dict): takes a dictionary with directories that have a 1:1 relationship between their README and the integration (only one) inside the "integrations" folder. """ for element in symlink_dict: - # Remove the README to prevent it being a normal file - Path(f'{element}/README.md').unlink() + try: + # Remove the README to prevent it being a normal file + Path(f'{element}/README.md').unlink() + except FileNotFoundError: + continue # and then make a symlink to the actual markdown Path(f'{element}/README.md').symlink_to(symlink_dict[element]) -- cgit v1.2.3 From 54a2ac3520e71278c5e3e50b60c0672eff057aee Mon Sep 17 00:00:00 2001 From: Netdata bot <43409846+netdatabot@users.noreply.github.com> Date: Tue, 9 Apr 2024 14:11:08 +0300 Subject: Regenerate integrations.js (#17353) Co-authored-by: ilyam8 <22274335+ilyam8@users.noreply.github.com> --- integrations/integrations.js | 2 +- integrations/integrations.json | 2 +- src/collectors/COLLECTORS.md | 2 + .../modules/intelgpu/integrations/intel_gpu.md | 173 +++++++++++++++++++++ 4 files changed, 177 insertions(+), 2 deletions(-) create mode 100644 src/go/collectors/go.d.plugin/modules/intelgpu/integrations/intel_gpu.md diff --git a/integrations/integrations.js b/integrations/integrations.js index 8d9950164c..86641e5fce 100644 --- a/integrations/integrations.js +++ b/integrations/integrations.js @@ -2,4 +2,4 @@ // It gets generated by integrations/gen_integrations.py in the Netdata repo export const categories = [{"id": "deploy", "name": "Deploy", "description": "", "most_popular": true, "priority": 1, "children": [{"id": "deploy.operating-systems", "name": "Operating Systems", "description": "", "most_popular": true, "priority": 1, "children": []}, {"id": "deploy.docker-kubernetes", "name": "Docker & Kubernetes", "description": "", "most_popular": true, "priority": 2, "children": []}, {"id": "deploy.provisioning-systems", "parent": "deploy", "name": "Provisioning Systems", "description": "", "most_popular": false, "priority": -1, "children": []}]}, {"id": "data-collection", "name": "Data Collection", "description": "", "most_popular": true, "priority": 2, "children": [{"id": "data-collection.other", "name": "Other", "description": "", "most_popular": false, "priority": -1, "collector_default": true, "children": []}, {"id": "data-collection.ebpf", "name": "eBPF", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.freebsd", "name": "FreeBSD", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.containers-and-vms", "name": "Containers and VMs", "description": "", "most_popular": true, "priority": 6, "children": []}, {"id": "data-collection.database-servers", "name": "Databases", "description": "", "most_popular": true, "priority": 1, "children": []}, {"id": "data-collection.kubernetes", "name": "Kubernetes", "description": "", "most_popular": true, "priority": 7, "children": []}, {"id": "data-collection.notifications", "name": "Incident Management", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.service-discovery-registry", "name": "Service Discovery / Registry", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.web-servers-and-web-proxies", "name": "Web Servers and Web Proxies", "description": "", "most_popular": true, "priority": 2, "children": []}, {"id": "data-collection.cloud-provider-managed", "name": "Cloud Provider Managed", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.windows-systems", "name": "Windows Systems", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.apm", "name": "APM", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.hardware-devices-and-sensors", "name": "Hardware Devices and Sensors", "description": "", "most_popular": true, "priority": 4, "children": []}, {"id": "data-collection.macos-systems", "name": "macOS Systems", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.message-brokers", "name": "Message Brokers", "description": "", "most_popular": true, "priority": 3, "children": []}, {"id": "data-collection.provisioning-systems", "name": "Provisioning Systems", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.search-engines", "name": "Search Engines", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.linux-systems", "name": "Linux Systems", "description": "", "most_popular": true, "priority": 5, "children": [{"id": "data-collection.linux-systems.system-metrics", "name": "System", "description": "", "most_popular": true, "priority": 1, "children": []}, {"id": "data-collection.linux-systems.memory-metrics", "name": "Memory", "description": "", "most_popular": true, "priority": 3, "children": []}, {"id": "data-collection.linux-systems.cpu-metrics", "name": "CPU", "description": "", "most_popular": true, "priority": 2, "children": []}, {"id": "data-collection.linux-systems.pressure-metrics", "name": "Pressure", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.linux-systems.network-metrics", "name": "Network", "description": "", "most_popular": true, "priority": 5, "children": []}, {"id": "data-collection.linux-systems.ipc-metrics", "name": "IPC", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.linux-systems.disk-metrics", "name": "Disk", "description": "", "most_popular": true, "priority": 4, "children": []}, {"id": "data-collection.linux-systems.firewall-metrics", "name": "Firewall", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.linux-systems.power-supply-metrics", "name": "Power Supply", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.linux-systems.filesystem-metrics", "name": "Filesystem", "description": "", "most_popular": false, "priority": -1, "children": [{"id": "data-collection.linux-systems.filesystem-metrics.zfs", "name": "ZFS", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.linux-systems.filesystem-metrics.btrfs", "name": "BTRFS", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.linux-systems.filesystem-metrics.nfs", "name": "NFS", "description": "", "most_popular": false, "priority": -1, "children": []}]}, {"id": "data-collection.linux-systems.kernel-metrics", "name": "Kernel", "description": "", "most_popular": false, "priority": -1, "children": []}]}, {"id": "data-collection.networking-stack-and-network-interfaces", "name": "Networking Stack and Network Interfaces", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.synthetic-checks", "name": "Synthetic Checks", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.ci-cd-systems", "name": "CICD Platforms", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.ups", "name": "UPS", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.freebsd-systems", "name": "FreeBSD Systems", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.logs-servers", "name": "Logs Servers", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.security-systems", "name": "Security Systems", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.observability", "name": "Observability", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.gaming", "name": "Gaming", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.iot-devices", "name": "IoT Devices", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.media-streaming-servers", "name": "Media Services", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.authentication-and-authorization", "name": "Authentication and Authorization", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.project-management", "name": "Project Management", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.application-servers", "name": "Application Servers", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.dns-and-dhcp-servers", "name": "DNS and DHCP Servers", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.mail-servers", "name": "Mail Servers", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.processes-and-system-services", "name": "Processes and System Services", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.storage-mount-points-and-filesystems", "name": "Storage, Mount Points and Filesystems", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.systemd", "name": "Systemd", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.telephony-servers", "name": "Telephony Servers", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.vpns", "name": "VPNs", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.blockchain-servers", "name": "Blockchain Servers", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.distributed-computing-systems", "name": "Distributed Computing Systems", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.generic-data-collection", "name": "Generic Data Collection", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.p2p", "name": "P2P", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.snmp-and-networked-devices", "name": "SNMP and Networked Devices", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.system-clock-and-ntp", "name": "System Clock and NTP", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.nas", "name": "NAS", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.api-gateways", "name": "API Gateways", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.task-queues", "name": "Task Queues", "description": "", "most_popular": false, "priority": -1, "children": []}, {"id": "data-collection.ftp-servers", "name": "FTP Servers", "description": "", "most_popular": false, "priority": -1, "children": []}]}, {"id": "logs", "name": "Logs", "description": "Monitoring logs on your infrastructure", "most_popular": true, "priority": 3, "children": []}, {"id": "export", "name": "exporters", "description": "Exporter Integrations", "most_popular": true, "priority": 5, "children": []}, {"id": "notify", "name": "notifications", "description": "Notification Integrations", "most_popular": true, "priority": 4, "children": [{"id": "notify.agent", "name": "Agent Dispatched Notifications", "description": "", "most_popular": true, "priority": 2, "children": []}, {"id": "notify.cloud", "name": "Centralized Cloud Notifications", "description": "", "most_popular": true, "priority": 1, "children": []}]}] -export const integrations = [{"meta": {"plugin_name": "apps.plugin", "module_name": "apps", "monitored_instance": {"name": "Applications", "link": "", "categories": ["data-collection.processes-and-system-services"], "icon_filename": "applications.svg"}, "related_resources": {"integrations": {"list": []}}, "info_provided_to_referring_integrations": {"description": ""}, "keywords": ["applications", "processes", "os", "host monitoring"], "most_popular": false}, "overview": "# Applications\n\nPlugin: apps.plugin\nModule: apps\n\n## Overview\n\nMonitor Applications for optimal software performance and resource usage.\n\n\n\nThis collector is supported on all platforms.\n\nThis collector supports collecting metrics from multiple instances of this integration, including remote instances.\n\n\n### Default Behavior\n\n#### Auto-Detection\n\nThis integration doesn't support auto-detection.\n\n#### Limits\n\nThe default configuration for this integration does not impose any limits on data collection.\n\n#### Performance Impact\n\nThe default configuration for this integration is not expected to impose a significant performance impact on the system.\n", "setup": "## Setup\n\n### Prerequisites\n\nNo action required.\n\n### Configuration\n\n#### File\n\nThere is no configuration file.\n#### Options\n\n\n\nThere are no configuration options.\n\n#### Examples\nThere are no configuration examples.\n\n", "troubleshooting": "", "alerts": "## Alerts\n\nThere are no alerts configured by default for this integration.\n", "metrics": "## Metrics\n\nMetrics grouped by *scope*.\n\nThe scope defines the instance that the metric belongs to. An instance is uniquely identified by a set of labels.\n\n\n\n### Per applications group\n\nThese metrics refer to the application group.\n\nLabels:\n\n| Label | Description |\n|:-----------|:----------------|\n| app_group | The name of the group defined in the configuration. |\n\nMetrics:\n\n| Metric | Dimensions | Unit |\n|:------|:----------|:----|\n| app.cpu_utilization | user, system | percentage |\n| app.cpu_guest_utilization | guest | percentage |\n| app.cpu_context_switches | voluntary, involuntary | switches/s |\n| app.mem_usage | rss | MiB |\n| app.mem_private_usage | mem | MiB |\n| app.vmem_usage | vmem | MiB |\n| app.mem_page_faults | minor, major | pgfaults/s |\n| app.swap_usage | swap | MiB |\n| app.disk_physical_io | reads, writes | KiB/s |\n| app.disk_logical_io | reads, writes | KiB/s |\n| app.processes | processes | processes |\n| app.threads | threads | threads |\n| app.fds_open_limit | limit | percentage |\n| app.fds_open | files, sockets, pipes, inotifies, event, timer, signal, eventpolls, other | fds |\n| app.uptime | uptime | seconds |\n| app.uptime_summary | min, avg, max | seconds |\n\n", "integration_type": "collector", "id": "apps.plugin-apps-Applications", "edit_link": "https://github.com/netdata/netdata/blob/master/src/collectors/apps.plugin/metadata.yaml", "related_resources": ""}, {"meta": {"plugin_name": "apps.plugin", "module_name": "groups", "monitored_instance": {"name": "User Groups", "link": "", "categories": ["data-collection.processes-and-system-services"], "icon_filename": "user.svg"}, "related_resources": {"integrations": {"list": []}}, "info_provided_to_referring_integrations": {"description": ""}, "keywords": ["groups", "processes", "user auditing", "authorization", "os", "host monitoring"], "most_popular": false}, "overview": "# User Groups\n\nPlugin: apps.plugin\nModule: groups\n\n## Overview\n\nThis integration monitors resource utilization on a user groups context.\n\n\n\nThis collector is supported on all platforms.\n\nThis collector supports collecting metrics from multiple instances of this integration, including remote instances.\n\n\n### Default Behavior\n\n#### Auto-Detection\n\nThis integration doesn't support auto-detection.\n\n#### Limits\n\nThe default configuration for this integration does not impose any limits on data collection.\n\n#### Performance Impact\n\nThe default configuration for this integration is not expected to impose a significant performance impact on the system.\n", "setup": "## Setup\n\n### Prerequisites\n\nNo action required.\n\n### Configuration\n\n#### File\n\nThere is no configuration file.\n#### Options\n\n\n\nThere are no configuration options.\n\n#### Examples\nThere are no configuration examples.\n\n", "troubleshooting": "", "alerts": "## Alerts\n\nThere are no alerts configured by default for this integration.\n", "metrics": "## Metrics\n\nMetrics grouped by *scope*.\n\nThe scope defines the instance that the metric belongs to. An instance is uniquely identified by a set of labels.\n\n\n\n### Per user group\n\nThese metrics refer to the user group.\n\nLabels:\n\n| Label | Description |\n|:-----------|:----------------|\n| user_group | The name of the user group. |\n\nMetrics:\n\n| Metric | Dimensions | Unit |\n|:------|:----------|:----|\n| usergroup.cpu_utilization | user, system | percentage |\n| usergroup.cpu_guest_utilization | guest | percentage |\n| usergroup.cpu_context_switches | voluntary, involuntary | switches/s |\n| usergroup.mem_usage | rss | MiB |\n| usergroup.mem_private_usage | mem | MiB |\n| usergroup.vmem_usage | vmem | MiB |\n| usergroup.mem_page_faults | minor, major | pgfaults/s |\n| usergroup.swap_usage | swap | MiB |\n| usergroup.disk_physical_io | reads, writes | KiB/s |\n| usergroup.disk_logical_io | reads, writes | KiB/s |\n| usergroup.processes | processes | processes |\n| usergroup.threads | threads | threads |\n| usergroup.fds_open_limit | limit | percentage |\n| usergroup.fds_open | files, sockets, pipes, inotifies, event, timer, signal, eventpolls, other | fds |\n| usergroup.uptime | uptime | seconds |\n| usergroup.uptime_summary | min, avg, max | seconds |\n\n", "integration_type": "collector", "id": "apps.plugin-groups-User_Groups", "edit_link": "https://github.com/netdata/netdata/blob/master/src/collectors/apps.plugin/metadata.yaml", "related_resources": ""}, {"meta": {"plugin_name": "apps.plugin", "module_name": "users", "monitored_instance": {"name": "Users", "link": "", "categories": ["data-collection.processes-and-system-services"], "icon_filename": "users.svg"}, "related_resources": {"integrations": {"list": []}}, "info_provided_to_referring_integrations": {"description": ""}, "keywords": ["users", "processes", "os", "host monitoring"], "most_popular": false}, "overview": "# Users\n\nPlugin: apps.plugin\nModule: users\n\n## Overview\n\nThis integration monitors resource utilization on a user context.\n\n\n\nThis collector is supported on all platforms.\n\nThis collector supports collecting metrics from multiple instances of this integration, including remote instances.\n\n\n### Default Behavior\n\n#### Auto-Detection\n\nThis integration doesn't support auto-detection.\n\n#### Limits\n\nThe default configuration for this integration does not impose any limits on data collection.\n\n#### Performance Impact\n\nThe default configuration for this integration is not expected to impose a significant performance impact on the system.\n", "setup": "## Setup\n\n### Prerequisites\n\nNo action required.\n\n### Configuration\n\n#### File\n\nThere is no configuration file.\n#### Options\n\n\n\nThere are no configuration options.\n\n#### Examples\nThere are no configuration examples.\n\n", "troubleshooting": "", "alerts": "## Alerts\n\nThere are no alerts configured by default for this integration.\n", "metrics": "## Metrics\n\nMetrics grouped by *scope*.\n\nThe scope defines the instance that the metric belongs to. An instance is uniquely identified by a set of labels.\n\n\n\n### Per user\n\nThese metrics refer to the user.\n\nLabels:\n\n| Label | Description |\n|:-----------|:----------------|\n| user | The name of the user. |\n\nMetrics:\n\n| Metric | Dimensions | Unit |\n|:------|:----------|:----|\n| user.cpu_utilization | user, system | percentage |\n| user.cpu_guest_utilization | guest | percentage |\n| user.cpu_context_switches | voluntary, involuntary | switches/s |\n| user.mem_usage | rss | MiB |\n| user.mem_private_usage | mem | MiB |\n| user.vmem_usage | vmem | MiB |\n| user.mem_page_faults | minor, major | pgfaults/s |\n| user.swap_usage | swap | MiB |\n| user.disk_physical_io | reads, writes | KiB/s |\n| user.disk_logical_io | reads, writes | KiB/s |\n| user.processes | processes | processes |\n| user.threads | threads | threads |\n| user.fds_open_limit | limit | percentage |\n| user.fds_open | files, sockets, pipes, inotifies, event, timer, signal, eventpolls, other | fds |\n| user.uptime | uptime | seconds |\n| user.uptime_summary | min, avg, max | seconds |\n\n", "integration_type": "collector", "id": "apps.plugin-users-Users", "edit_link": "https://github.com/netdata/netdata/blob/master/src/collectors/apps.plugin/metadata.yaml", "related_resources": ""}, {"meta": {"plugin_name": "cgroups.plugin", "module_name": "/sys/fs/cgroup", "monitored_instance": {"name": "Containers", "link": "", "categories": ["data-collection.containers-and-vms"], "icon_filename": "container.svg"}, "related_resources": {"integrations": {"list": []}}, "info_provided_to_referring_integrations": {"description": ""}, "keywords": ["containers"], "most_popular": true}, "overview": "# Containers\n\nPlugin: cgroups.plugin\nModule: /sys/fs/cgroup\n\n## Overview\n\nMonitor Containers for performance, resource usage, and health status.\n\n\n\nThis collector is supported on all platforms.\n\nThis collector supports collecting metrics from multiple instances of this integration, including remote instances.\n\n\n### Default Behavior\n\n#### Auto-Detection\n\nThis integration doesn't support auto-detection.\n\n#### Limits\n\nThe default configuration for this integration does not impose any limits on data collection.\n\n#### Performance Impact\n\nThe default configuration for this integration is not expected to impose a significant performance impact on the system.\n", "setup": "## Setup\n\n### Prerequisites\n\nNo action required.\n\n### Configuration\n\n#### File\n\nThere is no configuration file.\n#### Options\n\n\n\nThere are no configuration options.\n\n#### Examples\nThere are no configuration examples.\n\n", "troubleshooting": "", "alerts": "## Alerts\n\n\nThe following alerts are available:\n\n| Alert name | On metric | Description |\n|:------------|:----------|:------------|\n| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes |\n| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization |\n| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute |\n| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute |\n", "metrics": "## Metrics\n\nMetrics grouped by *scope*.\n\nThe scope defines the instance that the metric belongs to. An instance is uniquely identified by a set of labels.\n\n\n\n### Per cgroup\n\n\n\nLabels:\n\n| Label | Description |\n|:-----------|:----------------|\n| container_name | The container name or group path if name resolution fails. |\n| image | Docker/Podman container image name. |\n\nMetrics:\n\n| Metric | Dimensions | Unit |\n|:------|:----------|:----|\n| cgroup.cpu_limit | used | percentage |\n| cgroup.cpu | user, system | percentage |\n| cgroup.cpu_per_core | a dimension per core | percentage |\n| cgroup.throttled | throttled | percentage |\n| cgroup.throttled_duration | duration | ms |\n| cgroup.cpu_shares | shares | shares |\n| cgroup.mem | cache, rss, swap, rss_huge, mapped_file | MiB |\n| cgroup.writeback | dirty, writeback | MiB |\n| cgroup.mem_activity | in, out | MiB/s |\n| cgroup.pgfaults | pgfault, swap | MiB/s |\n| cgroup.mem_usage | ram, swap | MiB |\n| cgroup.mem_usage_limit | available, used | MiB |\n| cgroup.mem_utilization | utilization | percentage |\n| cgroup.mem_failcnt | failures | count |\n| cgroup.io | read, write | KiB/s |\n| cgroup.serviced_ops | read, write | operations/s |\n| cgroup.throttle_io | read, write | KiB/s |\n| cgroup.throttle_serviced_ops | read, write | operations/s |\n| cgroup.queued_ops | read, write | operations |\n| cgroup.merged_ops | read, write | operations/s |\n| cgroup.cpu_some_pressure | some10, some60, some300 | percentage |\n| cgroup.cpu_some_pressure_stall_time | time | ms |\n| cgroup.cpu_full_pressure | some10, some60, some300 | percentage |\n| cgroup.cpu_full_pressure_stall_time | time | ms |\n| cgroup.memory_some_pressure | some10, some60, some300 | percentage |\n| cgroup.memory_some_pressure_stall_time | time | ms |\n| cgroup.memory_full_pressure | some10, some60, some300 | percentage |\n| cgroup.memory_full_pressure_stall_time | time | ms |\n| cgroup.io_some_pressure | some10, some60, some300 | percentage |\n| cgroup.io_some_pressure_stall_time | time | ms |\n| cgroup.io_full_pressure | some10, some60, some300 | percentage |\n| cgroup.io_full_pressure_stall_time | time | ms |\n| cgroup.pids_current | pids | pids |\n\n### Per cgroup network device\n\n\n\nLabels:\n\n| Label | Description |\n|:-----------|:----------------|\n| container_name | The container name or group path if name resolution fails. |\n| image | Docker/Podman container image name. |\n| device | The name of the host network interface linked to the container's network interface. |\n| container_device | Container network interface name. |\n| interface_type | Network interface type. Always \"virtual\" for the containers. |\n\nMetrics:\n\n| Metric | Dimensions | Unit |\n|:------|:----------|:----|\n| cgroup.net_net | received, sent | kilobits/s |\n| cgroup.net_packets | received, sent, multicast | pps |\n| cgroup.net_errors | inbound, outbound | errors/s |\n| cgroup.net_drops | inbound, outbound | errors/s |\n| cgroup.net_fifo | receive, transmit | errors/s |\n| cgroup.net_compressed | receive, sent | pps |\n| cgroup.net_events | frames, collisions, carrier | events/s |\n| cgroup.net_operstate | up, down, notpresent, lowerlayerdown, testing, dormant, unknown | state |\n| cgroup.net_carrier | up, down | state |\n| cgroup.net_mtu | mtu | octets |\n\n", "integration_type": "collector", "id": "cgroups.plugin-/sys/fs/cgroup-Containers", "edit_link": "https://github.com/netdata/netdata/blob/master/src/collectors/cgroups.plugin/metadata.yaml", "related_resources": ""}, {"meta": {"plugin_name": "cgroups.plugin", "module_name": "/sys/fs/cgroup", "monitored_instance": {"name": "Kubernetes Containers", "link": "https://kubernetes.io/", "icon_filename": "kubernetes.svg", "categories": ["data-collection.kubernetes"]}, "related_resources": {"integrations": {"list": []}}, "info_provided_to_referring_integrations": {"description": ""}, "keywords": ["k8s", "kubernetes", "pods", "containers"], "most_popular": true}, "overview": "# Kubernetes Containers\n\nPlugin: cgroups.plugin\nModule: /sys/fs/cgroup\n\n## Overview\n\nMonitor Containers for performance, resource usage, and health status.\n\n\n\nThis collector is supported on all platforms.\n\nThis collector supports collecting metrics from multiple instances of this integration, including remote instances.\n\n\n### Default Behavior\n\n#### Auto-Detection\n\nThis integration doesn't support auto-detection.\n\n#### Limits\n\nThe default configuration for this integration does not impose any limits on data collection.\n\n#### Performance Impact\n\nThe default configuration for this integration is not expected to impose a significant performance impact on the system.\n", "setup": "## Setup\n\n### Prerequisites\n\nNo action required.\n\n### Configuration\n\n#### File\n\nThere is no configuration file.\n#### Options\n\n\n\nThere are no configuration options.\n\n#### Examples\nThere are no configuration examples.\n\n", "troubleshooting": "", "alerts": "## Alerts\n\n\nThe following alerts are available:\n\n| Alert name | On metric | Description |\n|:------------|:----------|:------------|\n| [ k8s_cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | k8s.cgroup.cpu_limit | average cgroup CPU utilization over the last 1