From f6ec79cfb8e10b421655df5af1336c91cf41ded4 Mon Sep 17 00:00:00 2001 From: Emmanuel Vasilakis Date: Mon, 14 Jun 2021 13:56:23 +0300 Subject: Swap class and type attributes in stock alarm configurations (#11240) * swap type and class * edit REFERENCE.md --- health/REFERENCE.md | 55 ++++++++++------- health/health.d/adaptec_raid.conf | 8 +-- health/health.d/anomalies.conf | 8 +-- health/health.d/apcupsd.conf | 12 ++-- health/health.d/backend.conf | 12 ++-- health/health.d/bcache.conf | 8 +-- health/health.d/beanstalkd.conf | 4 +- health/health.d/bind_rndc.conf | 4 +- health/health.d/boinc.conf | 16 ++--- health/health.d/btrfs.conf | 16 ++--- health/health.d/ceph.conf | 4 +- health/health.d/cgroups.conf | 8 +-- health/health.d/cockroachdb.conf | 16 ++--- health/health.d/cpu.conf | 16 ++--- health/health.d/dbengine.conf | 16 ++--- health/health.d/disks.conf | 16 ++--- health/health.d/dns_query.conf | 4 +- health/health.d/dnsmasq_dhcp.conf | 4 +- health/health.d/dockerd.conf | 4 +- health/health.d/elasticsearch.conf | 4 +- health/health.d/entropy.conf | 4 +- health/health.d/exporting.conf | 29 +++++---- health/health.d/fping.conf | 16 ++--- health/health.d/fronius.conf | 4 +- health/health.d/gearman.conf | 4 +- health/health.d/go.d.plugin.conf | 4 +- health/health.d/haproxy.conf | 12 ++-- health/health.d/hdfs.conf | 20 +++--- health/health.d/httpcheck.conf | 32 +++++----- health/health.d/ioping.conf | 4 +- health/health.d/ipc.conf | 8 +-- health/health.d/ipfs.conf | 4 +- health/health.d/ipmi.conf | 8 +-- health/health.d/kubelet.conf | 36 +++++------ health/health.d/linux_power_supply.conf | 4 +- health/health.d/load.conf | 16 ++--- health/health.d/mdstat.conf | 16 ++--- health/health.d/megacli.conf | 20 +++--- health/health.d/memcached.conf | 12 ++-- health/health.d/memory.conf | 12 ++-- health/health.d/mysql.conf | 44 +++++++------- health/health.d/net.conf | 56 ++++++++--------- health/health.d/netfilter.conf | 4 +- health/health.d/pihole.conf | 16 ++--- health/health.d/portcheck.conf | 12 ++-- health/health.d/processes.conf | 4 +- health/health.d/python.d.plugin.conf | 4 +- health/health.d/ram.conf | 20 +++--- health/health.d/redis.conf | 8 +-- health/health.d/retroshare.conf | 4 +- health/health.d/riakkv.conf | 24 ++++---- health/health.d/scaleio.conf | 8 +-- health/health.d/softnet.conf | 12 ++-- health/health.d/stiebeleltron.conf | 4 +- health/health.d/swap.conf | 8 +-- health/health.d/systemdunits.conf | 40 ++++++------ health/health.d/tcp_conn.conf | 4 +- health/health.d/tcp_listen.conf | 16 ++--- health/health.d/tcp_mem.conf | 4 +- health/health.d/tcp_orphans.conf | 4 +- health/health.d/tcp_resets.conf | 16 ++--- health/health.d/timex.conf | 4 +- health/health.d/udp_errors.conf | 8 +-- health/health.d/unbound.conf | 8 +-- health/health.d/varnish.conf | 4 +- health/health.d/vcsa.conf | 32 +++++----- health/health.d/vernemq.conf | 104 ++++++++++++++++---------------- health/health.d/vsphere.conf | 44 +++++++------- health/health.d/web_log.conf | 96 ++++++++++++++--------------- health/health.d/whoisquery.conf | 4 +- health/health.d/wmi.conf | 32 +++++----- health/health.d/x509check.conf | 8 +-- health/health.d/zfs.conf | 12 ++-- 73 files changed, 591 insertions(+), 577 deletions(-) diff --git a/health/REFERENCE.md b/health/REFERENCE.md index 5ea6b7c5d6..456912ea70 100644 --- a/health/REFERENCE.md +++ b/health/REFERENCE.md @@ -59,9 +59,9 @@ Netdata parses the following lines. Beneath the table is an in-depth explanation | --------------------------------------------------- | --------------- | ------------------------------------------------------------------------------------- | | [`alarm`/`template`](#alarm-line-alarm-or-template) | yes | Name of the alarm/template. | | [`on`](#alarm-line-on) | yes | The chart this alarm should attach to. | -| [`class`](#alarm-line-class) | no | The general classification of the alarm. | -| [`component`](#alarm-line-component) | no | Specify the component of the class of the alarm. | -| [`type`](#alarm-line-type) | no | The type of error the alarm monitors. | +| [`class`](#alarm-line-class) | no | The general alarm classification. | +| [`type`](#alarm-line-type) | no | What area of the system the alarm monitors. | +| [`component`](#alarm-line-component) | no | Specific component of the type of the alarm. | | [`os`](#alarm-line-os) | no | Which operating systems to run this chart. | | [`hosts`](#alarm-line-hosts) | no | Which hostnames will run this alarm. | | [`plugin`](#alarm-line-plugin) | no | Restrict an alarm or template to only a certain plugin. | @@ -136,17 +136,38 @@ If you create a template using the `disk.io` context, it will apply an alarm to #### Alarm line `class` -Specify the classification of the alarm or template. +This indicates the type of error (or general problem area) that the alarm or template applies to. For example, `Latency` can be used for alarms that trigger on latency issues on network interfaces, web servers, or database systems. Example: -Class can be used to indicate the broader area of the system that the alarm applies to. For example, under the general `Database` class, you can group together alarms that operate on various database systems, like `MySQL`, `CockroachDB`, `CouchDB` etc. Example: +```yaml +class: Latency +``` + +
+Netdata's stock alarms use the following `class` attributes by default: + +| Class | +| ----------------| +| Errors | +| Latency | +| Utilization | +| Workload | + + +
+ +`class` will default to `Unknown` if the line is missing from the alarm configuration. + +#### Alarm line `type` + +Type can be used to indicate the broader area of the system that the alarm applies to. For example, under the general `Database` type, you can group together alarms that operate on various database systems, like `MySQL`, `CockroachDB`, `CouchDB` etc. Example: ```yaml -class: Database +type: Database ```
-Netdata's stock alarms use the following `class` attributes by default, but feel free to adjust for your own requirements. +Netdata's stock alarms use the following `type` attributes by default, but feel free to adjust for your own requirements. -| Class | Description | +| Type | Description | | ------------------------ | ------------------------------------------------------------------------------------------------ | | Ad Filtering | Services related to Ad Filtering (like pi-hole) | | Certificates | Certificates monitoring related | @@ -162,7 +183,7 @@ class: Database | Linux | Services specific to Linux (e.g. systemd) | | Messaging | Alerts for message passing services (e.g. vernemq) | | Netdata | Internal Netdata components monitoring | -| Other | Use as a general class of alerts | +| Other | When an alert doesn't fit in other types. | | Power Supply | Alerts from power supply related services (e.g. apcupsd) | | Search engine | Alerts for search services (e.g. elasticsearch) | | Storage | Class for alerts dealing with storage services (storage devices typically live under `System`) | @@ -174,26 +195,16 @@ class: Database
-If an alarm configuration is missing the `class` line, its value will default to `Unknown`. +If an alarm configuration is missing the `type` line, its value will default to `Unknown`. #### Alarm line `component` -Component can be used to narrow down what the previous `class` value specifies for each alarm or template. Continuing from the previous example, `component` might include `MySQL`, `CockroachDB`, `MongoDB`, all under the same `Database` classification. Example: +Component can be used to narrow down what the previous `type` value specifies for each alarm or template. Continuing from the previous example, `component` might include `MySQL`, `CockroachDB`, `MongoDB`, all under the same `Database` type. Example: ```yaml component: MySQL ``` -As with the `class` line, if `component` is missing from the configuration, its value will default to `Unknown`. - -#### Alarm line `type` - -This indicates the type of error (or general problem area) that the alarm or template applies to. For example, `Latency` can be used for alarms that trigger on latency issues in network interfaces, web servers, or database systems. Example: - -```yaml -type: Latency -``` - -`type` will also (as with `class` and `component`) default to `Unknown` if the line is missing from the alarm configuration. +As with the `class` and `type` line, if `component` is missing from the configuration, its value will default to `Unknown`. #### Alarm line `os` diff --git a/health/health.d/adaptec_raid.conf b/health/health.d/adaptec_raid.conf index b067e18401..1d823adddd 100644 --- a/health/health.d/adaptec_raid.conf +++ b/health/health.d/adaptec_raid.conf @@ -3,9 +3,9 @@ template: adaptec_raid_ld_status on: adaptec_raid.ld_status - class: System + class: Errors + type: System component: RAID - type: Errors lookup: max -10s foreach * units: bool every: 10s @@ -18,9 +18,9 @@ component: RAID template: adaptec_raid_pd_state on: adaptec_raid.pd_state - class: System + class: Errors + type: System component: RAID - type: Errors lookup: max -10s foreach * units: bool every: 10s diff --git a/health/health.d/anomalies.conf b/health/health.d/anomalies.conf index f27e39fc10..269ae544b0 100644 --- a/health/health.d/anomalies.conf +++ b/health/health.d/anomalies.conf @@ -2,9 +2,9 @@ template: anomalies_anomaly_probabilities on: anomalies.probability - class: Netdata + class: Errors + type: Netdata component: ML - type: Errors lookup: average -2m foreach * every: 1m warn: $this > 50 @@ -14,9 +14,9 @@ component: ML template: anomalies_anomaly_flags on: anomalies.anomaly - class: Netdata + class: Errors + type: Netdata component: ML - type: Errors lookup: sum -2m foreach * every: 1m warn: $this > 10 diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf index 07b5c28c97..65f1a69ab9 100644 --- a/health/health.d/apcupsd.conf +++ b/health/health.d/apcupsd.conf @@ -2,9 +2,9 @@ template: apcupsd_10min_ups_load on: apcupsd.load - class: Power Supply + class: Utilization + type: Power Supply component: UPS - type: Utilization os: * hosts: * lookup: average -10m unaligned of percentage @@ -20,9 +20,9 @@ component: UPS # Fire the alarm as soon as it's going on battery (99% charge) and clear only when full. template: apcupsd_ups_charge on: apcupsd.charge - class: Power Supply + class: Errors + type: Power Supply component: UPS - type: Errors os: * hosts: * lookup: average -60s unaligned of charge @@ -36,9 +36,9 @@ component: UPS template: apcupsd_last_collected_secs on: apcupsd.load - class: Power Supply + class: Latency + type: Power Supply component: UPS device - type: Latency calc: $now - $last_collected_t every: 10s units: seconds ago diff --git a/health/health.d/backend.conf b/health/health.d/backend.conf index 948ea551a0..91d469395e 100644 --- a/health/health.d/backend.conf +++ b/health/health.d/backend.conf @@ -1,9 +1,9 @@ # Alert that backends subsystem will be disabled soon alarm: backend_metrics_eol on: netdata.backend_metrics - class: Netdata + class: Errors + type: Netdata component: Exporting engine - type: Errors units: boolean calc: $now - $last_collected_t every: 1m @@ -16,9 +16,9 @@ component: Exporting engine alarm: backend_last_buffering on: netdata.backend_metrics - class: Netdata + class: Latency + type: Netdata component: Exporting engine - type: Latency calc: $now - $last_collected_t units: seconds ago every: 10s @@ -30,9 +30,9 @@ component: Exporting engine alarm: backend_metrics_sent on: netdata.backend_metrics - class: Netdata + class: Workload + type: Netdata component: Exporting engine - type: Workload units: % calc: abs($sent) * 100 / abs($buffered) every: 10s diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf index d75d8e19b0..49cb5ad0f6 100644 --- a/health/health.d/bcache.conf +++ b/health/health.d/bcache.conf @@ -1,9 +1,9 @@ template: bcache_cache_errors on: disk.bcache_cache_read_races - class: System + class: Errors + type: System component: Disk - type: Errors lookup: sum -1m unaligned absolute units: errors every: 1m @@ -16,9 +16,9 @@ component: Disk template: bcache_cache_dirty on: disk.bcache_cache_alloc - class: System + class: Utilization + type: System component: Disk - type: Utilization calc: $dirty + $metadata + $undefined units: % every: 1m diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf index 99c7545710..13ac8c1825 100644 --- a/health/health.d/beanstalkd.conf +++ b/health/health.d/beanstalkd.conf @@ -2,9 +2,9 @@ template: beanstalk_server_buried_jobs on: beanstalk.current_jobs - class: Messaging + class: Workload + type: Messaging component: Beanstalk - type: Workload calc: $buried units: jobs every: 10s diff --git a/health/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf index e88f87a4fa..7c09225ff6 100644 --- a/health/health.d/bind_rndc.conf +++ b/health/health.d/bind_rndc.conf @@ -1,8 +1,8 @@ template: bind_rndc_stats_file_size on: bind_rndc.stats_size - class: DNS + class: Utilization + type: DNS component: BIND - type: Utilization units: megabytes every: 60 calc: $stats_size diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf index 8604abee96..7d7a4fdae3 100644 --- a/health/health.d/boinc.conf +++ b/health/health.d/boinc.conf @@ -3,9 +3,9 @@ # Warn on any compute errors encountered. template: boinc_compute_errors on: boinc.states - class: Computing + class: Errors + type: Computing component: BOINC - type: Errors os: * hosts: * families: * @@ -21,9 +21,9 @@ component: BOINC # Warn on lots of upload errors template: boinc_upload_errors on: boinc.states - class: Computing + class: Errors + type: Computing component: BOINC - type: Errors os: * hosts: * families: * @@ -39,9 +39,9 @@ component: BOINC # Warn on the task queue being empty template: boinc_total_tasks on: boinc.tasks - class: Computing + class: Utilization + type: Computing component: BOINC - type: Utilization os: * hosts: * families: * @@ -57,9 +57,9 @@ component: BOINC # Warn on no active tasks with a non-empty queue template: boinc_active_tasks on: boinc.tasks - class: Computing + class: Utilization + type: Computing component: BOINC - type: Utilization os: * hosts: * families: * diff --git a/health/health.d/btrfs.conf b/health/health.d/btrfs.conf index d3200a7eee..8d197aa8d2 100644 --- a/health/health.d/btrfs.conf +++ b/health/health.d/btrfs.conf @@ -1,9 +1,9 @@ template: btrfs_allocated on: btrfs.disk - class: System + class: Utilization + type: System component: File system - type: Utilization os: * hosts: * families: * @@ -18,9 +18,9 @@ component: File system template: btrfs_data on: btrfs.data - class: System + class: Utilization + type: System component: File system - type: Utilization os: * hosts: * families: * @@ -35,9 +35,9 @@ component: File system template: btrfs_metadata on: btrfs.metadata - class: System + class: Utilization + type: System component: File system - type: Utilization os: * hosts: * families: * @@ -52,9 +52,9 @@ component: File system template: btrfs_system on: btrfs.system - class: System + class: Utilization + type: System component: File system - type: Utilization os: * hosts: * families: * diff --git a/health/health.d/ceph.conf b/health/health.d/ceph.conf index ed8f9b4b95..1f9da25c75 100644 --- a/health/health.d/ceph.conf +++ b/health/health.d/ceph.conf @@ -2,9 +2,9 @@ template: ceph_cluster_space_usage on: ceph.general_usage - class: Storage + class: Utilization + type: Storage component: Ceph - type: Utilization calc: $used * 100 / ($used + $avail) units: % every: 1m diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf index 068533f102..45b34806ca 100644 --- a/health/health.d/cgroups.conf +++ b/health/health.d/cgroups.conf @@ -3,9 +3,9 @@ template: cgroup_10min_cpu_usage on: cgroup.cpu_limit - class: Cgroups + class: Utilization + type: Cgroups component: CPU - type: Utilization os: linux hosts: * lookup: average -10m unaligned @@ -19,9 +19,9 @@ component: CPU template: cgroup_ram_in_use on: cgroup.mem_usage - class: Cgroups + class: Utilization + type: Cgroups component: Memory - type: Utilization os: linux hosts: * calc: ($ram) * 100 / $memory_limit diff --git a/health/health.d/cockroachdb.conf b/health/health.d/cockroachdb.conf index 88011b42af..3d95d4f29d 100644 --- a/health/health.d/cockroachdb.conf +++ b/health/health.d/cockroachdb.conf @@ -3,9 +3,9 @@ template: cockroachdb_used_storage_capacity on: cockroachdb.storage_used_capacity_percentage - class: Database + class: Utilization + type: Database component: CockroachDB - type: Utilization calc: $capacity_used_percent units: % every: 10s @@ -17,9 +17,9 @@ component: CockroachDB template: cockroachdb_used_usable_storage_capacity on: cockroachdb.storage_used_capacity_percentage - class: Database + class: Utilization + type: Database component: CockroachDB - type: Utilization calc: $capacity_usable_used_percent units: % every: 10s @@ -33,9 +33,9 @@ component: CockroachDB template: cockroachdb_unavailable_ranges on: cockroachdb.ranges_replication_problem - class: Database + class: Utilization + type: Database component: CockroachDB - type: Utilization calc: $ranges_unavailable units: num every: 10s @@ -48,9 +48,9 @@ component: CockroachDB template: cockroachdb_open_file_descriptors_limit on: cockroachdb.process_file_descriptors - class: Database + class: Utilization + type: Database component: CockroachDB - type: Utilization calc: $sys_fd_open/$sys_fd_softlimit * 100 units: % every: 10s diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf index d112157683..ad69528253 100644 --- a/health/health.d/cpu.conf +++ b/health/health.d/cpu.conf @@ -3,9 +3,9 @@ template: 10min_cpu_usage on: system.cpu - class: System + class: Utilization + type: System component: CPU - type: Utilization os: linux hosts: * lookup: average -10m unaligned of user,system,softirq,irq,guest @@ -19,9 +19,9 @@ component: CPU template: 10min_cpu_iowait on: system.cpu - class: System + class: Utilization + type: System component: CPU - type: Utilization os: linux hosts: * lookup: average -10m unaligned of iowait @@ -35,9 +35,9 @@ component: CPU template: 20min_steal_cpu on: system.cpu - class: System + class: Latency + type: System component: CPU - type: Latency os: linux hosts: * lookup: average -20m unaligned of steal @@ -52,9 +52,9 @@ component: CPU ## FreeBSD template: 10min_cpu_usage on: system.cpu - class: System + class: Utilization + type: System component: CPU - type: Utilization os: freebsd hosts: * lookup: average -10m unaligned of user,system,interrupt diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf index 79c156ab83..65c41b8462 100644 --- a/health/health.d/dbengine.conf +++ b/health/health.d/dbengine.conf @@ -3,9 +3,9 @@ alarm: 10min_dbengine_global_fs_errors on: netdata.dbengine_global_errors - class: Netdata + class: Errors + type: Netdata component: DB engine - type: Errors os: linux freebsd macos hosts: * lookup: sum -10m unaligned of fs_errors @@ -18,9 +18,9 @@ component: DB engine alarm: 10min_dbengine_global_io_errors on: netdata.dbengine_global_errors - class: Netdata + class: Errors + type: Netdata component: DB engine - type: Errors os: linux freebsd macos hosts: * lookup: sum -10m unaligned of io_errors @@ -33,9 +33,9 @@ component: DB engine alarm: 10min_dbengine_global_flushing_warnings on: netdata.dbengine_global_errors - class: Netdata + class: Errors + type: Netdata component: DB engine - type: Errors os: linux freebsd macos hosts: * lookup: sum -10m unaligned of pg_cache_over_half_dirty_events @@ -49,9 +49,9 @@ component: DB engine alarm: 10min_dbengine_global_flushing_errors on: netdata.dbengine_long_term_page_stats - class: Netdata + class: Errors + type: Netdata component: DB engine - type: Errors os: linux freebsd macos hosts: * lookup: sum -10m unaligned of flushing_pressure_deletions diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf index 0b88106181..5daff61a14 100644 --- a/health/health.d/disks.conf +++ b/health/health.d/disks.conf @@ -11,9 +11,9 @@ template: disk_space_usage on: disk.space - class: System + class: Utilization + type: System component: Disk - type: Utilization os: linux freebsd hosts: * families: !/dev !/dev/* !/run !/run/* * @@ -28,9 +28,9 @@ component: Disk template: disk_inode_usage on: disk.inodes - class: System + class: Utilization + type: System component: Disk - type: Utilization os: linux freebsd hosts: * families: !/dev !/dev/* !/run !/run/* * @@ -136,9 +136,9 @@ component: Disk template: 10min_disk_utilization on: disk.util - class: System + class: Utilization + type: System component: Disk - type: Utilization os: linux freebsd hosts: * families: * @@ -158,9 +158,9 @@ component: Disk template: 10min_disk_backlog on: disk.backlog - class: System + class: Latency + type: System component: Disk - type: Latency os: linux hosts: * families: * diff --git a/health/health.d/dns_query.conf b/health/health.d/dns_query.conf index 1fbb2c5981..ec4937c0a8 100644 --- a/health/health.d/dns_query.conf +++ b/health/health.d/dns_query.conf @@ -3,9 +3,9 @@ template: dns_query_time_query_time on: dns_query_time.query_time - class: DNS + class: Latency + type: DNS component: DNS - type: Latency lookup: average -10s unaligned foreach * units: ms every: 10s diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf index 10d139f772..010b945992 100644 --- a/health/health.d/dnsmasq_dhcp.conf +++ b/health/health.d/dnsmasq_dhcp.conf @@ -2,9 +2,9 @@ template: dnsmasq_dhcp_dhcp_range_utilization on: dnsmasq_dhcp.dhcp_range_utilization - class: DHCP + class: Utilization + type: DHCP component: Dnsmasq - type: Utilization every: 10s units: % calc: $used diff --git a/health/health.d/dockerd.conf b/health/health.d/dockerd.conf index ba866f81ba..220ddd664b 100644 --- a/health/health.d/dockerd.conf +++ b/health/health.d/dockerd.conf @@ -1,8 +1,8 @@ template: docker_unhealthy_containers on: docker.unhealthy_containers - class: Containers + class: Errors + type: Containers component: Docker - type: Errors units: unhealthy containers every: 10s lookup: average -10s diff --git a/health/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf index 05d576c39e..dfec133b82 100644 --- a/health/health.d/elasticsearch.conf +++ b/health/health.d/elasticsearch.conf @@ -3,9 +3,9 @@ template: elasticsearch_last_collected on: elasticsearch.cluster_health_status - class: Search engine + class: Latency + type: Search engine component: Elasticsearch - type: Latency calc: $now - $last_collected_t units: seconds ago every: 10s diff --git a/health/health.d/entropy.conf b/health/health.d/entropy.conf index 0478fa0bec..13b0fcde4d 100644 --- a/health/health.d/entropy.conf +++ b/health/health.d/entropy.conf @@ -5,9 +5,9 @@ alarm: lowest_entropy on: system.entropy - class: System + class: Utilization + type: System component: Cryptography - type: Utilization os: linux hosts: * lookup: min -5m unaligned diff --git a/health/health.d/exporting.conf b/health/health.d/exporting.conf index 4430f3fd82..06f398c6e4 100644 --- a/health/health.d/exporting.conf +++ b/health/health.d/exporting.conf @@ -1,22 +1,25 @@ -template: exporting_last_buffering -families: * - on: exporting_data_size - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful buffering of exporting data - to: dba + template: exporting_last_buffering + families: * + on: exporting_data_size + class: Latency + type: Netdata +component: Exporting engine + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful buffering of exporting data + to: dba template: exporting_metrics_sent families: * on: exporting_data_size - class: Netdata + class: Workload + type: Netdata component: Exporting engine - type: Workload units: % calc: abs($sent) * 100 / abs($buffered) every: 10s diff --git a/health/health.d/fping.conf b/health/health.d/fping.conf index 120fe8f281..bb22419faa 100644 --- a/health/health.d/fping.conf +++ b/health/health.d/fping.conf @@ -2,9 +2,9 @@ template: fping_last_collected_secs families: * on: fping.latency - class: Other + class: Latency + type: Other component: Network - type: Latency calc: $now - $last_collected_t units: seconds ago every: 10s @@ -17,9 +17,9 @@ component: Network template: fping_host_reachable families: * on: fping.latency - class: Other + class: Errors + type: Other component: Network - type: Errors calc: $average != nan units: up/down every: 10s @@ -31,9 +31,9 @@ component: Network template: fping_host_latency families: * on: fping.latency - class: Other + class: Latency + type: Other component: Network - type: Latency lookup: average -10s unaligned of average units: ms every: 10s @@ -48,9 +48,9 @@ component: Network template: fping_packet_loss families: * on: fping.quality - class: System + class: Errors + type: System component: Network - type: Errors lookup: average -10m unaligned of returned calc: 100 - $this green: 1 diff --git a/health/health.d/fronius.conf b/health/health.d/fronius.conf index 81aafaa60a..853bd7fbc2 100644 --- a/health/health.d/fronius.conf +++ b/health/health.d/fronius.conf @@ -1,9 +1,9 @@ template: fronius_last_collected_secs families: * on: fronius.power - class: Power Supply + class: Latency + type: Power Supply component: Solar - type: Latency calc: $now - $last_collected_t every: 10s units: seconds ago diff --git a/health/health.d/gearman.conf b/health/health.d/gearman.conf index 289e5fbb58..d04c1fdb91 100644 --- a/health/health.d/gearman.conf +++ b/health/health.d/gearman.conf @@ -1,9 +1,9 @@ template: gearman_workers_queued on: gearman.single_job - class: Computing + class: Latency + type: Computing component: Gearman - type: Latency lookup: average -10m unaligned match-names of Queued units: workers every: 10s diff --git a/health/health.d/go.d.plugin.conf b/health/health.d/go.d.plugin.conf index ecd79c208f..8bf84a9769 100644 --- a/health/health.d/go.d.plugin.conf +++ b/health/health.d/go.d.plugin.conf @@ -3,9 +3,9 @@ template: go.d_job_last_collected_secs on: netdata.go_plugin_execution_time - class: Netdata + class: Error + type: Netdata component: go.d.plugin - type: Error module: * calc: $now - $last_collected_t units: seconds ago diff --git a/health/health.d/haproxy.conf b/health/health.d/haproxy.conf index 9f6b1c5776..c1375351e5 100644 --- a/health/health.d/haproxy.conf +++ b/health/health.d/haproxy.conf @@ -1,8 +1,8 @@ template: haproxy_backend_server_status on: haproxy_hs.down - class: Web Proxy + class: Errors + type: Web Proxy component: HAProxy - type: Errors units: failed servers every: 10s lookup: average -10s @@ -12,9 +12,9 @@ component: HAProxy template: haproxy_backend_status on: haproxy_hb.down - class: Web Proxy + class: Errors + type: Web Proxy component: HAProxy - type: Errors units: failed backend every: 10s lookup: average -10s @@ -24,9 +24,9 @@ component: HAProxy template: haproxy_last_collected on: haproxy_hb.down - class: Web Proxy + class: Latency + type: Web Proxy component: HAProxy - type: Latency calc: $now - $last_collected_t units: seconds ago every: 10s diff --git a/health/health.d/hdfs.conf b/health/health.d/hdfs.conf index c67bf11dae..ca8df31b92 100644 --- a/health/health.d/hdfs.conf +++ b/health/health.d/hdfs.conf @@ -3,9 +3,9 @@ template: hdfs_capacity_usage on: hdfs.capacity - class: Storage + class: Utilization + type: Storage component: HDFS - type: Utilization calc: ($used) * 100 / ($used + $remaining) units: % every: 10s @@ -20,9 +20,9 @@ component: HDFS template: hdfs_missing_blocks on: hdfs.blocks - class: Storage + class: Errors + type: Storage component: HDFS - type: Errors calc: $missing units: missing blocks every: 10s @@ -34,9 +34,9 @@ component: HDFS template: hdfs_stale_nodes on: hdfs.data_nodes - class: Storage + class: Errors + type: Storage component: HDFS - type: Errors calc: $stale units: dead nodes every: 10s @@ -48,9 +48,9 @@ component: HDFS template: hdfs_dead_nodes on: hdfs.data_nodes - class: Storage + class: Errors + type: Storage component: HDFS - type: Errors calc: $dead units: dead nodes every: 10s @@ -64,9 +64,9 @@ component: HDFS template: hdfs_num_failed_volumes on: hdfs.num_failed_volumes - class: Storage + class: Errors + type: Storage component: HDFS - type: Errors calc: $fsds_num_failed_volumes units: failed volumes every: 10s diff --git a/health/health.d/httpcheck.conf b/health/health.d/httpcheck.conf index 39fc95a2c8..599c47acc1 100644 --- a/health/health.d/httpcheck.conf +++ b/health/health.d/httpcheck.conf @@ -3,9 +3,9 @@ template: httpcheck_web_service_up families: * on: httpcheck.status - class: Web Server + class: Utilization + type: Web Server component: HTTP endpoint - type: Utilization lookup: average -1m unaligned percentage of success calc: ($this < 75) ? (0) : ($this) every: 5s @@ -16,9 +16,9 @@ component: HTTP endpoint template: httpcheck_web_service_bad_content families: * on: httpcheck.status - class: Web Server + class: Workload + type: Web Server component: HTTP endpoint - type: Workload lookup: average -5m unaligned percentage of bad_content every: 10s units: % @@ -32,9 +32,9 @@ component: HTTP endpoint template: httpcheck_web_service_bad_status families: * on: httpcheck.status - class: Web Server + class: Workload + type: Web Server component: HTTP endpoint - type: Workload lookup: average -5m unaligned percentage of bad_status every: 10s units: % @@ -48,9 +48,9 @@ component: HTTP endpoint template: httpcheck_web_service_timeouts families: * on: httpcheck.status - class: Web Server + class: Latency + type: Web Server component: HTTP endpoint - type: Latency lookup: average -5m unaligned percentage of timeout every: 10s units: % @@ -59,9 +59,9 @@ component: HTTP endpoint template: httpcheck_no_web_service_connections families: * on: httpcheck.status - class: Other + class: Errors + type: Other component: HTTP endpoint - type: Errors lookup: average -5m unaligned percentage of no_connection every: 10s units: % @@ -71,9 +71,9 @@ component: HTTP endpoint template: httpcheck_web_service_unreachable families: * on: httpcheck.status - class: Web Server + class: Errors + type: Web Server component: HTTP endpoint - type: Errors calc: ($httpcheck_no_web_service_connections >= $httpcheck_web_service_timeouts) ? ($httpcheck_no_web_service_connections) : ($httpcheck_web_service_timeouts) units: % every: 10s @@ -87,9 +87,9 @@ component: HTTP endpoint template: httpcheck_1h_web_service_response_time families: * on: httpcheck.responsetime - class: Other + class: Latency + type: Other component: HTTP endpoint - type: Latency lookup: average -1h unaligned of time every: 30s units: ms @@ -98,9 +98,9 @@ component: HTTP endpoint template: httpcheck_web_service_slow families: * on: httpcheck.responsetime - class: Web Server + class: Latency + type: Web Server component: HTTP endpoint - type: Latency lookup: average -3m unaligned of time units: ms every: 10s diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf index 57ce4e8666..ee4befbea6 100644 --- a/health/health.d/ioping.conf +++ b/health/health.d/ioping.conf @@ -1,9 +1,9 @@ template: ioping_disk_latency families: * on: ioping.latency - class: System + class: Latency + type: System component: Disk - type: Latency lookup: average -10s unaligned of average units: ms every: 10s diff --git a/health/health.d/ipc.conf b/health/health.d/ipc.conf index 6eaf7abe9d..c178a410aa 100644 --- a/health/health.d/ipc.conf +++ b/health/health.d/ipc.conf @@ -3,9 +3,9 @@ alarm: semaphores_used on: system.ipc_semaphores - class: System + class: Utilization + type: System component: IPC - type: Utilization os: linux hosts: * calc: $semaphores * 100 / $ipc_semaphores_max @@ -19,9 +19,9 @@ component: IPC alarm: semaphore_arrays_used on: system.ipc_semaphore_arrays - class: System + class: Utilization + type: System component: IPC - type: Utilization os: linux hosts: * calc: $arrays * 100 / $ipc_semaphores_arrays_max diff --git a/health/health.d/ipfs.conf b/health/health.d/ipfs.conf index 6268f40927..a514ddfd03 100644 --- a/health/health.d/ipfs.conf +++ b/health/health.d/ipfs.conf @@ -1,9 +1,9 @@ template: ipfs_datastore_usage on: ipfs.repo_size - class: Data Sharing + class: Utilization + type: Data Sharing component: IPFS - type: Utilization calc: $size * 100 / $avail units: % every: 10s diff --git a/health/health.d/ipmi.conf b/health/health.d/ipmi.conf index d4fdc6c793..feadba1b76 100644 --- a/health/health.d/ipmi.conf +++ b/health/health.d/ipmi.conf @@ -1,8 +1,8 @@ alarm: ipmi_sensors_states on: ipmi.sensors_states - class: System + class: Errors + type: System component: IPMI - type: Errors calc: $warning + $critical units: sensors every: 10s @@ -14,9 +14,9 @@ component: IPMI alarm: ipmi_events on: ipmi.events - class: System + class: Utilization + type: System component: IPMI - type: Utilization calc: $events units: events every: 10s diff --git a/health/health.d/kubelet.conf b/health/health.d/kubelet.conf index 4d3c45f971..c2778cc5ee 100644 --- a/health/health.d/kubelet.conf +++ b/health/health.d/kubelet.conf @@ -6,9 +6,9 @@ template: kubelet_node_config_error on: k8s_kubelet.kubelet_node_config_error - class: Kubernetes + class: Errors + type: Kubernetes component: Kubelet - type: Errors calc: $kubelet_node_config_error units: bool every: 10s @@ -22,9 +22,9 @@ component: Kubelet template: kubelet_token_requests lookup: sum -10s of token_fail_count on: k8s_kubelet.kubelet_token_requests - class: Kubernetes + class: Errors + type: Kubernetes component: Kubelet - type: Errors units: failed requests every: 10s warn: $this > 0 @@ -37,9 +37,9 @@ component: Kubelet template: kubelet_operations_error lookup: sum -1m on: k8s_kubelet.kubelet_operations_errors - class: Kubernetes + class: Errors + type: Kubernetes component: Kubelet - type: Errors units: errors every: 10s warn: $this > (($status >= $WARNING) ? (0) : (20)) @@ -64,9 +64,9 @@ component: Kubelet template: kubelet_1m_pleg_relist_latency_quantile_05 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Kubernetes + class: Latency + type: Kubernetes component: Kubelet - type: Latency lookup: average -1m unaligned of kubelet_pleg_relist_latency_05 units: microseconds every: 10s @@ -74,9 +74,9 @@ component: Kubelet template: kubelet_10s_pleg_relist_latency_quantile_05 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Kubernetes + class: Latency + type: Kubernetes component: Kubelet - type: Latency lookup: average -10s unaligned of kubelet_pleg_relist_latency_05 calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05)) every: 10s @@ -92,9 +92,9 @@ component: Kubelet template: kubelet_1m_pleg_relist_latency_quantile_09 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Kubernetes + class: Latency + type: Kubernetes component: Kubelet - type: Latency lookup: average -1m unaligned of kubelet_pleg_relist_latency_09 units: microseconds every: 10s @@ -102,9 +102,9 @@ component: Kubelet template: kubelet_10s_pleg_relist_latency_quantile_09 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Kubernetes + class: Latency + type: Kubernetes component: Kubelet - type: Latency lookup: average -10s unaligned of kubelet_pleg_relist_latency_09 calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09)) every: 10s @@ -120,9 +120,9 @@ component: Kubelet template: kubelet_1m_pleg_relist_latency_quantile_099 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Kubernetes + class: Latency + type: Kubernetes component: Kubelet - type: Latency lookup: average -1m unaligned of kubelet_pleg_relist_latency_099 units: microseconds every: 10s @@ -130,9 +130,9 @@ component: Kubelet template: kubelet_10s_pleg_relist_latency_quantile_099 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Kubernetes + class: Latency + type: Kubernetes component: Kubelet - type: Latency lookup: average -10s unaligned of kubelet_pleg_relist_latency_099 calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099)) every: 10s diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf index e28c246a31..c0bc6de8a1 100644 --- a/health/health.d/linux_power_supply.conf +++ b/health/health.d/linux_power_supply.conf @@ -2,9 +2,9 @@ template: linux_power_supply_capacity on: powersupply.capacity - class: Power Supply + class: Utilization + type: Power Supply component: Battery - type: Utilization calc: $capacity units: % every: 10s diff --git a/health/health.d/load.conf b/health/health.d/load.conf index e811f6ee20..0bd872f85d 100644 --- a/health/health.d/load.conf +++ b/health/health.d/load.conf @@ -6,9 +6,9 @@ # minute, with a special case for a single CPU of setting the trigger at 2. alarm: load_cpu_number on: system.load - class: System + class: Utilization + type: System component: Load - type: Utilization os: linux hosts: * calc: ($active_processors == nan or $active_processors == inf or $active_processors < 2) ? ( 2 ) : ( $active_processors ) @@ -22,9 +22,9 @@ component: Load alarm: load_average_15 on: system.load - class: System + class: Utilization + type: System component: Load - type: Utilization os: linux hosts: * lookup: max -1m unaligned of load15 @@ -37,9 +37,9 @@ component: Load alarm: load_average_5 on: system.load - class: System + class: Utilization + type: System component: Load - type: Utilization os: linux hosts: * lookup: max -1m unaligned of load5 @@ -52,9 +52,9 @@ component: Load alarm: load_average_1 on: system.load - class: System + class: Utilization + type: System component: Load - type: Utilization os: linux hosts: * lookup: max -1m unaligned of load1 diff --git a/health/health.d/mdstat.conf b/health/health.d/mdstat.conf index 67483b2018..cedaa000eb 100644 --- a/health/health.d/mdstat.conf +++ b/health/health.d/mdstat.conf @@ -1,8 +1,8 @@ template: mdstat_last_collected on: md.disks - class: System + class: Latency + type: System component: RAID - type: Latency calc: $now - $last_collected_t units: seconds ago every: 10s @@ -13,9 +13,9 @@ component: RAID template: mdstat_disks on: md.disks - class: System + class: Errors + type: System component: RAID - type: Errors units: failed devices every: 10s calc: $down @@ -26,9 +26,9 @@ component: RAID template: mdstat_mismatch_cnt on: md.mismatch_cnt - class: System + class: Errors + type: System component: RAID - type: Errors families: !*(raid1) !*(raid10) * units: unsynchronized blocks calc: $count @@ -40,9 +40,9 @@ component: RAID template: mdstat_nonredundant_last_collected on: md.nonredundant - class: System + class: Latency + type: System component: RAID - type: Latency calc: $now - $last_collected_t units: seconds ago every: 10s diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf index 1b6502f621..9fbcfdb92d 100644 --- a/health/health.d/megacli.conf +++ b/health/health.d/megacli.conf @@ -3,9 +3,9 @@ template: megacli_adapter_state on: megacli.adapter_degraded - class: System + class: Errors + type: System component: RAID - type: Errors lookup: max -10s foreach * units: boolean every: 10s @@ -18,9 +18,9 @@ component: RAID template: megacli_pd_predictive_failures on: megacli.pd_predictive_failure - class: System + class: Errors + type: System component: RAID - type: Errors lookup: sum -10s foreach * units: predictive failures every: 10s @@ -31,9 +31,9 @@ component: RAID template: megacli_pd_media_errors on: megacli.pd_media_error - class: System + class: Errors + type: System component: RAID - type: Errors lookup: sum -10s foreach * units: media errors every: 10s @@ -46,9 +46,9 @@ component: RAID template: megacli_bbu_relative_charge on: megacli.bbu_relative_charge - class: System + class: Workload + type: System component: RAID - type: Workload lookup: average -10s units: percent every: 10s @@ -59,9 +59,9 @@ component: RAID template: megacli_bbu_cycle_count on: megacli.bbu_cycle_count - class: System + class: Workload + type: System component: RAID - type: Workload lookup: average -10s units: cycles every: 10s diff --git a/health/health.d/memcached.conf b/health/health.d/memcached.conf index 1efad98a05..2a2fe4b82b 100644 --- a/health/health.d/memcached.conf +++ b/health/health.d/memcached.conf @@ -3,9 +3,9 @@ template: memcached_cache_memory_usage on: memcached.cache - class: KV Storage + class: Utilization + type: KV Storage component: Memcached - type: Utilization calc: $used * 100 / ($used + $available) units: % every: 10s @@ -20,9 +20,9 @@ component: Memcached template: memcached_cache_fill_rate on: memcached.cache - class: KV Storage + class: Utilization + type: KV Storage component: Memcached - type: Utilization lookup: min -10m at -50m unaligned of available calc: ($this - $available) / (($now - $after) / 3600) units: KB/hour @@ -34,9 +34,9 @@ component: Memcached template: memcached_out_of_cache_space_time on: memcached.cache - class: KV Storage + class: Utilization + type: KV Storage component: Memcached - type: Utilization calc: ($memcached_cache_fill_rate > 0) ? ($available / $memcached_cache_fill_rate) : (inf) units: hours every: 10s diff --git a/health/health.d/memory.conf b/health/health.d/memory.conf index ab651315f3..010cbbd7b8 100644 --- a/health/health.d/memory.conf +++ b/health/health.d/memory.conf @@ -3,9 +3,9 @@ alarm: 1hour_ecc_memory_correctable on: mem.ecc_ce - class: System + class: Errors + type: System component: Memory - type: Errors os: linux hosts: * lookup: sum -10m unaligned @@ -18,9 +18,9 @@ component: Memory alarm: 1hour_ecc_memory_uncorrectable on: mem.ecc_ue - class: System + class: Errors + type: System component: Memory - type: Errors os: linux hosts: * lookup: sum -10m unaligned @@ -33,9 +33,9 @@ component: Memory alarm: 1hour_memory_hw_corrupted on: mem.hwcorrupt - class: System + class: Errors + type: System component: Memory - type: Errors os: linux hosts: * calc: $HardwareCorrupted diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf index a30e1b3bc3..34452d9831 100644 --- a/health/health.d/mysql.conf +++ b/health/health.d/mysql.conf @@ -3,9 +3,9 @@ template: mysql_10s_slow_queries on: mysql.queries - class: Database + class: Latency + type: Database component: MySQL - type: Latency lookup: sum -10s of slow_queries units: slow queries every: 10s @@ -21,9 +21,9 @@ component: MySQL template: mysql_10s_table_locks_immediate on: mysql.table_locks - class: Database + class: Utilization + type: Database component: MySQL - type: Utilization lookup: sum -10s absolute of immediate units: immediate locks every: 10s @@ -32,9 +32,9 @@ component: MySQL template: mysql_10s_table_locks_waited on: mysql.table_locks - class: Database + class: Latency + type: Database component: MySQL - type: Latency lookup: sum -10s absolute of waited units: waited locks every: 10s @@ -43,9 +43,9 @@ component: MySQL template: mysql_10s_waited_locks_ratio on: mysql.table_locks - class: Database + class: Latency + type: Database component: MySQL - type: Latency calc: ( ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate) > 0 ) ? (($mysql_10s_table_locks_waited * 100) / ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate)) : 0 units: % every: 10s @@ -61,9 +61,9 @@ component: MySQL template: mysql_connections on: mysql.connections_active - class: Database + class: Utilization + type: Database component: MySQL - type: Utilization calc: $active * 100 / $limit units: % every: 10s @@ -79,9 +79,9 @@ component: MySQL template: mysql_replication on: mysql.slave_status - class: Database + class: Errors + type: Database component: MySQL - type: Errors calc: ($sql_running <= 0 OR $io_running <= 0)?0:1 units: ok/failed every: 10s @@ -92,9 +92,9 @@ component: MySQL template: mysql_replication_lag on: mysql.slave_behind - class: Database + class: Latency + type: Database component: MySQL - type: Errors calc: $seconds units: seconds every: 10s @@ -111,9 +111,9 @@ component: MySQL template: mysql_galera_cluster_size_max_2m on: mysql.galera_cluster_size - class: Database + class: Utilization + type: Database component: MySQL - type: Utilization lookup: max -2m absolute units: nodes every: 10s @@ -122,9 +122,9 @@ component: MySQL template: mysql_galera_cluster_size on: mysql.galera_cluster_size - class: Database + class: Utilization + type: Database component: MySQL - type: Utilization calc: $nodes units: nodes every: 10s @@ -138,9 +138,9 @@ component: MySQL template: mysql_galera_cluster_state on: mysql.galera_cluster_state - class: Database + class: Errors + type: Database component: MySQL - type: Errors calc: $state every: 10s warn: $this == 2 OR $this == 3 @@ -155,9 +155,9 @@ component: MySQL template: mysql_galera_cluster_status on: mysql.galera_cluster_status - class: Database + class: Errors + type: Database component: MySQL - type: Errors calc: $wsrep_cluster_status every: 10s crit: $mysql_galera_cluster_state != nan AND $this != 0 diff --git a/health/health.d/net.conf b/health/health.d/net.conf index db480031ce..028ca7b81b 100644 --- a/health/health.d/net.conf +++ b/health/health.d/net.conf @@ -6,9 +6,9 @@ template: interface_speed on: net.net - class: System + class: Latency + type: System component: Network - type: Latency os: * hosts: * families: * @@ -19,9 +19,9 @@ component: Network template: 1m_received_traffic_overflow on: net.net - class: System + class: Workload + type: System component: Network - type: Workload os: linux hosts: * families: * @@ -36,9 +36,9 @@ component: Network template: 1m_sent_traffic_overflow on: net.net - class: System + class: Workload + type: System component: Network - type: Workload os: linux hosts: * families: * @@ -63,9 +63,9 @@ component: Network template: inbound_packets_dropped on: net.drops - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * families: !net* * @@ -76,9 +76,9 @@ component: Network template: outbound_packets_dropped on: net.drops - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * families: !net* * @@ -89,9 +89,9 @@ component: Network template: inbound_packets_dropped_ratio on: net.packets - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * families: !net* !wl* * @@ -106,9 +106,9 @@ component: Network template: outbound_packets_dropped_ratio on: net.packets - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * families: !net* !wl* * @@ -123,9 +123,9 @@ component: Network template: wifi_inbound_packets_dropped_ratio on: net.packets - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * families: wl* @@ -140,9 +140,9 @@ component: Network template: wifi_outbound_packets_dropped_ratio on: net.packets - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * families: wl* @@ -160,9 +160,9 @@ component: Network template: interface_inbound_errors on: net.errors - class: System + class: Errors + type: System component: Network - type: Errors os: freebsd hosts: * families: * @@ -176,9 +176,9 @@ component: Network template: interface_outbound_errors on: net.errors - class: System + class: Errors + type: System component: Network - type: Errors os: freebsd hosts: * families: * @@ -200,9 +200,9 @@ component: Network template: 10min_fifo_errors on: net.fifo - class: System + class: Errors + type: System component: Network - type: Errors os: linux hosts: * families: * @@ -225,9 +225,9 @@ component: Network template: 1m_received_packets_rate on: net.packets - class: System + class: Workload + type: System component: Network - type: Workload os: linux freebsd hosts: * families: * @@ -238,9 +238,9 @@ component: Network template: 10s_received_packets_storm on: net.packets - class: System + class: Workload + type: System component: Network - type: Workload os: linux freebsd hosts: * families: * diff --git a/health/health.d/netfilter.conf b/health/health.d/netfilter.conf index 35c89caf7a..7de383fa2b 100644 --- a/health/health.d/netfilter.conf +++ b/health/health.d/netfilter.conf @@ -3,9 +3,9 @@ alarm: netfilter_conntrack_full on: netfilter.conntrack_sockets - class: System + class: Workload + type: System component: Network - type: Workload os: linux hosts: * lookup: max -10s unaligned of connections diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf index 429ff17849..f9a40e240f 100644 --- a/health/health.d/pihole.conf +++ b/health/health.d/pihole.conf @@ -3,9 +3,9 @@ template: pihole_blocked_queries on: pihole.dns_queries_percentage - class: Ad Filtering + class: Errors + type: Ad Filtering component: Pi-hole - type: Errors every: 10s units: % calc: $blocked @@ -21,9 +21,9 @@ component: Pi-hole template: pihole_blocklist_last_update on: pihole.blocklist_last_update - class: Ad Filtering + class: Errors + type: Ad Filtering component: Pi-hole - type: Errors every: 10s units: seconds calc: $ago @@ -36,9 +36,9 @@ component: Pi-hole template: pihole_blocklist_gravity_file on: pihole.blocklist_last_update - class: Ad Filtering + class: Errors + type: Ad Filtering component: Pi-hole - type: Errors every: 10s units: boolean calc: $file_exists @@ -52,9 +52,9 @@ component: Pi-hole template: pihole_status on: pihole.unwanted_domains_blocking_status - class: Ad Filtering + class: Errors + type: Ad Filtering component: Pi-hole - type: Errors every: 10s units: boolean calc: $enabled diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf index d864b7b82b..8cbd7729c8 100644 --- a/health/health.d/portcheck.conf +++ b/health/health.d/portcheck.conf @@ -3,9 +3,9 @@ template: portcheck_service_reachable families: * on: portcheck.status - class: Other + class: Workload + type: Other component: TCP endpoint - type: Workload lookup: average -1m unaligned percentage of success calc: ($this < 75) ? (0) : ($this) every: 5s @@ -16,9 +16,9 @@ component: TCP endpoint template: portcheck_connection_timeouts families: * on: portcheck.status - class: Other + class: Errors + type: Other component: TCP endpoint - type: Errors lookup: average -5m unaligned percentage of timeout every: 10s units: % @@ -31,9 +31,9 @@ component: TCP endpoint template: portcheck_connection_fails families: * on: portcheck.status - class: Other + class: Errors + type: Other component: TCP endpoint - type: Errors lookup: average -5m unaligned percentage of no_connection,failed every: 10s units: % diff --git a/health/health.d/processes.conf b/health/health.d/processes.conf index b44a24c0b0..2929ee3d45 100644 --- a/health/health.d/processes.conf +++ b/health/health.d/processes.conf @@ -2,9 +2,9 @@ alarm: active_processes on: system.active_processes - class: System + class: Workload + type: System component: Processes - type: Workload hosts: * calc: $active * 100 / $pidmax units: % diff --git a/health/health.d/python.d.plugin.conf b/health/health.d/python.d.plugin.conf index 7a3ebe1d26..f3abc588fe 100644 --- a/health/health.d/python.d.plugin.conf +++ b/health/health.d/python.d.plugin.conf @@ -3,9 +3,9 @@ template: python.d_job_last_collected_secs on: netdata.pythond_runtime - class: Netdata + class: Error + type: Netdata component: python.d.plugin - type: Error module: * calc: $now - $last_collected_t units: seconds ago diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf index 450c548171..64c4a96c8c 100644 --- a/health/health.d/ram.conf +++ b/health/health.d/ram.conf @@ -3,9 +3,9 @@ alarm: used_ram_to_ignore on: system.ram - class: System + class: Utilization + type: System component: Memory - type: Utilization os: linux freebsd hosts: * calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz - $zfs.arc_size.min) @@ -15,9 +15,9 @@ component: Memory alarm: ram_in_use on: system.ram - class: System + class: Utilization + type: System component: Memory - type: Utilization os: linux hosts: * # calc: $used * 100 / ($used + $cached + $free) @@ -32,9 +32,9 @@ component: Memory alarm: ram_available on: mem.available - class: System + class: Utilization + type: System component: Memory - type: Utilization os: linux hosts: * calc: ($avail + $system.ram.used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers) @@ -61,9 +61,9 @@ component: Memory ## FreeBSD alarm: ram_in_use on: system.ram - class: System + class: Utilization + type: System component: Memory - type: Utilization os: freebsd hosts: * calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive) @@ -77,9 +77,9 @@ component: Memory alarm: ram_available on: system.ram - class: System + class: Utilization + type: System component: Memory - type: Utilization os: freebsd hosts: * calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers) diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf index d597f0be91..dfb771e8ca 100644 --- a/health/health.d/redis.conf +++ b/health/health.d/redis.conf @@ -2,9 +2,9 @@ template: redis_bgsave_broken families: * on: redis.bgsave_health - class: KV Storage + class: Errors + type: KV Storage component: Redis - type: Errors every: 10s crit: $rdb_last_bgsave_status != 0 units: ok/failed @@ -15,9 +15,9 @@ component: Redis template: redis_bgsave_slow families: * on: redis.bgsave_now - class: KV Storage + class: Latency + type: KV Storage component: Redis - type: Latency every: 10s warn: $rdb_bgsave_in_progress > 600 crit: $rdb_bgsave_in_progress > 1200 diff --git a/health/health.d/retroshare.conf b/health/health.d/retroshare.conf index 6b3ab9dc3a..14aa76b4c3 100644 --- a/health/health.d/retroshare.conf +++ b/health/health.d/retroshare.conf @@ -3,9 +3,9 @@ template: retroshare_dht_working on: retroshare.dht - class: Data Sharing + class: Utilization + type: Data Sharing component: Retroshare - type: Utilization calc: $dht_size_all units: peers every: 1m diff --git a/health/health.d/riakkv.conf b/health/health.d/riakkv.conf index b390840843..261fd48c6a 100644 --- a/health/health.d/riakkv.conf +++ b/health/health.d/riakkv.conf @@ -2,9 +2,9 @@ # Warn if a list keys operation is running. template: riakkv_list_keys_active on: riak.core.fsm_active - class: Database + class: Utilization + type: Database component: Riak KV - type: Utilization calc: $list_fsm_active units: state machines every: 10s @@ -17,9 +17,9 @@ component: Riak KV # KV GET template: riakkv_1h_kv_get_mean_latency on: riak.kv.latency.get - class: Database + class: Latency + type: Database component: Riak KV - type: Latency calc: $node_get_fsm_time_mean lookup: average -1h unaligned of time every: 30s @@ -29,9 +29,9 @@ component: Riak KV template: riakkv_kv_get_slow on: riak.kv.latency.get - class: Database + class: Latency + type: Database component: Riak KV - type: Latency calc: $mean lookup: average -3m unaligned of time units: ms @@ -47,9 +47,9 @@ component: Riak KV # KV PUT template: riakkv_1h_kv_put_mean_latency on: riak.kv.latency.put - class: Database + class: Latency + type: Database