Bugfix on alerts generation for yamls (#15420)

author: Fotis Voutsas <fotis@netdata.cloud> 2023-07-18 11:00:23 +0300
committer: GitHub <noreply@github.com> 2023-07-18 11:00:23 +0300
commit: 25586826d42107952ead3348dbce0c0223adf61d (patch)
tree: d6af5389beeed5e459b463e2258ce699159bbd7c /collectors/proc.plugin
parent: 489c3f5c66be6d3d63c3975bdbf6add861141998 (diff)
1 files changed, 117 insertions, 10 deletions
diff --git a/collectors/proc.plugin/multi_metadata.yaml b/collectors/proc.plugin/multi_metadata.yaml
index 61437dee47..3ca89969cf 100644
--- a/collectors/proc.plugin/multi_metadata.yaml
+++ b/collectors/proc.plugin/multi_metadata.yaml
@@ -71,6 +71,11 @@ modules:
         metric: system.cpu
         info: average CPU steal time over the last 20 minutes
         os: "linux"
+      - name: 10min_cpu_usage
+        link: https://github.com/netdata/netdata/blob/master/health/health.d/cpu.conf
+        metric: system.cpu
+        info: average CPU utilization over the last 10 minutes (excluding nice)
+        os: "freebsd"
     metrics:
       folding:
         title: Metrics
@@ -221,7 +226,12 @@ modules:
     troubleshooting:
       problems:
         list: []
-    alerts: []
+    alerts:
+      - name: lowest_entropy
+        link: https://github.com/netdata/netdata/blob/master/health/health.d/entropy.conf
+        metric: system.entropy
+        info: minimum number of entries in the random numbers pool in the last 5 minutes
+        os: "linux"
     metrics:
       folding:
         title: Metrics
@@ -659,6 +669,15 @@ modules:
         metric: system.load
         info: system five-minute load average
         os: "linux"
+      - name: load_average_1
+        link: https://github.com/netdata/netdata/blob/master/health/health.d/load.conf
+        metric: system.load
+        info: system one-minute load average
+        os: "linux"
+      - name: active_processes
+        link: https://github.com/netdata/netdata/blob/master/health/health.d/processes.conf
+        metric: system.active_processes
+        info: system process IDs (PID) space utilization
     metrics:
       folding:
         title: Metrics
@@ -986,6 +1005,12 @@ modules:
           or net.core.netdev_budget_usecs with work remaining over the last minute
           (this can be a cause for dropped packets)
         os: "linux"
+      - name: 10min_netisr_backlog_exceeded
+        link: https://github.com/netdata/netdata/blob/master/health/health.d/softnet.conf
+        metric: system.softnet_stat
+        info: average number of drops in the last minute due to exceeded sysctl net.route.netisr_maxqlen
+          (this can be a cause for dropped packets)
+        os: "freebsd"
     metrics:
       folding:
         title: Metrics
@@ -1092,6 +1117,22 @@ modules:
         info: percentage of estimated amount of RAM available for userspace processes,
           without causing swapping
         os: "linux"
+      - name: ram_available
+        link: https://github.com/netdata/netdata/blob/master/health/health.d/ram.conf
+        metric: mem.available
+        info: percentage of estimated amount of RAM available for userspace processes,
+          without causing swapping
+        os: "freebsd"
+      - name: used_swap
+        link: https://github.com/netdata/netdata/blob/master/health/health.d/swap.conf
+        metric: system.swap
+        info: swap memory utilization
+        os: "linux freebsd"
+      - name: 1hour_memory_hw_corrupted
+        link: https://github.com/netdata/netdata/blob/master/health/health.d/memory.conf
+        metric: mem.hwcorrupt
+        info: amount of memory corrupted due to a hardware failure
+        os: "linux"
     metrics:
       folding:
         title: Metrics
@@ -1677,6 +1718,11 @@ modules:
         metric: system.ipc_semaphores
         info: IPC semaphore utilization
         os: "linux"
+      - name: semaphore_arrays_used
+        link: https://github.com/netdata/netdata/blob/master/health/health.d/ipc.conf
+        metric: system.ipc_semaphore_arrays
+        info: IPC semaphore arrays utilization
+        os: "linux"
     metrics:
       folding:
         title: Metrics
@@ -1779,19 +1825,28 @@ modules:
       problems:
         list: []
     alerts:
+      - name: 10min_disk_backlog
+        link: https://github.com/netdata/netdata/blob/master/health/health.d/disks.conf
+        metric: disk.backlog
+        info: average backlog size of the ${label:device} disk over the last 10 minutes
+        os: "linux"
       - name: 10min_disk_utilization
         link: https://github.com/netdata/netdata/blob/master/health/health.d/disks.conf
         metric: disk.util
         info: average percentage of time ${label:device} disk was busy over the last
           10 minutes
         os: "linux freebsd"
+      - name: bcache_cache_dirty
+        link: https://github.com/netdata/netdata/blob/master/health/health.d/bcache.conf
+        metric: disk.bcache_cache_alloc
+        info: percentage of cache space used for dirty data and metadata (this usually
+          means your SSD cache is too small)
       - name: bcache_cache_errors
         link: https://github.com/netdata/netdata/blob/master/health/health.d/bcache.conf
         metric: disk.bcache_cache_read_races
         info: number of times data was read from the cache, the bucket was reused
           and invalidated in the last 10 minutes (when this occurs the data is reread
           from the backing device)
-        os: "freebsd"
     metrics:
       folding:
         title: Metrics
@@ -2051,19 +2106,20 @@ modules:
         link: https://github.com/netdata/netdata/blob/master/health/health.d/mdstat.conf
         metric: md.disks
         info: number of seconds since the last successful data collection
-        os: "*"
       - name: mdstat_disks
         link: https://github.com/netdata/netdata/blob/master/health/health.d/mdstat.conf
         metric: md.disks
         info: number of devices in the down state for the ${label:device} ${label:raid_level}
           array. Any number > 0 indicates that the array is degraded.
-        os: "*"
       - name: mdstat_mismatch_cnt
         link: https://github.com/netdata/netdata/blob/master/health/health.d/mdstat.conf
         metric: md.mismatch_cnt
         info: number of unsynchronized blocks for the ${label:device} ${label:raid_level}
           array
-        os: "*"
+      - name: mdstat_nonredundant_last_collected
+        link: https://github.com/netdata/netdata/blob/master/health/health.d/mdstat.conf
+        metric: md.nonredundant
+        info: number of seconds since the last successful data collection
     metrics:
       folding:
         title: Metrics
@@ -2231,6 +2287,13 @@ modules:
         info: average number of packets received by the network interface ${label:device}
           over the last minute
         os: "linux freebsd"
+      - name: 10s_received_packets_storm
+        link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf
+        metric: net.packets
+        info: ratio of average number of received packets for the network interface
+          ${label:device} over the last 10 seconds, compared to the rate over the
+          last minute
+        os: "linux freebsd"
       - name: interface_inbound_errors
         link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf
         metric: net.errors
@@ -2708,6 +2771,11 @@ modules:
         info: average number of dropped packets in the TCP accept queue over the last
           minute
         os: "linux"
+      - name: tcp_connections
+        link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_conn.conf
+        metric: ipv4.tcpsock
+        info: IPv4 TCP connections utilization
+        os: "linux"
       - name: 1m_ipv4_tcp_resets_sent
         link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_resets.conf
         metric: ipv4.tcphandshake
@@ -2725,11 +2793,23 @@ modules:
         metric: ipv4.tcphandshake
         info: average number of received TCP RESETS over the last minute
         os: "linux freebsd"
+      - name: 10s_ipv4_tcp_resets_received
+        link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_resets.conf
+        metric: ipv4.tcphandshake
+        info: average number of received TCP RESETS over the last 10 seconds. This
+          can be an indication that a service this host needs has crashed. Netdata
+          will not send a clear notification for this alarm.
+        os: "linux freebsd"
       - name: 1m_ipv4_udp_receive_buffer_errors
         link: https://github.com/netdata/netdata/blob/master/health/health.d/udp_errors.conf
         metric: ipv4.udperrors
         info: average number of UDP receive buffer errors over the last minute
         os: "linux freebsd"
+      - name: 1m_ipv4_udp_send_buffer_errors
+        link: https://github.com/netdata/netdata/blob/master/health/health.d/udp_errors.conf
+        metric: ipv4.udperrors
+        info: average number of UDP send buffer errors over the last minute
+        os: "linux"
     metrics:
       folding:
         title: Metrics
@@ -3264,7 +3344,17 @@ modules:
     troubleshooting:
       problems:
         list: []
-    alerts: []
+    alerts:
+      - name: tcp_orphans
+        link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_orphans.conf
+        metric: ipv4.sockstat_tcp_sockets
+        info: orphan IPv4 TCP sockets utilization
+        os: "linux"
+      - name: tcp_memory
+        link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_mem.conf
+        metric: ipv4.sockstat_tcp_mem
+        info: TCP memory utilization
+        os: "linux"
     metrics:
       folding:
         title: Metrics
@@ -3902,7 +3992,12 @@ modules:
     troubleshooting:
       problems:
         list: []
-    alerts: []
+    alerts:
+      - name: netfilter_conntrack_full
+        link: https://github.com/netdata/netdata/blob/master/health/health.d/netfilter.conf
+        metric: netfilter.conntrack_sockets
+        info: netfilter connection tracker table size utilization
+        os: "linux"
     metrics:
       folding:
         title: Metrics
@@ -4106,7 +4201,10 @@ modules:
         link: https://github.com/netdata/netdata/blob/master/health/health.d/zfs.conf
         metric: zfspool.state
         info: ZFS pool ${label:pool} state is degraded
-        os: "*"
+      - name: zfs_pool_state_crit
+        link: https://github.com/netdata/netdata/blob/master/health/health.d/zfs.conf
+        metric: zfspool.state
+        info: ZFS pool ${label:pool} state is faulted or unavail
     metrics:
       folding:
         title: Metrics
@@ -4191,7 +4289,6 @@ modules:
         link: https://github.com/netdata/netdata/blob/master/health/health.d/zfs.conf
         metric: zfs.memory_ops
         info: number of times ZFS had to limit the ARC growth in the last 10 minutes
-        os: "*"
     metrics:
       folding:
         title: Metrics
@@ -4490,6 +4587,11 @@ modules:
         metric: btrfs.device_errors
         info: number of encountered BTRFS corruption errors
         os: "*"
+      - name: btrfs_device_generation_errors
+        link: https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf
+        metric: btrfs.device_errors
+        info: number of encountered BTRFS generation errors
+        os: "*"
     metrics:
       folding:
         title: Metrics
@@ -4632,7 +4734,12 @@ modules:
     troubleshooting:
       problems:
         list: []
-    alerts: []
+    alerts:
+      - name: linux_power_supply_capacity
+        link: |
+          https://github.com/netdata/netdata/blob/master/health/health.d/linux_power_supply.conf
+        metric: powersupply.capacity
+        info: percentage of remaining power supply capacity
     metrics:
       folding:
         title: Metrics
author	Fotis Voutsas <fotis@netdata.cloud>	2023-07-18 11:00:23 +0300
committer	GitHub <noreply@github.com>	2023-07-18 11:00:23 +0300
commit	25586826d42107952ead3348dbce0c0223adf61d (patch)
tree	d6af5389beeed5e459b463e2258ce699159bbd7c /collectors/proc.plugin
parent	489c3f5c66be6d3d63c3975bdbf6add861141998 (diff)