From 8b78803a4edbb52764f5d2ade7c11c2d746545a1 Mon Sep 17 00:00:00 2001 From: Ilya Mashchenko Date: Fri, 2 Jul 2021 14:25:15 +0300 Subject: [docs] fix prometheus node cpu alert rule (#11309) --- backends/prometheus/README.md | 68 +++++++++++++++++++++--------------------- exporting/prometheus/README.md | 68 +++++++++++++++++++++--------------------- 2 files changed, 68 insertions(+), 68 deletions(-) diff --git a/backends/prometheus/README.md b/backends/prometheus/README.md index 10275fa205..0346aa666a 100644 --- a/backends/prometheus/README.md +++ b/backends/prometheus/README.md @@ -131,40 +131,40 @@ add a _- "nodes.yml"_ entry under the _rule_files:_ section in the example prome ```yaml groups: -- name: nodes - - rules: - - alert: node_high_cpu_usage_70 - expr: avg(rate(netdata_cpu_cpu_percentage_average{dimension="idle"}[1m])) by (job) > 70 - for: 1m - annotations: - description: '{{ $labels.job }} on ''{{ $labels.job }}'' CPU usage is at {{ humanize $value }}%.' - summary: CPU alert for container node '{{ $labels.job }}' - - - alert: node_high_memory_usage_70 - expr: 100 / sum(netdata_system_ram_MB_average) by (job) - * sum(netdata_system_ram_MB_average{dimension=~"free|cached"}) by (job) < 30 - for: 1m - annotations: - description: '{{ $labels.job }} memory usage is {{ humanize $value}}%.' - summary: Memory alert for container node '{{ $labels.job }}' - - - alert: node_low_root_filesystem_space_20 - expr: 100 / sum(netdata_disk_space_GB_average{family="/"}) by (job) - * sum(netdata_disk_space_GB_average{family="/",dimension=~"avail|cached"}) by (job) < 20 - for: 1m - annotations: - description: '{{ $labels.job }} root filesystem space is {{ humanize $value}}%.' - summary: Root filesystem alert for container node '{{ $labels.job }}' - - - alert: node_root_filesystem_fill_rate_6h - expr: predict_linear(netdata_disk_space_GB_average{family="/",dimension=~"avail|cached"}[1h], 6 * 3600) < 0 - for: 1h - labels: - severity: critical - annotations: - description: Container node {{ $labels.job }} root filesystem is going to fill up in 6h. - summary: Disk fill alert for Swarm node '{{ $labels.job }}' + - name: nodes + + rules: + - alert: node_high_cpu_usage_70 + expr: sum(sum_over_time(netdata_system_cpu_percentage_average{dimension=~"(user|system|softirq|irq|guest)"}[10m])) by (job) / sum(count_over_time(netdata_system_cpu_percentage_average{dimension="idle"}[10m])) by (job) > 70 + for: 1m + annotations: + description: '{{ $labels.job }} on ''{{ $labels.job }}'' CPU usage is at {{ humanize $value }}%.' + summary: CPU alert for container node '{{ $labels.job }}' + + - alert: node_high_memory_usage_70 + expr: 100 / sum(netdata_system_ram_MB_average) by (job) + * sum(netdata_system_ram_MB_average{dimension=~"free|cached"}) by (job) < 30 + for: 1m + annotations: + description: '{{ $labels.job }} memory usage is {{ humanize $value}}%.' + summary: Memory alert for container node '{{ $labels.job }}' + + - alert: node_low_root_filesystem_space_20 + expr: 100 / sum(netdata_disk_space_GB_average{family="/"}) by (job) + * sum(netdata_disk_space_GB_average{family="/",dimension=~"avail|cached"}) by (job) < 20 + for: 1m + annotations: + description: '{{ $labels.job }} root filesystem space is {{ humanize $value}}%.' + summary: Root filesystem alert for container node '{{ $labels.job }}' + + - alert: node_root_filesystem_fill_rate_6h + expr: predict_linear(netdata_disk_space_GB_average{family="/",dimension=~"avail|cached"}[1h], 6 * 3600) < 0 + for: 1h + labels: + severity: critical + annotations: + description: Container node {{ $labels.job }} root filesystem is going to fill up in 6h. + summary: Disk fill alert for Swarm node '{{ $labels.job }}' ``` #### Install prometheus.service diff --git a/exporting/prometheus/README.md b/exporting/prometheus/README.md index d718a366eb..27f4b10d44 100644 --- a/exporting/prometheus/README.md +++ b/exporting/prometheus/README.md @@ -134,40 +134,40 @@ add a _- "nodes.yml"_ entry under the _rule_files:_ section in the example prome ```yaml groups: -- name: nodes - - rules: - - alert: node_high_cpu_usage_70 - expr: avg(rate(netdata_cpu_cpu_percentage_average{dimension="idle"}[1m])) by (job) > 70 - for: 1m - annotations: - description: '{{ $labels.job }} on ''{{ $labels.job }}'' CPU usage is at {{ humanize $value }}%.' - summary: CPU alert for container node '{{ $labels.job }}' - - - alert: node_high_memory_usage_70 - expr: 100 / sum(netdata_system_ram_MB_average) by (job) - * sum(netdata_system_ram_MB_average{dimension=~"free|cached"}) by (job) < 30 - for: 1m - annotations: - description: '{{ $labels.job }} memory usage is {{ humanize $value}}%.' - summary: Memory alert for container node '{{ $labels.job }}' - - - alert: node_low_root_filesystem_space_20 - expr: 100 / sum(netdata_disk_space_GB_average{family="/"}) by (job) - * sum(netdata_disk_space_GB_average{family="/",dimension=~"avail|cached"}) by (job) < 20 - for: 1m - annotations: - description: '{{ $labels.job }} root filesystem space is {{ humanize $value}}%.' - summary: Root filesystem alert for container node '{{ $labels.job }}' - - - alert: node_root_filesystem_fill_rate_6h - expr: predict_linear(netdata_disk_space_GB_average{family="/",dimension=~"avail|cached"}[1h], 6 * 3600) < 0 - for: 1h - labels: - severity: critical - annotations: - description: Container node {{ $labels.job }} root filesystem is going to fill up in 6h. - summary: Disk fill alert for Swarm node '{{ $labels.job }}' + - name: nodes + + rules: + - alert: node_high_cpu_usage_70 + expr: sum(sum_over_time(netdata_system_cpu_percentage_average{dimension=~"(user|system|softirq|irq|guest)"}[10m])) by (job) / sum(count_over_time(netdata_system_cpu_percentage_average{dimension="idle"}[10m])) by (job) > 70 + for: 1m + annotations: + description: '{{ $labels.job }} on ''{{ $labels.job }}'' CPU usage is at {{ humanize $value }}%.' + summary: CPU alert for container node '{{ $labels.job }}' + + - alert: node_high_memory_usage_70 + expr: 100 / sum(netdata_system_ram_MB_average) by (job) + * sum(netdata_system_ram_MB_average{dimension=~"free|cached"}) by (job) < 30 + for: 1m + annotations: + description: '{{ $labels.job }} memory usage is {{ humanize $value}}%.' + summary: Memory alert for container node '{{ $labels.job }}' + + - alert: node_low_root_filesystem_space_20 + expr: 100 / sum(netdata_disk_space_GB_average{family="/"}) by (job) + * sum(netdata_disk_space_GB_average{family="/",dimension=~"avail|cached"}) by (job) < 20 + for: 1m + annotations: + description: '{{ $labels.job }} root filesystem space is {{ humanize $value}}%.' + summary: Root filesystem alert for container node '{{ $labels.job }}' + + - alert: node_root_filesystem_fill_rate_6h + expr: predict_linear(netdata_disk_space_GB_average{family="/",dimension=~"avail|cached"}[1h], 6 * 3600) < 0 + for: 1h + labels: + severity: critical + annotations: + description: Container node {{ $labels.job }} root filesystem is going to fill up in 6h. + summary: Disk fill alert for Swarm node '{{ $labels.job }}' ``` #### Install prometheus.service -- cgit v1.2.3