summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIlya Mashchenko <ilya@netdata.cloud>2021-07-02 14:25:15 +0300
committerGitHub <noreply@github.com>2021-07-02 14:25:15 +0300
commit8b78803a4edbb52764f5d2ade7c11c2d746545a1 (patch)
tree91262481b816e206e724bfd42ce1d2bd1f45a289
parent1bc24efdcabbb33055d4f12148ecca790760166c (diff)
[docs] fix prometheus node cpu alert rule (#11309)
-rw-r--r--backends/prometheus/README.md68
-rw-r--r--exporting/prometheus/README.md68
2 files changed, 68 insertions, 68 deletions
diff --git a/backends/prometheus/README.md b/backends/prometheus/README.md
index 10275fa205..0346aa666a 100644
--- a/backends/prometheus/README.md
+++ b/backends/prometheus/README.md
@@ -131,40 +131,40 @@ add a _- "nodes.yml"_ entry under the _rule_files:_ section in the example prome
```yaml
groups:
-- name: nodes
-
- rules:
- - alert: node_high_cpu_usage_70
- expr: avg(rate(netdata_cpu_cpu_percentage_average{dimension="idle"}[1m])) by (job) > 70
- for: 1m
- annotations:
- description: '{{ $labels.job }} on ''{{ $labels.job }}'' CPU usage is at {{ humanize $value }}%.'
- summary: CPU alert for container node '{{ $labels.job }}'
-
- - alert: node_high_memory_usage_70
- expr: 100 / sum(netdata_system_ram_MB_average) by (job)
- * sum(netdata_system_ram_MB_average{dimension=~"free|cached"}) by (job) < 30
- for: 1m
- annotations:
- description: '{{ $labels.job }} memory usage is {{ humanize $value}}%.'
- summary: Memory alert for container node '{{ $labels.job }}'
-
- - alert: node_low_root_filesystem_space_20
- expr: 100 / sum(netdata_disk_space_GB_average{family="/"}) by (job)
- * sum(netdata_disk_space_GB_average{family="/",dimension=~"avail|cached"}) by (job) < 20
- for: 1m
- annotations:
- description: '{{ $labels.job }} root filesystem space is {{ humanize $value}}%.'
- summary: Root filesystem alert for container node '{{ $labels.job }}'
-
- - alert: node_root_filesystem_fill_rate_6h
- expr: predict_linear(netdata_disk_space_GB_average{family="/",dimension=~"avail|cached"}[1h], 6 * 3600) < 0
- for: 1h
- labels:
- severity: critical
- annotations:
- description: Container node {{ $labels.job }} root filesystem is going to fill up in 6h.
- summary: Disk fill alert for Swarm node '{{ $labels.job }}'
+ - name: nodes
+
+ rules:
+ - alert: node_high_cpu_usage_70
+ expr: sum(sum_over_time(netdata_system_cpu_percentage_average{dimension=~"(user|system|softirq|irq|guest)"}[10m])) by (job) / sum(count_over_time(netdata_system_cpu_percentage_average{dimension="idle"}[10m])) by (job) > 70
+ for: 1m
+ annotations:
+ description: '{{ $labels.job }} on ''{{ $labels.job }}'' CPU usage is at {{ humanize $value }}%.'
+ summary: CPU alert for container node '{{ $labels.job }}'
+
+ - alert: node_high_memory_usage_70
+ expr: 100 / sum(netdata_system_ram_MB_average) by (job)
+ * sum(netdata_system_ram_MB_average{dimension=~"free|cached"}) by (job) < 30
+ for: 1m
+ annotations:
+ description: '{{ $labels.job }} memory usage is {{ humanize $value}}%.'
+ summary: Memory alert for container node '{{ $labels.job }}'
+
+ - alert: node_low_root_filesystem_space_20
+ expr: 100 / sum(netdata_disk_space_GB_average{family="/"}) by (job)
+ * sum(netdata_disk_space_GB_average{family="/",dimension=~"avail|cached"}) by (job) < 20
+ for: 1m
+ annotations:
+ description: '{{ $labels.job }} root filesystem space is {{ humanize $value}}%.'
+ summary: Root filesystem alert for container node '{{ $labels.job }}'
+
+ - alert: node_root_filesystem_fill_rate_6h
+ expr: predict_linear(netdata_disk_space_GB_average{family="/",dimension=~"avail|cached"}[1h], 6 * 3600) < 0
+ for: 1h
+ labels:
+ severity: critical
+ annotations:
+ description: Container node {{ $labels.job }} root filesystem is going to fill up in 6h.
+ summary: Disk fill alert for Swarm node '{{ $labels.job }}'
```
#### Install prometheus.service
diff --git a/exporting/prometheus/README.md b/exporting/prometheus/README.md
index d718a366eb..27f4b10d44 100644
--- a/exporting/prometheus/README.md
+++ b/exporting/prometheus/README.md
@@ -134,40 +134,40 @@ add a _- "nodes.yml"_ entry under the _rule_files:_ section in the example prome
```yaml
groups:
-- name: nodes
-
- rules:
- - alert: node_high_cpu_usage_70
- expr: avg(rate(netdata_cpu_cpu_percentage_average{dimension="idle"}[1m])) by (job) > 70
- for: 1m
- annotations:
- description: '{{ $labels.job }} on ''{{ $labels.job }}'' CPU usage is at {{ humanize $value }}%.'
- summary: CPU alert for container node '{{ $labels.job }}'
-
- - alert: node_high_memory_usage_70
- expr: 100 / sum(netdata_system_ram_MB_average) by (job)
- * sum(netdata_system_ram_MB_average{dimension=~"free|cached"}) by (job) < 30
- for: 1m
- annotations:
- description: '{{ $labels.job }} memory usage is {{ humanize $value}}%.'
- summary: Memory alert for container node '{{ $labels.job }}'
-
- - alert: node_low_root_filesystem_space_20
- expr: 100 / sum(netdata_disk_space_GB_average{family="/"}) by (job)
- * sum(netdata_disk_space_GB_average{family="/",dimension=~"avail|cached"}) by (job) < 20
- for: 1m
- annotations:
- description: '{{ $labels.job }} root filesystem space is {{ humanize $value}}%.'
- summary: Root filesystem alert for container node '{{ $labels.job }}'
-
- - alert: node_root_filesystem_fill_rate_6h
- expr: predict_linear(netdata_disk_space_GB_average{family="/",dimension=~"avail|cached"}[1h], 6 * 3600) < 0
- for: 1h
- labels:
- severity: critical
- annotations:
- description: Container node {{ $labels.job }} root filesystem is going to fill up in 6h.
- summary: Disk fill alert for Swarm node '{{ $labels.job }}'
+ - name: nodes
+
+ rules:
+ - alert: node_high_cpu_usage_70
+ expr: sum(sum_over_time(netdata_system_cpu_percentage_average{dimension=~"(user|system|softirq|irq|guest)"}[10m])) by (job) / sum(count_over_time(netdata_system_cpu_percentage_average{dimension="idle"}[10m])) by (job) > 70
+ for: 1m
+ annotations:
+ description: '{{ $labels.job }} on ''{{ $labels.job }}'' CPU usage is at {{ humanize $value }}%.'
+ summary: CPU alert for container node '{{ $labels.job }}'
+
+ - alert: node_high_memory_usage_70
+ expr: 100 / sum(netdata_system_ram_MB_average) by (job)
+ * sum(netdata_system_ram_MB_average{dimension=~"free|cached"}) by (job) < 30
+ for: 1m
+ annotations:
+ description: '{{ $labels.job }} memory usage is {{ humanize $value}}%.'
+ summary: Memory alert for container node '{{ $labels.job }}'
+
+ - alert: node_low_root_filesystem_space_20
+ expr: 100 / sum(netdata_disk_space_GB_average{family="/"}) by (job)
+ * sum(netdata_disk_space_GB_average{family="/",dimension=~"avail|cached"}) by (job) < 20
+ for: 1m
+ annotations:
+ description: '{{ $labels.job }} root filesystem space is {{ humanize $value}}%.'
+ summary: Root filesystem alert for container node '{{ $labels.job }}'
+
+ - alert: node_root_filesystem_fill_rate_6h
+ expr: predict_linear(netdata_disk_space_GB_average{family="/",dimension=~"avail|cached"}[1h], 6 * 3600) < 0
+ for: 1h
+ labels:
+ severity: critical
+ annotations:
+ description: Container node {{ $labels.job }} root filesystem is going to fill up in 6h.
+ summary: Disk fill alert for Swarm node '{{ $labels.job }}'
```
#### Install prometheus.service