summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--database/rrdcalc.h1
-rw-r--r--health/README.md3
-rw-r--r--health/health.c27
-rw-r--r--health/health.h1
-rw-r--r--health/health_json.c4
-rw-r--r--tests/Makefile.am3
-rw-r--r--tests/alarm_repetition/alarm.sh.in86
-rw-r--r--tests/alarm_repetition/netdata.conf_with_repetition57
-rw-r--r--tests/alarm_repetition/netdata.conf_without_repetition57
-rw-r--r--tests/alarm_repetition/ram_with_repetition.conf64
-rw-r--r--tests/alarm_repetition/ram_without_repetition.conf63
12 files changed, 361 insertions, 6 deletions
diff --git a/.gitignore b/.gitignore
index 52a108f7fd..1222d46ea0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -166,6 +166,7 @@ gmon.txt
sitespeed-result/
tests/acls/acl.sh
tests/urls/request.sh
+tests/alarm_repetition/alarm.sh
# tests and temp files
python.d/python-modules-installer.sh
diff --git a/database/rrdcalc.h b/database/rrdcalc.h
index 3400f711cf..f0c34b5439 100644
--- a/database/rrdcalc.h
+++ b/database/rrdcalc.h
@@ -27,6 +27,7 @@
#define RRDCALC_FLAG_RUNNABLE 0x00000040
#define RRDCALC_FLAG_DISABLED 0x00000080
#define RRDCALC_FLAG_SILENCED 0x00000100
+#define RRDCALC_FLAG_RUN_ONCE 0x00000200
#define RRDCALC_FLAG_NO_CLEAR_NOTIFICATION 0x80000000
diff --git a/health/README.md b/health/README.md
index 848c1bc3ba..35f037bce6 100644
--- a/health/README.md
+++ b/health/README.md
@@ -347,7 +347,8 @@ delay: [[[up U] [down D] multiplier M] max X]
#### Alarm line `repeat`
-Defines the interval between repeating notifications for the alarms in CRITICAL or WARNING mode. This will override the default interval settings inherited from health settings in `netdata.conf`. The default settings for repeating notifications are `default repeat warning = DURATION` and `default repeat critical = DURATION` which can be found in health stock configuration.
+Defines the interval between repeating notifications for the alarms in CRITICAL or WARNING mode. This will override the default interval settings inherited from health settings in `netdata.conf`. The default settings for repeating notifications are `default repeat warning = DURATION` and `default repeat critical = DURATION` which can be found in health stock configuration, when one of these interval is bigger than 0, Netdata will activate the repeat notification for `CRITICAL`, `CLEAR` and `WARNING` messages.
+`
Format:
diff --git a/health/health.c b/health/health.c
index 1ee1a37226..1460b5ba48 100644
--- a/health/health.c
+++ b/health/health.c
@@ -216,9 +216,11 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
// we have not executed this alarm notification in the past
// so, don't send CLEAR notifications
if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
- debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
- , ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
- goto done;
+ if((!(ae->flags & HEALTH_ENTRY_RUN_ONCE)) || (ae->flags & HEALTH_ENTRY_RUN_ONCE && ae->old_status < RRDCALC_STATUS_RAISED) ) {
+ debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
+ , ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
+ goto done;
+ }
}
}
}
@@ -872,10 +874,21 @@ void *health_main(void *ptr) {
for(rc = host->alarms; rc ; rc = rc->next) {
int repeat_every = 0;
if(unlikely(rrdcalc_isrepeating(rc))) {
- if(unlikely(rc->status == RRDCALC_STATUS_WARNING))
+ if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
+ rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUN_ONCE;
repeat_every = rc->warn_repeat_every;
- else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL))
+ } else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
+ rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUN_ONCE;
repeat_every = rc->crit_repeat_every;
+ } else if(unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
+ if(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE)) {
+ if(rc->old_status == RRDCALC_STATUS_CRITICAL) {
+ repeat_every = rc->crit_repeat_every;
+ } else if (rc->old_status == RRDCALC_STATUS_WARNING) {
+ repeat_every = rc->warn_repeat_every;
+ }
+ }
+ }
}
if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
rc->last_repeat = now;
@@ -890,6 +903,10 @@ void *health_main(void *ptr) {
)
);
ae->last_repeat = rc->last_repeat;
+ if (!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) {
+ ae->flags |= HEALTH_ENTRY_RUN_ONCE;
+ }
+ rc->rrdcalc_flags |= RRDCALC_FLAG_RUN_ONCE;
health_process_notifications(host, ae);
debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
health_alarm_log_free_one_nochecks_nounlink(ae);
diff --git a/health/health.h b/health/health.h
index 6c000bf458..8e4d0f7cb3 100644
--- a/health/health.h
+++ b/health/health.h
@@ -23,6 +23,7 @@ extern unsigned int default_health_enabled;
#define HEALTH_ENTRY_FLAG_EXEC_RUN 0x00000004
#define HEALTH_ENTRY_FLAG_EXEC_FAILED 0x00000008
#define HEALTH_ENTRY_FLAG_SILENCED 0x00000010
+#define HEALTH_ENTRY_RUN_ONCE 0x00000020
#define HEALTH_ENTRY_FLAG_SAVED 0x10000000
#define HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION 0x80000000
diff --git a/health/health_json.c b/health/health_json.c
index f6ff1b1a74..8a088d034a 100644
--- a/health/health_json.c
+++ b/health/health_json.c
@@ -43,6 +43,7 @@ static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, R
"\t\t\"updates_id\": %u,\n"
"\t\t\"value_string\": \"%s\",\n"
"\t\t\"old_value_string\": \"%s\",\n"
+ "\t\t\"last_repeat\": \"%lu\",\n"
"\t\t\"silenced\": \"%s\",\n"
, host->hostname
, ae->unique_id
@@ -71,6 +72,7 @@ static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, R
, ae->updates_id
, ae->new_value_string
, ae->old_value_string
+ , (unsigned long)ae->last_repeat
, (ae->flags & HEALTH_ENTRY_FLAG_SILENCED)?"true":"false"
);
@@ -143,6 +145,7 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
"\t\t\t\"warn_repeat_every\": \"%u\",\n"
"\t\t\t\"crit_repeat_every\": \"%u\",\n"
"\t\t\t\"value_string\": \"%s\",\n"
+ "\t\t\t\"last_repeat\": \"%lu\",\n"
, rc->chart, rc->name
, (unsigned long)rc->id
, rc->name
@@ -170,6 +173,7 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
, rc->warn_repeat_every
, rc->crit_repeat_every
, value_string
+ , (unsigned long)rc->last_repeat
);
if(unlikely(rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)) {
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 0aa5af247f..179b04864b 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -7,6 +7,7 @@ CLEANFILES = \
health_mgmtapi/health-cmdapi-test.sh \
acls/acl.sh \
urls/request.sh \
+ alarm_repetition/alarm.sh \
$(NULL)
include $(top_srcdir)/build/subst.inc
@@ -26,12 +27,14 @@ dist_noinst_DATA = \
health_mgmtapi/health-cmdapi-test.sh.in \
acls/acl.sh.in \
urls/request.sh.in \
+ alarm_repetition/alarm.sh.in \
$(NULL)
dist_plugins_SCRIPTS = \
health_mgmtapi/health-cmdapi-test.sh \
acls/acl.sh \
urls/request.sh \
+ alarm_repetition/alarm.sh \
$(NULL)
dist_noinst_SCRIPTS = \
diff --git a/tests/alarm_repetition/alarm.sh.in b/tests/alarm_repetition/alarm.sh.in
new file mode 100644
index 0000000000..8555e0a3c0
--- /dev/null
+++ b/tests/alarm_repetition/alarm.sh.in
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+#The health directory to put the alarms
+HEALTHDIR="@configdir_POST@/health.d/"
+
+#output directory
+OUTDIR="workdir/"
+
+#url to do download
+MURL="http://localhost:19999/api/v1/alarms?active"
+
+#error messages
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+NOCOLOR='\033[0m'
+
+MYCDIR="$(pwd)"
+CONFFILE="$MYCDIR/netdata.conf"
+
+change_alarm_file() {
+ if [ -f "$1" ]; then
+ rm "$1"
+ fi
+
+ #copy keeping the permissions
+ cp -a "$2" "$3"
+}
+
+netdata_test_download() {
+ OPT="-e"
+ if [ "$3" == "I" ]; then
+ OPT="-v"
+ fi
+
+ grep "HTTP/1.1 200 OK" "$1" 2>/dev/null 1>/dev/null
+ TEST="$?"
+ if [ "$TEST" -ne "0" ]; then
+ echo -e "${RED} Error to get the alarms"
+ killall netdata
+ exit 1
+ fi
+
+ COUNT=$(grep -w "\"last_repeat\":" "$2" | grep -c "$OPT" "\"0\"")
+ if [ "$COUNT" -eq "0" ]; then
+ echo -e "${RED} Netdata gave an unexpected result when alarm repetition is $4"
+ killall netdata
+ exit 1
+ fi
+
+ echo -e "${GREEN} I got the expected result"
+}
+
+get_the_logs() {
+ curl -v -k --create-dirs -o "$OUTDIR/$1.out" "$MURL" 2> "$OUTDIR/$1.err"
+ netdata_test_download "$OUTDIR/$1.err" "$OUTDIR/$1.out" "$2" "$3"
+}
+
+process_data() {
+ SEC=120
+ netdata -c "$CONFFILE" -D &
+ NETDATAPID=$!
+ echo -e "${NOCOLOR}Sleeping during $SEC seconds to create alarm entries"
+ sleep $SEC
+ get_the_logs "$1" "$2" "$3"
+ kill $NETDATAPID
+}
+
+mkdir "$OUTDIR"
+CREATEDIR="$?"
+if [ "$CREATEDIR" -ne "0" ]; then
+ echo -e "${RED}Cannot create the output directory, it already exists. The test will overwrite previous results."
+fi
+
+change_alarm_file "./0" "ram_without_repetition.conf" "$HEALTHDIR/ram.conf"
+cp -a netdata.conf_without_repetition netdata.conf
+process_data "ram_without" "K" "not activated."
+rm netdata.conf
+
+change_alarm_file "$HEALTHDIR/ram.conf" "ram_with_repetition.conf" "$HEALTHDIR/ram.conf"
+cp -a netdata.conf_with_repetition netdata.conf
+process_data "ram_with" "I" "activated."
+rm netdata.conf
+
+echo -e "${GREEN} all the tests were sucessful"
+rm "$HEALTHDIR/ram.conf"
+rm -rf $OUTDIR
diff --git a/tests/alarm_repetition/netdata.conf_with_repetition b/tests/alarm_repetition/netdata.conf_with_repetition
new file mode 100644
index 0000000000..5e02288dbf
--- /dev/null
+++ b/tests/alarm_repetition/netdata.conf_with_repetition
@@ -0,0 +1,57 @@
+# netdata configuration
+#
+# You can download the latest version of this file, using:
+#
+# wget -O /etc/netdata/netdata.conf http://localhost:19999/netdata.conf
+# or
+# curl -o /etc/netdata/netdata.conf http://localhost:19999/netdata.conf
+#
+# You can uncomment and change any of the options below.
+# The value shown in the commented settings, is the default value.
+#
+
+# global netdata configuration
+
+[global]
+ #run as user = netdata
+
+[web]
+ #ssl key = /etc/netdata/ssl/key2048.pem
+ #ssl certificate = /etc/netdata/ssl/cert2048.pem
+ mode = static-threaded
+ # listen backlog = 4096
+ default port = 19999
+ #bind to = *=dashboard|registry|streaming|netdata.conf|badges|management *:20000=dashboard|registry|streaming|netdata.conf|badges|management^SSL=optional *:20001=dashboard|registry|streaming|netdata.conf|badges|management^SSL=force unix:/tmp/netdata/netdata.sock
+ # web files owner = netdata
+ # web files group = netdata
+ #accept a streaming request every seconds = 2
+
+[plugins]
+ proc = yes
+ diskspace = no
+ cgroups = no
+ tc = no
+ idlejitter = no
+ enable running new plugins = no
+ check for new plugins every = 60
+ go.d = no
+ node.d = no
+ charts.d = no
+ nfacct = no
+ python.d = no
+ apps = no
+ fping = no
+ cups = no
+
+[health]
+ enabled = yes
+ in memory max health log entries = 1000
+ default repeat warning = 4s
+ default repeat critical = 2s
+
+[registry]
+ enabled = yes
+ allow from = *
+
+[cloud]
+ cloud base url = https://netdata.cloud
diff --git a/tests/alarm_repetition/netdata.conf_without_repetition b/tests/alarm_repetition/netdata.conf_without_repetition
new file mode 100644
index 0000000000..80513ecb79
--- /dev/null
+++ b/tests/alarm_repetition/netdata.conf_without_repetition
@@ -0,0 +1,57 @@
+# netdata configuration
+#
+# You can download the latest version of this file, using:
+#
+# wget -O /etc/netdata/netdata.conf http://localhost:19999/netdata.conf
+# or
+# curl -o /etc/netdata/netdata.conf http://localhost:19999/netdata.conf
+#
+# You can uncomment and change any of the options below.
+# The value shown in the commented settings, is the default value.
+#
+
+# global netdata configuration
+
+[global]
+ #run as user = netdata
+
+[web]
+ #ssl key = /etc/netdata/ssl/key2048.pem
+ #ssl certificate = /etc/netdata/ssl/cert2048.pem
+ mode = static-threaded
+ # listen backlog = 4096
+ default port = 19999
+ #bind to = *=dashboard|registry|streaming|netdata.conf|badges|management *:20000=dashboard|registry|streaming|netdata.conf|badges|management^SSL=optional *:20001=dashboard|registry|streaming|netdata.conf|badges|management^SSL=force unix:/tmp/netdata/netdata.sock
+ # web files owner = netdata
+ # web files group = netdata
+ #accept a streaming request every seconds = 2
+
+[plugins]
+ proc = yes
+ diskspace = no
+ cgroups = no
+ tc = no
+ idlejitter = no
+ enable running new plugins = no
+ check for new plugins every = 60
+ go.d = no
+ node.d = no
+ charts.d = no
+ nfacct = no
+ python.d = no
+ apps = no
+ fping = no
+ cups = no
+
+[health]
+ enabled = yes
+ in memory max health log entries = 1000
+ #default repeat warning = 4s
+ #default repeat critical = 2s
+
+[registry]
+ enabled = yes
+ allow from = *
+
+[cloud]
+ cloud base url = https://netdata.cloud
diff --git a/tests/alarm_repetition/ram_with_repetition.conf b/tests/alarm_repetition/ram_with_repetition.conf
new file mode 100644
index 0000000000..c215a71d78
--- /dev/null
+++ b/tests/alarm_repetition/ram_with_repetition.conf
@@ -0,0 +1,64 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ alarm: used_ram_to_ignore
+ on: system.ram
+ os: linux freebsd
+ hosts: *
+ calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz)
+ every: 10s
+ info: the amount of memory that is reported as used, but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC)
+
+ alarm: ram_in_use
+ on: system.ram
+ os: linux
+ hosts: *
+# calc: $used * 100 / ($used + $cached + $free)
+ calc: ($used - $used_ram_to_ignore) * 100 / ($used - $used_ram_to_ignore + $cached + $free)
+ units: %
+ every: 1s
+ warn: $this > 1
+ crit: $this > 5
+ delay: down 15m multiplier 1.5 max 1h
+ info: system RAM used
+ to: sysadmin #alarms
+ repeat: warning 30s critical 60s
+
+ alarm: ram_available
+ on: mem.available
+ os: linux
+ hosts: *
+ calc: ($avail + $used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers)
+ units: %
+ every: 10s
+ warn: $this < (($status >= $WARNING) ? (15) : (10))
+ crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
+ delay: down 15m multiplier 1.5 max 1h
+ info: estimated amount of RAM available for userspace processes, without causing swapping
+ to: sysadmin #alarms
+
+## FreeBSD
+alarm: ram_in_use
+ on: system.ram
+ os: freebsd
+hosts: *
+ calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
+units: %
+every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+delay: down 15m multiplier 1.5 max 1h
+ info: system RAM usage
+ to: sysadmin #alarms
+
+ alarm: ram_available
+ on: system.ram
+ os: freebsd
+ hosts: *
+ calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
+ units: %
+ every: 10s
+ warn: $this < (($status >= $WARNING) ? (15) : (10))
+ crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
+ delay: down 15m multiplier 1.5 max 1h
+ info: estimated amount of RAM available for userspace processes, without causing swapping
+ to: sysadmin #alarms
diff --git a/tests/alarm_repetition/ram_without_repetition.conf b/tests/alarm_repetition/ram_without_repetition.conf
new file mode 100644
index 0000000000..edfc492e0e
--- /dev/null
+++ b/tests/alarm_repetition/ram_without_repetition.conf
@@ -0,0 +1,63 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ alarm: used_ram_to_ignore
+ on: system.ram
+ os: linux freebsd
+ hosts: *
+ calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz)
+ every: 10s
+ info: the amount of memory that is reported as used, but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC)
+
+ alarm: ram_in_use
+ on: system.ram
+ os: linux
+ hosts: *
+# calc: $used * 100 / ($used + $cached + $free)
+ calc: ($used - $used_ram_to_ignore) * 100 / ($used - $used_ram_to_ignore + $cached + $free)
+ units: %
+ every: 1s
+ warn: $this > 1
+ crit: $this > 5
+ delay: down 15m multiplier 1.5 max 1h
+ info: system RAM used
+ to: sysadmin #alarms
+
+ alarm: ram_available
+ on: mem.available
+ os: linux
+ hosts: *
+ calc: ($avail + $used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers)
+ units: %
+ every: 10s
+ warn: $this < (($status >= $WARNING) ? (15) : (10))
+ crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
+ delay: down 15m multiplier 1.5 max 1h
+ info: estimated amount of RAM available for userspace processes, without causing swapping
+ to: sysadmin #alarms
+
+## FreeBSD
+alarm: ram_in_use
+ on: system.ram
+ os: freebsd
+hosts: *
+ calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
+units: %
+every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+delay: down 15m multiplier 1.5 max 1h
+ info: system RAM usage
+ to: sysadmin #alarms
+
+ alarm: ram_available
+ on: system.ram
+ os: freebsd
+ hosts: *
+ calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
+ units: %
+ every: 10s
+ warn: $this < (($status >= $WARNING) ? (15) : (10))
+ crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
+ delay: down 15m multiplier 1.5 max 1h
+ info: estimated amount of RAM available for userspace processes, without causing swapping
+ to: sysadmin #alarms