summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorthiagoftsm <thiagoftsm@gmail.com>2019-09-03 18:30:26 +0000
committerGitHub <noreply@github.com>2019-09-03 18:30:26 +0000
commit0798426c7dbfc290b1f6f6e397063a09ee8a3447 (patch)
tree943caf03981652bdbd7cc3faead2205ee66ef748
parent4d975c271258eaaef6b4aceabb2ff5f3212be879 (diff)
Fix clear notification missing (#6638)
* alarm_clear: Mapping In this PR I mapped all the necessary steps to discover the solution for the ISSUE 6581 * alarm_clear: Documentation and fixes This commit fixes the problem that were present in Netdata and it also updates the documentation of the functions and Netdata. * alarm_clear: shell script The original implementation did not have a shell script, here I begin to fix this * alarm_clear: shell script It is necessay to verify why make is not producing the same binary than cmake and finish the changes in the script * alarm_clear: adjust in health.c I rewrote the health.c to be more readable, but I discovered the problem I had in the last few hours were due kernel update * alarm_clear: script changes In this commit I am bringing the final version of the script that test the alarm repetition * alarm_clear: script fix and remove comments IN this commit I am fixing the shellcheck errors and removing some debug messages that were present in the code while I was developing * alarm_clear: Format The health.c had wrong tabulation, this PR brings back the pattern of space as tab for this file * alarm_clear: Script The script was using killlall that is not more present in all Linux distribution this commit removes this and bring the new way to stop Netdata * alarm_clear: return to previous tabulation I am bringing back the old tabulation here and I will create a new PR exclusively for this * alarm_clear: Remove comments I am removing comments from this PR to keep the focus in the major problem * alarm_clear: Remove comments 2 I forgot one comment * alarm_clear: New variable I am appending a new variable in the check before the rebase, because the health.c changed in other file has a direct relationship with what I did here until now * alarm_clear: Fix clear repetition With this last commit, I am bringing a new way to raise the clear alarm, but it is not repeating more with this fix, it displayed one time when it is cleaned and it will display the message again, if and only if, the alarm was raised.
-rw-r--r--.gitignore1
-rw-r--r--database/rrdcalc.h1
-rw-r--r--health/README.md3
-rw-r--r--health/health.c27
-rw-r--r--health/health.h1
-rw-r--r--health/health_json.c4
-rw-r--r--tests/Makefile.am3
-rw-r--r--tests/alarm_repetition/alarm.sh.in86
-rw-r--r--tests/alarm_repetition/netdata.conf_with_repetition57
-rw-r--r--tests/alarm_repetition/netdata.conf_without_repetition57
-rw-r--r--tests/alarm_repetition/ram_with_repetition.conf64
-rw-r--r--tests/alarm_repetition/ram_without_repetition.conf63
12 files changed, 361 insertions, 6 deletions
diff --git a/.gitignore b/.gitignore
index 52a108f7fd..1222d46ea0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -166,6 +166,7 @@ gmon.txt
sitespeed-result/
tests/acls/acl.sh
tests/urls/request.sh
+tests/alarm_repetition/alarm.sh
# tests and temp files
python.d/python-modules-installer.sh
diff --git a/database/rrdcalc.h b/database/rrdcalc.h
index 3400f711cf..f0c34b5439 100644
--- a/database/rrdcalc.h
+++ b/database/rrdcalc.h
@@ -27,6 +27,7 @@
#define RRDCALC_FLAG_RUNNABLE 0x00000040
#define RRDCALC_FLAG_DISABLED 0x00000080
#define RRDCALC_FLAG_SILENCED 0x00000100
+#define RRDCALC_FLAG_RUN_ONCE 0x00000200
#define RRDCALC_FLAG_NO_CLEAR_NOTIFICATION 0x80000000
diff --git a/health/README.md b/health/README.md
index 848c1bc3ba..35f037bce6 100644
--- a/health/README.md
+++ b/health/README.md
@@ -347,7 +347,8 @@ delay: [[[up U] [down D] multiplier M] max X]
#### Alarm line `repeat`
-Defines the interval between repeating notifications for the alarms in CRITICAL or WARNING mode. This will override the default interval settings inherited from health settings in `netdata.conf`. The default settings for repeating notifications are `default repeat warning = DURATION` and `default repeat critical = DURATION` which can be found in health stock configuration.
+Defines the interval between repeating notifications for the alarms in CRITICAL or WARNING mode. This will override the default interval settings inherited from health settings in `netdata.conf`. The default settings for repeating notifications are `default repeat warning = DURATION` and `default repeat critical = DURATION` which can be found in health stock configuration, when one of these interval is bigger than 0, Netdata will activate the repeat notification for `CRITICAL`, `CLEAR` and `WARNING` messages.
+`
Format:
diff --git a/health/health.c b/health/health.c
index 1ee1a37226..1460b5ba48 100644
--- a/health/health.c
+++ b/health/health.c
@@ -216,9 +216,11 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
// we have not executed this alarm notification in the past
// so, don't send CLEAR notifications
if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
- debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
- , ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
- goto done;
+ if((!(ae->flags & HEALTH_ENTRY_RUN_ONCE)) || (ae->flags & HEALTH_ENTRY_RUN_ONCE && ae->old_status < RRDCALC_STATUS_RAISED) ) {
+ debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
+ , ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
+ goto done;
+ }
}
}
}
@@ -872,10 +874,21 @@ void *health_main(void *ptr) {
for(rc = host->alarms; rc ; rc = rc->next) {
int repeat_every = 0;
if(unlikely(rrdcalc_isrepeating(rc))) {
- if(unlikely(rc->status == RRDCALC_STATUS_WARNING))
+ if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
+ rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUN_ONCE;
repeat_every = rc->warn_repeat_every;
- else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL))
+ } else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
+ rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUN_ONCE;
repeat_every = rc->crit_repeat_every;
+ } else if(unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
+ if(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE)) {
+ if(rc->old_status == RRDCALC_STATUS_CRITICAL) {
+ repeat_every = rc->crit_repeat_every;
+ } else if (rc->old_status == RRDCALC_STATUS_WARNING) {
+ repeat_every = rc->warn_repeat_every;
+ }
+ }
+ }
}
if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
rc->last_repeat = now;
@@ -890,6 +903,10 @@ void *health_main(void *ptr) {
)
);
ae->last_repeat = rc->last_repeat;
+ if (!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) {
+ ae->flags |= HEALTH_ENTRY_RUN_ONCE;
+ }
+ rc->rrdcalc_flags |= RRDCALC_FLAG_RUN_ONCE;
health_process_notifications(host, ae);
debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
health_alarm_log_free_one_nochecks_nounlink(ae);
diff --git a/health/health.h b/health/health.h
index 6c000bf458..8e4d0f7cb3 100644
--- a/health/health.h
+++ b/health/health.h
@@ -23,6 +23,7 @@ extern unsigned int default_health_enabled;
#define HEALTH_ENTRY_FLAG_EXEC_RUN 0x00000004
#define HEALTH_ENTRY_FLAG_EXEC_FAILED 0x00000008
#define HEALTH_ENTRY_FLAG_SILENCED 0x00000010
+#define HEALTH_ENTRY_RUN_ONCE 0x00000020
#define HEALTH_ENTRY_FLAG_SAVED 0x10000000
#define HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION 0x80000000
diff --git a/health/health_json.c b/health/health_json.c
index f6ff1b1a74..8a088d034a 100644
--- a/health/health_json.c
+++ b/health/health_json.c
@@ -43,6 +43,7 @@ static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, R
"\t\t\"updates_id\": %u,\n"
"\t\t\"value_string\": \"%s\",\n"
"\t\t\"old_value_string\": \"%s\",\n"
+ "\t\t\"last_repeat\": \"%lu\",\n"
"\t\t\"silenced\": \"%s\",\n"
, host->hostname
, ae->unique_id
@@ -71,6 +72,7 @@ static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, R
, ae->updates_id
, ae->new_value_string
, ae->old_value_string
+ , (unsigned long)ae->last_repeat
, (ae->flags & HEALTH_ENTRY_FLAG_SILENCED)?"true":"false"
);
@@ -143,6 +145,7 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
"\t\t\t\"warn_repeat_every\": \"%u\",\n"
"\t\t\t\"crit_repeat_every\": \"%u\",\n"
"\t\t\t\"value_string\": \"%s\",\n"
+ "\t\t\t\"last_repeat\": \"%lu\",\n"
, rc->chart, rc->name
, (unsigned long)rc->id
, rc->name
@@ -170,6 +173,7 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
, rc->warn_repeat_every
, rc->crit_repeat_every
, value_string
+ , (unsigned long)rc->last_repeat
);
if(unlikely(rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)) {
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 0aa5af247f..179b04864b 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -7,6 +7,7 @@ CLEANFILES = \
health_mgmtapi/health-cmdapi-test.sh \
acls/acl.sh \
urls/request.sh \
+ alarm_repetition/alarm.sh \
$(NULL)
include $(top_srcdir)/build/subst.inc
@@ -26,12 +27,14 @@ dist_noinst_DATA = \
health_mgmtapi/health-cmdapi-test.sh.in \
acls/acl.sh.in \
urls/request.sh.in \
+ alarm_repetition/alarm.sh.in \
$(NULL)
dist_plugins_SCRIPTS = \
health_mgmtapi/health-cmdapi-test.sh \
acls/acl.sh \
urls/request.sh \
+ alarm_repetition/alarm.sh \
$(NULL)
dist_noinst_SCRIPTS = \
diff --git a/tests/alarm_repetition/alarm.sh.in b/tests/alarm_repetition/alarm.sh.in
new file mode 100644
index 0000000000..8555e0a3c0
--- /dev/null
+++ b/tests/alarm_repetition/alarm.sh.in
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+#The health directory to put the alarms
+HEALTHDIR="@configdir_POST@/health.d/"
+
+#output directory
+OUTDIR="workdir/"
+
+#url to do download
+MURL="http://localhost:19999/api/v1/alarms?active"
+
+#error messages
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+NOCOLOR='\033[0m'
+
+MYCDIR="$(pwd)"
+CONFFILE="$MYCDIR/netdata.conf"
+
+change_alarm_file() {
+ if [ -f "$1" ]; then
+ rm "$1"
+ fi
+
+ #copy keeping the permissions
+ cp -a "$2" "$3"
+}
+
+netdata_test_download() {
+ OPT="-e"
+ if [ "$3" == "I" ]; then
+ OPT="-v"
+ fi
+
+ grep "HTTP/1.1 200 OK" "$1" 2>/dev/null 1>/dev/null
+ TEST="$?"
+ if [ "$TEST" -ne "0" ]; then
+ echo -e "${RED} Error to get the alarms"
+ killall netdata
+ exit 1
+ fi
+
+ COUNT=$(grep -w "\"last_repeat\":" "$2" | grep -c "$OPT" "\"0\"")
+ if [ "$COUNT" -eq "0" ]; then
+ echo -e "${RED} Netdata gave an unexpected result when alarm repetition is $4"
+ killall netdata
+ exit 1
+ fi
+
+ echo -e "${GREEN} I got the expected result"
+}
+
+get_the_logs() {
+ curl -v -k --create-dirs -o "$OUTDIR/$1.out" "$MURL" 2> "$OUTDIR/$1.err"
+ netdata_test_download "$OUTDIR/$1.err" "$OUTDIR/$1.out" "$2" "$3"
+}
+
+process_data() {
+ SEC=120
+ netdata -c "$CONFFILE" -D &
+ NETDATAPID=$!
+ echo -e "${NOCOLOR}Sleeping during $SEC seconds to create alarm entries"
+ sleep $SEC
+ get_the_logs "$1" "$2" "$3"
+ kill $NETDATAPID
+}
+
+mkdir "$OUTDIR"
+CREATEDIR="$?"
+if [ "$CREATEDIR" -ne "0" ]; then
+ echo -e "${RED}Cannot create the output directory, it already exists. The test will overwrite previous results."
+fi
+
+change_alarm_file "./0" "ram_without_repetition.conf" "$HEALTHDIR/ram.conf"
+cp -a netdata.conf_without_repetition netdata.conf
+process_data "ram_without" "K" "not activated."
+rm netdata.conf
+
+change_alarm_file "$HEALTHDIR/ram.conf" "ram_with_repetition.conf" "$HEALTHDIR/ram.conf"
+cp -a netdata.conf_with_repetition netdata.conf
+process_data "ram_with" "I" "activated."
+rm netdata.conf
+
+echo -e "${GREEN} all the tests were sucessful"
+rm "$HEALTHDIR/ram.conf"
+rm -rf $OUTDIR
diff --git a/tests/alarm_repetition/netdata.conf_with_repetition b/tests/alarm_repetition/netdata.conf_with_repetition
new file mode 100644
index 0000000000..5e02288dbf
--- /dev/null
+++ b/tests/alarm_repetition/netdata.conf_with_repetition
@@ -0,0 +1,57 @@
+# netdata configuration
+#
+# You can download the latest version of this file, using:
+#
+# wget -O /etc/netdata/netdata.conf http://localhost:19999/netdata.conf
+# or
+# curl -o /etc/netdata/netdata.conf http://localhost:19999/netdata.conf
+#
+# You can uncomment and change any of the options below.
+# The value shown in the commented settings, is the default value.
+#
+
+# global netdata configuration
+
+[global]
+ #run as user = netdata
+
+[web]
+ #ssl key = /etc/netdata/ssl/key2048.pem
+ #ssl certificate = /etc/netdata/ssl/cert2048.pem
+ mode = static-threaded
+ # listen backlog = 4096
+ default port = 19999
+ #bind to = *=dashboard|registry|streaming|netdata.conf|badges|management *:20000=dashboard|registry|streaming|netdata.conf|badges|management^SSL=optional *:20001=dashboard|registry|streaming|netdata.conf|badges|management^SSL=force unix:/tmp/netdata/netdata.sock
+ # web files owner = netdata
+ # web files group = netdata
+ #accept a streaming request every seconds = 2
+
+[plugins]
+ proc = yes
+ diskspace = no
+ cgroups = no
+ tc = no
+ idlejitter = no
+ enable running new plugins = no
+ check for new plugins every = 60
+ go.d = no
+ node.d = no
+ charts.d = no
+ nfacct = no
+ python.d = no
+ apps = no
+ fping = no
+ cups = no
+
+[health]
+ enabled = yes
+ in memory max health log entries = 1000
+ default repeat warning = 4s
+ default repeat critical = 2s
+
+[registry]
+ enabled = yes
+ allow from = *
+
+[cloud]
+ cloud base url = https://netdata.cloud
diff --git a/tests/alarm_repetition/netdata.conf_without_repetition b/tests/alarm_repetition/netdata.conf_without_repetition
new file mode 100644
index 0000000000..80513ecb79
--- /dev/null
+++ b/tests/alarm_repetition/netdata.conf_without_repetition
@@ -0,0 +1,57 @@
+# netdata configuration
+#
+# You can download the latest version of this file, using:
+#
+# wget -O /etc/netdata/netdata.conf http://localhost:19999/netdata.conf
+# or
+# curl -o /etc/netdata/netdata.conf http://localhost:19999/netdata.conf
+#
+# You can uncomment and change any of the options below.
+# The value shown in the commented settings, is the default value.
+#
+
+# global netdata configuration
+
+[global]
+ #run as user = netdata
+
+[web]
+ #ssl key = /etc/netdata/ssl/key2048.pem
+ #ssl certificate = /etc/netdata/ssl/cert2048.pem
+ mode = static-threaded
+ # listen backlog = 4096
+ default port = 19999
+ #bind to = *=dashboard|registry|streaming|netdata.conf|badges|management *:20000=dashboard|registry|streaming|netdata.conf|badges|management^SSL=optional *:20001=dashboard|registry|streaming|netdata.conf|badges|management^SSL=force unix:/tmp/netdata/netdata.sock
+ # web files owner = netdata
+ # web files group = netdata
+ #accept a streaming request every seconds = 2
+
+[plugins]
+ proc = yes
+ diskspace = no
+ cgroups = no
+ tc = no
+ idlejitter = no
+ enable running new plugins = no
+ check for new plugins every = 60
+ go.d = no
+ node.d = no
+ charts.d = no
+ nfacct = no
+ python.d = no
+ apps = no
+ fping = no
+ cups = no
+
+[health]
+ enabled = yes
+ in memory max health log entries = 1000
+ #default repeat warning = 4s
+ #default repeat critical = 2s
+
+[registry]
+ enabled = yes
+ allow from = *
+
+[cloud]
+ cloud base url = https://netdata.cloud
diff --git a/tests/alarm_repetition/ram_with_repetition.conf b/tests/alarm_repetition/ram_with_repetition.conf
new file mode 100644
index 0000000000..c215a71d78
--- /dev/null
+++ b/tests/alarm_repetition/ram_with_repetition.conf
@@ -0,0 +1,64 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ alarm: used_ram_to_ignore
+ on: system.ram
+ os: linux freebsd
+ hosts: *
+ calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz)
+ every: 10s
+ info: the amount of memory that is reported as used, but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC)
+
+ alarm: ram_in_use
+ on: system.ram
+ os: linux
+ hosts: *
+# calc: $used * 100 / ($used + $cached + $free)
+ calc: ($used - $used_ram_to_ignore) * 100 / ($used - $used_ram_to_ignore + $cached + $free)
+ units: %
+ every: 1s
+ warn: $this > 1
+ crit: $this > 5
+ delay: down 15m multiplier 1.5 max 1h
+ info: system RAM used
+ to: sysadmin #alarms
+ repeat: warning 30s critical 60s
+
+ alarm: ram_available
+ on: mem.available
+ os: linux
+ hosts: *
+ calc: ($avail + $used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers)
+ units: %
+ every: 10s
+ warn: $this < (($status >= $WARNING) ? (15) : (10))
+ crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
+ delay: down 15m multiplier 1.5 max 1h
+ info: estimated amount of RAM available for userspace processes, without causing swapping
+ to: sysadmin #alarms
+
+## FreeBSD
+alarm: ram_in_use
+ on: system.ram
+ os: freebsd
+hosts: *
+ calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
+units: %
+every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+delay: down 15m multiplier 1.5 max 1h
+ info: system RAM usage
+ to: sysadmin #alarms
+
+ alarm: ram_available
+ on: system.ram
+ os: freebsd
+ hosts: *
+ calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
+ units: %
+ every: 10s
+ warn: $this < (($status >= $WARNING) ? (15) : (10))
+ crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
+ delay: down 15m multiplier 1.5 max 1h
+ info: estimated amount of RAM available for userspace processes, without causing swapping
+ to: sysadmin #alarms
diff --git a/tests/alarm_repetition/ram_without_repetition.conf b/tests/alarm_repetition/ram_without_repetition.conf
new file mode 100644
index 0000000000..edfc492e0e
--- /dev/null
+++ b/tests/alarm_repetition/ram_without_repetition.conf
@@ -0,0 +1,63 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ alarm: used_ram_to_ignore
+ on: system.ram
+ os: linux freebsd
+ hosts: *
+ calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz)
+ every: 10s
+ info: the amount of memory that is reported as used, but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC)
+
+ alarm: ram_in_use
+ on: system.ram
+ os: linux
+ hosts: *
+# calc: $used * 100 / ($used + $cached + $free)
+ calc: ($used - $used_ram_to_ignore) * 100 / ($used - $used_ram_to_ignore + $cached + $free)
+ units: %
+ every: 1s
+ warn: $this > 1
+ crit: $this > 5
+ delay: down 15m multiplier 1.5 max 1h
+ info: system RAM used
+ to: sysadmin #alarms
+
+ alarm: ram_available
+ on: mem.available
+ os: linux
+ hosts: *
+ calc: ($avail + $used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers)
+ units: %
+ every: 10s
+ warn: $this < (($status >= $WARNING) ? (15) : (10))
+ crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
+ delay: down 15m multiplier 1.5 max 1h
+ info: estimated amount of RAM available for userspace processes, without causing swapping
+ to: sysadmin #alarms
+
+## FreeBSD
+alarm: ram_in_use
+ on: system.ram
+ os: freebsd
+hosts: *
+ calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
+units: %
+every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+delay: down 15m multiplier 1.5 max 1h
+ info: system RAM usage
+ to: sysadmin #alarms
+
+ alarm: ram_available
+ on: system.ram
+ os: freebsd
+ hosts: *
+ calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
+ units: %
+ every: 10s
+ warn: $this < (($status >= $WARNING) ? (15) : (10))
+ crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
+ delay: down 15m multiplier 1.5 max 1h
+ info: estimated amount of RAM available for userspace processes, without causing swapping
+ to: sysadmin #alarms