diff options
Diffstat (limited to 'health')
68 files changed, 7988 insertions, 0 deletions
diff --git a/health/Makefile.am b/health/Makefile.am new file mode 100644 index 0000000000..6f09b2e25f --- /dev/null +++ b/health/Makefile.am @@ -0,0 +1,87 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +AUTOMAKE_OPTIONS = subdir-objects +MAINTAINERCLEANFILES = $(srcdir)/Makefile.in + +CLEANFILES = \ + alarm-notify.sh \ + $(NULL) + +include $(top_srcdir)/build/subst.inc +SUFFIXES = .in + +dist_libconfig_DATA = \ + health_alarm_notify.conf \ + health_email_recipients.conf \ + $(NULL) + +dist_plugins_SCRIPTS = \ + alarm-notify.sh \ + alarm-email.sh \ + alarm-test.sh \ + $(NULL) + +dist_noinst_DATA = \ + alarm-notify.sh.in \ + README.md \ + $(NULL) + +healthconfigdir=$(libconfigdir)/health.d +dist_healthconfig_DATA = \ + health.d/apache.conf \ + health.d/apcupsd.conf \ + health.d/backend.conf \ + health.d/bcache.conf \ + health.d/beanstalkd.conf \ + health.d/bind_rndc.conf \ + health.d/boinc.conf \ + health.d/btrfs.conf \ + health.d/ceph.conf \ + health.d/cpu.conf \ + health.d/couchdb.conf \ + health.d/disks.conf \ + health.d/dockerd.conf \ + health.d/elasticsearch.conf \ + health.d/entropy.conf \ + health.d/fping.conf \ + health.d/fronius.conf \ + health.d/haproxy.conf \ + health.d/httpcheck.conf \ + health.d/ipc.conf \ + health.d/ipfs.conf \ + health.d/ipmi.conf \ + health.d/isc_dhcpd.conf \ + health.d/lighttpd.conf \ + health.d/linux_power_supply.conf \ + health.d/load.conf \ + health.d/mdstat.conf \ + health.d/megacli.conf \ + health.d/memcached.conf \ + health.d/memory.conf \ + health.d/mongodb.conf \ + health.d/mysql.conf \ + health.d/named.conf \ + health.d/net.conf \ + health.d/netfilter.conf \ + health.d/nginx.conf \ + health.d/nginx_plus.conf \ + health.d/portcheck.conf \ + health.d/postgres.conf \ + health.d/qos.conf \ + health.d/ram.conf \ + health.d/redis.conf \ + health.d/retroshare.conf \ + health.d/softnet.conf \ + health.d/squid.conf \ + health.d/stiebeleltron.conf \ + health.d/swap.conf \ + health.d/tcp_conn.conf \ + health.d/tcp_listen.conf \ + health.d/tcp_mem.conf \ + health.d/tcp_orphans.conf \ + health.d/tcp_resets.conf \ + health.d/udp_errors.conf \ + health.d/varnish.conf \ + health.d/web_log.conf \ + health.d/zfs.conf \ + $(NULL) diff --git a/health/README.md b/health/README.md new file mode 100644 index 0000000000..e69de29bb2 --- /dev/null +++ b/health/README.md diff --git a/health/alarm-email.sh b/health/alarm-email.sh new file mode 100755 index 0000000000..69c4c3f8df --- /dev/null +++ b/health/alarm-email.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-3.0-or-later + +# OBSOLETE - REPLACED WITH +# alarm-notify.sh + +${0/alarm-email.sh/alarm-notify.sh} "${@}" diff --git a/health/alarm-notify.sh.in b/health/alarm-notify.sh.in new file mode 100755 index 0000000000..4aef3a521a --- /dev/null +++ b/health/alarm-notify.sh.in @@ -0,0 +1,2378 @@ +#!/usr/bin/env bash + +# netdata +# real-time performance and health monitoring, done right! +# (C) 2017 Costa Tsaousis <costa@tsaousis.gr> +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Script to send alarm notifications for netdata +# +# Features: +# - multiple notification methods +# - multiple roles per alarm +# - multiple recipients per role +# - severity filtering per recipient +# +# Supported notification methods: +# - emails by @ktsaou +# - slack.com notifications by @ktsaou +# - alerta.io notifications by @kattunga +# - discordapp.com notifications by @lowfive +# - pushover.net notifications by @ktsaou +# - pushbullet.com push notifications by Tiago Peralta @tperalta82 #1070 +# - telegram.org notifications by @hashworks #1002 +# - twilio.com notifications by Levi Blaney @shadycuz #1211 +# - kafka notifications by @ktsaou #1342 +# - pagerduty.com notifications by Jim Cooley @jimcooley #1373 +# - messagebird.com notifications by @tech_no_logical #1453 +# - hipchat notifications by @ktsaou #1561 +# - fleep notifications by @Ferroin +# - custom notifications by @ktsaou +# - syslog messages by @Ferroin +# - Microsoft Team notification by @tioumen + +# ----------------------------------------------------------------------------- +# testing notifications + + +if [ \( "${1}" = "test" -o "${2}" = "test" \) -a "${#}" -le 2 ] +then + if [ "${2}" = "test" ] + then + recipient="${1}" + else + recipient="${2}" + fi + + [ -z "${recipient}" ] && recipient="sysadmin" + + id=1 + last="CLEAR" + test_res=0 + for x in "WARNING" "CRITICAL" "CLEAR" + do + echo >&2 + echo >&2 "# SENDING TEST ${x} ALARM TO ROLE: ${recipient}" + + "${0}" "${recipient}" "$(hostname)" 1 1 "${id}" "$(date +%s)" "test_alarm" "test.chart" "test.family" "${x}" "${last}" 100 90 "${0}" 1 $((0 + id)) "units" "this is a test alarm to verify notifications work" "new value" "old value" + if [ $? -ne 0 ] + then + echo >&2 "# FAILED" + test_res=1 + else + echo >&2 "# OK" + fi + + last="${x}" + id=$((id + 1)) + done + + exit $test_res +fi + +export PATH="${PATH}:/sbin:/usr/sbin:/usr/local/sbin" +export LC_ALL=C + +# ----------------------------------------------------------------------------- + +PROGRAM_NAME="$(basename "${0}")" + +logdate() { + date "+%Y-%m-%d %H:%M:%S" +} + +log() { + local status="${1}" + shift + + echo >&2 "$(logdate): ${PROGRAM_NAME}: ${status}: ${*}" + +} + +warning() { + log WARNING "${@}" +} + +error() { + log ERROR "${@}" +} + +info() { + log INFO "${@}" +} + +fatal() { + log FATAL "${@}" + exit 1 +} + +debug=${NETDATA_ALARM_NOTIFY_DEBUG-0} +debug() { + [ "${debug}" = "1" ] && log DEBUG "${@}" +} + +docurl() { + if [ -z "${curl}" ] + then + error "\${curl} is unset." + return 1 + fi + + if [ "${debug}" = "1" ] + then + echo >&2 "--- BEGIN curl command ---" + printf >&2 "%q " ${curl} "${@}" + echo >&2 + echo >&2 "--- END curl command ---" + + local out=$(mktemp /tmp/netdata-health-alarm-notify-XXXXXXXX) + local code=$(${curl} ${curl_options} --write-out %{http_code} --output "${out}" --silent --show-error "${@}") + local ret=$? + echo >&2 "--- BEGIN received response ---" + cat >&2 "${out}" + echo >&2 + echo >&2 "--- END received response ---" + echo >&2 "RECEIVED HTTP RESPONSE CODE: ${code}" + rm "${out}" + echo "${code}" + return ${ret} + fi + + ${curl} ${curl_options} --write-out %{http_code} --output /dev/null --silent --show-error "${@}" + return $? +} + +# ----------------------------------------------------------------------------- +# this is to be overwritten by the config file + +custom_sender() { + info "not sending custom notification for ${status} of '${host}.${chart}.${name}'" +} + + +# ----------------------------------------------------------------------------- + +# check for BASH v4+ (required for associative arrays) +[ $(( ${BASH_VERSINFO[0]} )) -lt 4 ] && \ + fatal "BASH version 4 or later is required (this is ${BASH_VERSION})." + +# ----------------------------------------------------------------------------- +# defaults to allow running this script by hand + +[ -z "${NETDATA_USER_CONFIG_DIR}" ] && NETDATA_USER_CONFIG_DIR="@configdir_POST@" +[ -z "${NETDATA_STOCK_CONFIG_DIR}" ] && NETDATA_STOCK_CONFIG_DIR="@libconfigdir_POST@" +[ -z "${NETDATA_CACHE_DIR}" ] && NETDATA_CACHE_DIR="@cachedir_POST@" +[ -z "${NETDATA_REGISTRY_URL}" ] && NETDATA_REGISTRY_URL="https://registry.my-netdata.io" + +# ----------------------------------------------------------------------------- +# parse command line parameters + +roles="${1}" # the roles that should be notified for this event +host="${2}" # the host generated this event +unique_id="${3}" # the unique id of this event +alarm_id="${4}" # the unique id of the alarm that generated this event +event_id="${5}" # the incremental id of the event, for this alarm id +when="${6}" # the timestamp this event occurred +name="${7}" # the name of the alarm, as given in netdata health.d entries +chart="${8}" # the name of the chart (type.id) +family="${9}" # the family of the chart +status="${10}" # the current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL +old_status="${11}" # the previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL +value="${12}" # the current value of the alarm +old_value="${13}" # the previous value of the alarm +src="${14}" # the line number and file the alarm has been configured +duration="${15}" # the duration in seconds of the previous alarm state +non_clear_duration="${16}" # the total duration in seconds this is/was non-clear +units="${17}" # the units of the value +info="${18}" # a short description of the alarm +value_string="${19}" # friendly value (with units) +old_value_string="${20}" # friendly old value (with units) + +# ----------------------------------------------------------------------------- +# find a suitable hostname to use, if netdata did not supply a hostname + +this_host=$(hostname -s 2>/dev/null) +[ -z "${host}" ] && host="${this_host}" + +# ----------------------------------------------------------------------------- +# screen statuses we don't need to send a notification + +# don't do anything if this is not WARNING, CRITICAL or CLEAR +if [ "${status}" != "WARNING" -a "${status}" != "CRITICAL" -a "${status}" != "CLEAR" ] +then + info "not sending notification for ${status} of '${host}.${chart}.${name}'" + exit 1 +fi + +# don't do anything if this is CLEAR, but it was not WARNING or CRITICAL +if [ "${old_status}" != "WARNING" -a "${old_status}" != "CRITICAL" -a "${status}" = "CLEAR" ] +then + info "not sending notification for ${status} of '${host}.${chart}.${name}' (last status was ${old_status})" + exit 1 +fi + +# ----------------------------------------------------------------------------- +# load configuration + +# By default fetch images from the global public registry. +# This is required by default, since all notification methods need to download +# images via the Internet, and private registries might not be reachable. +# This can be overwritten at the configuration file. +images_base_url="https://registry.my-netdata.io" + +# curl options to use +curl_options="" + +# needed commands +# if empty they will be searched in the system path +curl= +sendmail= + +# enable / disable features +SEND_SLACK="YES" +SEND_MSTEAM="YES" +SEND_ALERTA="YES" +SEND_FLOCK="YES" +SEND_DISCORD="YES" +SEND_PUSHOVER="YES" +SEND_TWILIO="YES" +SEND_HIPCHAT="YES" +SEND_MESSAGEBIRD="YES" +SEND_KAVENEGAR="YES" +SEND_TELEGRAM="YES" +SEND_EMAIL="YES" +SEND_PUSHBULLET="YES" +SEND_KAFKA="YES" +SEND_PD="YES" +SEND_FLEEP="YES" +SEND_IRC="YES" +SEND_AWSSNS="YES" +SEND_SYSLOG="NO" +SEND_CUSTOM="YES" + +# slack configs +SLACK_WEBHOOK_URL= +DEFAULT_RECIPIENT_SLACK= +declare -A role_recipients_slack=() + +# Microsoft Team configs +MSTEAM_WEBHOOK_URL= +DEFAULT_RECIPIENT_MSTEAM= +declare -A role_recipients_msteam=() + +# rocketchat configs +ROCKETCHAT_WEBHOOK_URL= +DEFAULT_RECIPIENT_ROCKETCHAT= +declare -A role_recipients_rocketchat=() + +# alerta configs +ALERTA_WEBHOOK_URL= +ALERTA_API_KEY= +DEFAULT_RECIPIENT_ALERTA= +declare -A role_recipients_alerta=() + +# flock configs +FLOCK_WEBHOOK_URL= +DEFAULT_RECIPIENT_FLOCK= +declare -A role_recipients_flock=() + +# discord configs +DISCORD_WEBHOOK_URL= +DEFAULT_RECIPIENT_DISCORD= +declare -A role_recipients_discord=() + +# pushover configs +PUSHOVER_APP_TOKEN= +DEFAULT_RECIPIENT_PUSHOVER= +declare -A role_recipients_pushover=() + +# pushbullet configs +PUSHBULLET_ACCESS_TOKEN= +PUSHBULLET_SOURCE_DEVICE= +DEFAULT_RECIPIENT_PUSHBULLET= +declare -A role_recipients_pushbullet=() + +# twilio configs +TWILIO_ACCOUNT_SID= +TWILIO_ACCOUNT_TOKEN= +TWILIO_NUMBER= +DEFAULT_RECIPIENT_TWILIO= +declare -A role_recipients_twilio=() + +# hipchat configs +HIPCHAT_SERVER= +HIPCHAT_AUTH_TOKEN= +DEFAULT_RECIPIENT_HIPCHAT= +declare -A role_recipients_hipchat=() + +# messagebird configs +MESSAGEBIRD_ACCESS_KEY= +MESSAGEBIRD_NUMBER= +DEFAULT_RECIPIENT_MESSAGEBIRD= +declare -A role_recipients_messagebird=() + +# kavenegar configs +KAVENEGAR_API_KEY="" +KAVENEGAR_SENDER="" +DEFAULT_RECIPIENT_KAVENEGAR=() +declare -A role_recipients_kavenegar="" + +# telegram configs +TELEGRAM_BOT_TOKEN= +DEFAULT_RECIPIENT_TELEGRAM= +declare -A role_recipients_telegram=() + +# kafka configs +KAFKA_URL= +KAFKA_SENDER_IP= + +# pagerduty.com configs +PD_SERVICE_KEY= +DEFAULT_RECIPIENT_PD= +declare -A role_recipients_pd=() + +# fleep.io configs +FLEEP_SENDER="${host}" +DEFAULT_RECIPIENT_FLEEP= +declare -A role_recipients_fleep=() + +# Amazon SNS configs +DEFAULT_RECIPIENT_AWSSNS= +AWSSNS_MESSAGE_FORMAT= +declare -A role_recipients_awssns=() + +# syslog configs +SYSLOG_FACILITY= +declare -A role_recipients_syslog=() + +# custom configs +DEFAULT_RECIPIENT_CUSTOM= +declare -A role_recipients_custom=() + +# email configs +EMAIL_SENDER= +DEFAULT_RECIPIENT_EMAIL="root" +EMAIL_CHARSET=$(locale charmap 2>/dev/null) +EMAIL_THREADING= +declare -A role_recipients_email=() + +# irc configs +IRC_NICKNAME= +IRC_REALNAME= +DEFAULT_RECIPIENT_IRC= +IRC_NETWORK= +declare -A role_recipients_irc=() + +# load the stock and user configuration files +# these will overwrite the variables above + +for CONFIG in "${NETDATA_STOCK_CONFIG_DIR}/health_alarm_notify.conf" "${NETDATA_USER_CONFIG_DIR}/health_alarm_notify.conf" +do + if [ -f "${CONFIG}" ] + then + debug "Loading config file '${CONFIG}'..." + source "${CONFIG}" + [ $? -ne 0 ] && error "Failed to load config file '${CONFIG}'." + else + warning "Cannot find file '${CONFIG}'." + fi +done + +# If we didn't autodetect the character set for e-mail and it wasn't +# set by the user, we need to set it to a reasonable default. UTF-8 +# should be correct for almost all modern UNIX systems. +if [ -z ${EMAIL_CHARSET} ] + then + EMAIL_CHARSET="UTF-8" +fi + +# ----------------------------------------------------------------------------- +# filter a recipient based on alarm event severity |