summaryrefslogtreecommitdiffstats
path: root/health/notifications
diff options
context:
space:
mode:
authorthiagoftsm <thiagoftsm@gmail.com>2020-09-21 13:19:05 +0000
committerGitHub <noreply@github.com>2020-09-21 13:19:05 +0000
commit5156fb013fc024d7f62611bed50c4c154033f01a (patch)
treea237c3d4cbc479c629bd727b1c5daeef1282326e /health/notifications
parentbc66c462f33880349504324e1dbc315769abedb8 (diff)
Stackpulse integration (#9965)
Add integration with Stackpulse.
Diffstat (limited to 'health/notifications')
-rw-r--r--health/notifications/Makefile.am1
-rwxr-xr-xhealth/notifications/alarm-notify.sh.in68
-rwxr-xr-xhealth/notifications/health_alarm_notify.conf21
-rw-r--r--health/notifications/stackpulse/Makefile.inc12
-rw-r--r--health/notifications/stackpulse/README.md81
5 files changed, 180 insertions, 3 deletions
diff --git a/health/notifications/Makefile.am b/health/notifications/Makefile.am
index a0f0d82d6a..4af9f7782e 100644
--- a/health/notifications/Makefile.am
+++ b/health/notifications/Makefile.am
@@ -41,6 +41,7 @@ include pushover/Makefile.inc
include rocketchat/Makefile.inc
include slack/Makefile.inc
include smstools3/Makefile.inc
+include stackpulse/Makefile.inc
include syslog/Makefile.inc
include telegram/Makefile.inc
include twilio/Makefile.inc
diff --git a/health/notifications/alarm-notify.sh.in b/health/notifications/alarm-notify.sh.in
index 2d7f9e5aab..78641f07d0 100755
--- a/health/notifications/alarm-notify.sh.in
+++ b/health/notifications/alarm-notify.sh.in
@@ -36,6 +36,7 @@
# - RocketChat notifications by @Hermsi1337 #3777
# - Google Hangouts Chat notifications by @EnzoAkira and @hendrikhofstadt
# - Dynatrace Event by @illumine
+# - Stackpulse Event by @thiagoftsm
# -----------------------------------------------------------------------------
# testing notifications
@@ -382,6 +383,8 @@ DYNATRACE_ANNOTATION_TYPE=
DYNATRACE_EVENT=
SEND_DYNATRACE=
+# stackpulse configs
+STACKPULSE_WEBHOOK=
# load the stock and user configuration files
# these will overwrite the variables above
@@ -532,6 +535,9 @@ filter_recipient_by_criticality() {
# check matrix
{ [ -z "${MATRIX_HOMESERVER}" ] || [ -z "${MATRIX_ACCESSTOKEN}" ]; } && SEND_MATRIX="NO"
+# check stackpulse
+[ -z "${STACKPULSE_WEBHOOK}" ] && SEND_STACKPULSE="NO"
+
if [ "${SEND_PUSHOVER}" = "YES" ] ||
[ "${SEND_SLACK}" = "YES" ] ||
[ "${SEND_ROCKETCHAT}" = "YES" ] ||
@@ -552,7 +558,8 @@ if [ "${SEND_PUSHOVER}" = "YES" ] ||
[ "${SEND_MATRIX}" = "YES" ] ||
[ "${SEND_CUSTOM}" = "YES" ] ||
[ "${SEND_MSTEAM}" = "YES" ] ||
- [ "${SEND_DYNATRACE}" = "YES" ]; then
+ [ "${SEND_DYNATRACE}" = "YES" ] ||
+ [ "${SEND_STACKPULSE}" = "YES" ]; then
# if we need curl, check for the curl command
if [ -z "${curl}" ]; then
curl="$(command -v curl 2>/dev/null)"
@@ -580,6 +587,7 @@ if [ "${SEND_PUSHOVER}" = "YES" ] ||
SEND_MATRIX="NO"
SEND_CUSTOM="NO"
SEND_DYNATRACE="NO"
+ SEND_STACKPULSE="NO"
fi
fi
@@ -708,7 +716,8 @@ for method in "${SEND_EMAIL}" \
"${SEND_SYSLOG}" \
"${SEND_SMS}" \
"${SEND_MSTEAM}" \
- "${SEND_DYNATRACE}"; do
+ "${SEND_DYNATRACE}" \
+ "${SEND_STACKPULSE}" ; do
if [ "${method}" == "YES" ]; then
proceed=1
@@ -1968,8 +1977,10 @@ EOF
return 1
}
+
# -----------------------------------------------------------------------------
# Dynatrace sender
+
send_dynatrace() {
[ "${SEND_DYNATRACE}" != "YES" ] && return 1
@@ -2016,6 +2027,51 @@ EOF
fi
}
+
+# -----------------------------------------------------------------------------
+# Stackpulse sender
+
+send_stackpulse() {
+ local payload httpcode oldv currv
+ [ "${SEND_STACKPULSE}" != "YES" ] && return 1
+
+ # We are sending null when values are nan to avoid errors while JSON message is parsed
+ [ "${old_value}" != "nan" ] && oldv="${old_value}" || oldv="null"
+ [ "${value}" != "nan" ] && currv="${value}" || currv="null"
+
+ payload=$(cat <<EOF
+ {
+ "Node" : "${host}",
+ "Chart" : "${chart}",
+ "OldValue" : ${oldv},
+ "Value" : ${currv},
+ "Units" : "${units}",
+ "OldStatus" : "${old_status}",
+ "Status" : "${status}",
+ "Alarm" : "${name}",
+ "Date": ${when},
+ "Duration": ${duration},
+ "NonClearDuration": ${non_clear_duration},
+ "Description" : "${status_message}, ${info}",
+ "CalcExpression" : "${calc_expression}",
+ "CalcParamValues" : "${calc_param_values}",
+ "TotalWarnings" : "${total_warnings}",
+ "TotalCritical" : "${total_critical}",
+ "ID" : ${alarm_id}
+ }
+EOF
+)
+
+ httpcode=$(docurl -X POST -H "Content-Type: application/json" -d "${payload}" ${STACKPULSE_WEBHOOK})
+ if [ "${httpcode}" = "200" ]; then
+ info "sent stackpulse notification for: ${host} ${chart}.${name} is ${status}"
+ else
+ error "failed to send stackpulse notification for: ${host} ${chart}.${name} is ${status}, with HTTP response status code ${httpcode}."
+ return 1
+ fi
+
+ return 0
+}
# -----------------------------------------------------------------------------
# prepare the content of the notification
@@ -2535,6 +2591,11 @@ SENT_DYNATRACE=$?
# -----------------------------------------------------------------------------
+# send the EVENT to Dynatrace
+send_stackpulse
+SENT_STACKPULSE=$?
+
+# -----------------------------------------------------------------------------
# let netdata know
for state in "${SENT_EMAIL}" \
"${SENT_PUSHOVER}" \
@@ -2561,7 +2622,8 @@ for state in "${SENT_EMAIL}" \
"${SENT_SYSLOG}" \
"${SENT_SMS}" \
"${SENT_MSTEAM}" \
- "${SENT_DYNATRACE}"; do
+ "${SENT_DYNATRACE}" \
+ "${SENT_STACKPULSE}" ; do
if [ "${state}" -eq 0 ]; then
# we sent something
exit 0
diff --git a/health/notifications/health_alarm_notify.conf b/health/notifications/health_alarm_notify.conf
index ac9e8ee543..5615683452 100755
--- a/health/notifications/health_alarm_notify.conf
+++ b/health/notifications/health_alarm_notify.conf
@@ -270,6 +270,15 @@ DYNATRACE_EVENT="CUSTOM_INFO"
DEFAULT_RECIPIENT_DYNATRACE=""
#------------------------------------------------------------------------------
+# Stackpulse global notification options
+SEND_STACKPULSE="YES"
+
+# Webhook
+STACKPULSE_WEBHOOK=""
+
+DEFAULT_RECIPIENT_STACKPULSE=""
+
+#------------------------------------------------------------------------------
# hangouts (google hangouts chat) global notification options
# enable/disable sending hangouts notifications
@@ -937,6 +946,8 @@ role_recipients_dynatrace[sysadmin]="${DEFAULT_RECIPIENT_DYNATRACE}"
role_recipients_matrix[sysadmin]="${DEFAULT_RECIPIENT_MATRIX}"
+role_recipients_stackpulse[sysadmin]="${DEFAULT_RECIPIENT_STACKPULSE}"
+
# -----------------------------------------------------------------------------
# DNS related alarms
@@ -990,6 +1001,8 @@ role_recipients_dynatrace[domainadmin]="${DEFAULT_RECIPIENT_DYNATRACE}"
role_recipients_matrix[domainadmin]="${DEFAULT_RECIPIENT_MATRIX}"
+role_recipients_stackpulse[domainadmin]="${DEFAULT_RECIPIENT_STACKPULSE}"
+
# -----------------------------------------------------------------------------
# database servers alarms
# mysql, redis, memcached, postgres, etc
@@ -1044,6 +1057,8 @@ role_recipients_dynatrace[dba]="${DEFAULT_RECIPIENT_DYNATRACE}"
role_recipients_matrix[dba]="${DEFAULT_RECIPIENT_MATRIX}"
+role_recipients_stackpulse[dba]="${DEFAULT_RECIPIENT_STACKPULSE}"
+
# -----------------------------------------------------------------------------
# web servers alarms
# apache, nginx, lighttpd, etc
@@ -1098,6 +1113,8 @@ role_recipients_dynatrace[webmaster]="${DEFAULT_RECIPIENT_DYNATRACE}"
role_recipients_matrix[webmaster]="${DEFAULT_RECIPIENT_MATRIX}"
+role_recipients_stackpulse[webmaster]="${DEFAULT_RECIPIENT_STACKPULSE}"
+
# -----------------------------------------------------------------------------
# proxy servers alarms
# squid, etc
@@ -1152,6 +1169,8 @@ role_recipients_dynatrace[proxyadmin]="${DEFAULT_RECIPIENT_DYNATRACE}"
role_recipients_matrix[proxyadmin]="${DEFAULT_RECIPIENT_MATRIX}"
+role_recipients_stackpulse[proxyadmin]="${DEFAULT_RECIPIENT_STACKPULSE}"
+
# -----------------------------------------------------------------------------
# peripheral devices
# UPS, photovoltaics, etc
@@ -1203,3 +1222,5 @@ role_recipients_sms[sitemgr]="${DEFAULT_RECIPIENT_SMS}"
role_recipients_dynatrace[sitemgr]="${DEFAULT_RECIPIENT_DYNATRACE}"
role_recipients_matrix[sitemgr]="${DEFAULT_RECIPIENT_MATRIX}"
+
+role_recipients_stackpulse[sitemgr]="${DEFAULT_RECIPIENT_STACKPULSE}"
diff --git a/health/notifications/stackpulse/Makefile.inc b/health/notifications/stackpulse/Makefile.inc
new file mode 100644
index 0000000000..eabcb4bcf0
--- /dev/null
+++ b/health/notifications/stackpulse/Makefile.inc
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+# THIS IS NOT A COMPLETE Makefile
+# IT IS INCLUDED BY ITS PARENT'S Makefile.am
+# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT
+
+# install these files
+dist_noinst_DATA += \
+ stackpulse/README.md \
+ stackpulse/Makefile.inc \
+ $(NULL)
+
diff --git a/health/notifications/stackpulse/README.md b/health/notifications/stackpulse/README.md
new file mode 100644
index 0000000000..4f9300ca6d
--- /dev/null
+++ b/health/notifications/stackpulse/README.md
@@ -0,0 +1,81 @@
+<!--
+title: "Send notifications to StackPulse"
+description: "Send alerts to your StackPulse Netdata integration any time an anomaly or performance issue strikes a node in your infrastructure."
+sidebar_label: "StackPulse"
+custom_edit_url: https://github.com/netdata/netdata/edit/master/health/notifications/stackpulse/README.md
+-->
+
+# Send notifications to StackPulse
+
+[StackPulse](https://stackpulse.com/) is a software-as-a-service platform for site reliablility engineering.
+StackPulse is a Software-as-a-Service platform for Site Reliablility Engineering. It helps SREs, DevOps Engineers and
+Software Developers reduce toil and alert fatigue while improving reliability of software services by managing,
+analyzing and automating incident response activities.
+
+Sending Netdata alarm notifications to StackPulse allows you to create smart automated response workflows
+(StackPulse playbooks) that will help you drive down your MTTD and MTTR by performing any of the following:
+
+- Enriching the incident with data from multiple sources
+- Performing triage actions and analyzing their results
+- Orchestrating incident management and notification flows
+- Performing automatic and semi-automatic remediation actions
+- Analzying incident data and remediation patterns to improve reliability of your services
+
+To send the notification you need:
+
+1. Create a Netdata integration in the `StackPulse Administration Portal`, and copy the `Endpoint` URL.
+
+![Creating a Netdata integration in StackPulse](https://user-images.githubusercontent.com/49162938/93023348-d9455a80-f5dd-11ea-8e05-67d07dce93e4.png)
+
+2. On your node, navigate to `/etc/netdata/` and run the following command:
+
+```sh
+$ ./edit-config health_alarm_notify.conf
+```
+
+3. Set the `STACKPULSE_WEBHOOK` variable to `Endpoint` URL you copied earlier:
+
+```
+SEND_STACKPULSE="YES"
+STACKPULSE_WEBHOOK="https://hooks.stackpulse.io/v1/webhooks/YOUR_UNIQUE_ID"
+```
+
+4. Now [restart Netdata](/docs/getting-started.md#start-stop-and-restart-netdata). When your node creates an alarm, you
+ can see the associated notification on your StackPulse Administration Portal
+
+## React to alarms with playbooks
+
+StackPulse allow users to create `Playbooks` giving additional information about events that happen in specific
+scenarios. For example, you could create a Playbook that responds to a "low disk space" alarm by compressing and
+cleaning up storage partitions with dynamic data.
+
+![image](https://user-images.githubusercontent.com/49162938/93207961-4c201400-f74b-11ea-94d1-42a29d007b62.png)
+
+![The StackPulse Administration Portal with a Netdata
+alarm](https://user-images.githubusercontent.com/49162938/93208199-bfc22100-f74b-11ea-83c4-728be23dcf4d.png)
+### Create Playbooks for Netdata alarms
+
+To create a Playbook, you need to access the StackPulse Administration Portal. After the initial setup, you need to
+access the **TRIGGER** tab to define the scenarios used to trigger the event. The following variables are available:
+
+- `Hostname`: The host that generated the event.
+- `Chart`: The name of the chart.
+- `OldValue` : The previous value of the alarm.
+- `Value`: The current value of the alarm.
+- `Units` : The units of the value.
+- `OldStatus` : The previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL.
+- `State`: The current alarm status, the acceptable values are the same of `OldStatus`.
+- `Alarm` : The name of the alarm, as given in Netdata's health.d entries.
+- `Date` : The timestamp this event occurred.
+- `Duration` : The duration in seconds of the previous alarm state.
+- `NonClearDuration` : The total duration in seconds this is/was non-clear.
+- `Description` : A short description of the alarm copied from the alarm definition.
+- `CalcExpression` : The expression that was evaluated to trigger the alarm.
+- `CalcParamValues` : The values of the parameters in the expression, at the time of the evaluation.
+- `TotalWarnings` : Total number of alarms in WARNING state.
+- `TotalCritical` : Total number of alarms in CRITICAL state.
+- `ID` : The unique id of the alarm that generated this event.
+
+For more details how to create a scenario, take a look at the [StackPulse documentation](https://docs.stackpulse.io).
+
+[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fhealth%2Fnotifications%2Fopsgenie%2FREADME%2FDonations-netdata-has-received&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>)