diff options
author | thiagoftsm <thiagoftsm@gmail.com> | 2020-09-21 13:19:05 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-09-21 13:19:05 +0000 |
commit | 5156fb013fc024d7f62611bed50c4c154033f01a (patch) | |
tree | a237c3d4cbc479c629bd727b1c5daeef1282326e /health/notifications | |
parent | bc66c462f33880349504324e1dbc315769abedb8 (diff) |
Stackpulse integration (#9965)
Add integration with Stackpulse.
Diffstat (limited to 'health/notifications')
-rw-r--r-- | health/notifications/Makefile.am | 1 | ||||
-rwxr-xr-x | health/notifications/alarm-notify.sh.in | 68 | ||||
-rwxr-xr-x | health/notifications/health_alarm_notify.conf | 21 | ||||
-rw-r--r-- | health/notifications/stackpulse/Makefile.inc | 12 | ||||
-rw-r--r-- | health/notifications/stackpulse/README.md | 81 |
5 files changed, 180 insertions, 3 deletions
diff --git a/health/notifications/Makefile.am b/health/notifications/Makefile.am index a0f0d82d6a..4af9f7782e 100644 --- a/health/notifications/Makefile.am +++ b/health/notifications/Makefile.am @@ -41,6 +41,7 @@ include pushover/Makefile.inc include rocketchat/Makefile.inc include slack/Makefile.inc include smstools3/Makefile.inc +include stackpulse/Makefile.inc include syslog/Makefile.inc include telegram/Makefile.inc include twilio/Makefile.inc diff --git a/health/notifications/alarm-notify.sh.in b/health/notifications/alarm-notify.sh.in index 2d7f9e5aab..78641f07d0 100755 --- a/health/notifications/alarm-notify.sh.in +++ b/health/notifications/alarm-notify.sh.in @@ -36,6 +36,7 @@ # - RocketChat notifications by @Hermsi1337 #3777 # - Google Hangouts Chat notifications by @EnzoAkira and @hendrikhofstadt # - Dynatrace Event by @illumine +# - Stackpulse Event by @thiagoftsm # ----------------------------------------------------------------------------- # testing notifications @@ -382,6 +383,8 @@ DYNATRACE_ANNOTATION_TYPE= DYNATRACE_EVENT= SEND_DYNATRACE= +# stackpulse configs +STACKPULSE_WEBHOOK= # load the stock and user configuration files # these will overwrite the variables above @@ -532,6 +535,9 @@ filter_recipient_by_criticality() { # check matrix { [ -z "${MATRIX_HOMESERVER}" ] || [ -z "${MATRIX_ACCESSTOKEN}" ]; } && SEND_MATRIX="NO" +# check stackpulse +[ -z "${STACKPULSE_WEBHOOK}" ] && SEND_STACKPULSE="NO" + if [ "${SEND_PUSHOVER}" = "YES" ] || [ "${SEND_SLACK}" = "YES" ] || [ "${SEND_ROCKETCHAT}" = "YES" ] || @@ -552,7 +558,8 @@ if [ "${SEND_PUSHOVER}" = "YES" ] || [ "${SEND_MATRIX}" = "YES" ] || [ "${SEND_CUSTOM}" = "YES" ] || [ "${SEND_MSTEAM}" = "YES" ] || - [ "${SEND_DYNATRACE}" = "YES" ]; then + [ "${SEND_DYNATRACE}" = "YES" ] || + [ "${SEND_STACKPULSE}" = "YES" ]; then # if we need curl, check for the curl command if [ -z "${curl}" ]; then curl="$(command -v curl 2>/dev/null)" @@ -580,6 +587,7 @@ if [ "${SEND_PUSHOVER}" = "YES" ] || SEND_MATRIX="NO" SEND_CUSTOM="NO" SEND_DYNATRACE="NO" + SEND_STACKPULSE="NO" fi fi @@ -708,7 +716,8 @@ for method in "${SEND_EMAIL}" \ "${SEND_SYSLOG}" \ "${SEND_SMS}" \ "${SEND_MSTEAM}" \ - "${SEND_DYNATRACE}"; do + "${SEND_DYNATRACE}" \ + "${SEND_STACKPULSE}" ; do if [ "${method}" == "YES" ]; then proceed=1 @@ -1968,8 +1977,10 @@ EOF return 1 } + # ----------------------------------------------------------------------------- # Dynatrace sender + send_dynatrace() { [ "${SEND_DYNATRACE}" != "YES" ] && return 1 @@ -2016,6 +2027,51 @@ EOF fi } + +# ----------------------------------------------------------------------------- +# Stackpulse sender + +send_stackpulse() { + local payload httpcode oldv currv + [ "${SEND_STACKPULSE}" != "YES" ] && return 1 + + # We are sending null when values are nan to avoid errors while JSON message is parsed + [ "${old_value}" != "nan" ] && oldv="${old_value}" || oldv="null" + [ "${value}" != "nan" ] && currv="${value}" || currv="null" + + payload=$(cat <<EOF + { + "Node" : "${host}", + "Chart" : "${chart}", + "OldValue" : ${oldv}, + "Value" : ${currv}, + "Units" : "${units}", + "OldStatus" : "${old_status}", + "Status" : "${status}", + "Alarm" : "${name}", + "Date": ${when}, + "Duration": ${duration}, + "NonClearDuration": ${non_clear_duration}, + "Description" : "${status_message}, ${info}", + "CalcExpression" : "${calc_expression}", + "CalcParamValues" : "${calc_param_values}", + "TotalWarnings" : "${total_warnings}", + "TotalCritical" : "${total_critical}", + "ID" : ${alarm_id} + } +EOF +) + + httpcode=$(docurl -X POST -H "Content-Type: application/json" -d "${payload}" ${STACKPULSE_WEBHOOK}) + if [ "${httpcode}" = "200" ]; then + info "sent stackpulse notification for: ${host} ${chart}.${name} is ${status}" + else + error "failed to send stackpulse notification for: ${host} ${chart}.${name} is ${status}, with HTTP response status code ${httpcode}." + return 1 + fi + + return 0 +} # ----------------------------------------------------------------------------- # prepare the content of the notification @@ -2535,6 +2591,11 @@ SENT_DYNATRACE=$? # ----------------------------------------------------------------------------- +# send the EVENT to Dynatrace +send_stackpulse +SENT_STACKPULSE=$? + +# ----------------------------------------------------------------------------- # let netdata know for state in "${SENT_EMAIL}" \ "${SENT_PUSHOVER}" \ @@ -2561,7 +2622,8 @@ for state in "${SENT_EMAIL}" \ "${SENT_SYSLOG}" \ "${SENT_SMS}" \ "${SENT_MSTEAM}" \ - "${SENT_DYNATRACE}"; do + "${SENT_DYNATRACE}" \ + "${SENT_STACKPULSE}" ; do if [ "${state}" -eq 0 ]; then # we sent something exit 0 diff --git a/health/notifications/health_alarm_notify.conf b/health/notifications/health_alarm_notify.conf index ac9e8ee543..5615683452 100755 --- a/health/notifications/health_alarm_notify.conf +++ b/health/notifications/health_alarm_notify.conf @@ -270,6 +270,15 @@ DYNATRACE_EVENT="CUSTOM_INFO" DEFAULT_RECIPIENT_DYNATRACE="" #------------------------------------------------------------------------------ +# Stackpulse global notification options +SEND_STACKPULSE="YES" + +# Webhook +STACKPULSE_WEBHOOK="" + +DEFAULT_RECIPIENT_STACKPULSE="" + +#------------------------------------------------------------------------------ # hangouts (google hangouts chat) global notification options # enable/disable sending hangouts notifications @@ -937,6 +946,8 @@ role_recipients_dynatrace[sysadmin]="${DEFAULT_RECIPIENT_DYNATRACE}" role_recipients_matrix[sysadmin]="${DEFAULT_RECIPIENT_MATRIX}" +role_recipients_stackpulse[sysadmin]="${DEFAULT_RECIPIENT_STACKPULSE}" + # ----------------------------------------------------------------------------- # DNS related alarms @@ -990,6 +1001,8 @@ role_recipients_dynatrace[domainadmin]="${DEFAULT_RECIPIENT_DYNATRACE}" role_recipients_matrix[domainadmin]="${DEFAULT_RECIPIENT_MATRIX}" +role_recipients_stackpulse[domainadmin]="${DEFAULT_RECIPIENT_STACKPULSE}" + # ----------------------------------------------------------------------------- # database servers alarms # mysql, redis, memcached, postgres, etc @@ -1044,6 +1057,8 @@ role_recipients_dynatrace[dba]="${DEFAULT_RECIPIENT_DYNATRACE}" role_recipients_matrix[dba]="${DEFAULT_RECIPIENT_MATRIX}" +role_recipients_stackpulse[dba]="${DEFAULT_RECIPIENT_STACKPULSE}" + # ----------------------------------------------------------------------------- # web servers alarms # apache, nginx, lighttpd, etc @@ -1098,6 +1113,8 @@ role_recipients_dynatrace[webmaster]="${DEFAULT_RECIPIENT_DYNATRACE}" role_recipients_matrix[webmaster]="${DEFAULT_RECIPIENT_MATRIX}" +role_recipients_stackpulse[webmaster]="${DEFAULT_RECIPIENT_STACKPULSE}" + # ----------------------------------------------------------------------------- # proxy servers alarms # squid, etc @@ -1152,6 +1169,8 @@ role_recipients_dynatrace[proxyadmin]="${DEFAULT_RECIPIENT_DYNATRACE}" role_recipients_matrix[proxyadmin]="${DEFAULT_RECIPIENT_MATRIX}" +role_recipients_stackpulse[proxyadmin]="${DEFAULT_RECIPIENT_STACKPULSE}" + # ----------------------------------------------------------------------------- # peripheral devices # UPS, photovoltaics, etc @@ -1203,3 +1222,5 @@ role_recipients_sms[sitemgr]="${DEFAULT_RECIPIENT_SMS}" role_recipients_dynatrace[sitemgr]="${DEFAULT_RECIPIENT_DYNATRACE}" role_recipients_matrix[sitemgr]="${DEFAULT_RECIPIENT_MATRIX}" + +role_recipients_stackpulse[sitemgr]="${DEFAULT_RECIPIENT_STACKPULSE}" diff --git a/health/notifications/stackpulse/Makefile.inc b/health/notifications/stackpulse/Makefile.inc new file mode 100644 index 0000000000..eabcb4bcf0 --- /dev/null +++ b/health/notifications/stackpulse/Makefile.inc @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +# THIS IS NOT A COMPLETE Makefile +# IT IS INCLUDED BY ITS PARENT'S Makefile.am +# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT + +# install these files +dist_noinst_DATA += \ + stackpulse/README.md \ + stackpulse/Makefile.inc \ + $(NULL) + diff --git a/health/notifications/stackpulse/README.md b/health/notifications/stackpulse/README.md new file mode 100644 index 0000000000..4f9300ca6d --- /dev/null +++ b/health/notifications/stackpulse/README.md @@ -0,0 +1,81 @@ +<!-- +title: "Send notifications to StackPulse" +description: "Send alerts to your StackPulse Netdata integration any time an anomaly or performance issue strikes a node in your infrastructure." +sidebar_label: "StackPulse" +custom_edit_url: https://github.com/netdata/netdata/edit/master/health/notifications/stackpulse/README.md +--> + +# Send notifications to StackPulse + +[StackPulse](https://stackpulse.com/) is a software-as-a-service platform for site reliablility engineering. +StackPulse is a Software-as-a-Service platform for Site Reliablility Engineering. It helps SREs, DevOps Engineers and +Software Developers reduce toil and alert fatigue while improving reliability of software services by managing, +analyzing and automating incident response activities. + +Sending Netdata alarm notifications to StackPulse allows you to create smart automated response workflows +(StackPulse playbooks) that will help you drive down your MTTD and MTTR by performing any of the following: + +- Enriching the incident with data from multiple sources +- Performing triage actions and analyzing their results +- Orchestrating incident management and notification flows +- Performing automatic and semi-automatic remediation actions +- Analzying incident data and remediation patterns to improve reliability of your services + +To send the notification you need: + +1. Create a Netdata integration in the `StackPulse Administration Portal`, and copy the `Endpoint` URL. + +![Creating a Netdata integration in StackPulse](https://user-images.githubusercontent.com/49162938/93023348-d9455a80-f5dd-11ea-8e05-67d07dce93e4.png) + +2. On your node, navigate to `/etc/netdata/` and run the following command: + +```sh +$ ./edit-config health_alarm_notify.conf +``` + +3. Set the `STACKPULSE_WEBHOOK` variable to `Endpoint` URL you copied earlier: + +``` +SEND_STACKPULSE="YES" +STACKPULSE_WEBHOOK="https://hooks.stackpulse.io/v1/webhooks/YOUR_UNIQUE_ID" +``` + +4. Now [restart Netdata](/docs/getting-started.md#start-stop-and-restart-netdata). When your node creates an alarm, you + can see the associated notification on your StackPulse Administration Portal + +## React to alarms with playbooks + +StackPulse allow users to create `Playbooks` giving additional information about events that happen in specific +scenarios. For example, you could create a Playbook that responds to a "low disk space" alarm by compressing and +cleaning up storage partitions with dynamic data. + +![image](https://user-images.githubusercontent.com/49162938/93207961-4c201400-f74b-11ea-94d1-42a29d007b62.png) + +![The StackPulse Administration Portal with a Netdata +alarm](https://user-images.githubusercontent.com/49162938/93208199-bfc22100-f74b-11ea-83c4-728be23dcf4d.png) +### Create Playbooks for Netdata alarms + +To create a Playbook, you need to access the StackPulse Administration Portal. After the initial setup, you need to +access the **TRIGGER** tab to define the scenarios used to trigger the event. The following variables are available: + +- `Hostname`: The host that generated the event. +- `Chart`: The name of the chart. +- `OldValue` : The previous value of the alarm. +- `Value`: The current value of the alarm. +- `Units` : The units of the value. +- `OldStatus` : The previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL. +- `State`: The current alarm status, the acceptable values are the same of `OldStatus`. +- `Alarm` : The name of the alarm, as given in Netdata's health.d entries. +- `Date` : The timestamp this event occurred. +- `Duration` : The duration in seconds of the previous alarm state. +- `NonClearDuration` : The total duration in seconds this is/was non-clear. +- `Description` : A short description of the alarm copied from the alarm definition. +- `CalcExpression` : The expression that was evaluated to trigger the alarm. +- `CalcParamValues` : The values of the parameters in the expression, at the time of the evaluation. +- `TotalWarnings` : Total number of alarms in WARNING state. +- `TotalCritical` : Total number of alarms in CRITICAL state. +- `ID` : The unique id of the alarm that generated this event. + +For more details how to create a scenario, take a look at the [StackPulse documentation](https://docs.stackpulse.io). + +[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fhealth%2Fnotifications%2Fopsgenie%2FREADME%2FDonations-netdata-has-received&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>) |