summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCosta Tsaousis (ktsaou) <costa@tsaousis.gr>2017-02-11 15:36:45 +0200
committerCosta Tsaousis (ktsaou) <costa@tsaousis.gr>2017-02-11 15:36:45 +0200
commitce599880d6d8f74e95fb682a2d7a0d17439f15b8 (patch)
tree148c64c55c9ba42df98e5340ac07bfdf5f56eb2e
parent27d9930e82cf41b8e9b387c339dc90bc66b909c4 (diff)
make web_log alarms work above 30 reqs/minute; fix web_log plugin descriptions
-rw-r--r--conf.d/health.d/web_log.conf34
-rw-r--r--python.d/web_log.chart.py4
2 files changed, 25 insertions, 13 deletions
diff --git a/conf.d/health.d/web_log.conf b/conf.d/health.d/web_log.conf
index 4300fe7557..fa88dc8cf5 100644
--- a/conf.d/health.d/web_log.conf
+++ b/conf.d/health.d/web_log.conf
@@ -16,6 +16,15 @@ families: *
# -----------------------------------------------------------------------------
# high level response code alarms
+template: 1m_requests
+ on: web_log.response_codes
+families: *
+ lookup: sum -1m unaligned
+ calc: ($this == 0)?(1):($this)
+ units: requests
+ every: 10s
+ info: the sum of all HTTP requests over the last minute
+
template: 1m_2xx
on: web_log.response_codes
families: *
@@ -23,7 +32,11 @@ families: *
calc: ($this == 0)?(1):($this)
units: requests
every: 10s
+ warn: ($1m_requests > 30) ? ($this > ($status >= $WARNING ) ? ( 1 ) : ( 2 )) : ( 0 )
+ crit: ($1m_requests > 30) ? ($this > ($status == $CRITICAL) ? ( 2 ) : ( 5 )) : ( 0 )
+ delay: down 15m multiplier 1.5 max 1h
info: the sum of successful HTTP requests over the last minute
+ to: webmaster
template: 1m_redirects
on: web_log.response_codes
@@ -32,8 +45,8 @@ families: *
calc: $this * 100 / ( $1m_2xx + $this )
units: %
every: 10s
- warn: $this > (($status >= $WARNING) ? ( 1 ) : ( 2 ))
- crit: $this > (($status == $CRITICAL) ? ( 2 ) : ( 5 ))
+ warn: ($1m_requests > 30) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 2 )) ) : ( 0 )
+ crit: ($1m_requests > 30) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
delay: down 15m multiplier 1.5 max 1h
info: the ratio of HTTP redirects (3xx) vs the successful requests, \
over the last minute
@@ -46,8 +59,8 @@ families: *
calc: $this * 100 / ( $1m_2xx + $this )
units: %
every: 10s
- warn: $this > (($status >= $WARNING) ? ( 1 ) : ( 5 ))
- crit: $this > (($status == $CRITICAL) ? ( 5 ) : ( 10 ))
+ warn: ($1m_requests > 30) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 5 )) ) : ( 0 )
+ crit: ($1m_requests > 30) ? ($this > (($status == $CRITICAL) ? ( 5 ) : ( 10 )) ) : ( 0 )
delay: down 15m multiplier 1.5 max 1h
info: the ratio of HTTP bad requests (4xx) vs the successful requests, \
over the last minute
@@ -60,8 +73,8 @@ families: *
calc: $this * 100 / ( $1m_2xx + $this )
units: %
every: 10s
- warn: $this > (($status >= $WARNING) ? ( 1 ) : ( 2 ))
- crit: $this > (($status == $CRITICAL) ? ( 2 ) : ( 5 ))
+ warn: ($1m_requests > 30) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 )
+ crit: ($1m_requests > 30) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
delay: down 15m multiplier 1.5 max 1h
info: the ratio of HTTP internal server errors (5xx) vs the successful \
requests, over the last minute
@@ -78,7 +91,6 @@ families: *
every: 30s
info: the average time to respond to HTTP requests, over the last 10 minutes
-
template: web_slow
on: web_log.response_time
families: *
@@ -87,8 +99,8 @@ families: *
every: 10s
green: 500
red: 1000
- warn: $this > $green && $this > ($10m_response_time * 2)
- crit: $this > $red && $this > ($10m_response_time * 4)
+ warn: ($1m_requests > 30) ? ($this > $green && $this > ($10m_response_time * 2) ) : ( 0 )
+ crit: ($1m_requests > 30) ? ($this > $red && $this > ($10m_response_time * 4) ) : ( 0 )
delay: down 15m multiplier 1.5 max 1h
info: the average time to respond to HTTP requests, over the last 1 minute
to: webmaster
@@ -118,8 +130,8 @@ families: *
calc: ($5m_2xx_last > 0)?($5m_2xx_now * 100 / $5m_2xx_last):(100)
units: %
every: 30s
- warn: ($5m_2xx_last > 30)?($this > 200 OR $this < 50):(0)
- crit: ($5m_2xx_last > 30)?($this > 400 OR $this < 25):(0)
+ warn: ($1m_requests > 30) ? (($5m_2xx_last > 30) ? ($this > 200 OR $this < 50) : (0) ) : ( 0 )
+ crit: ($1m_requests > 30) ? (($5m_2xx_last > 30) ? ($this > 400 OR $this < 25) : (0) ) : ( 0 )
delay: down 15m multiplier 1.5 max 1h
options: no-clear-notification
info: the percentage of web requests over the last 5 minutes, \
diff --git a/python.d/web_log.chart.py b/python.d/web_log.chart.py
index 6cb594f6ae..391d446852 100644
--- a/python.d/web_log.chart.py
+++ b/python.d/web_log.chart.py
@@ -208,10 +208,10 @@ class Service(LogService):
job_name = find_job_name(self.override_name, self.name)
self.detailed_chart = 'CHART %s.detailed_response_codes ""' \
- ' "Response Codes" requests/s responses' \
+ ' "Detailed Response Codes" requests/s responses' \
' web_log.detailed_response_codes stacked 1 %s\n' % (job_name, self.update_every)
self.http_method_chart = 'CHART %s.http_method' \
- ' "" "Requests Per HTTP Method" requests/s requests' \
+ ' "" "Requests Per HTTP Method" requests/s "http methods"' \
' web_log.http_method stacked 2 %s\n' % (job_name, self.update_every)
if regex_name == 'access_apache_ext':