summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCosta Tsaousis (ktsaou) <costa@tsaousis.gr>2016-06-09 22:49:44 +0300
committerCosta Tsaousis (ktsaou) <costa@tsaousis.gr>2016-06-09 22:49:44 +0300
commitbb8cdaf9dd25372bbbd09d916e9afe7f907ec1d6 (patch)
treec796ac14b633b3fed34cfa43ff0a44ad19558d2a
parent61360d367bf33f2590740a50863f739c819a7d81 (diff)
properly handle the exit status of plugins to avoid infinite restart attempts; plugins that report failure are now restarted up to 10 times if they have collected values in the past; apache fixed to work with latest apache; example plugin is now disabled by default to prevent starting bash in systems that dont have any other bash plugin used
-rwxr-xr-xcharts.d/apache.chart.sh23
-rwxr-xr-xcharts.d/example.chart.sh7
-rwxr-xr-xplugins.d/charts.d.plugin25
-rw-r--r--src/plugins_d.c68
-rw-r--r--src/plugins_d.h12
-rw-r--r--web/demo2.html12
6 files changed, 103 insertions, 44 deletions
diff --git a/charts.d/apache.chart.sh b/charts.d/apache.chart.sh
index f4f7d0581b..a3e064094c 100755
--- a/charts.d/apache.chart.sh
+++ b/charts.d/apache.chart.sh
@@ -67,19 +67,14 @@ apache_detect() {
# we will not check of the Conns*
# keys, since these are apache 2.4 specific
- if [ -z "${apache_key_accesses}" \
- -o -z "${apache_key_kbytes}" \
- -o -z "${apache_key_reqpersec}" \
- -o -z "${apache_key_bytespersec}" \
- -o -z "${apache_key_bytesperreq}" \
- -o -z "${apache_key_busyworkers}" \
- -o -z "${apache_key_idleworkers}" \
- -o -z "${apache_key_scoreboard}" \
- ]
- then
- echo >&2 "apache: Invalid response or missing keys from apache server: ${*}"
- return 1
- fi
+ [ -z "${apache_key_accesses}" ] && echo >&2 "apache: missing 'Total Accesses' from apache server: ${*}" && return 1
+ [ -z "${apache_key_kbytes}" ] && echo >&2 "apache: missing 'Total kBytes' from apache server: ${*}" && return 1
+ [ -z "${apache_key_reqpersec}" ] && echo >&2 "apache: missing 'ReqPerSec' from apache server: ${*}" && return 1
+ [ -z "${apache_key_bytespersec}" ] && echo >&2 "apache: missing 'BytesPerSec' from apache server: ${*}" && return 1
+ [ -z "${apache_key_bytesperreq}" ] && echo >&2 "apache: missing 'BytesPerReq' from apache server: ${*}" && return 1
+ [ -z "${apache_key_busyworkers}" ] && echo >&2 "apache: missing 'BusyWorkers' from apache server: ${*}" && return 1
+ [ -z "${apache_key_idleworkers}" ] && echo >&2 "apache: missing 'IdleWorkers' from apache server: ${*}" && return 1
+ [ -z "${apache_key_scoreboard}" ] && echo >&2 "apache: missing 'Scoreboard' from apache server: ${*}" && return 1
if [ ! -z "${apache_key_connstotal}" \
-a ! -z "${apache_key_connsasyncwriting}" \
@@ -88,6 +83,8 @@ apache_detect() {
]
then
apache_has_conns=1
+ else
+ apache_has_conns=0
fi
return 0
diff --git a/charts.d/example.chart.sh b/charts.d/example.chart.sh
index ad20504621..b20740e862 100755
--- a/charts.d/example.chart.sh
+++ b/charts.d/example.chart.sh
@@ -7,14 +7,21 @@
# between the calls of the _update() function
example_update_every=
+# the priority is used to sort the charts on the dashboard
+# 1 = the first chart
example_priority=150000
+# to enable this chart, you have to set this to 12345
+# (just a demonstration for something that needs to be checked)
+example_magic_number=
+
# _check is called once, to find out if this chart should be enabled or not
example_check() {
# this should return:
# - 0 to enable the chart
# - 1 to disable the chart
+ [ "${example_magic_number}" != "12345" ] && return 1
return 0
}
diff --git a/plugins.d/charts.d.plugin b/plugins.d/charts.d.plugin
index 2824fa3c6f..543695dd7d 100755
--- a/plugins.d/charts.d.plugin
+++ b/plugins.d/charts.d.plugin
@@ -504,7 +504,7 @@ if [ -z "$run_charts" ]
exit 1
fi
-declare -A charts_last_update=() charts_update_every=() charts_next_update=() charts_run_counter=()
+declare -A charts_last_update=() charts_update_every=() charts_next_update=() charts_run_counter=() charts_serial_failures=()
global_update() {
local exit_at \
c=0 dt ret last_ms exec_start_ms exec_end_ms \
@@ -522,13 +522,14 @@ global_update() {
charts_last_update[$chart]=$((now_ms - (now_ms % (charts_update_every[$chart] * 1000) ) ))
charts_next_update[$chart]=$(( charts_last_update[$chart] + (charts_update_every[$chart] * 1000) ))
charts_run_counter[$chart]=0
+ charts_serial_failures[$chart]=0
echo "CHART netdata.plugin_chartsd_$chart '' 'Execution time for $chart plugin' 'milliseconds / run' charts.d netdata.plugin_charts area 145000 ${charts_update_every[$chart]}"
echo "DIMENSION run_time 'run time' absolute 1 1"
done
# the main loop
- while [ 1 ]
+ while [ "${#next_charts[@]}" -gt 0 ]
do
c=$((c + 1))
now_charts=("${next_charts[@]}")
@@ -570,15 +571,24 @@ global_update() {
current_time_ms; exec_end_ms=$now_ms
echo "BEGIN netdata.plugin_chartsd_$chart $dt"
+ echo "SET run_time = $(( exec_end_ms - exec_start_ms ))"
+ echo "END"
+
if [ $ret -eq 0 ]
then
- echo "SET run_time = $(( exec_end_ms - exec_start_ms ))"
+ charts_serial_failures[$chart]=0
next_charts+=($chart)
else
- echo "SET run_time = $(( (exec_end_ms - exec_start_ms) * -1 ))"
- echo >&2 "$PROGRAM_NAME: chart '$chart' update() function reports failure. Disabling it."
+ charts_serial_failures[$chart]=$(( charts_serial_failures[$chart] + 1 ))
+
+ if [ charts_serial_failures[$chart] -gt 10 ]
+ then
+ echo >&2 "$PROGRAM_NAME: chart '$chart' update() function reported failure ${charts_serial_failures[$chart]} times. Disabling it."
+ else
+ echo >&2 "$PROGRAM_NAME: chart '$chart' update() function reports failure. Will keep trying for a while."
+ next_charts+=($chart)
+ fi
fi
- echo "END"
else
next_charts+=($chart)
fi
@@ -601,6 +611,9 @@ global_update() {
test ${now_ms} -ge ${exit_at} && exit 0
done
+
+ echo >&2 "$PROGRAM_NAME: Nothing left to do. Disabling charts.d.plugin."
+ echo "DISABLE"
}
global_update
diff --git a/src/plugins_d.c b/src/plugins_d.c
index 0ccbd36e45..d7815433e4 100644
--- a/src/plugins_d.c
+++ b/src/plugins_d.c
@@ -125,6 +125,8 @@ void *pluginsd_worker_thread(void *arg)
uint32_t STOPPING_WAKE_ME_UP_PLEASE_HASH = simple_hash("STOPPING_WAKE_ME_UP_PLEASE");
#endif
+ size_t count = 0;
+
while(likely(1)) {
if(unlikely(netdata_exit)) break;
@@ -137,7 +139,6 @@ void *pluginsd_worker_thread(void *arg)
info("PLUGINSD: '%s' running on pid %d", cd->fullfilename, cd->pid);
RRDSET *st = NULL;
- unsigned long long count = 0;
char *s;
uint32_t hash;
@@ -182,8 +183,6 @@ void *pluginsd_worker_thread(void *arg)
if(unlikely(st->debug)) debug(D_PLUGINSD, "PLUGINSD: '%s' is setting dimension %s/%s to %s", cd->fullfilename, st->id, dimension, value?value:"<nothing>");
if(value) rrddim_set(st, dimension, strtoll(value, NULL, 0));
-
- count++;
}
else if(likely(hash == BEGIN_HASH && !strcmp(s, "BEGIN"))) {
char *id = words[1];
@@ -223,6 +222,8 @@ void *pluginsd_worker_thread(void *arg)
rrdset_done(st);
st = NULL;
+
+ count++;
}
else if(likely(hash == FLUSH_HASH && !strcmp(s, "FLUSH"))) {
debug(D_PLUGINSD, "PLUGINSD: '%s' is requesting a FLUSH", cd->fullfilename);
@@ -386,17 +387,17 @@ void *pluginsd_worker_thread(void *arg)
break;
}
}
+ if(likely(count)) {
+ cd->successful_collections += count;
+ cd->serial_failures = 0;
+ }
+ else
+ cd->serial_failures++;
- info("PLUGINSD: '%s' on pid %d stopped.", cd->fullfilename, cd->pid);
+ info("PLUGINSD: '%s' on pid %d stopped after %zu successful data collections (ENDs).", cd->fullfilename, cd->pid, count);
- // fgets() failed or loop broke
+ // get the return code
int code = mypclose(fp, cd->pid);
- if(code == 1 || code == 127) {
- // 1 = DISABLE
- // 127 = cannot even run it
- error("PLUGINSD: '%s' (pid %d) exited with code %d. Disabling it.", cd->fullfilename, cd->pid, code);
- cd->enabled = 0;
- }
if(netdata_exit) {
cd->pid = 0;
@@ -406,14 +407,49 @@ void *pluginsd_worker_thread(void *arg)
return NULL;
}
- if(unlikely(!count && cd->enabled)) {
- error("PLUGINSD: '%s' (pid %d) does not generate useful output. Waiting a bit before starting it again.", cd->fullfilename, cd->pid);
- sleep((unsigned int) (cd->update_every * 10));
+ if(code != 0) {
+ // the plugin reports failure
+
+ if(likely(!cd->successful_collections)) {
+ // nothing collected - disable it
+ error("PLUGINSD: '%s' exited with error code %d. Disabling it.", cd->fullfilename, code);
+ cd->enabled = 0;
+ }
+ else {
+ // we have collected something
+
+ if(likely(cd->serial_failures <= 10)) {
+ error("PLUGINSD: '%s' exited with error code %d, but has given useful output in the past (%zu times). Waiting a bit before starting it again.", cd->fullfilename, code, cd->successful_collections);
+ sleep((unsigned int) (cd->update_every * 10));
+ }
+ else {
+ error("PLUGINSD: '%s' exited with error code %d, but has given useful output in the past (%zu times). We tried %d times to restart it, but it failed to generate data. Disabling it.", cd->fullfilename, code, cd->successful_collections, cd->serial_failures);
+ cd->enabled = 0;
+ }
+ }
}
+ else {
+ // the plugin reports success
+ if(unlikely(!cd->successful_collections)) {
+ // we have collected nothing so far
+
+ if(likely(cd->serial_failures <= 10)) {
+ error("PLUGINSD: '%s' (pid %d) does not generate useful output but it reports success (exits with 0). Waiting a bit before starting it again.", cd->fullfilename, cd->pid);
+ sleep((unsigned int) (cd->update_every * 10));
+ }
+ else {
+ error("PLUGINSD: '%s' (pid %d) does not generate useful output, although it reports success (exits with 0), but we have tried %d times to collect something. Disabling it.", cd->fullfilename, cd->pid, cd->serial_failures);
+ cd->enabled = 0;
+ }
+ }
+ else
+ sleep((unsigned int) cd->update_every);
+ }
cd->pid = 0;
- if(likely(cd->enabled)) sleep((unsigned int) cd->update_every);
- else break;
+
+ if(unlikely(!cd->enabled))
+ break;
}
cd->obsolete = 1;
diff --git a/src/plugins_d.h b/src/plugins_d.h
index 11e89e0ac3..1a10dc0dc5 100644
--- a/src/plugins_d.h
+++ b/src/plugins_d.h
@@ -20,9 +20,15 @@ struct plugind {
pid_t pid;
pthread_t thread;
- int update_every;
- int obsolete;
- int enabled;
+ size_t successful_collections; // the number of times we have seen
+ // values collected from this plugin
+
+ size_t serial_failures; // the number of times the plugin started
+ // without collecting values
+
+ int update_every; // the plugin default data collection frequency
+ int obsolete; // do not touch this structure after setting this to 1
+ int enabled; // if this is enabled or not
time_t started_t;
diff --git a/web/demo2.html b/web/demo2.html
index 8fe28d45ba..294d7bd7c5 100644
--- a/web/demo2.html
+++ b/web/demo2.html
@@ -20,7 +20,7 @@
<meta property="og:description" content="Stunning real-time dashboards, blazingly fast and extremely interactive. Zero configuration, zero dependencies, zero maintenance." />
<script>var netdataTheme = 'slate';</script>
- <script type="text/javascript" src="dashboard.js?v39"></script>
+ <script type="text/javascript" src="http://my-netdata.io/dashboard.js?v39"></script>
</head>
<body>
@@ -63,8 +63,8 @@
data-gauge-max-value="32767"
data-width="45%"
data-after="-600"
- data-points="40"
- data-title="Updates Every 15&nbsp;Sec"
+ data-points="60"
+ data-title="Updates Every 10&nbsp;Sec"
data-units="important metric"
data-colors="#C55"
></div>
@@ -106,8 +106,8 @@
data-easypiechart-max-value="32767"
data-width="45%"
data-after="-600"
- data-points="40"
- data-title="Updates Every 15&nbsp;Sec&nbsp;(<a href='https://github.com/OpenTSDB/opentsdb.net/blob/gh-pages/docs/source/user_guide/utilities/tcollector.rst#collecting-lots-of-metrics-with-tcollector' target='_blank'>OpenTSDB</a>)"
+ data-points="60"
+ data-title="Updates Every 10&nbsp;Sec"
data-units="important metric"
data-colors="#C55"
></div>
@@ -118,7 +118,7 @@
data-width="45%"
data-after="-600"
data-points="2"
- data-title="Updates Every 5&nbsp;Mins&nbsp;(your&nbsp;NMS)"
+ data-title="Updates Every 5&nbsp;Mins"
data-units="important metric"
data-colors="#C55"
></div>