summaryrefslogtreecommitdiffstats
path: root/daemon/main.c
diff options
context:
space:
mode:
authorCosta Tsaousis <costa@netdata.cloud>2023-01-11 23:11:21 +0200
committerGitHub <noreply@github.com>2023-01-11 23:11:21 +0200
commit3a77de1f54b152e677f9261bfdf38532e0151f72 (patch)
tree9dfb2258dfff5071766459e1fc24dcbd8fdf6f79 /daemon/main.c
parentb7a2125da386eb75c997c6d4afbe3db0a5f3abcc (diff)
profile startup and shutdown timings (#14243)
* profile startup and shutdown timings * waste less time waiting for threads to exit
Diffstat (limited to 'daemon/main.c')
-rw-r--r--daemon/main.c172
1 files changed, 136 insertions, 36 deletions
diff --git a/daemon/main.c b/daemon/main.c
index 4ce0360b97..c97735180b 100644
--- a/daemon/main.c
+++ b/daemon/main.c
@@ -226,7 +226,8 @@ static bool service_wait_exit(SERVICE_TYPE service, usec_t timeout_ut) {
// signal them to stop
size_t last_running = 0;
size_t stale_time_ut = 0;
- usec_t sleep_ut = 500 * USEC_PER_MS;
+ usec_t sleep_ut = 50 * USEC_PER_MS;
+ size_t log_countdown_ut = sleep_ut;
do {
if(running != last_running)
stale_time_ut = 0;
@@ -257,11 +258,17 @@ static bool service_wait_exit(SERVICE_TYPE service, usec_t timeout_ut) {
netdata_spinlock_unlock(&service_globals.lock);
if(running) {
- buffer_flush(service_list);
- service_to_buffer(service_list, running_services);
- info("SERVICE CONTROL: waiting for the following %zu services [ %s] to exit: %s",
- running, buffer_tostring(service_list),
- running <= 10 ? buffer_tostring(thread_list) : "");
+ log_countdown_ut -= (log_countdown_ut >= sleep_ut) ? sleep_ut : log_countdown_ut;
+ if(log_countdown_ut == 0 || running != last_running) {
+ log_countdown_ut = 20 * sleep_ut;
+
+ buffer_flush(service_list);
+ service_to_buffer(service_list, running_services);
+ info("SERVICE CONTROL: waiting for the following %zu services [ %s] to exit: %s",
+ running, buffer_tostring(service_list),
+ running <= 10 ? buffer_tostring(thread_list) : "");
+ }
+
sleep_usec(sleep_ut);
stale_time_ut += sleep_ut;
}
@@ -285,18 +292,39 @@ static bool service_wait_exit(SERVICE_TYPE service, usec_t timeout_ut) {
return (running == 0);
}
+#define delta_shutdown_time(msg) \
+ { \
+ usec_t now_ut = now_monotonic_usec(); \
+ if(prev_msg) \
+ info("NETDATA SHUTDOWN: in %7llu ms, %s%s - next: %s", (now_ut - last_ut) / USEC_PER_MS, (timeout)?"(TIMEOUT) ":"", prev_msg, msg); \
+ else \
+ info("NETDATA SHUTDOWN: next: %s", msg); \
+ last_ut = now_ut; \
+ prev_msg = msg; \
+ timeout = false; \
+ }
+
void netdata_cleanup_and_exit(int ret) {
+ usec_t started_ut = now_monotonic_usec();
+ usec_t last_ut = started_ut;
+ const char *prev_msg = NULL;
+ bool timeout = false;
+
error_log_limit_unlimited();
- info("EXIT: netdata prepares to exit with code %d...", ret);
+ info("NETDATA SHUTDOWN: initializing shutdown with code %d...", ret);
send_statistics("EXIT", ret?"ERROR":"OK","-");
+ delta_shutdown_time("create shutdown file");
+
char agent_crash_file[FILENAME_MAX + 1];
char agent_incomplete_shutdown_file[FILENAME_MAX + 1];
snprintfz(agent_crash_file, FILENAME_MAX, "%s/.agent_crash", netdata_configured_varlib_dir);
snprintfz(agent_incomplete_shutdown_file, FILENAME_MAX, "%s/.agent_incomplete_shutdown", netdata_configured_varlib_dir);
(void) rename(agent_crash_file, agent_incomplete_shutdown_file);
+ delta_shutdown_time("disable maintenance, new queries, new web requests, new streaming connections and aclk");
+
service_signal_exit(
SERVICE_MAINTENANCE
| ABILITY_DATA_QUERIES
@@ -305,7 +333,9 @@ void netdata_cleanup_and_exit(int ret) {
| SERVICE_ACLK
);
- service_wait_exit(
+ delta_shutdown_time("stop replication, exporters, ML training, health and web servers threads");
+
+ timeout = !service_wait_exit(
SERVICE_REPLICATION
| SERVICE_EXPORTERS
| SERVICE_ML_TRAINING
@@ -313,38 +343,51 @@ void netdata_cleanup_and_exit(int ret) {
| SERVICE_WEB_SERVER
, 3 * USEC_PER_SEC);
- service_wait_exit(
+ delta_shutdown_time("stop collectors and streaming threads");
+
+ timeout = !service_wait_exit(
SERVICE_COLLECTORS
| SERVICE_STREAMING
, 3 * USEC_PER_SEC);
- service_wait_exit(
+ delta_shutdown_time("stop ML prediction and context threads");
+
+ timeout = !service_wait_exit(
SERVICE_ML_PREDICTION
| SERVICE_CONTEXT
, 3 * USEC_PER_SEC);
- service_wait_exit(
+ delta_shutdown_time("stop maintenance thread");
+
+ timeout = !service_wait_exit(
SERVICE_MAINTENANCE
, 3 * USEC_PER_SEC);
- info("EXIT: cleaning up the database...");
+ delta_shutdown_time("clean rrdhost database");
+
rrdhost_cleanup_all();
- info("EXIT: metasync shutdown prepare...");
+ delta_shutdown_time("prepare metasync shutdown");
+
metadata_sync_shutdown_prepare();
#ifdef ENABLE_ACLK
+ delta_shutdown_time("signal aclk sync to stop");
aclk_sync_exit_all();
#endif
- service_wait_exit(
+ delta_shutdown_time("stop aclk threads");
+
+ timeout = !service_wait_exit(
SERVICE_ACLK
, 3 * USEC_PER_SEC);
- // stop everything else
- service_wait_exit(~0, 10 * USEC_PER_SEC);
+ delta_shutdown_time("stop all remaining worker threads");
+
+ timeout = !service_wait_exit(~0, 10 * USEC_PER_SEC);
+
+ delta_shutdown_time("cancel main threads");
- info("EXIT: stopping static threads...");
cancel_main_threads();
if(!ret) {
@@ -352,42 +395,59 @@ void netdata_cleanup_and_exit(int ret) {
#ifdef ENABLE_DBENGINE
if(dbengine_enabled) {
- info("EXIT: flushing dbengine...");
+ delta_shutdown_time("flush dbengine tiers");
for (size_t tier = 0; tier < storage_tiers; tier++)
rrdeng_prepare_exit(multidb_ctx[tier]);
}
#endif
// free the database
- info("EXIT: freeing database memory...");
+ delta_shutdown_time("free rrdhost structures");
+
rrdhost_free_all();
+ delta_shutdown_time("stop metasync threads");
+
metadata_sync_shutdown();
#ifdef ENABLE_DBENGINE
if(dbengine_enabled) {
- info("EXIT: stopping dbengine...");
+ delta_shutdown_time("stop dbengine tiers");
for (size_t tier = 0; tier < storage_tiers; tier++)
rrdeng_exit(multidb_ctx[tier]);
}
#endif
}
+ delta_shutdown_time("close SQL context db");
+
sql_close_context_database();
+
+ delta_shutdown_time("closed SQL main db");
+
sql_close_database();
// unlink the pid
if(pidfile[0]) {
- info("EXIT: removing netdata PID file '%s'...", pidfile);
+ delta_shutdown_time("remove pid file");
+
if(unlink(pidfile) != 0)
error("EXIT: cannot unlink pidfile '%s'.", pidfile);
}
#ifdef ENABLE_HTTPS
+ delta_shutdown_time("free openssl structures");
security_clean_openssl();
#endif
- info("EXIT: all done - netdata is now exiting - bye bye...");
+
+ delta_shutdown_time("remove incomplete shutdown file");
+
(void) unlink(agent_incomplete_shutdown_file);
+
+ delta_shutdown_time("exit");
+
+ usec_t ended_ut = now_monotonic_usec();
+ info("NETDATA SHUTDOWN: completed in %llu ms - netdata is now exiting - bye bye...", (ended_ut - started_ut) / USEC_PER_MS);
exit(ret);
}
@@ -1178,11 +1238,28 @@ void post_conf_load(char **user)
appconfig_get(&cloud_config, CONFIG_SECTION_GLOBAL, "cloud base url", DEFAULT_CLOUD_BASE_URL);
}
+#define delta_startup_time(msg) \
+ { \
+ usec_t now_ut = now_monotonic_usec(); \
+ if(prev_msg) \
+ info("NETDATA STARTUP: in %7llu ms, %s - next: %s", (now_ut - last_ut) / USEC_PER_MS, prev_msg, msg); \
+ else \
+ info("NETDATA STARTUP: next: %s", msg); \
+ last_ut = now_ut; \
+ prev_msg = msg; \
+ }
+
int pgc_unittest(void);
int mrg_unittest(void);
int julytest(void);
int main(int argc, char **argv) {
+ // initialize the system clocks
+ clocks_init();
+ usec_t started_ut = now_monotonic_usec();
+ usec_t last_ut = started_ut;
+ const char *prev_msg = NULL;
+
int i;
int config_loaded = 0;
int dont_fork = 0;
@@ -1629,9 +1706,7 @@ int main(int argc, char **argv) {
}
#endif
-
- if(!config_loaded)
- {
+ if(!config_loaded) {
load_netdata_conf(NULL, 0);
post_conf_load(&user);
load_cloud_conf(0);
@@ -1642,7 +1717,6 @@ int main(int argc, char **argv) {
appconfig_set(&cloud_config, CONFIG_SECTION_GLOBAL, "enabled", "false");
}
-
// ------------------------------------------------------------------------
// initialize netdata
{
@@ -1662,9 +1736,6 @@ int main(int argc, char **argv) {
#endif
#endif
- // initialize the system clocks
- clocks_init();
-
// set libuv worker threads
libuv_worker_threads = get_system_cpus() * 2;
@@ -1749,6 +1820,7 @@ int main(int argc, char **argv) {
// --------------------------------------------------------------------
// Initialize ML configuration
+ delta_startup_time("initialize ML");
ml_init();
// --------------------------------------------------------------------
@@ -1757,18 +1829,18 @@ int main(int argc, char **argv) {
// block signals while initializing threads.
// this causes the threads to block signals.
+ delta_startup_time("initialize signals");
signals_block();
+ signals_init(); // setup the signals we want to use
- // setup the signals we want to use
+ // --------------------------------------------------------------------
+ // check which threads are enabled and initialize them
- signals_init();
+ delta_startup_time("initialize static threads");
// setup threads configs
default_stacksize = netdata_threads_init();
- // --------------------------------------------------------------------
- // check which threads are enabled and initialize them
-
for (i = 0; static_threads[i].name != NULL ; i++) {
struct netdata_static_thread *st = &static_threads[i];
@@ -1788,14 +1860,17 @@ int main(int argc, char **argv) {
// --------------------------------------------------------------------
// create the listening sockets
+ delta_startup_time("initialize web server");
+
web_client_api_v1_init();
web_server_threading_selection();
if(web_server_mode != WEB_SERVER_MODE_NONE)
api_listen_sockets_setup();
-
}
+ delta_startup_time("set resource limits");
+
#ifdef NETDATA_INTERNAL_CHECKS
if(debug_flags != 0) {
struct rlimit rl = { RLIM_INFINITY, RLIM_INFINITY };
@@ -1813,18 +1888,27 @@ int main(int argc, char **argv) {
else
info("resources control: allowed file descriptors: soft = %zu, max = %zu", (size_t)rlimit_nofile.rlim_cur, (size_t)rlimit_nofile.rlim_max);
+
+ delta_startup_time("become daemon");
+
// fork, switch user, create pid file, set process priority
if(become_daemon(dont_fork, user) == -1)
fatal("Cannot daemonize myself.");
info("netdata started on pid %d.", getpid());
+ delta_startup_time("initialize threads after fork");
+
netdata_threads_init_after_fork((size_t)config_get_number(CONFIG_SECTION_GLOBAL, "pthread stack size", (long)default_stacksize));
// initialize internal registry
+ delta_startup_time("initialize registry");
registry_init();
+
// fork the spawn server
+ delta_startup_time("fork the spawn server");
spawn_init();
+
/*
* Libuv uv_spawn() uses SIGCHLD internally:
* https://github.com/libuv/libuv/blob/cc51217a317e96510fbb284721d5e6bc2af31e33/src/unix/process.c#L485
@@ -1837,15 +1921,21 @@ int main(int argc, char **argv) {
// ------------------------------------------------------------------------
// initialize rrd, registry, health, rrdpush, etc.
+ delta_startup_time("collecting system info");
+
netdata_anonymous_statistics_enabled=-1;
struct rrdhost_system_info *system_info = callocz(1, sizeof(struct rrdhost_system_info));
get_system_info(system_info);
system_info->hops = 0;
get_install_type(&system_info->install_type, &system_info->prebuilt_arch, &system_info->prebuilt_dist);
+ delta_startup_time("initialize RRD structures");
+
if(rrd_init(netdata_configured_hostname, system_info, false))
fatal("Cannot initialize localhost instance with name '%s'.", netdata_configured_hostname);
+ delta_startup_time("check for incomplete shutdown");
+
char agent_crash_file[FILENAME_MAX + 1];
char agent_incomplete_shutdown_file[FILENAME_MAX + 1];
snprintfz(agent_incomplete_shutdown_file, FILENAME_MAX, "%s/.agent_incomplete_shutdown", netdata_configured_varlib_dir);
@@ -1860,6 +1950,8 @@ int main(int argc, char **argv) {
// ------------------------------------------------------------------------
// Claim netdata agent to a cloud endpoint
+ delta_startup_time("collect claiming info");
+
if (claiming_pending_arguments)
claim_agent(claiming_pending_arguments);
load_claiming_state();
@@ -1870,11 +1962,14 @@ int main(int argc, char **argv) {
error_log_limit_reset();
// Load host labels
+ delta_startup_time("collect host labels");
reload_host_labels();
// ------------------------------------------------------------------------
// spawn the threads
+ delta_startup_time("start the static threads");
+
web_server_config_options();
netdata_zero_metrics_enabled = config_get_boolean_ondemand(CONFIG_SECTION_DB, "enable zero metrics", CONFIG_BOOLEAN_NO);
@@ -1895,9 +1990,14 @@ int main(int argc, char **argv) {
// ------------------------------------------------------------------------
// Initialize netdata agent command serving from cli and signals
+ delta_startup_time("initialize commands API");
+
commands_init();
- info("netdata initialization completed. Enjoy real-time performance monitoring!");
+ delta_startup_time("ready");
+
+ usec_t ready_ut = now_monotonic_usec();
+ info("NETDATA STARTUP: completed in %llu ms. Enjoy real-time performance monitoring!", (ready_ut - started_ut) / USEC_PER_MS);
netdata_ready = 1;
send_statistics("START", "-", "-");