diff options
author | Costa Tsaousis <costa@netdata.cloud> | 2023-06-07 21:10:27 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-06-07 21:10:27 +0300 |
commit | 66c85460199dbf65aad09cdfcdbae25c6bde265b (patch) | |
tree | a77e1f19d21f429fbc73ff8c71660cfb97c934ed /daemon | |
parent | 892255b23728fde076402b7300f13c80de32e5fc (diff) |
Re-write of SSL support in Netdata; restoration of SIGCHLD; detection of stale plugins; streaming improvements (#15113)
* add information about streaming connections to /api/v2/nodes; reset defer time when sender or receivers connect or disconnect
* make each streaming destination respect its SSL settings
* to not send SSL traffic over non-SSL connection
* keep track of outgoing streaming connection attempts
* retry SSL reads when SSL_read() returns SSL_ERROR_WANT_READ
* Revert "retry SSL reads when SSL_read() returns SSL_ERROR_WANT_READ"
This reverts commit 14c858677c6f2d3b08c94f298e2f45ecdb74c801.
* cleanup SSL connections properly
* initialize SSL in rpt before takeover
* sender should free SSL when talking to a non-SSL destination
* do not shutdown SSL when receiver exits
* restore operation of SIGCHLD when the reaper is not enabled
* create an fgets function that checks for data and times out
* work on error handling of plugins exiting
* remove newlines from logs
* global call to waitid(), caching the result for netdata_pclose() to process
* receiver tid
* parser timeouts in 2 minutes instead of 10
* fix crash when UUID is NULL in SQLite
* abstract sqlite3 parsing for uuid and text
* write proper ssl errors on read and write
* fix for SSL_ERROR_WANT_RETRY_VERIFY
* SSL WANT per function
* unified SSL error logging
* fix compilation warning
* additional logging about parser cleanup
* streaming parser should call the pluginsd parser cleanup
* SSL error handling work
* SSL initialization unification
* check for pending data when receiving SSL response with timeout
* macro to check if an SSL connection has been established
* remove SSL_pending()
* check for SSL macros
* use SSL_peek() to find if there is a response
* SSL renames
* more SSL renames & cleanup
* rrdpush ssl connection function
* abstract all SSL functions into security.c
* keep track of SSL connections and always attempt to use SSL read/write when on SSL connection
* signal openssl to skip certificate validation when configured to do so
* better SSL error handling and logging
* SSL code cleanup
* SSL retry on SSL_connect and SSL_accept
* SSL provide default return value for old compilers
* SSL read/write functions emulate system read/write functions
* fix receive/send timeout and switch from SSL_peek() to SSL_pending()
* remove SSL_pending()
* removed sender auto-retry and debug info for initial recevier response
* ssl skip certificate verification config for web server
* ssl errors log ip and port of the peer
* keep ssl with web_client for its whole lifetime
* thread safe socket peers to text
* use error_limit() for common ssl errors
* cleanup
* more cleanup
* coverity fixes
* ssl error logs include both local and remote ip/port info
* remove obsolete code
Diffstat (limited to 'daemon')
-rw-r--r-- | daemon/analytics.c | 8 | ||||
-rw-r--r-- | daemon/main.c | 4 | ||||
-rw-r--r-- | daemon/signals.c | 98 |
3 files changed, 41 insertions, 69 deletions
diff --git a/daemon/analytics.c b/daemon/analytics.c index b3c802b86c..2689886bd7 100644 --- a/daemon/analytics.c +++ b/daemon/analytics.c @@ -375,8 +375,12 @@ void analytics_https(void) BUFFER *b = buffer_create(30, NULL); #ifdef ENABLE_HTTPS analytics_exporting_connectors_ssl(b); - buffer_strcat(b, netdata_ssl_client_ctx && rrdhost_flag_check(localhost, RRDHOST_FLAG_RRDPUSH_SENDER_CONNECTED) && localhost->sender->ssl.flags == NETDATA_SSL_HANDSHAKE_COMPLETE ? "streaming|" : "|"); - buffer_strcat(b, netdata_ssl_srv_ctx ? "web" : ""); + + buffer_strcat(b, netdata_ssl_streaming_sender_ctx && + rrdhost_flag_check(localhost, RRDHOST_FLAG_RRDPUSH_SENDER_CONNECTED) && + SSL_connection(&localhost->sender->ssl) ? "streaming|" : "|"); + + buffer_strcat(b, netdata_ssl_web_server_ctx ? "web" : ""); #else buffer_strcat(b, "||"); #endif diff --git a/daemon/main.c b/daemon/main.c index 0805b21773..cff6530f3f 100644 --- a/daemon/main.c +++ b/daemon/main.c @@ -482,7 +482,7 @@ void netdata_cleanup_and_exit(int ret) { #ifdef ENABLE_HTTPS delta_shutdown_time("free openssl structures"); - security_clean_openssl(); + netdata_ssl_cleanup(); #endif delta_shutdown_time("remove incomplete shutdown file"); @@ -834,7 +834,7 @@ static void security_init(){ tls_version = config_get(CONFIG_SECTION_WEB, "tls version", "1.3"); tls_ciphers = config_get(CONFIG_SECTION_WEB, "tls ciphers", "none"); - security_openssl_library(); + netdata_ssl_initialize_openssl(); } #endif diff --git a/daemon/signals.c b/daemon/signals.c index c857a9b578..3699010ce6 100644 --- a/daemon/signals.c +++ b/daemon/signals.c @@ -2,8 +2,6 @@ #include "common.h" -static int reaper_enabled = 0; - typedef enum signal_action { NETDATA_SIGNAL_END_OF_LIST, NETDATA_SIGNAL_IGNORE, @@ -78,16 +76,6 @@ void signals_init(void) { struct sigaction sa; sa.sa_flags = 0; - // Enable process tracking / reaper if running as init (pid == 1). - // This prevents zombie processes when running in a container. - if (getpid() == 1) { - info("SIGNAL: Enabling reaper"); - netdata_popen_tracking_init(); - reaper_enabled = 1; - } else { - info("SIGNAL: Not enabling reaper"); - } - // ignore all signals while we run in a signal handler sigfillset(&sa.sa_mask); @@ -97,10 +85,6 @@ void signals_init(void) { case NETDATA_SIGNAL_IGNORE: sa.sa_handler = SIG_IGN; break; - case NETDATA_SIGNAL_CHILD: - if (reaper_enabled == 0) - continue; - // FALLTHROUGH default: sa.sa_handler = signal_handler; break; @@ -115,9 +99,6 @@ void signals_restore_SIGCHLD(void) { struct sigaction sa; - if (reaper_enabled == 0) - return; - sa.sa_flags = 0; sigfillset(&sa.sa_mask); sa.sa_handler = signal_handler; @@ -137,9 +118,6 @@ void signals_reset(void) { if(sigaction(signals_waiting[i].signo, &sa, NULL) == -1) error("SIGNAL: Failed to reset signal handler for: %s", signals_waiting[i].name); } - - if (reaper_enabled == 1) - netdata_popen_tracking_cleanup(); } // reap_child reaps the child identified by pid. @@ -147,39 +125,42 @@ static void reap_child(pid_t pid) { siginfo_t i; errno = 0; - debug(D_CHILDS, "SIGNAL: Reaping pid: %d...", pid); - if (waitid(P_PID, (id_t)pid, &i, WEXITED|WNOHANG) == -1) { + debug(D_CHILDS, "SIGNAL: reap_child(%d)...", pid); + if (netdata_waitid(P_PID, (id_t)pid, &i, WEXITED|WNOHANG) == -1) { if (errno != ECHILD) - error("SIGNAL: Failed to wait for: %d", pid); + error("SIGNAL: waitid(%d): failed to wait for child", pid); else - debug(D_CHILDS, "SIGNAL: Already reaped: %d", pid); + info("SIGNAL: waitid(%d): failed - it seems the child is already reaped", pid); return; - } else if (i.si_pid == 0) { + } + else if (i.si_pid == 0) { // Process didn't exit, this shouldn't happen. + error("SIGNAL: waitid(%d): reports pid 0 - child has not exited", pid); return; } switch (i.si_code) { - case CLD_EXITED: - debug(D_CHILDS, "SIGNAL: Child %d exited: %d", pid, i.si_status); - break; - case CLD_KILLED: - debug(D_CHILDS, "SIGNAL: Child %d killed by signal: %d", pid, i.si_status); - break; - case CLD_DUMPED: - debug(D_CHILDS, "SIGNAL: Child %d dumped core by signal: %d", pid, i.si_status); - break; - case CLD_STOPPED: - debug(D_CHILDS, "SIGNAL: Child %d stopped by signal: %d", pid, i.si_status); - break; - case CLD_TRAPPED: - debug(D_CHILDS, "SIGNAL: Child %d trapped by signal: %d", pid, i.si_status); - break; - case CLD_CONTINUED: - debug(D_CHILDS, "SIGNAL: Child %d continued by signal: %d", pid, i.si_status); - break; - default: - debug(D_CHILDS, "SIGNAL: Child %d gave us a SIGCHLD with code %d and status %d.", pid, i.si_code, i.si_status); + case CLD_EXITED: + info("SIGNAL: reap_child(%d) exited with code: %d", pid, i.si_status); + break; + case CLD_KILLED: + info("SIGNAL: reap_child(%d) killed by signal: %d", pid, i.si_status); + break; + case CLD_DUMPED: + info("SIGNAL: reap_child(%d) dumped core by signal: %d", pid, i.si_status); + break; + case CLD_STOPPED: + info("SIGNAL: reap_child(%d) stopped by signal: %d", pid, i.si_status); + break; + case CLD_TRAPPED: + info("SIGNAL: reap_child(%d) trapped by signal: %d", pid, i.si_status); + break; + case CLD_CONTINUED: + info("SIGNAL: reap_child(%d) continued by signal: %d", pid, i.si_status); + break; + default: + info("SIGNAL: reap_child(%d) gave us a SIGCHLD with code %d and status %d.", pid, i.si_code, i.si_status); + break; } } @@ -187,25 +168,13 @@ static void reap_child(pid_t pid) { static void reap_children() { siginfo_t i; - while (1 == 1) { - // Identify which process caused the signal so we can determine - // if we need to reap a re-parented process. + while(1) { i.si_pid = 0; - if (waitid(P_ALL, (id_t)0, &i, WEXITED|WNOHANG|WNOWAIT) == -1) { - if (errno != ECHILD) // This shouldn't happen with WNOHANG but does. - error("SIGNAL: Failed to wait"); - return; - } else if (i.si_pid == 0) { - // No child exited. + if (netdata_waitid(P_ALL, (id_t)0, &i, WEXITED|WNOHANG|WNOWAIT) == -1 || i.si_pid == 0) + // nothing to do return; - } else if (netdata_popen_tracking_pid_shoud_be_reaped(i.si_pid) == 0) { - // myp managed, sleep for a short time to avoid busy wait while - // this is handled by myp. - usleep(10000); - } else { - // Unknown process, likely a re-parented child, reap it. - reap_child(i.si_pid); - } + + reap_child(i.si_pid); } } @@ -267,7 +236,6 @@ void signals_handle(void) { break; case NETDATA_SIGNAL_CHILD: - debug(D_CHILDS, "SIGNAL: Received %s. Reaping...", name); reap_children(); break; |