summaryrefslogtreecommitdiffstats
path: root/daemon/main.c
diff options
context:
space:
mode:
authorCosta Tsaousis <costa@netdata.cloud>2023-01-10 19:59:21 +0200
committerGitHub <noreply@github.com>2023-01-10 19:59:21 +0200
commit368a26cfee6887ca0cb2301d93138f63b75e353a (patch)
treeb57e39fdb78dc57f7a2c1fcc3d9b6bf3c2a2a113 /daemon/main.c
parentb513888be389f92b2323d1bb3fdf55c22d4e4bad (diff)
DBENGINE v2 (#14125)
* count open cache pages refering to datafile * eliminate waste flush attempts * remove eliminated variable * journal v2 scanning split functions * avoid locking open cache for a long time while migrating to journal v2 * dont acquire datafile for the loop; disable thread cancelability while a query is running * work on datafile acquiring * work on datafile deletion * work on datafile deletion again * logs of dbengine should start with DBENGINE * thread specific key for queries to check if a query finishes without a finalize * page_uuid is not used anymore * Cleanup judy traversal when building new v2 Remove not needed calls to metric registry * metric is 8 bytes smaller; timestamps are protected with a spinlock; timestamps in metric are now always coherent * disable checks for invalid time-ranges * Remove type from page details * report scanning time * remove infinite loop from datafile acquire for deletion * remove infinite loop from datafile acquire for deletion again * trace query handles * properly allocate array of dimensions in replication * metrics cleanup * metrics registry uses arrayalloc * arrayalloc free should be protected by lock * use array alloc in page cache * journal v2 scanning fix * datafile reference leaking hunding * do not load metrics of future timestamps * initialize reasons * fix datafile reference leak * do not load pages that are entirely overlapped by others * expand metric retention atomically * split replication logic in initialization and execution * replication prepare ahead queries * replication prepare ahead queries fixed * fix replication workers accounting * add router active queries chart * restore accounting of pages metadata sources; cleanup replication * dont count skipped pages as unroutable * notes on services shutdown * do not migrate to journal v2 too early, while it has pending dirty pages in the main cache for the specific journal file * do not add pages we dont need to pdc * time in range re-work to provide info about past and future matches * finner control on the pages selected for processing; accounting of page related issues * fix invalid reference to handle->page * eliminate data collection handle of pg_lookup_next * accounting for queries with gaps * query preprocessing the same way the processing is done; cache now supports all operations on Judy * dynamic libuv workers based on number of processors; minimum libuv workers 8; replication query init ahead uses libuv workers - reserved ones (3) * get into pdc all matching pages from main cache and open cache; do not do v2 scan if main cache and open cache can satisfy the query * finner gaps calculation; accounting of overlapping pages in queries * fix gaps accounting * move datafile deletion to worker thread * tune libuv workers and thread stack size * stop netdata threads gradually * run indexing together with cache flush/evict * more work on clean shutdown * limit the number of pages to evict per run * do not lock the clean queue for accesses if it is not possible at that time - the page will be moved to the back of the list during eviction * economies on flags for smaller page footprint; cleanup and renames * eviction moves referenced pages to the end of the queue * use murmur hash for indexing partition * murmur should be static * use more indexing partitions * revert number of partitions to number of cpus * cancel threads first, then stop services * revert default thread stack size * dont execute replication requests of disconnected senders * wait more time for services that are exiting gradually * fixed last commit * finer control on page selection algorithm * default stacksize of 1MB * fix formatting * fix worker utilization going crazy when the number is rotating * avoid buffer full due to replication preprocessing of requests * support query priorities * add count of spins in spinlock when compiled with netdata internal checks * remove prioritization from dbengine queries; cache now uses mutexes for the queues * hot pages are now in sections judy arrays, like dirty * align replication queries to optimal page size * during flushing add to clean and evict in batches * Revert "during flushing add to clean and evict in batches" This reverts commit 8fb2b69d068499eacea6de8291c336e5e9f197c7. * dont lock clean while evicting pages during flushing * Revert "dont lock clean while evicting pages during flushing" This reverts commit d6c82b5f40aeba86fc7aead062fab1b819ba58b3. * Revert "Revert "during flushing add to clean and evict in batches"" This reverts commit ca7a187537fb8f743992700427e13042561211ec. * dont cross locks during flushing, for the fastest flushes possible * low-priority queries load pages synchronously * Revert "low-priority queries load pages synchronously" This reverts commit 1ef2662ddcd20fe5842b856c716df134c42d1dc7. * cache uses spinlock again * during flushing, dont lock the clean queue at all; each item is added atomically * do smaller eviction runs * evict one page at a time to minimize lock contention on the clean queue * fix eviction statistics * fix last commit * plain should be main cache * event loop cleanup; evictions and flushes can now happen concurrently * run flush and evictions from tier0 only * remove not needed variables * flushing open cache is not needed; flushing protection is irrelevant since flushing is global for all tiers; added protection to datafiles so that only one flusher can run per datafile at any given time * added worker jobs in timer to find the slow part of it * support fast eviction of pages when all_of_them is set * revert default thread stack size * bypass event loop for dispatching read extent commands to workers - send them directly * Revert "bypass event loop for dispatching read extent commands to workers - send them directly" This reverts commit 2c08bc5bab12881ae33bc73ce5dea03dfc4e1fce. * cache work requests * minimize memory operations during flushing; caching of extent_io_descriptors and page_descriptors * publish flushed pages to open cache in the thread pool * prevent eventloop requests from getting stacked in the event loop * single threaded dbengine controller; support priorities for all queries; major cleanup and restructuring of rrdengine.c * more rrdengine.c cleanup * enable db rotation * do not log when there is a filter * do not run multiple migration to journal v2 * load all extents async * fix wrong paste * report opcodes waiting, works dispatched, works executing * cleanup event loop memory every 10 minutes * dont dispatch more work requests than the number of threads available * use the dispatched counter instead of the executing counter to check if the worker thread pool is full * remove UV_RUN_NOWAIT * replication to fill the queues * caching of extent buffers; code cleanup * caching of pdc and pd; rework on journal v2 indexing, datafile creation, database rotation * single transaction wal * synchronous flushing * first cancel the threads, then signal them to exit * caching of rrdeng query handles; added priority to query target; health is now low prio * add priority to the missing points; do not allow critical priority in queries * offload query preparation and routing to libuv thread pool * updated timing charts for the offloaded query preparation * caching of WALs * accounting for struct caches (buffers); do not load extents with invalid sizes * protection against memory booming during replication due to the optimal alignment of pages; sender thread buffer is now also reset when the circular buffer is reset * also check if the expanded before is not the chart later updated time * also check if the expanded before is not after the wall clock time of when the query started * Remove unused variable * replication to queue less queries; cleanup of internal fatals * Mark dimension to be updated async * caching of extent_page_details_list (epdl) and datafile_extent_offset_list (deol) * disable pgc stress test, under an ifdef * disable mrg stress test under an ifdef * Mark chart and host labels, host info for async check and store in the database * dictionary items use arrayalloc * cache section pages structure is allocated with arrayalloc * Add function to wakeup the aclk query threads and check for exit Register function to be called during shutdown after signaling the service to exit * parallel preparation of all dimensions of queries * be more sensitive to enable streaming after replication * atomically finish chart replication * fix last commit * fix last commit again * fix last commit again again * fix last commit again again again * unify the normalization of retention calculation for collected charts; do not enable streaming if more than 60 points are to be transferred; eliminate an allocation during replication * do not cancel start streaming; use high priority queries when we have locked chart data collection * prevent starvation on opcodes execution, by allowing 2% of the requests to be re-ordered * opcode now uses 2 spinlocks one for the caching of allocations and one for the waiting queue * Remove check locks and NETDATA_VERIFY_LOCKS as it is not needed anymore * Fix bad memory allocation / cleanup * Cleanup ACLK sync initialization (part 1) * Don't update metric registry during shutdown (part 1) * Prevent crash when dashboard is refreshed and host goes away * Mark ctx that is shutting down. Test not adding flushed pages to open cache as hot if we are shutting down * make ML work * Fix compile without NETDATA_INTERNAL_CHECKS * shutdown each ctx independently * fix completion of quiesce * do not update shared ML charts * Create ML charts on child hosts. When a parent runs a ML for a child, the relevant-ML charts should be created on the child host. These charts should use the parent's hostname to differentiate multiple parents that might run ML for a child. The only exception to this rule is the training/prediction resource usage charts. These are created on the localhost of the parent host, because they provide information specific to said host. * check new ml code * first save the database, then free all memory * dbengine prep exit before freeing all memory; fixed deadlock in cache hot to dirty; added missing check to query engine about metrics without any data in the db * Cleanup metadata thread (part 2) * increase refcount before dispatching prep command * Do not try to stop anomaly detection threads twice. A separate function call has been added to stop anomaly detection threads. This commit removes the left over function calls that were made internally when a host was being created/destroyed. * Remove allocations when smoothing samples buffer The number of dims per sample is always 1, ie. we are training and predicting only individual dimensions. * set the orphan flag when loading archived hosts * track worker dispatch callbacks and threadpool worker init * make ML threads joinable; mark ctx having flushing in progress as early as possible * fix allocation counter * Cleanup metadata thread (part 3) * Cleanup metadata thread (part 4) * Skip metadata host scan when running unittest * unittest support during init * dont use all the libuv threads for queries * break an infinite loop when sleep_usec() is interrupted * ml prediction is a collector for several charts * sleep_usec() now makes sure it will never loop if it passes the time expected; sleep_usec() now uses nanosleep() because clock_nanosleep() misses signals on netdata exit * worker_unregister() in netdata threads cleanup * moved pdc/epdl/deol/extent_buffer related code to pdc.c and pdc.h * fixed ML issues * removed engine2 directory * added dbengine2 files in CMakeLists.txt * move query plan data to query target, so that they can be exposed by in jsonwrap * uniform definition of query plan according to the other query target members * event_loop should be in daemon, not libnetdata * metric_retention_by_uuid() is now part of the storage engine abstraction * unify time_t variables to have the suffix _s (meaning: seconds) * old dbengine statistics become "dbengine io" * do not enable ML resource usage charts by default * unify ml chart families, plugins and modules * cleanup query plans from query target * cleanup all extent buffers * added debug info for rrddim slot to time * rrddim now does proper gap management * full rewrite of the mem modes * use library functions for madvise * use CHECKSUM_SZ for the checksum size * fix coverity warning about the impossible case of returning a page that is entirely in the past of the query * fix dbengine shutdown * keep the old datafile lock until a new datafile has been created, to avoid creating multiple datafiles concurrently * fine tune cache evictions * dont initialize health if the health service is not running - prevent crash on shutdown while children get connected * rename AS threads to ACLK[hostname] * prevent re-use of uninitialized memory in queries * use JulyL instead of JudyL for PDC operations - to test it first * add also JulyL files * fix July memory accounting * disable July for PDC (use Judy) * use the function to remove datafiles from linked list * fix july and event_loop * add july to libnetdata subdirs * rename time_t variables that end in _t to end in _s * replicate when there is a gap at the beginning of the replication period * reset postponing of sender connections when a receiver is connected * Adjust update every properly * fix replication infinite loop due to last change * packed enums in rrd.h and cleanup of obsolete rrd structure members * prevent deadlock in replication: replication_recalculate_buffer_used_ratio_unsafe() deadlocking with replication_sender_delete_pending_requests() * void unused variable * void unused variables * fix indentation * entries_by_time calculation in VD was wrong; restored internal checks for checking future timestamps * macros to caclulate page entries by time and size * prevent statsd cleanup crash on exit * cleanup health thread related variables Co-authored-by: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com> Co-authored-by: vkalintiris <vasilis@netdata.cloud>
Diffstat (limited to 'daemon/main.c')
-rw-r--r--daemon/main.c387
1 files changed, 363 insertions, 24 deletions
diff --git a/daemon/main.c b/daemon/main.c
index 941412697f..4ce0360b97 100644
--- a/daemon/main.c
+++ b/daemon/main.c
@@ -8,6 +8,8 @@ bool unittest_running = false;
int netdata_zero_metrics_enabled;
int netdata_anonymous_statistics_enabled;
+int libuv_worker_threads = MIN_LIBUV_WORKER_THREADS;
+
struct netdata_static_thread *static_threads;
struct config netdata_config = {
@@ -23,11 +25,267 @@ struct config netdata_config = {
}
};
-void netdata_cleanup_and_exit(int ret) {
- // enabling this, is wrong
- // because the threads will be cancelled while cleaning up
- // netdata_exit = 1;
+typedef struct service_thread {
+ pid_t tid;
+ SERVICE_THREAD_TYPE type;
+ SERVICE_TYPE services;
+ char name[NETDATA_THREAD_NAME_MAX + 1];
+ bool cancelled;
+
+ union {
+ netdata_thread_t netdata_thread;
+ uv_thread_t uv_thread;
+ };
+
+ force_quit_t force_quit_callback;
+ request_quit_t request_quit_callback;
+ void *data;
+} SERVICE_THREAD;
+
+struct service_globals {
+ SERVICE_TYPE running;
+ SPINLOCK lock;
+ Pvoid_t pid_judy;
+} service_globals = {
+ .running = ~0,
+ .pid_judy = NULL,
+};
+
+SERVICE_THREAD *service_register(SERVICE_THREAD_TYPE thread_type, request_quit_t request_quit_callback, force_quit_t force_quit_callback, void *data, bool update __maybe_unused) {
+ SERVICE_THREAD *sth = NULL;
+ pid_t tid = gettid();
+
+ netdata_spinlock_lock(&service_globals.lock);
+ Pvoid_t *PValue = JudyLIns(&service_globals.pid_judy, tid, PJE0);
+ if(!*PValue) {
+ sth = callocz(1, sizeof(SERVICE_THREAD));
+ sth->tid = tid;
+ sth->type = thread_type;
+ sth->request_quit_callback = request_quit_callback;
+ sth->force_quit_callback = force_quit_callback;
+ sth->data = data;
+ os_thread_get_current_name_np(sth->name);
+ *PValue = sth;
+
+ switch(thread_type) {
+ case SERVICE_THREAD_TYPE_NETDATA:
+ sth->netdata_thread = netdata_thread_self();
+ break;
+
+ case SERVICE_THREAD_TYPE_LIBUV:
+ sth->uv_thread = uv_thread_self();
+ break;
+ }
+ }
+ else {
+ sth = *PValue;
+ }
+ netdata_spinlock_unlock(&service_globals.lock);
+
+ return sth;
+}
+
+void service_exits(void) {
+ pid_t tid = gettid();
+
+ netdata_spinlock_lock(&service_globals.lock);
+ Pvoid_t *PValue = JudyLGet(service_globals.pid_judy, tid, PJE0);
+ if(PValue) {
+ freez(*PValue);
+ JudyLDel(&service_globals.pid_judy, tid, PJE0);
+ }
+ netdata_spinlock_unlock(&service_globals.lock);
+}
+
+bool service_running(SERVICE_TYPE service) {
+ static __thread SERVICE_THREAD *sth = NULL;
+
+ if(unlikely(!sth))
+ sth = service_register(SERVICE_THREAD_TYPE_NETDATA, NULL, NULL, NULL, false);
+
+ if(netdata_exit)
+ __atomic_store_n(&service_globals.running, 0, __ATOMIC_RELAXED);
+
+ if(service == 0)
+ service = sth->services;
+
+ sth->services |= service;
+
+ return ((__atomic_load_n(&service_globals.running, __ATOMIC_RELAXED) & service) == service);
+}
+
+void service_signal_exit(SERVICE_TYPE service) {
+ __atomic_and_fetch(&service_globals.running, ~(service), __ATOMIC_RELAXED);
+
+ netdata_spinlock_lock(&service_globals.lock);
+
+ Pvoid_t *PValue;
+ Word_t tid = 0;
+ bool first = true;
+ while((PValue = JudyLFirstThenNext(service_globals.pid_judy, &tid, &first))) {
+ SERVICE_THREAD *sth = *PValue;
+
+ if((sth->services & service) && sth->request_quit_callback) {
+ netdata_spinlock_unlock(&service_globals.lock);
+ sth->request_quit_callback(sth->data);
+ netdata_spinlock_lock(&service_globals.lock);
+ continue;
+ }
+ }
+
+ netdata_spinlock_unlock(&service_globals.lock);
+}
+
+static void service_to_buffer(BUFFER *wb, SERVICE_TYPE service) {
+ if(service & SERVICE_MAINTENANCE)
+ buffer_strcat(wb, "MAINTENANCE ");
+ if(service & SERVICE_COLLECTORS)
+ buffer_strcat(wb, "COLLECTORS ");
+ if(service & SERVICE_ML_TRAINING)
+ buffer_strcat(wb, "ML_TRAINING ");
+ if(service & SERVICE_ML_PREDICTION)
+ buffer_strcat(wb, "ML_PREDICTION ");
+ if(service & SERVICE_REPLICATION)
+ buffer_strcat(wb, "REPLICATION ");
+ if(service & ABILITY_DATA_QUERIES)
+ buffer_strcat(wb, "DATA_QUERIES ");
+ if(service & ABILITY_WEB_REQUESTS)
+ buffer_strcat(wb, "WEB_REQUESTS ");
+ if(service & SERVICE_WEB_SERVER)
+ buffer_strcat(wb, "WEB_SERVER ");
+ if(service & SERVICE_ACLK)
+ buffer_strcat(wb, "ACLK ");
+ if(service & SERVICE_HEALTH)
+ buffer_strcat(wb, "HEALTH ");
+ if(service & SERVICE_STREAMING)
+ buffer_strcat(wb, "STREAMING ");
+ if(service & ABILITY_STREAMING_CONNECTIONS)
+ buffer_strcat(wb, "STREAMING_CONNECTIONS ");
+ if(service & SERVICE_CONTEXT)
+ buffer_strcat(wb, "CONTEXT ");
+ if(service & SERVICE_ANALYTICS)
+ buffer_strcat(wb, "ANALYTICS ");
+ if(service & SERVICE_EXPORTERS)
+ buffer_strcat(wb, "EXPORTERS ");
+}
+
+static bool service_wait_exit(SERVICE_TYPE service, usec_t timeout_ut) {
+ BUFFER *service_list = buffer_create(1024);
+ BUFFER *thread_list = buffer_create(1024);
+ usec_t started_ut = now_monotonic_usec(), ended_ut;
+ size_t running;
+ SERVICE_TYPE running_services = 0;
+
+ // cancel the threads
+ running = 0;
+ running_services = 0;
+ {
+ buffer_flush(thread_list);
+
+ netdata_spinlock_lock(&service_globals.lock);
+
+ Pvoid_t *PValue;
+ Word_t tid = 0;
+ bool first = true;
+ while((PValue = JudyLFirstThenNext(service_globals.pid_judy, &tid, &first))) {
+ SERVICE_THREAD *sth = *PValue;
+ if(sth->services & service && sth->tid != gettid() && !sth->cancelled) {
+ sth->cancelled = true;
+
+ switch(sth->type) {
+ case SERVICE_THREAD_TYPE_NETDATA:
+ netdata_thread_cancel(sth->netdata_thread);
+ break;
+
+ case SERVICE_THREAD_TYPE_LIBUV:
+ break;
+ }
+
+ if(running)
+ buffer_strcat(thread_list, ", ");
+
+ buffer_sprintf(thread_list, "'%s' (%d)", sth->name, sth->tid);
+
+ running++;
+ running_services |= sth->services & service;
+
+ if(sth->force_quit_callback) {
+ netdata_spinlock_unlock(&service_globals.lock);
+ sth->force_quit_callback(sth->data);
+ netdata_spinlock_lock(&service_globals.lock);
+ continue;
+ }
+ }
+ }
+
+ netdata_spinlock_unlock(&service_globals.lock);
+ }
+
+ service_signal_exit(service);
+ // signal them to stop
+ size_t last_running = 0;
+ size_t stale_time_ut = 0;
+ usec_t sleep_ut = 500 * USEC_PER_MS;
+ do {
+ if(running != last_running)
+ stale_time_ut = 0;
+
+ last_running = running;
+ running = 0;
+ running_services = 0;
+ buffer_flush(thread_list);
+
+ netdata_spinlock_lock(&service_globals.lock);
+
+ Pvoid_t *PValue;
+ Word_t tid = 0;
+ bool first = true;
+ while((PValue = JudyLFirstThenNext(service_globals.pid_judy, &tid, &first))) {
+ SERVICE_THREAD *sth = *PValue;
+ if(sth->services & service && sth->tid != gettid()) {
+ if(running)
+ buffer_strcat(thread_list, ", ");
+
+ buffer_sprintf(thread_list, "'%s' (%d)", sth->name, sth->tid);
+
+ running_services |= sth->services & service;
+ running++;
+ }
+ }
+
+ netdata_spinlock_unlock(&service_globals.lock);
+
+ if(running) {
+ buffer_flush(service_list);
+ service_to_buffer(service_list, running_services);
+ info("SERVICE CONTROL: waiting for the following %zu services [ %s] to exit: %s",
+ running, buffer_tostring(service_list),
+ running <= 10 ? buffer_tostring(thread_list) : "");
+ sleep_usec(sleep_ut);
+ stale_time_ut += sleep_ut;
+ }
+
+ ended_ut = now_monotonic_usec();
+ } while(running && (ended_ut - started_ut < timeout_ut || stale_time_ut < timeout_ut));
+
+ if(running) {
+ buffer_flush(service_list);
+ service_to_buffer(service_list, running_services);
+ info("SERVICE CONTROL: "
+ "the following %zu service(s) [ %s] take too long to exit: %s; "
+ "giving up on them...",
+ running, buffer_tostring(service_list),
+ buffer_tostring(thread_list));
+ }
+
+ buffer_free(thread_list);
+ buffer_free(service_list);
+
+ return (running == 0);
+}
+
+void netdata_cleanup_and_exit(int ret) {
error_log_limit_unlimited();
info("EXIT: netdata prepares to exit with code %d...", ret);
@@ -39,38 +297,82 @@ void netdata_cleanup_and_exit(int ret) {
snprintfz(agent_incomplete_shutdown_file, FILENAME_MAX, "%s/.agent_incomplete_shutdown", netdata_configured_varlib_dir);
(void) rename(agent_crash_file, agent_incomplete_shutdown_file);
- // cleanup/save the database and exit
+ service_signal_exit(
+ SERVICE_MAINTENANCE
+ | ABILITY_DATA_QUERIES
+ | ABILITY_WEB_REQUESTS
+ | ABILITY_STREAMING_CONNECTIONS
+ | SERVICE_ACLK
+ );
+
+ service_wait_exit(
+ SERVICE_REPLICATION
+ | SERVICE_EXPORTERS
+ | SERVICE_ML_TRAINING
+ | SERVICE_HEALTH
+ | SERVICE_WEB_SERVER
+ , 3 * USEC_PER_SEC);
+
+ service_wait_exit(
+ SERVICE_COLLECTORS
+ | SERVICE_STREAMING
+ , 3 * USEC_PER_SEC);
+
+ service_wait_exit(
+ SERVICE_ML_PREDICTION
+ | SERVICE_CONTEXT
+ , 3 * USEC_PER_SEC);
+
+ service_wait_exit(
+ SERVICE_MAINTENANCE
+ , 3 * USEC_PER_SEC);
+
info("EXIT: cleaning up the database...");
rrdhost_cleanup_all();
- if(!ret) {
- // exit cleanly
+ info("EXIT: metasync shutdown prepare...");
+ metadata_sync_shutdown_prepare();
- // stop everything
- info("EXIT: stopping static threads...");
#ifdef ENABLE_ACLK
- aclk_sync_exit_all();
+ aclk_sync_exit_all();
#endif
- cancel_main_threads();
- // free the database
- info("EXIT: freeing database memory...");
+ service_wait_exit(
+ SERVICE_ACLK
+ , 3 * USEC_PER_SEC);
+
+ // stop everything else
+ service_wait_exit(~0, 10 * USEC_PER_SEC);
+
+ info("EXIT: stopping static threads...");
+ cancel_main_threads();
+
+ if(!ret) {
+ // exit cleanly
+
#ifdef ENABLE_DBENGINE
if(dbengine_enabled) {
+ info("EXIT: flushing dbengine...");
for (size_t tier = 0; tier < storage_tiers; tier++)
rrdeng_prepare_exit(multidb_ctx[tier]);
}
#endif
- metadata_sync_shutdown_prepare();
+
+ // free the database
+ info("EXIT: freeing database memory...");
rrdhost_free_all();
+
metadata_sync_shutdown();
+
#ifdef ENABLE_DBENGINE
if(dbengine_enabled) {
+ info("EXIT: stopping dbengine...");
for (size_t tier = 0; tier < storage_tiers; tier++)
rrdeng_exit(multidb_ctx[tier]);
}
#endif
}
+
sql_close_context_database();
sql_close_database();
@@ -679,8 +981,9 @@ static void get_netdata_configured_variables() {
// ------------------------------------------------------------------------
// get default Database Engine page cache size in MiB
- db_engine_use_malloc = config_get_boolean(CONFIG_SECTION_DB, "dbengine page cache with malloc", CONFIG_BOOLEAN_YES);
default_rrdeng_page_cache_mb = (int) config_get_number(CONFIG_SECTION_DB, "dbengine page cache size MB", default_rrdeng_page_cache_mb);
+ db_engine_journal_check = config_get_boolean(CONFIG_SECTION_DB, "dbengine enable journal integrity check", CONFIG_BOOLEAN_NO);
+
if(default_rrdeng_page_cache_mb < RRDENG_MIN_PAGE_CACHE_SIZE_MB) {
error("Invalid page cache size %d given. Defaulting to %d.", default_rrdeng_page_cache_mb, RRDENG_MIN_PAGE_CACHE_SIZE_MB);
default_rrdeng_page_cache_mb = RRDENG_MIN_PAGE_CACHE_SIZE_MB;
@@ -731,14 +1034,14 @@ static void get_netdata_configured_variables() {
// --------------------------------------------------------------------
- rrdset_free_obsolete_time = config_get_number(CONFIG_SECTION_DB, "cleanup obsolete charts after secs", rrdset_free_obsolete_time);
+ rrdset_free_obsolete_time_s = config_get_number(CONFIG_SECTION_DB, "cleanup obsolete charts after secs", rrdset_free_obsolete_time_s);
// Current chart locking and invalidation scheme doesn't prevent Netdata from segmentation faults if a short
// cleanup delay is set. Extensive stress tests showed that 10 seconds is quite a safe delay. Look at
// https://github.com/netdata/netdata/pull/11222#issuecomment-868367920 for more information.
- if (rrdset_free_obsolete_time < 10) {
- rrdset_free_obsolete_time = 10;
+ if (rrdset_free_obsolete_time_s < 10) {
+ rrdset_free_obsolete_time_s = 10;
info("The \"cleanup obsolete charts after seconds\" option was set to 10 seconds.");
- config_set_number(CONFIG_SECTION_DB, "cleanup obsolete charts after secs", rrdset_free_obsolete_time);
+ config_set_number(CONFIG_SECTION_DB, "cleanup obsolete charts after secs", rrdset_free_obsolete_time_s);
}
gap_when_lost_iterations_above = (int)config_get_number(CONFIG_SECTION_DB, "gap when lost iterations above", gap_when_lost_iterations_above);
@@ -746,6 +1049,7 @@ static void get_netdata_configured_variables() {
gap_when_lost_iterations_above = 1;
config_set_number(CONFIG_SECTION_DB, "gap when lost iterations above", gap_when_lost_iterations_above);
}
+ gap_when_lost_iterations_above += 2;
// --------------------------------------------------------------------
// get various system parameters
@@ -874,6 +1178,10 @@ void post_conf_load(char **user)
appconfig_get(&cloud_config, CONFIG_SECTION_GLOBAL, "cloud base url", DEFAULT_CLOUD_BASE_URL);
}
+int pgc_unittest(void);
+int mrg_unittest(void);
+int julytest(void);
+
int main(int argc, char **argv) {
int i;
int config_loaded = 0;
@@ -1001,7 +1309,7 @@ int main(int argc, char **argv) {
default_health_enabled = 0;
storage_tiers = 1;
registry_init();
- if(rrd_init("unittest", NULL)) {
+ if(rrd_init("unittest", NULL, true)) {
fprintf(stderr, "rrd_init failed for unittest\n");
return 1;
}
@@ -1056,6 +1364,18 @@ int main(int argc, char **argv) {
unittest_running = true;
return metadata_unittest();
}
+ else if(strcmp(optarg, "pgctest") == 0) {
+ unittest_running = true;
+ return pgc_unittest();
+ }
+ else if(strcmp(optarg, "mrgtest") == 0) {
+ unittest_running = true;
+ return mrg_unittest();
+ }
+ else if(strcmp(optarg, "julytest") == 0) {
+ unittest_running = true;
+ return julytest();
+ }
else if(strncmp(optarg, createdataset_string, strlen(createdataset_string)) == 0) {
optarg += strlen(createdataset_string);
unsigned history_seconds = strtoul(optarg, NULL, 0);
@@ -1345,9 +1665,29 @@ int main(int argc, char **argv) {
// initialize the system clocks
clocks_init();
- // prepare configuration environment variables for the plugins
+ // set libuv worker threads
+ libuv_worker_threads = get_system_cpus() * 2;
+
+ if(libuv_worker_threads < MIN_LIBUV_WORKER_THREADS)
+ libuv_worker_threads = MIN_LIBUV_WORKER_THREADS;
+
+ if(libuv_worker_threads > MAX_LIBUV_WORKER_THREADS)
+ libuv_worker_threads = MAX_LIBUV_WORKER_THREADS;
+
- setenv("UV_THREADPOOL_SIZE", config_get(CONFIG_SECTION_GLOBAL, "libuv worker threads", "16"), 1);
+ libuv_worker_threads = config_get_number(CONFIG_SECTION_GLOBAL, "libuv worker threads", libuv_worker_threads);
+ if(libuv_worker_threads < MIN_LIBUV_WORKER_THREADS) {
+ libuv_worker_threads = MIN_LIBUV_WORKER_THREADS;
+ config_set_number(CONFIG_SECTION_GLOBAL, "libuv worker threads", libuv_worker_threads);
+ }
+
+ {
+ char buf[20 + 1];
+ snprintfz(buf, 20, "%d", libuv_worker_threads);
+ setenv("UV_THREADPOOL_SIZE", buf, 1);
+ }
+
+ // prepare configuration environment variables for the plugins
get_netdata_configured_variables();
set_global_environment();
@@ -1426,7 +1766,6 @@ int main(int argc, char **argv) {
// setup threads configs
default_stacksize = netdata_threads_init();
-
// --------------------------------------------------------------------
// check which threads are enabled and initialize them
@@ -1504,7 +1843,7 @@ int main(int argc, char **argv) {
system_info->hops = 0;
get_install_type(&system_info->install_type, &system_info->prebuilt_arch, &system_info->prebuilt_dist);
- if(rrd_init(netdata_configured_hostname, system_info))
+ if(rrd_init(netdata_configured_hostname, system_info, false))
fatal("Cannot initialize localhost instance with name '%s'.", netdata_configured_hostname);
char agent_crash_file[FILENAME_MAX + 1];