From fc9b5170c6301366bd66d9b0766850a0ed8dc807 Mon Sep 17 00:00:00 2001 From: Costa Tsaousis Date: Sun, 3 Dec 2023 17:00:51 +0200 Subject: log2journal improvements 5 (#16519) * added ${LINE} variable; added default config * prefer single quotes in yaml to avoid interference from yaml escaping * simple_hashtable now supports deletions * simple hashtable now supports setting entries with NULL values * hashtable implementation now has sorting option to maintain a sorted list of the items * multiple hashtables with type checking * added comments * still incomplete yaml parser * fixes and cleanup --- collectors/log2journal/Makefile.am | 1 + collectors/log2journal/README.md | 7 +- collectors/log2journal/log2journal-help.c | 2 +- collectors/log2journal/log2journal-json.c | 75 ++++- collectors/log2journal/log2journal-params.c | 27 +- collectors/log2journal/log2journal-yaml.c | 2 +- collectors/log2journal/log2journal.c | 107 +++--- collectors/log2journal/log2journal.d/default.yaml | 15 + .../log2journal/log2journal.d/nginx-combined.yaml | 38 +-- .../log2journal/log2journal.d/nginx-json.yaml | 78 ++--- collectors/log2journal/log2journal.h | 66 ++-- collectors/log2journal/tests.d/default.output | 20 ++ collectors/log2journal/tests.d/full.output | 24 +- collectors/log2journal/tests.d/full.yaml | 12 +- collectors/log2journal/tests.sh | 22 +- libnetdata/facets/facets.c | 69 ++-- libnetdata/simple_hashtable.h | 372 +++++++++++++++++++-- 17 files changed, 684 insertions(+), 253 deletions(-) create mode 100644 collectors/log2journal/log2journal.d/default.yaml create mode 100644 collectors/log2journal/tests.d/default.output diff --git a/collectors/log2journal/Makefile.am b/collectors/log2journal/Makefile.am index 578757fc38..b13d2160b3 100644 --- a/collectors/log2journal/Makefile.am +++ b/collectors/log2journal/Makefile.am @@ -13,4 +13,5 @@ log2journalconfigdir=$(libconfigdir)/log2journal.d dist_log2journalconfig_DATA = \ log2journal.d/nginx-combined.yaml \ log2journal.d/nginx-json.yaml \ + log2journal.d/default.yaml \ $(NULL) diff --git a/collectors/log2journal/README.md b/collectors/log2journal/README.md index 2747142d68..bb48378803 100644 --- a/collectors/log2journal/README.md +++ b/collectors/log2journal/README.md @@ -2,7 +2,7 @@ `log2journal` and `systemd-cat-native` can be used to convert a structured log file, such as the ones generated by web servers, into `systemd-journal` entries. -By combining these tools, together with the usual UNIX shell tools you can create advanced log processing pipelines sending any kind of structured text logs to systemd-journald. This is a simple, but powerful and efficient way to handle log processing. +By combining these tools you can create advanced log processing pipelines sending any kind of structured text logs to systemd-journald. This is a simple, but powerful and efficient way to handle log processing. The process involves the usual piping of shell commands, to get and process the log files in realtime. @@ -27,6 +27,11 @@ Let's see the steps: ``` 3. `systemd-cat-native` is a Netdata program. I can send the logs to a local `systemd-journald` (journal namespaces supported), or to a remote `systemd-journal-remote`. + +## YAML configuration + + + ## Real-life example We have an nginx server logging in this format: diff --git a/collectors/log2journal/log2journal-help.c b/collectors/log2journal/log2journal-help.c index 67af516df3..a20615c3c2 100644 --- a/collectors/log2journal/log2journal-help.c +++ b/collectors/log2journal/log2journal-help.c @@ -60,7 +60,7 @@ void log_job_command_line_help(const char *name) { printf(" --file /path/to/file.yaml or -f /path/to/file.yaml\n"); printf(" Read yaml configuration file for instructions.\n"); printf("\n"); - printf(" --config CONFIG_NAME\n"); + printf(" --config CONFIG_NAME or -c CONFIG_NAME\n"); printf(" Run with the internal configuration named CONFIG_NAME.\n"); printf(" Available internal configs:\n"); printf("\n"); diff --git a/collectors/log2journal/log2journal-json.c b/collectors/log2journal/log2journal-json.c index 41f893abc5..2ca294e4db 100644 --- a/collectors/log2journal/log2journal-json.c +++ b/collectors/log2journal/log2journal-json.c @@ -167,7 +167,7 @@ static inline bool json_parse_number(LOG_JSON_STATE *js) { } } -static bool encode_utf8(unsigned codepoint, char **d, size_t *remaining) { +static inline bool encode_utf8(unsigned codepoint, char **d, size_t *remaining) { if (codepoint <= 0x7F) { // 1-byte sequence if (*remaining < 2) return false; // +1 for the null @@ -205,6 +205,56 @@ static bool encode_utf8(unsigned codepoint, char **d, size_t *remaining) { return true; } +size_t parse_surrogate(const char *s, char *d, size_t *remaining) { + if (s[0] != '\\' || (s[1] != 'u' && s[1] != 'U')) { + return 0; // Not a valid Unicode escape sequence + } + + char hex[9] = {0}; // Buffer for the hexadecimal value + unsigned codepoint; + + if (s[1] == 'u') { + // Handle \uXXXX + if (!isxdigit(s[2]) || !isxdigit(s[3]) || !isxdigit(s[4]) || !isxdigit(s[5])) { + return 0; // Not a valid \uXXXX sequence + } + + hex[0] = s[2]; + hex[1] = s[3]; + hex[2] = s[4]; + hex[3] = s[5]; + codepoint = (unsigned)strtoul(hex, NULL, 16); + + if (codepoint >= 0xD800 && codepoint <= 0xDBFF) { + // Possible start of surrogate pair + if (s[6] == '\\' && s[7] == 'u' && isxdigit(s[8]) && isxdigit(s[9]) && + isxdigit(s[10]) && isxdigit(s[11])) { + // Valid low surrogate + unsigned low_surrogate = strtoul(&s[8], NULL, 16); + if (low_surrogate < 0xDC00 || low_surrogate > 0xDFFF) { + return 0; // Invalid low surrogate + } + codepoint = 0x10000 + ((codepoint - 0xD800) << 10) + (low_surrogate - 0xDC00); + return encode_utf8(codepoint, &d, remaining) ? 12 : 0; // \uXXXX\uXXXX + } + } + + // Single \uXXXX + return encode_utf8(codepoint, &d, remaining) ? 6 : 0; + } + else { + // Handle \UXXXXXXXX + for (int i = 2; i < 10; i++) { + if (!isxdigit(s[i])) { + return 0; // Not a valid \UXXXXXXXX sequence + } + hex[i - 2] = s[i]; + } + codepoint = (unsigned)strtoul(hex, NULL, 16); + return encode_utf8(codepoint, &d, remaining) ? 10 : 0; // \UXXXXXXXX + } +} + static inline void copy_newline(LOG_JSON_STATE *js __maybe_unused, char **d, size_t *remaining) { if(*remaining > 3) { *(*d)++ = '\\'; @@ -258,18 +308,12 @@ static inline bool json_parse_string(LOG_JSON_STATE *js) { s++; break; - case 'u': - if(isxdigit(s[1]) && isxdigit(s[2]) && isxdigit(s[3]) && isxdigit(s[4])) { - char b[5] = { - [0] = s[1], - [1] = s[2], - [2] = s[3], - [3] = s[4], - [4] = '\0', - }; - unsigned codepoint = strtoul(b, NULL, 16); - if(encode_utf8(codepoint, &d, &remaining)) { - s += 5; + case 'u': { + size_t old_remaining = remaining; + size_t consumed = parse_surrogate(s - 1, d, &remaining); + if (consumed > 0) { + s += consumed - 1; // -1 because we already incremented s after '\\' + d += old_remaining - remaining; continue; } else { @@ -278,11 +322,6 @@ static inline bool json_parse_string(LOG_JSON_STATE *js) { c = *s++; } } - else { - *d++ = '\\'; - remaining--; - c = *s++; - } break; default: diff --git a/collectors/log2journal/log2journal-params.c b/collectors/log2journal/log2journal-params.c index ca4e2f5860..a7bb3e263c 100644 --- a/collectors/log2journal/log2journal-params.c +++ b/collectors/log2journal/log2journal-params.c @@ -6,22 +6,25 @@ void log_job_init(LOG_JOB *jb) { memset(jb, 0, sizeof(*jb)); - simple_hashtable_init(&jb->hashtable, 32); + simple_hashtable_init_KEY(&jb->hashtable, 32); + hashed_key_set(&jb->line.key, "LINE"); } -static void simple_hashtable_cleanup_allocated(SIMPLE_HASHTABLE *ht) { - for(size_t i = 0; i < ht->size ;i++) { - HASHED_KEY *k = ht->hashtable[i].data; +static void simple_hashtable_cleanup_allocated_keys(SIMPLE_HASHTABLE_KEY *ht) { + SIMPLE_HASHTABLE_FOREACH_READ_ONLY(ht, sl, _KEY) { + HASHED_KEY *k = SIMPLE_HASHTABLE_FOREACH_READ_ONLY_VALUE(sl); if(k && k->flags & HK_HASHTABLE_ALLOCATED) { - hashed_key_cleanup(k); - freez(k); - ht->hashtable[i].data = NULL; - ht->hashtable[i].hash = 0; + // the order of these statements is important! + simple_hashtable_del_slot_KEY(ht, sl); // remove any references to n + hashed_key_cleanup(k); // cleanup the internals of n + freez(k); // free n } } } void log_job_cleanup(LOG_JOB *jb) { + hashed_key_cleanup(&jb->line.key); + if(jb->prefix) { freez((void *) jb->prefix); jb->prefix = NULL; @@ -47,8 +50,8 @@ void log_job_cleanup(LOG_JOB *jb) { txt_cleanup(&jb->rewrites.tmp); txt_cleanup(&jb->filename.current); - simple_hashtable_cleanup_allocated(&jb->hashtable); - simple_hashtable_free(&jb->hashtable); + simple_hashtable_cleanup_allocated_keys(&jb->hashtable); + simple_hashtable_destroy_KEY(&jb->hashtable); // remove references to everything else, to reveal them in valgrind memset(jb, 0, sizeof(*jb)); @@ -346,7 +349,7 @@ bool log_job_command_line_parse_parameters(LOG_JOB *jb, int argc, char **argv) { if (!yaml_parse_file(value, jb)) return false; } - else if (strcmp(param, "--config") == 0) { + else if (strcmp(param, "-c") == 0 || strcmp(param, "--config") == 0) { if (!yaml_parse_config(value, jb)) return false; } @@ -392,7 +395,7 @@ bool log_job_command_line_parse_parameters(LOG_JOB *jb, int argc, char **argv) { // Check if a pattern is set and exactly one pattern is specified if (!jb->pattern) { - log2stderr("Error: Pattern not specified."); + log2stderr("Warning: pattern not specified. Try the default config with: -c default"); log_job_command_line_help(argv[0]); return false; } diff --git a/collectors/log2journal/log2journal-yaml.c b/collectors/log2journal/log2journal-yaml.c index 1b9e823cb7..862e7bf4b7 100644 --- a/collectors/log2journal/log2journal-yaml.c +++ b/collectors/log2journal/log2journal-yaml.c @@ -852,7 +852,7 @@ static bool needs_quotes_in_yaml(const char *str) { static void yaml_print_node(const char *key, const char *value, size_t depth, bool dash) { if(depth > 10) depth = 10; - const char *quote = "\""; + const char *quote = "'"; const char *second_line = NULL; if(value && strchr(value, '\n')) { diff --git a/collectors/log2journal/log2journal.c b/collectors/log2journal/log2journal.c index 5dd98d6837..c3204939cd 100644 --- a/collectors/log2journal/log2journal.c +++ b/collectors/log2journal/log2journal.c @@ -61,37 +61,14 @@ const char journal_key_characters_map[256] = { // ---------------------------------------------------------------------------- -// Function to insert a key into the sorted.keys array while keeping it sorted -void log_job_add_key_sorted(LOG_JOB *jb, HASHED_KEY *newKey) { - size_t i, j; - - // Find the position to insert the new key based on lexicographic order - for (i = 0; i < jb->sorted.used; i++) { - if (strcmp(newKey->key, jb->sorted.keys[i]->key) < 0) { - break; - } - } - - // Shift elements to the right to make space for the new key - for (j = jb->sorted.used; j > i; j--) { - jb->sorted.keys[j] = jb->sorted.keys[j - 1]; - } - - // Insert the new key at the correct position - jb->sorted.keys[i] = newKey; - jb->sorted.used++; -} - static inline HASHED_KEY *get_key_from_hashtable(LOG_JOB *jb, HASHED_KEY *k) { if(k->flags & HK_HASHTABLE_ALLOCATED) return k; if(!k->hashtable_ptr) { HASHED_KEY *ht_key; - SIMPLE_HASHTABLE_SLOT *slot = simple_hashtable_get_slot(&jb->hashtable, k->hash, true); - if(slot->data) { - ht_key = slot->data; - + SIMPLE_HASHTABLE_SLOT_KEY *slot = simple_hashtable_get_slot_KEY(&jb->hashtable, k->hash, true); + if((ht_key = SIMPLE_HASHTABLE_SLOT_DATA(slot))) { if(!(ht_key->flags & HK_COLLISION_CHECKED)) { ht_key->flags |= HK_COLLISION_CHECKED; @@ -109,11 +86,7 @@ static inline HASHED_KEY *get_key_from_hashtable(LOG_JOB *jb, HASHED_KEY *k) { ht_key->hash = k->hash; ht_key->flags = HK_HASHTABLE_ALLOCATED; - slot->hash = ht_key->hash; - slot->data = ht_key; - jb->hashtable.used++; - - log_job_add_key_sorted(jb, ht_key); + simple_hashtable_set_slot_KEY(&jb->hashtable, slot, ht_key->hash, ht_key); } k->hashtable_ptr = ht_key; @@ -158,18 +131,25 @@ static inline void validate_key(LOG_JOB *jb __maybe_unused, HASHED_KEY *k) { // ---------------------------------------------------------------------------- -static inline size_t replace_evaluate_to_buffer(LOG_JOB *jb, HASHED_KEY *k, REPLACE_PATTERN *rp, char *dst, size_t dst_size) { +static inline size_t replace_evaluate_to_buffer(LOG_JOB *jb, HASHED_KEY *k __maybe_unused, REPLACE_PATTERN *rp, char *dst, size_t dst_size) { size_t remaining = dst_size; char *copy_to = dst; for(REPLACE_NODE *node = rp->nodes; node != NULL && remaining > 1; node = node->next) { if(node->is_variable) { - HASHED_KEY *ktmp = get_key_from_hashtable_with_char_ptr(jb, node->name.key); - if(ktmp->value.len) { - size_t copied = copy_to_buffer(copy_to, remaining, ktmp->value.txt, ktmp->value.len); + if(hashed_keys_match(&node->name, &jb->line.key)) { + size_t copied = copy_to_buffer(copy_to, remaining, jb->line.trimmed, jb->line.trimmed_len); copy_to += copied; remaining -= copied; } + else { + HASHED_KEY *ktmp = get_key_from_hashtable_with_char_ptr(jb, node->name.key); + if(ktmp->value.len) { + size_t copied = copy_to_buffer(copy_to, remaining, ktmp->value.txt, ktmp->value.len); + copy_to += copied; + remaining -= copied; + } + } } else { size_t copied = copy_to_buffer(copy_to, remaining, node->name.key, node->name.len); @@ -189,9 +169,14 @@ static inline void replace_evaluate(LOG_JOB *jb, HASHED_KEY *k, REPLACE_PATTERN for(REPLACE_NODE *node = rp->nodes; node != NULL; node = node->next) { if(node->is_variable) { - HASHED_KEY *ktmp = get_key_from_hashtable_with_char_ptr(jb, node->name.key); - if(ktmp->value.len) - txt_expand_and_append(&ht_key->value, ktmp->value.txt, ktmp->value.len); + if(hashed_keys_match(&node->name, &jb->line.key)) + txt_expand_and_append(&ht_key->value, jb->line.trimmed, jb->line.trimmed_len); + + else { + HASHED_KEY *ktmp = get_key_from_hashtable_with_char_ptr(jb, node->name.key); + if(ktmp->value.len) + txt_expand_and_append(&ht_key->value, ktmp->value.txt, ktmp->value.len); + } } else txt_expand_and_append(&ht_key->value, node->name.key, node->name.len); @@ -220,9 +205,14 @@ static inline void replace_evaluate_from_pcre2(LOG_JOB *jb, HASHED_KEY *k, REPLA txt_expand_and_append(&jb->rewrites.tmp, k->value.txt + start_offset, length); } else { - HASHED_KEY *ktmp = get_key_from_hashtable_with_char_ptr(jb, node->name.key); - if(ktmp->value.len) - txt_expand_and_append(&jb->rewrites.tmp, ktmp->value.txt, ktmp->value.len); + if(hashed_keys_match(&node->name, &jb->line.key)) + txt_expand_and_append(&jb->rewrites.tmp, jb->line.trimmed, jb->line.trimmed_len); + + else { + HASHED_KEY *ktmp = get_key_from_hashtable_with_char_ptr(jb, node->name.key); + if(ktmp->value.len) + txt_expand_and_append(&jb->rewrites.tmp, ktmp->value.txt, ktmp->value.len); + } } } else { @@ -299,15 +289,6 @@ static inline void send_key_value_error(LOG_JOB *jb, HASHED_KEY *key, const char printf("\n"); } -static inline void send_key_value_and_rewrite(LOG_JOB *jb, HASHED_KEY *key, const char *value, size_t len) { - HASHED_KEY *ht_key = get_key_from_hashtable(jb, key); - - txt_replace(&ht_key->value, value, len); - ht_key->flags |= HK_VALUE_FROM_LOG; - -// fprintf(stderr, "SET %s=%.*s\n", ht_key->key, (int)ht_key->value.len, ht_key->value.txt); -} - inline void log_job_send_extracted_key_value(LOG_JOB *jb, const char *key, const char *value, size_t len) { HASHED_KEY *ht_key = get_key_from_hashtable_with_char_ptr(jb, key); HASHED_KEY *nk = rename_key(jb, ht_key); @@ -341,8 +322,8 @@ static inline void log_job_process_rewrites(LOG_JOB *jb) { } static inline void send_all_fields(LOG_JOB *jb) { - for(size_t i = 0; i < jb->sorted.used ;i++) { - HASHED_KEY *k = jb->sorted.keys[i]; + SIMPLE_HASHTABLE_SORTED_FOREACH_READ_ONLY(&jb->hashtable, kptr, HASHED_KEY, _KEY) { + HASHED_KEY *k = SIMPLE_HASHTABLE_SORTED_FOREACH_READ_ONLY_VALUE(kptr); if(k->value.len) { // the key exists and has some value @@ -496,11 +477,13 @@ int log_job_run(LOG_JOB *jb) { if(strcmp(jb->pattern, "json") == 0) { json = json_parser_create(jb); + // never fails } else if(strcmp(jb->pattern, "logfmt") == 0) { logfmt = logfmt_parser_create(jb); + // never fails } - else { + else if(strcmp(jb->pattern, "none") != 0) { pcre2 = pcre2_parser_create(jb); if(pcre2_has_error(pcre2)) { log2stderr("%s", pcre2_parser_error(pcre2)); @@ -509,21 +492,25 @@ int log_job_run(LOG_JOB *jb) { } } - char buffer[MAX_LINE_LENGTH]; - char *line; - size_t len; + jb->line.buffer = mallocz(MAX_LINE_LENGTH + 1); + jb->line.size = MAX_LINE_LENGTH + 1; + jb->line.trimmed_len = 0; + jb->line.trimmed = jb->line.buffer; + + while ((jb->line.trimmed = get_next_line(jb, (char *)jb->line.buffer, jb->line.size, &jb->line.trimmed_len))) { + const char *line = jb->line.trimmed; + size_t len = jb->line.trimmed_len; - while ((line = get_next_line(jb, buffer, sizeof(buffer), &len))) { if(jb_switched_filename(jb, line, len)) continue; - bool line_is_matched; + bool line_is_matched = true; if(json) line_is_matched = json_parse_document(json, line); else if(logfmt) line_is_matched = logfmt_parse_document(logfmt, line); - else + else if(pcre2) line_is_matched = pcre2_parse_document(pcre2, line, len); if(!line_is_matched) { @@ -531,7 +518,7 @@ int log_job_run(LOG_JOB *jb) { log2stderr("%s", json_parser_error(json)); else if(logfmt) log2stderr("%s", logfmt_parser_error(logfmt)); - else + else if(pcre2) log2stderr("%s", pcre2_parser_error(pcre2)); if(!jb_send_unmatched_line(jb, line)) @@ -557,6 +544,8 @@ int log_job_run(LOG_JOB *jb) { else if(pcre2) pcre2_parser_destroy(pcre2); + freez((void *)jb->line.buffer); + return 0; } diff --git a/collectors/log2journal/log2journal.d/default.yaml b/collectors/log2journal/log2journal.d/default.yaml new file mode 100644 index 0000000000..d41efc4abb --- /dev/null +++ b/collectors/log2journal/log2journal.d/default.yaml @@ -0,0 +1,15 @@ +pattern: none + +filename: + key: LOG_FILENAME + +inject: + - key: MESSAGE + value: '${LINE}' # a special variable that resolves to the whole line read from the log + + - key: PRIORITY + value: 6 # Valid PRIORITIES: 0=emerg, 1=alert, 2=crit, 3=error, 4=warn, 5=notice, 6=info, 7=debug + + - key: SYSLOG_IDENTIFIER + value: log2journal # the name of the application sending the logs + diff --git a/collectors/log2journal/log2journal.d/nginx-combined.yaml b/collectors/log2journal/log2journal.d/nginx-combined.yaml index 00610f4b7c..003c774d7b 100644 --- a/collectors/log2journal/log2journal.d/nginx-combined.yaml +++ b/collectors/log2journal/log2journal.d/nginx-combined.yaml @@ -37,15 +37,15 @@ rename: # Inject constant fields into the journal logs. inject: - key: SYSLOG_IDENTIFIER - value: "nginx-log" + value: nginx-log # inject PRIORITY is a duplicate of NGINX_STATUS - - key: "PRIORITY" - value: "${NGINX_STATUS}" + - key: PRIORITY + value: '${NGINX_STATUS}' # Inject NGINX_STATUS_FAMILY is a duplicate of NGINX_STATUS - - key: "NGINX_STATUS_FAMILY" - value: "${NGINX_STATUS}" + - key: NGINX_STATUS_FAMILY + value: '${NGINX_STATUS}' # Rewrite the value of fields (including the duplicated ones). # The search pattern can have named groups, and the replace pattern can use @@ -53,30 +53,30 @@ inject: rewrite: # PRIORITY is a duplicate of NGINX_STATUS # Valid PRIORITIES: 0=emerg, 1=alert, 2=crit, 3=error, 4=warn, 5=notice, 6=info, 7=debug - - key: "PRIORITY" - match: "^[123]" + - key: PRIORITY + match: '^[123]' value: 6 - - key: "PRIORITY" - match: "^4" + - key: PRIORITY + match: '^4' value: 5 - - key: "PRIORITY" - match: "^5" + - key: PRIORITY + match: '^5' value: 3 - - key: "PRIORITY" - match: ".*" + - key: PRIORITY + match: '.*' value: 4 # NGINX_STATUS_FAMILY is a duplicate of NGINX_STATUS - - key: "NGINX_STATUS_FAMILY" - match: "^(?[1-5])" - value: "${first_digit}xx" + - key: NGINX_STATUS_FAMILY + match: '^(?[1-5])' + value: '${first_digit}xx' - - key: "NGINX_STATUS_FAMILY" - match: ".*" - value: "UNKNOWN" + - key: NGINX_STATUS_FAMILY + match: '.*' + value: 'UNKNOWN' # Control what to do when input logs do not match the main PCRE2 pattern. unmatched: diff --git a/collectors/log2journal/log2journal.d/nginx-json.yaml b/collectors/log2journal/log2journal.d/nginx-json.yaml index 1ad702da7c..7fdc4be584 100644 --- a/collectors/log2journal/log2journal.d/nginx-json.yaml +++ b/collectors/log2journal/log2journal.d/nginx-json.yaml @@ -12,7 +12,7 @@ filename: key: NGINX_LOG_FILENAME filter: - exclude: "NGINX_BINARY_REMOTE_ADDR" + exclude: '^(NGINX_BINARY_REMOTE_ADDR)$' rename: - new_key: MESSAGE @@ -69,15 +69,15 @@ rename: # Inject constant fields into the journal logs. inject: - key: SYSLOG_IDENTIFIER - value: "nginx-log" + value: nginx-log # inject PRIORITY is a duplicate of NGINX_STATUS - - key: "PRIORITY" - value: "${NGINX_STATUS}" + - key: PRIORITY + value: '${NGINX_STATUS}' # Inject NGINX_STATUS_FAMILY is a duplicate of NGINX_STATUS - - key: "NGINX_STATUS_FAMILY" - value: "${NGINX_STATUS}" + - key: NGINX_STATUS_FAMILY + value: '${NGINX_STATUS}' # Rewrite the value of fields (including the duplicated ones). @@ -87,69 +87,69 @@ rewrite: # a ? means it has query string, everything else means it does not - key: NGINX_HAS_QUERY_STRING match: '^\?$' - value: "yes" + value: yes - key: NGINX_HAS_QUERY_STRING - match: ".*" - value: "no" + match: '.*' + value: no # 'on' means it was HTTPS, everything else means it was not - key: NGINX_HTTPS - match: "^on$" - value: "yes" + match: '^on$' + value: yes - key: NGINX_HTTPS - match: ".*" - value: "no" + match: '.*' + value: no # 'p' means it was pipelined, everything else means it was not - key: NGINX_PIPELINED - match: "^p$" - value: "yes" + match: '^p$' + value: yes - key: NGINX_PIPELINED - match: ".*" - value: "no" + match: '.*' + value: no # zero means client sent a certificate and it was verified, non-zero means otherwise - key: NGINX_PROXY_PROTOCOL_TLV_SSL_VERIFY - match: "^0$" - value: "yes" + match: '^0$' + value: yes - key: NGINX_PROXY_PROTOCOL_TLV_SSL_VERIFY - match: ".*" - value: "no" + match: '.*' + value: no # 'OK' means request completed, everything else means it didn't - key: NGINX_REQUEST_COMPLETION - match: "^OK$" - value: "completed" + match: '^OK$' + value: 'completed' - key: NGINX_REQUEST_COMPLETION - match: ".*" - value: "not completed" + match: '.*' + value: 'not completed' # PRIORTY is a duplicate of NGINX_STATUS # Valid PRIORITIES: 0=emerg, 1=alert, 2=crit, 3=error, 4=warn, 5=notice, 6=info, 7=debug - - key: "PRIORITY" - match: "^[123]" + - key: PRIORITY + match: '^[123]' value: 6 - - key: "PRIORITY" - match: "^4" + - key: PRIORITY + match: '^4' value: 5 - - key: "PRIORITY" - match: "^5" + - key: PRIORITY + match: '^5' value: 3 - - key: "PRIORITY" - match: ".*" + - key: PRIORITY + match: '.*' value: 4 # NGINX_STATUS_FAMILY is a duplicate of NGINX_STATUS - - key: "NGINX_STATUS_FAMILY" - match: "^(?[1-5])" - value: "${first_digit}xx" + - key: NGINX_STATUS_FAMILY + match: '^(?[1-5])' + value: '${first_digit}xx' - - key: "NGINX_STATUS_FAMILY" - match: ".*" - value: "UNKNOWN" + - key: NGINX_STATUS_FAMILY + match: '.*' + value: 'UNKNOWN' # Control what to do when input logs do not match the main PCRE2 pattern. unmatched: diff --git a/collectors/log2journal/log2journal.h b/collectors/log2journal/log2journal.h index f34d3db177..834a5b135d 100644 --- a/collectors/log2journal/log2journal.h +++ b/collectors/log2journal/log2journal.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -42,11 +43,23 @@ static inline void *mallocz(size_t size) { } static inline void *callocz(size_t elements, size_t size) { - void *ptr = mallocz(elements * size); - memset(ptr, 0, elements * size); + void *ptr = calloc(elements, size); + if (!ptr) { + log2stderr("Fatal Error: Memory allocation failed. Requested size: %zu bytes.", elements * size); + exit(EXIT_FAILURE); + } return ptr; } +static inline void *reallocz(void *ptr, size_t size) { + void *new_ptr = realloc(ptr, size); + if (!new_ptr) { + log2stderr("Fatal Error: Memory reallocation failed. Requested size: %zu bytes.", size); + exit(EXIT_FAILURE); + } + return new_ptr; +} + static inline char *strdupz(const char *s) { char *ptr = strdup(s); if (!ptr) { @@ -74,7 +87,6 @@ static inline void freez(void *ptr) { #define XXH_INLINE_ALL #include "../../libnetdata/xxhash.h" -#include "../../libnetdata/simple_hashtable.h" #define PCRE2_CODE_UNIT_WIDTH 8 #include @@ -83,13 +95,29 @@ static inline void freez(void *ptr) { #include #endif +// ---------------------------------------------------------------------------- +// hashtable for HASHED_KEY + +// cleanup hashtable defines +#undef SIMPLE_HASHTABLE_SORT_FUNCTION +#undef SIMPLE_HASHTABLE_VALUE_TYPE +#undef SIMPLE_HASHTABLE_NAME +#undef NETDATA_SIMPLE_HASHTABLE_H + +struct hashed_key; +static inline int compare_keys(struct hashed_key *k1, struct hashed_key *k2); +#define SIMPLE_HASHTABLE_SORT_FUNCTION compare_keys +#define SIMPLE_HASHTABLE_VALUE_TYPE struct hashed_key +#define SIMPLE_HASHTABLE_NAME _KEY +#include "../../libnetdata/simple_hashtable.h" + +// ---------------------------------------------------------------------------- + #define MAX_OUTPUT_KEYS 1024 #define MAX_LINE_LENGTH (1024 * 1024) -#define MAX_KEY_DUPS (MAX_OUTPUT_KEYS / 2) #define MAX_INJECTIONS (MAX_OUTPUT_KEYS / 2) #define MAX_REWRITES (MAX_OUTPUT_KEYS / 2) #define MAX_RENAMES (MAX_OUTPUT_KEYS / 2) -#define MAX_KEY_DUPS_KEYS 20 #define JOURNAL_MAX_KEY_LEN 64 // according to systemd-journald #define JOURNAL_MAX_VALUE_LEN (48 * 1024) // according to systemd-journald @@ -178,13 +206,7 @@ static inline void txt_expand_and_append(TEXT *t, const char *s, size_t len) { if(new_size < t->size * 2) new_size = t->size * 2; - char *b = mallocz(new_size); - if(t->txt) { - memcpy(b, t->txt, t->len); - freez(t->txt); - } - - t->txt = b; + t->txt = reallocz(t->txt, new_size); t->size = new_size; } @@ -213,9 +235,6 @@ typedef enum __attribute__((__packed__)) { HK_RENAMES_CHECKED = (1 << 4), // we checked once if there are renames on this key HK_HAS_RENAMES = (1 << 5), // and we found there is a rename rule related to it - HK_DUPS_CHECKED = (1 << 6), // we checked once if there are duplications for this key - HK_HAS_DUPS = (1 << 7), // and we found there are duplication related to it - // ephemeral flags - they are unset at the end of each log line HK_VALUE_FROM_LOG = (1 << 14), // the value of this key has been read from the log (or from injection, duplication) @@ -268,6 +287,10 @@ static inline bool hashed_keys_match(HASHED_KEY *k1, HASHED_KEY *k2) { return ((k1 == k2) || (k1->hash == k2->hash && strcmp(k1->key, k2->key) == 0)); } +static inline int compare_keys(struct hashed_key *k1, struct hashed_key *k2) { + return strcmp(k1->key, k2->key); +} + // ---------------------------------------------------------------------------- typedef struct search_pattern { @@ -355,12 +378,15 @@ typedef struct log_job { const char *pattern; const char *prefix; - SIMPLE_HASHTABLE hashtable; + SIMPLE_HASHTABLE_KEY hashtable; struct { - HASHED_KEY *keys[MAX_OUTPUT_KEYS]; - size_t used; - } sorted; + const char *buffer; + const char *trimmed; + size_t trimmed_len; + size_t size; + HASHED_KEY key; + } line; struct { SEARCH_PATTERN include; @@ -447,6 +473,8 @@ const char *json_parser_error(LOG_JSON_STATE *js); bool json_parse_document(LOG_JSON_STATE *js, const char *txt); void json_test(void); +size_t parse_surrogate(const char *s, char *d, size_t *remaining); + // ---------------------------------------------------------------------------- // logfmt parser diff --git a/collectors/log2journal/tests.d/default.output b/collectors/log2journal/tests.d/default.output new file mode 100644 index 0000000000..ef17cb2c7c --- /dev/null +++ b/collectors/log2journal/tests.d/default.output @@ -0,0 +1,20 @@ +MESSAGE=key1=value01 key2=value02 key3=value03 key4=value04 +PRIORITY=6 +SYSLOG_IDENTIFIER=log2journal + +MESSAGE=key1=value11 key2=value12 key3=value13 key4= +PRIORITY=6 +SYSLOG_IDENTIFIER=log2journal + +MESSAGE=key1=value21 key2=value22 key3=value23 key4=value24 +PRIORITY=6 +SYSLOG_IDENTIFIER=log2journal + +MESSAGE=key1=value31 key2=value32 key3=value33 key4= +PRIORITY=6 +SYSLOG_IDENTIFIER=log2journal + +MESSAGE=key1=value41 key2=value42 key3=value43 key4=value44 +PRIORITY=6 +SYSLOG_IDENTIFIER=log2journal + diff --git a/collectors/log2journal/tests.d/full.output b/collectors/log2journal/tests.d/full.output index 42132ba0e6..074092d4ed 100644 --- a/collectors/log2journal/tests.d/full.output +++ b/collectors/log2journal/tests.d/full.output @@ -24,8 +24,8 @@ filename: key: NGINX_LOG_FILENAME filter: - include: ".*" - exclude: ".*HELLO.*WORLD.*" + include: '.*' + exclude: '.*HELLO.*WORLD.*' rename: - new_key: TEST1 @@ -39,32 +39,32 @@ inject: - key: SYSLOG_IDENTIFIER2 value: nginx-log2 - key: PRIORITY - value: "${NGINX_STATUS}" + value: '${NGINX_STATUS}' - key: NGINX_STATUS_FAMILY - value: "${NGINX_STATUS}${NGINX_METHOD}" + value: '${NGINX_STATUS}${NGINX_METHOD}' rewrite: - key: PRIORITY - value: "${NGINX_STATUS}" + value: '${NGINX_STATUS}' inject: yes stop: no - key: PRIORITY - match: "^[123]" + match: '^[123]' value: 6 - key: PRIORITY - match: "^4" + match: '^4' value: 5 - key: PRIORITY - match: "^5" + match: '^5' value: 3 - key: PRIORITY - match: ".*" + match: '.*' value: 4 - key: NGINX_STATUS_FAMILY - match: "^(?[1-5])" - value: "${first_digit}xx" + match: '^(?[1-5])' + value: '${first_digit}xx' - key: NGINX_STATUS_FAMILY - match: ".*" + match: '.*' value: UNKNOWN unmatched: diff --git a/collectors/log2journal/tests.d/full.yaml b/collectors/log2journal/tests.d/full.yaml index e2e1995bdb..86cafb5a2a 100644 --- a/collectors/log2journal/tests.d/full.yaml +++ b/collectors/log2journal/tests.d/full.yaml @@ -24,8 +24,8 @@ filename: key: NGINX_LOG_FILENAME filter: - include: ".*" - exclude: ".*HELLO.*WORLD.*" + include: '.*' + exclude: '.*HELLO.*WORLD.*' rename: - new_key: TEST1 @@ -35,13 +35,13 @@ rename: inject: - key: SYSLOG_IDENTIFIER - value: "nginx-log" + value: 'nginx-log' - key: SYSLOG_IDENTIFIER2 - value: "nginx-log2" + value: 'nginx-log2' - key: PRIORITY - value: "${NGINX_STATUS}" + value: '${NGINX_STATUS}' - key: NGINX_STATUS_FAMILY - value: "${NGINX_STATUS}${NGINX_METHOD}" + value: '${NGINX_STATUS}${NGINX_METHOD}' rewrite: - key: "PRIORITY" diff --git a/collectors/log2journal/tests.sh b/collectors/log2journal/tests.sh index aa4309956b..4024388669 100755 --- a/collectors/log2journal/tests.sh +++ b/collectors/log2journal/tests.sh @@ -109,11 +109,12 @@ test_log2journal_config /dev/null "${tests}/full.output" --show-config \ # ----------------------------------------------------------------------------- test_log2journal() { - local in="${1}" - local out="${2}" - shift 2 + local n="${1}" + local in="${2}" + local out="${3}" + shift 3 - printf >&2 "running: " + printf >&2 "running test No ${n}: " printf >&2 "%q " "${log2journal_bin}" "${@}" printf >&2 "\n" echo >&2 "using as input : ${in}" @@ -138,9 +139,10 @@ test_log2journal() { echo >&2 echo >&2 "Testing parsing and output..." -test_log2journal "${tests}/json.log" "${tests}/json.output" json -test_log2journal "${tests}/json.log" "${tests}/json-include.output" json --include "OBJECT" -test_log2journal "${tests}/json.log" "${tests}/json-exclude.output" json --exclude "ARRAY[^2]" -test_log2journal "${tests}/nginx-json.log" "${tests}/nginx-json.output" -f "${script_dir}/log2journal.d/nginx-json.yaml" -test_log2journal "${tests}/nginx-combined.log" "${tests}/nginx-combined.output" -f "${script_dir}/log2journal.d/nginx-combined.yaml" -test_log2journal "${tests}/logfmt.log" "${tests}/logfmt.output" -f "${tests}/logfmt.yaml" +test_log2journal 1 "${tests}/json.log" "${tests}/json.output" json +test_log2journal 2 "${tests}/json.log" "${tests}/json-include.output" json --include "OBJECT" +test_log2journal 3 "${tests}/json.log" "${tests}/json-exclude.output" json --exclude "ARRAY[^2]" +test_log2journal 4 "${tests}/nginx-json.log" "${tests}/nginx-json.output" -f "${script_dir}/log2journal.d/nginx-json.yaml" +test_log2journal 5 "${tests}/nginx-combined.log" "${tests}/nginx-combined.output" -f "${script_dir}/log2journal.d/nginx-combined.yaml" +test_log2journal 6 "${tests}/logfmt.log" "${tests}/logfmt.output" -f "${tests}/logfmt.yaml" +test_log2journal 7 "${tests}/logfmt.log" "${tests}/default.output" -f "${script_dir}/log2journal.d/default.yaml" diff --git a/libnetdata/facets/facets.c b/libnetdata/facets/facets.c index 68695ef205..4a5f5442bf 100644 --- a/libnetdata/facets/facets.c +++ b/libnetdata/facets/facets.c @@ -99,8 +99,37 @@ static inline bool is_valid_string_hash(const char *s) { } // ---------------------------------------------------------------------------- +// hashtable for FACET_VALUE + +// cleanup hashtable defines +#undef SIMPLE_HASHTABLE_SORT_FUNCTION +#undef SIMPLE_HASHTABLE_VALUE_TYPE +#undef SIMPLE_HASHTABLE_NAME +#undef NETDATA_SIMPLE_HASHTABLE_H + +struct facet_value; +// #define SIMPLE_HASHTABLE_SORT_FUNCTION compare_facet_value +#define SIMPLE_HASHTABLE_VALUE_TYPE struct facet_value +#define SIMPLE_HASHTABLE_NAME _VALUE #include "../simple_hashtable.h" +// ---------------------------------------------------------------------------- +// hashtable for FACET_KEY + +// cleanup hashtable defines +#undef SIMPLE_HASHTABLE_SORT_FUNCTION +#undef SIMPLE_HASHTABLE_VALUE_TYPE +#undef SIMPLE_HASHTABLE_NAME +#undef NETDATA_SIMPLE_HASHTABLE_H + +struct facet_key; +// #define SIMPLE_HASHTABLE_SORT_FUNCTION compare_facet_key +#define SIMPLE_HASHTABLE_VALUE_TYPE struct facet_key +#define SIMPLE_HASHTABLE_NAME _KEY +#include "../simple_hashtable.h" + +// ---------------------------------------------------------------------------- + typedef struct facet_value { FACETS_HASH hash; const char *name; @@ -157,7 +186,7 @@ struct facet_key { bool enabled; uint32_t used; FACET_VALUE *ll; - SIMPLE_HASHTABLE ht; + SIMPLE_HASHTABLE_VALUE ht; } values; struct { @@ -216,7 +245,7 @@ struct facets { struct { size_t count; FACET_KEY *ll; - SIMPLE_HASHTABLE ht; + SIMPLE_HASHTABLE_KEY ht; } keys; struct { @@ -347,7 +376,7 @@ static inline bool facets_key_is_facet(FACETS *facets, FACET_KEY *k); static inline void FACETS_VALUES_INDEX_CREATE(FACET_KEY *k) { k->values.ll = NULL; k->values.used = 0; - simple_hashtable_init(&k->values.ht, FACETS_VALUES_HASHTABLE_ENTRIES); + simple_hashtable_init_VALUE(&k->values.ht, FACETS_VALUES_HASHTABLE_ENTRIES); } static inline void FACETS_VALUES_INDEX_DESTROY(FACET_KEY *k) { @@ -363,7 +392,7 @@ static inline void FACETS_VALUES_INDEX_DESTROY(FACET_KEY *k) { k->values.used = 0; k->values.enabled = false; - simple_hashtable_free(&k->values.ht); + simple_hashtable_destroy_VALUE(&k->values.ht); } static inline const char *facets_key_get_value(FACET_KEY *k) { @@ -410,17 +439,17 @@ static inline void FACET_VALUE_ADD_CONFLICT(FACET_KEY *k, FACET_VALUE *v, const } static inline FACET_VALUE *FACET_VALUE_GET_FROM_INDEX(FACET_KEY *k, FACETS_HASH hash) { - SIMPLE_HASHTABLE_SLOT *slot = simple_hashtable_get_slot(&k->values.ht, hash, true); - return slot->data; + SIMPLE_HASHTABLE_SLOT_VALUE *slot = simple_hashtable_get_slot_VALUE(&k->values.ht, hash, true); + return SIMPLE_HASHTABLE_SLOT_DATA(slot); } static inline FACET_VALUE *FACET_VALUE_ADD_TO_INDEX(FACET_KEY *k, const FACET_VALUE * const tv) { - SIMPLE_HASHTABLE_SLOT *slot = simple_hashtable_get_slot(&k->values.ht, tv->hash, true); + SIMPLE_HASHTABLE_SLOT_VALUE *slot = simple_hashtable_get_slot_VALUE(&k->values.ht, tv->hash, true); - if(slot->data) { + if(SIMPLE_HASHTABLE_SLOT_DATA(slot)) { // already exists - FACET_VALUE *v = slot->data; + FACET_VALUE *v = SIMPLE_HASHTABLE_SLOT_DATA(slot); FACET_VALUE_ADD_CONFLICT(k, v, tv); return v; } @@ -428,9 +457,7 @@ static inline FACET_VALUE *FACET_VALUE_ADD_TO_INDEX(FACET_KEY *k, const FACET_VA // we have to add it FACET_VALUE *v = mallocz(sizeof(*v)); - slot->hash = tv->hash; - slot->data = v; - k->values.ht.used++; + simple_hashtable_set_slot_VALUE(&k->values.ht, slot, tv->hash, v); memcpy(v, tv, sizeof(*v)); @@ -584,7 +611,7 @@ static inline void FACETS_KEYS_INDEX_CREATE(FACETS *facets) { facets->keys.count = 0; facets->keys_with_values.used = 0; - simple_hashtable_init(&facets->keys.ht, FACETS_KEYS_HASHTABLE_ENTRIES); + simple_hashtable_init_KEY(&facets->keys.ht, FACETS_KEYS_HASHTABLE_ENTRIES); } static inline void FACETS_KEYS_INDEX_DESTROY(FACETS *facets) { @@ -603,12 +630,12 @@ static inline void FACETS_KEYS_INDEX_DESTROY(FACETS *facets) { facets->keys.count = 0; facets->keys_with_values.used = 0; - simple_hashtable_free(&facets->keys.ht); + simple_hashtable_destroy_KEY(&facets->keys.ht); } static inline FACET_KEY *FACETS_KEY_GET_FROM_INDEX(FACETS *facets, FACETS_HASH hash) { - SIMPLE_HASHTABLE_SLOT *slot = simple_hashtable_get_slot(&facets->keys.ht, hash, true); - return slot->data; + SIMPLE_HASHTABLE_SLOT_KEY *slot = simple_hashtable_get_slot_KEY(&facets->keys.ht, hash, true); + return SIMPLE_HASHTABLE_SLOT_DATA(slot); } bool facets_key_name_value_length_is_selected(FACETS *facets, const char *key, size_t key_length, const char *value, size_t value_length) { @@ -687,22 +714,20 @@ static inline FACET_KEY *FACETS_KEY_CREATE(FACETS *facets, FACETS_HASH hash, con static inline FACET_KEY *FACETS_KEY_ADD_TO_INDEX(FACETS *facets, FACETS_HASH hash, const char *name, size_t name_length, FACET_KEY_OPTIONS options) { facets->operations.keys.registered++; - SIMPLE_HASHTABLE_SLOT *slot = simple_hashtable_get_slot(&facets->keys.ht, hash, true); + SIMPLE_HASHTABLE_SLOT_KEY *slot = simple_hashtable_get_slot_KEY(&facets->keys.ht, hash, true); - if(unlikely(!slot->data)) { + if(unlikely(!SIMPLE_HASHTABLE_SLOT_DATA(slot))) { // we have to add it FACET_KEY *k = FACETS_KEY_CREATE(facets, hash, name, name_length, options); - slot->hash = hash; - slot->data = k; - facets->keys.ht.used++; + simple_hashtable_set_slot_KEY(&facets->keys.ht, slot, hash, k); return k; } // already in the index - FACET_KEY *k = slot->data; + FACET_KEY *k = SIMPLE_HASHTABLE_SLOT_DATA(slot); facet_key_set_name(k, name, name_length); diff --git a/libnetdata/simple_hashtable.h b/libnetdata/simple_hashtable.h index 3fdf012382..f6b6db9068 100644 --- a/libnetdata/simple_hashtable.h +++ b/libnetdata/simple_hashtable.h @@ -3,90 +3,394 @@ #ifndef NETDATA_SIMPLE_HASHTABLE_H #define NETDATA_SIMPLE_HASHTABLE_H +#ifndef XXH_INLINE_ALL +#define XXH_INLINE_ALL +#endif +#include "xxhash.h" + typedef uint64_t SIMPLE_HASHTABLE_HASH; #define SIMPLE_HASHTABLE_HASH_SECOND_HASH_SHIFTS 32 -typedef struct simple_hashtable_slot { +#ifndef SIMPLE_HASHTABLE_NAME +#define SIMPLE_HASHTABLE_NAME +#endif + +#ifndef SIMPLE_HASHTABLE_VALUE_TYPE +#define SIMPLE_HASHTABLE_VALUE_TYPE void +#endif + +// First layer of macro for token concatenation +#define CONCAT_INTERNAL(a, b) a ## b +// Second layer of macro, which ensures proper expansion +#define CONCAT(a, b) CONCAT_INTERNAL(a, b) + +// define names for all structures and structures +#define simple_hashtable_init_named CONCAT(simple_hashtable_init, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_destroy_named CONCAT(simple_hashtable_destroy, SIMPLE_HASHTABLE_NAME) + +#define simple_hashtable_slot_named CONCAT(simple_hashtable_slot, SIMPLE_HASHTABLE_NAME) +#define SIMPLE_HASHTABLE_SLOT_NAMED CONCAT(SIMPLE_HASHTABLE_SLOT, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_named CONCAT(simple_hashtable, SIMPLE_HASHTABLE_NAME) +#define SIMPLE_HASHTABLE_NAMED CONCAT(SIMPLE_HASHTABLE, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_resize_named CONCAT(simple_hashtable_resize, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_get_slot_named CONCAT(simple_hashtable_get_slot, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_del_slot_named CONCAT(simple_hashtable_del_slot, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_set_slot_named CONCAT(simple_hashtable_set_slot, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_first_read_only_named CONCAT(simple_hashtable_first_read_only, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_next_read_only_named CONCAT(simple_hashtable_next_read_only, SIMPLE_HASHTABLE_NAME) + +#define simple_hashtable_sorted_binary_search_named CONCAT(simple_hashtable_sorted_binary_search, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_add_value_sorted_named CONCAT(simple_hashtable_add_value_sorted, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_del_value_sorted_named CONCAT(simple_hashtable_del_value_sorted, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_replace_value_sorted_named CONCAT(simple_hashtable_replace_value_sorted, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_sorted_array_first_read_only_named CONCAT(simple_hashtable_sorted_array_first_read_only, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_sorted_array_next_read_only_named CONCAT(simple_hashtable_sorted_array_next_read_only, SIMPLE_HASHTABLE_NAME) + +typedef struct simple_hashtable_slot_named { SIMPLE_HASHTABLE_HASH hash; - void *data; -} SIMPLE_HASHTABLE_SLOT; + SIMPLE_HASHTABLE_VALUE_TYPE *data; +} SIMPLE_HASHTABLE_SLOT_NAMED; -typedef struct simple_hashtable { +typedef struct simple_hashtable_named { size_t resizes; size_t searches; size_t collisions; + size_t deletions; + size_t deleted; size_t used; size_t size; - SIMPLE_HASHTABLE_SLOT *hashtable; -} SIMPLE_HASHTABLE; + SIMPLE_HASHTABLE_SLOT_NAMED *hashtable; + +#ifdef SIMPLE_HASHTABLE_SORT_FUNCTION + struct { + size_t used; + size_t size; + SIMPLE_HASHTABLE_VALUE_TYPE **array; + } sorted; +#endif +} SIMPLE_HASHTABLE_NAMED; + +#ifdef SIMPLE_HASHTABLE_SORT_FUNCTION +static inline size_t simple_hashtable_sorted_binary_search_named(SIMPLE_HASHTABLE_NAMED *ht, SIMPLE_HASHTABLE_VALUE_TYPE *value) { + size_t left = 0, right = ht->sorted.used; + + while (left < right) { + size_t mid = left + (right - left) / 2; + if (SIMPLE_HASHTABLE_SORT_FUNCTION(ht->sorted.array[mid], value) < 0) + left = mid + 1; + else + right = mid; + } + + return left; +} + +static inline void simple_hashtable_add_value_sorted_named(SIMPLE_HASHTABLE_NAMED *ht, SIMPLE_HASHTABLE_VALUE_TYPE *value) { + size_t index = simple_hashtable_sorted_binary_search_named(ht, value); + + // Ensure there's enough space in the sorted array + if (ht->sorted.used >= ht->sorted.size) { + size_t size = ht->sorted.size ? ht->sorted.size * 2 : 64; + SIMPLE_HASHTABLE_VALUE_TYPE **array = mallocz(size * sizeof(SIMPLE_HASHTABLE_VALUE_TYPE *)); + if(ht->sorted.array) { + memcpy(array, ht->sorted.array, ht->sorted.size * sizeof(SIMPLE_HASHTABLE_VALUE_TYPE *)); + freez(ht->sorted.array); + } + ht->sorted.array = array; + ht->sorted.size = size; + } + + // Use memmove to shift elements and create space for the new element + memmove(&ht->sorted.array[index + 1], &ht->sorted.array[index], (ht->sorted.used - index) * sizeof(SIMPLE_HASHTABLE_VALUE_TYPE *)); + + ht->sorted.array[index] = value; + ht->sorted.used++; +} + +static inline void simple_hashtable_del_value_sorted_named(SIMPLE_HASHTABLE_NAMED *ht, SIMPLE_HASHTABLE_VALUE_TYPE *value) { + size_t index = simple_hashtable_sorted_binary_search_named(ht, value); + + // Check if the value exists at the found index + assert(index < ht->sorted.used && ht->sorted.array[index] == value); + + // Use memmove to shift elements and close the gap + memmove(&ht->sorted.array[index], &ht->sorted.array[index + 1], (ht->sorted.used - index - 1) * sizeof(SIMPLE_HASHTABLE_VALUE_TYPE *)); + ht->sorted.used--; +} + +static inline void simple_hashtable_replace_value_sorted_named(SIMPLE_HASHTABLE_NAMED *ht, SIMPLE_HASHTABLE_VALUE_TYPE *old_value, SIMPLE_HASHTABLE_VALUE_TYPE *new_value) { + if(new_value == old_value) + return; + + size_t old_value_index = simple_hashtable_sorted_binary_search_named(ht, old_value); + assert(old_value_index < ht->sorted.used && ht->sorted.array[old_value_index] == old_value); + + int r = SIMPLE_HASHTABLE_SORT_FUNCTION(old_value, new_value); + if(r == 0) { + // Same value, so use the same index + ht->sorted.array[old_value_index] = new_value; + return; + } + + size_t new_value_index = simple_hashtable_sorted_binary_search_named(ht, new_value); + if(old_value_index == new_value_index) { + // Not the same value, but still at the same index + ht->sorted.array[old_value_index] = new_value; + return; + } + else if (old_value_index < new_value_index) { + // The old value is before the new value + size_t shift_start = old_value_index + 1; + size_t shift_end = new_value_index - 1; + size_t shift_size = shift_end - old_value_index; + + memmove(&ht->sorted.array[old_value_index], &ht->sorted.array[shift_start], shift_size * sizeof(SIMPLE_HASHTABLE_VALUE_TYPE *)); + ht->sorted.array[shift_end] = new_value; + } + else { + // The old value is after the new value + size_t shift_start = new_value_index; + size_t shift_end = old_value_index; + size_t shift_size = shift_end - new_value_index; + + memmove(&ht->sorted.array[new_value_index + 1], &ht->sorted.array[shift_start], shift_size * sizeof(SIMPLE_HASHTABLE_VALUE_TYPE *)); + ht->sorted.array[new_value_index] = new_value; + } +} + +static inline SIMPLE_HASHTABLE_VALUE_TYPE **simple_hashtable_sorted_array_first_read_only_named(SIMPLE_HASHTABLE_NAMED *ht) { + if (ht->sorted.used > 0) { + return &ht->sorted.array[0]; + } + return NULL; +} + +static inline SIMPLE_HASHTABLE_VALUE_TYPE **simple_hashtable_sorted_array_next_read_only_named(SIMPLE_HASHTABLE_NAMED *ht, SIMPLE_HASHTABLE_VALUE_TYPE **last) { + if (!last) return NULL; + + // Calculate the current position in the sorted array + size_t currentIndex = last - ht->sorted.array; + + // Proceed to the next element if it exists + if (currentIndex + 1 < ht->sorted.used) { + return &ht->sorted.array[currentIndex + 1]; + } + + // If no more elements, return NULL + return NULL; +} + +#define SIMPLE_HASHTABLE_SORTED_FOREACH_READ_ONLY(ht, var, type, name) \ + for (type **(var) = simple_hashtable_sorted_array_first_read_only ## name(ht); \ + var; \ + (var) = simple_hashtable_sorted_array_next_read_only ## name(ht, var)) -static void simple_hashtable_init(SIMPLE_HASHTABLE *ht, size_t size) { - ht->resizes = 0; - ht->used = 0; +#define SIMPLE_HASHTABLE_SORTED_FOREACH_READ_ONLY_VALUE(var) (*(var)) + +#else +static inline void simple_hashtable_add_value_sorted_named(SIMPLE_HASHTABLE_NAMED *ht __maybe_unused, SIMPLE_HASHTABLE_VALUE_TYPE *value __maybe_unused) { ; } +static inline void simple_hashtable_del_value_sorted_named(SIMPLE_HASHTABLE_NAMED *ht __maybe_unused, SIMPLE_HASHTABLE_VALUE_TYPE *value __maybe_unused) { ; } +static inline void simple_hashtable_replace_value_sorted_named(SIMPLE_HASHTABLE_NAMED *ht __maybe_unused, SIMPLE_HASHTABLE_VALUE_TYPE *old_value __maybe_unused, SIMPLE_HASHTABLE_VALUE_TYPE *new_value __maybe_unused) { ; } +#endif + +static void simple_hashtable_init_named(SIMPLE_HASHTABLE_NAMED *ht, size_t size) { + memset(ht, 0, sizeof(*ht)); ht->size = size; ht->hashtable = callocz(ht->size, sizeof(*ht->hashtable)); } -static void simple_hashtable_free(SIMPLE_HASHTABLE *ht) { +static void simple_hashtable_destroy_named(SIMPLE_HASHTABLE_NAMED *ht) { +#ifdef SIMPLE_HASHTABLE_SORT_FUNCTION + freez(ht->sorted.array); +#endif + freez(ht->hashtable); - ht->hashtable = NULL; - ht->size = 0; - ht->used = 0; - ht->resizes = 0; + memset(ht, 0, sizeof(*ht)); } -static inline void simple_hashtable_resize(SIMPLE_HASHTABLE *ht); +static inline void simple_hashtable_resize_named(SIMPLE_HASHTABLE_NAMED *ht); -static inline SIMPLE_HASHTABLE_SLOT *simple_hashtable_get_slot(SIMPLE_HASHTABLE *ht, SIMPLE_HASHTABLE_HASH hash, bool resize) { - // IMPORTANT: - // If the hashtable supported deletions, we would need to have a special slot.data value - // to mark deleted values and assume they are occupied during lookup, but empty during insert. - // But for our case, we don't need it, since we never delete items from the hashtable. +#define SHTS_DATA_UNSET ((void *)NULL) +#define SHTS_DATA_DELETED ((void *)0x01) +#define SHTS_DATA_USERNULL ((void *)0x02) +#define SHTS_IS_UNSET(sl) ((sl)->data == SHTS_DATA_UNSET) +#define SHTS_IS_DELETED(sl) ((sl)->data == SHTS_DATA_DELETED) +#define SHTS_IS_USERNULL(sl) ((sl)->data == SHTS_DATA_USERNULL) +#define SIMPLE_HASHTABLE_SLOT_DATA(sl) ((SHTS_IS_UNSET(sl) || SHTS_IS_DELETED(sl) || SHTS_IS_USERNULL(sl)) ? NULL : (sl)->data) +#define SIMPLE_HASHTABLE_SLOT_UNSET_OR_DELETED(sl) ((SHTS_IS_UNSET(sl) || SHTS_IS_DELETED(sl)) ? NULL : (sl)->data) +// IMPORTANT +// The pointer returned by this call is valid up to the next call of this function (or the resize one) +// If you need to cache something, cache the hash, not the slot pointer. +static inline SIMPLE_HASHTABLE_SLOT_NAMED *simple_hashtable_get_slot_named(SIMPLE_HASHTABLE_NAMED *ht, SIMPLE_HASHTABLE_HASH hash, bool resize) { ht->searches++; - size_t slot = hash % ht->size; - if(likely(!ht->hashtable[slot].data || ht->hashtable[slot].hash == hash)) - return &ht->hashtable[slot]; + size_t slot; + SIMPLE_HASHTABLE_SLOT_NAMED *sl; + SIMPLE_HASHTABLE_SLOT_NAMED *deleted; + + slot = hash % ht->size; + sl = &ht->hashtable[slot]; + deleted = SHTS_IS_DELETED(sl) ? sl : NULL; + if(likely(!SIMPLE_HASHTABLE_SLOT_UNSET_OR_DELETED(sl) || sl->hash == hash)) + return (SHTS_IS_UNSET(sl) && deleted) ? deleted : sl; ht->collisions++; - if(unlikely(resize && ht->size <= (ht->used << 4))) { - simple_hashtable_resize(ht); + if(unlikely(resize && (ht->size <= (ht->used << 1) || ht->used >= ht->size))) { + simple_hashtable_resize_named(ht); slot = hash % ht->size; - if(likely(!ht->hashtable[slot].data || ht->hashtable[slot].hash == hash)) - return &ht->hashtable[slot]; + sl = &ht->hashtable[slot]; + deleted = (!deleted && SHTS_IS_DELETED(sl)) ? sl : deleted; + if(likely(!SIMPLE_HASHTABLE_SLOT_UNSET_OR_DELETED(sl) || sl->hash == hash)) + return (SHTS_IS_UNSET(sl) && deleted) ? deleted : sl; ht->collisions++; } slot = ((hash >> SIMPLE_HASHTABLE_HASH_SECOND_HASH_SHIFTS) + 1) % ht->size; + sl = &ht->hashtable[slot]; + deleted = (!deleted && SHTS_IS_DELETED(sl)) ? sl : deleted; + // Linear probing until we find it - while (ht->hashtable[slot].data && ht->hashtable[slot].hash != hash) { + while (SIMPLE_HASHTABLE_SLOT_UNSET_OR_DELETED(sl) && sl->hash != hash) { slot = (slot + 1) % ht->size; // Wrap around if necessary + sl = &ht->hashtable[slot]; + deleted = (!deleted && SHTS_IS_DELETED(sl)) ? sl : deleted; ht->collisions++; } - return &ht->hashtable[slot]; + return (SHTS_IS_UNSET(sl) && deleted) ? deleted : sl; +} + +static inline bool simple_hashtable_del_slot_named(SIMPLE_HASHTABLE_NAMED *ht, SIMPLE_HASHTABLE_SLOT_NAMED *sl) { + if(SHTS_IS_UNSET(sl) || SHTS_IS_DELETED(sl)) + return false; + + ht->deletions++; + ht->deleted++; + + simple_hashtable_del_value_sorted_named(ht, SIMPLE_HASHTABLE_SLOT_DATA(sl)); + + sl->data = SHTS_DATA_DELETED; + return true; +} + +static inline void simple_hashtable_set_slot_named(SIMPLE_HASHTABLE_NAMED *ht, SIMPLE_HASHTABLE_SLOT_NAMED *sl, SIMPLE_HASHTABLE_HASH hash, SIMPLE_HASHTABLE_VALUE_TYPE *data) { + if(data == NULL) + data = SHTS_DATA_USERNULL; + + if(unlikely(data == SHTS_DATA_UNSET || data == SHTS_DATA_DELETED)) { + simple_hashtable_del_slot_named(ht, sl); + return; + } + + if(likely(SHTS_IS_UNSET(sl))) { + simple_hashtable_add_value_sorted_named(ht, data); + ht->used++; + } + + else if(unlikely(SHTS_IS_DELETED(sl))) { + ht->deleted--; + } + + else + simple_hashtable_replace_value_sorted_named(ht, SIMPLE_HASHTABLE_SLOT_DATA(sl), data); + + sl->hash = hash; + sl->data = data; } -static inline void simple_hashtable_resize(SIMPLE_HASHTABLE *ht) { - SIMPLE_HASHTABLE_SLOT *old = ht->hashtable; +// IMPORTANT +// this call invalidates all SIMPLE_HASHTABLE_SLOT_NAMED pointers +static inline void simple_hashtable_resize_named(SIMPLE_HASHTABLE_NAMED *ht) { + SIMPLE_HASHTABLE_SLOT_NAMED *old = ht->hashtable; size_t old_size = ht->size; ht->resizes++; - ht->size = (ht->size << 3) - 1; + ht->size = (ht->size << 1) - ((ht->size > 16) ? 1 : 0); ht->hashtable = callocz(ht->size, sizeof(*ht->hashtable)); + ht->used = ht->deleted = 0; for(size_t i = 0 ; i < old_size ; i++) { - if(!old[i].data) + if(!SIMPLE_HASHTABLE_SLOT_UNSET_OR_DELETED(&old[i])) continue; - SIMPLE_HASHTABLE_SLOT *slot = simple_hashtable_get_slot(ht, old[i].hash, false); + SIMPLE_HASHTABLE_SLOT_NAMED *slot = simple_hashtable_get_slot_named(ht, old[i].hash, false); *slot = old[i]; + ht->used++; } freez(old); } +// ---------------------------------------------------------------------------- +// hashtable traversal, in read-only mode +// the hashtable should not be modified while the traversal is taking place + +static inline SIMPLE_HASHTABLE_SLOT_NAMED *simple_hashtable_first_read_only_named(SIMPLE_HASHTABLE_NAMED *ht) { + for(size_t i = 0; i < ht->used ;i++) { + SIMPLE_HASHTABLE_SLOT_NAMED *sl = &ht->hashtable[i]; + if(!SIMPLE_HASHTABLE_SLOT_UNSET_OR_DELETED(sl)) + return sl; + } + + return NULL; +} + +static inline SIMPLE_HASHTABLE_SLOT_NAMED *simple_hashtable_next_read_only_named(SIMPLE_HASHTABLE_NAMED *ht, SIMPLE_HASHTABLE_SLOT_NAMED *last) { + if (!last) return NULL; + + // Calculate the current position in the array + size_t currentIndex = last - ht->hashtable; + + // Iterate over the hashtable starting from the next element + for (size_t i = currentIndex + 1; i < ht->size; i++) { + SIMPLE_HASHTABLE_SLOT_NAMED *sl = &ht->hashtable[i]; + if (!SIMPLE_HASHTABLE_SLOT_UNSET_OR_DELETED(sl)) { + return sl; + } + } + + // If no more data slots are found, return NULL + return NULL; +} + +#define SIMPLE_HASHTABLE_FOREACH_READ_ONLY(ht, var, name) \ + for(struct simple_hashtable_slot ## name *(var) = simple_hashtable_first_read_only ## name(ht); \ + var; \ + (var) = simple_hashtable_next_read_only ## name(ht, var)) + +#define SIMPLE_HASHTABLE_FOREACH_READ_ONLY_VALUE(var) SIMPLE_HASHTABLE_SLOT_DATA(var) + +// ---------------------------------------------------------------------------- +// high level implementation + +#ifdef SIMPLE_HASHTABLE_SAMPLE_IMPLEMENTATION + +#define simple_hashtable_set_named CONCAT(simple_hashtable_set, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_get_named CONCAT(simple_hashtable_get, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_del_named CONCAT(simple_hashtable_del, SIMPLE_HASHTABLE_NAME) + +static inline SIMPLE_HASHTABLE_VALUE_TYPE *simple_hashtable_set_named(SIMPLE_HASHTABLE_NAMED *ht, void *key, size_t key_len, SIMPLE_HASHTABLE_VALUE_TYPE *data) { + XXH64_hash_t hash = XXH3_64bits(key, key_len); + SIMPLE_HASHTABLE_SLOT_NAMED *sl = simple_hashtable_get_slot_named(ht, hash, true); + simple_hashtable_set_slot_named(ht, sl, hash, data); + return SIMPLE_HASHTABLE_SLOT_DATA(sl); +} + +static inline SIMPLE_HASHTABLE_VALUE_TYPE *simple_hashtable_get_named(SIMPLE_HASHTABLE_NAMED *ht, void *key, size_t key_len, SIMPLE_HASHTABLE_VALUE_TYPE *data) { + XXH64_hash_t hash = XXH3_64bits(key, key_len); + SIMPLE_HASHTABLE_SLOT_NAMED *sl = simple_hashtable_get_slot_named(ht, hash, true); + return SIMPLE_HASHTABLE_SLOT_DATA(sl); +} + +static inline bool simple_hashtable_del_named(SIMPLE_HASHTABLE_NAMED *ht, void *key, size_t key_len, SIMPLE_HASHTABLE_VALUE_TYPE *data) { + XXH64_hash_t hash = XXH3_64bits(key, key_len); + SIMPLE_HASHTABLE_SLOT_NAMED *sl = simple_hashtable_get_slot_named(ht, hash, true); + return simple_hashtable_del_slot_named(ht, sl); +} + +#endif // SIMPLE_HASHTABLE_SAMPLE_IMPLEMENTATION + #endif //NETDATA_SIMPLE_HASHTABLE_H -- cgit v1.2.3