diff options
author | Andrew Moss <1043609+amoss@users.noreply.github.com> | 2020-06-03 08:38:25 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-06-03 08:38:25 +0200 |
commit | 49719a961d6c079004b65458ea8c5e08ada1c44c (patch) | |
tree | 258b25ac60c403696a72b1589d5fa8634dfc6764 /build_external | |
parent | 1aa2cd7c43f6dd68b4bb43a87eb8b2995687ca9c (diff) |
Fix bugs in streaming and enable support for gap filling (#9214)
This PR adds (inactive) support that we will use to fill the gaps on chart when a receiving agent goes offline and the sender reconnects. The streaming component has been reworked to make the connection bi-directional and fix several outstanding bugs in the area.
* Fixed an incorrect case of version negotiation. Removed fatal() on exhaustion of fds.
* Fixed cases that fell through to polling the socket after closing.
* Fixed locking of data related to sender and receiver in the host structure.
* Added fine-grained locks to reduce contention.
* Added circular buffer to sender to prevent starvation in high-latency conditions.
* Fixed case where agent is a proxy and negotiated different streaming versions with sender and receiver.
* Changed interface to new parser to put the buffering code in streaming.
* Fixed the bug that stopped senders from reconnecting after their socket times out - this was part of the scaling fixes that provide an early shortcut path for rejecting connections without lock contention.
* Uses fine-grained locking and a different approach to thread shutdown instead.
* Added liveness detection to connections to allow selection of the best connection.
Diffstat (limited to 'build_external')
36 files changed, 1219 insertions, 4 deletions
diff --git a/build_external/clean-install-arch-debug.Dockerfile b/build_external/clean-install-arch-debug.Dockerfile new file mode 100644 index 0000000000..44d5b5ee99 --- /dev/null +++ b/build_external/clean-install-arch-debug.Dockerfile @@ -0,0 +1,62 @@ +FROM archlinux/base:latest + +# There is some redundancy between this file and the archlinux Dockerfile in the helper images +# repo and also with the clean-install.Dockefile. Once the help image is availabled on Docker +# Hub this file can be deleted. +RUN echo sdlsjdkls +RUN pacman -Syyu --noconfirm +RUN pacman --noconfirm --needed -S autoconf \ + autoconf-archive \ + autogen \ + automake \ + gcc \ + make \ + git \ + libuv \ + lz4 \ + netcat \ + openssl \ + pkgconfig \ + python \ + libvirt \ + cmake \ + valgrind \ + gdb + +ARG EXTRA_CFLAGS +COPY . /opt/netdata/source +WORKDIR /opt/netdata/source + +RUN git config --global user.email "root@container" +RUN git config --global user.name "Fake root" + +# RUN make distclean -> not safe if tree state changed on host since last config +# Kill everything that is not in .gitignore preserving any fresh changes, i.e. untracked changes will be +# deleted but local changes to tracked files will be preserved. +RUN if git status --porcelain | grep '^[MADRC]'; then \ + git stash && git clean -dxf && (git stash apply || true) \ + else \ + git clean -dxf ; \ + fi + +# Not everybody is updating distclean properly - fix. +RUN find . -name '*.Po' -exec rm \{\} \; +RUN rm -rf autom4te.cache +RUN rm -rf .git/ +RUN find . -type f >/opt/netdata/manifest + +RUN CFLAGS="-Og -g -ggdb -Wall -Wextra -Wformat-signedness -fstack-protector-all -DNETDATA_INTERNAL_CHECKS=1\ + -D_FORTIFY_SOURCE=2 -DNETDATA_VERIFY_LOCKS=1 ${EXTRA_CFLAGS}" ./netdata-installer.sh --require-cloud --disable-lto + +RUN ln -sf /dev/stdout /var/log/netdata/access.log +RUN ln -sf /dev/stdout /var/log/netdata/debug.log +RUN ln -sf /dev/stderr /var/log/netdata/error.log + +RUN printf >/opt/netdata/source/gdb_batch '\ +set args -D \n\ +handle SIG32 nostop \n\ +run \n\ +bt' + +#CMD ["/usr/sbin/valgrind", "--leak-check=full", "/usr/sbin/netdata", "-D"] +CMD ["/usr/bin/gdb", "-x", "/opt/netdata/source/gdb_batch", "/usr/sbin/netdata"] diff --git a/build_external/clean-install-arch-extras.Dockerfile b/build_external/clean-install-arch-extras.Dockerfile index 4425283347..b155c1304f 100644 --- a/build_external/clean-install-arch-extras.Dockerfile +++ b/build_external/clean-install-arch-extras.Dockerfile @@ -3,7 +3,7 @@ FROM archlinux/base:latest # There is some redundancy between this file and the archlinux Dockerfile in the helper images # repo and also with the clean-install.Dockefile. Once the help image is availabled on Docker # Hub this file can be deleted. - +RUN echo sdlsjdkls RUN pacman -Syyu --noconfirm RUN pacman --noconfirm --needed -S autoconf \ autoconf-archive \ @@ -20,7 +20,8 @@ RUN pacman --noconfirm --needed -S autoconf \ python \ libvirt \ cmake \ - valgrind + valgrind \ + gdb ARG EXTRA_CFLAGS COPY . /opt/netdata/source @@ -44,12 +45,14 @@ RUN rm -rf autom4te.cache RUN rm -rf .git/ RUN find . -type f >/opt/netdata/manifest -RUN CFLAGS="-O1 -ggdb -Wall -Wextra -Wformat-signedness -fstack-protector-all -DNETDATA_INTERNAL_CHECKS=1\ +RUN CFLAGS="-Og -g -ggdb -Wall -Wextra -Wformat-signedness -fstack-protector-all -DNETDATA_INTERNAL_CHECKS=1\ -D_FORTIFY_SOURCE=2 -DNETDATA_VERIFY_LOCKS=1 ${EXTRA_CFLAGS}" ./netdata-installer.sh --require-cloud --disable-lto RUN ln -sf /dev/stdout /var/log/netdata/access.log RUN ln -sf /dev/stdout /var/log/netdata/debug.log RUN ln -sf /dev/stderr /var/log/netdata/error.log +RUN rm /var/lib/netdata/registry/netdata.public.unique.id + CMD ["/usr/sbin/valgrind", "--leak-check=full", "/usr/sbin/netdata", "-D"] diff --git a/build_external/clean-install.Dockerfile b/build_external/clean-install.Dockerfile index bc1bae1f45..18586e8974 100644 --- a/build_external/clean-install.Dockerfile +++ b/build_external/clean-install.Dockerfile @@ -33,4 +33,7 @@ RUN ln -sf /dev/stdout /var/log/netdata/access.log RUN ln -sf /dev/stdout /var/log/netdata/debug.log RUN ln -sf /dev/stderr /var/log/netdata/error.log -CMD ["/usr/sbin/netdata","-D"]
\ No newline at end of file +RUN rm /var/lib/netdata/registry/netdata.public.unique.id + +CMD ["/usr/sbin/netdata","-D"] +ENTRYPOINT [] diff --git a/build_external/projects/aclk-testing/agent-compose.yml b/build_external/scenarios/aclk-testing/agent-compose.yml index 04c357c433..04c357c433 100644 --- a/build_external/projects/aclk-testing/agent-compose.yml +++ b/build_external/scenarios/aclk-testing/agent-compose.yml diff --git a/build_external/projects/aclk-testing/agent-valgrind-compose.yml b/build_external/scenarios/aclk-testing/agent-valgrind-compose.yml index cf38893b30..cf38893b30 100644 --- a/build_external/projects/aclk-testing/agent-valgrind-compose.yml +++ b/build_external/scenarios/aclk-testing/agent-valgrind-compose.yml diff --git a/build_external/projects/aclk-testing/agent_netdata.conf b/build_external/scenarios/aclk-testing/agent_netdata.conf index 5938591a32..5938591a32 100644 --- a/build_external/projects/aclk-testing/agent_netdata.conf +++ b/build_external/scenarios/aclk-testing/agent_netdata.conf diff --git a/build_external/projects/aclk-testing/configureVerneMQ.Dockerfile b/build_external/scenarios/aclk-testing/configureVerneMQ.Dockerfile index 228548c094..228548c094 100644 --- a/build_external/projects/aclk-testing/configureVerneMQ.Dockerfile +++ b/build_external/scenarios/aclk-testing/configureVerneMQ.Dockerfile diff --git a/build_external/projects/aclk-testing/paho-compose.yml b/build_external/scenarios/aclk-testing/paho-compose.yml index 4fc6ce2c49..4fc6ce2c49 100644 --- a/build_external/projects/aclk-testing/paho-compose.yml +++ b/build_external/scenarios/aclk-testing/paho-compose.yml diff --git a/build_external/projects/aclk-testing/paho-inspection.py b/build_external/scenarios/aclk-testing/paho-inspection.py index ec1e1675b3..ec1e1675b3 100644 --- a/build_external/projects/aclk-testing/paho-inspection.py +++ b/build_external/scenarios/aclk-testing/paho-inspection.py diff --git a/build_external/projects/aclk-testing/paho.Dockerfile b/build_external/scenarios/aclk-testing/paho.Dockerfile index 77a49e7683..77a49e7683 100644 --- a/build_external/projects/aclk-testing/paho.Dockerfile +++ b/build_external/scenarios/aclk-testing/paho.Dockerfile diff --git a/build_external/projects/aclk-testing/vernemq-compose.yml b/build_external/scenarios/aclk-testing/vernemq-compose.yml index a9f07a546a..a9f07a546a 100644 --- a/build_external/projects/aclk-testing/vernemq-compose.yml +++ b/build_external/scenarios/aclk-testing/vernemq-compose.yml diff --git a/build_external/projects/aclk-testing/vernemq.conf b/build_external/scenarios/aclk-testing/vernemq.conf index 18e8432998..18e8432998 100644 --- a/build_external/projects/aclk-testing/vernemq.conf +++ b/build_external/scenarios/aclk-testing/vernemq.conf diff --git a/build_external/scenarios/gaps_hi/master-compose.yml b/build_external/scenarios/gaps_hi/master-compose.yml new file mode 100644 index 0000000000..05d70c9244 --- /dev/null +++ b/build_external/scenarios/gaps_hi/master-compose.yml @@ -0,0 +1,13 @@ +version: '3.3' +services: + agent_master: + image: debian_10_dev + command: /usr/sbin/netdata -D + ports: + - 21000:19999 + volumes: + - ./master_stream.conf:/etc/netdata/stream.conf:ro + - ./master_guid:/var/lib/netdata/registry/netdata.public.unique.id:ro + - ./min.conf:/etc/netdata/netdata.conf:ro + cap_add: + - SYS_PTRACE diff --git a/build_external/scenarios/gaps_hi/master_guid b/build_external/scenarios/gaps_hi/master_guid new file mode 100644 index 0000000000..fee6f3234d --- /dev/null +++ b/build_external/scenarios/gaps_hi/master_guid @@ -0,0 +1 @@ +00000000-0000-0000-0000-000000000000
\ No newline at end of file diff --git a/build_external/scenarios/gaps_hi/master_stream.conf b/build_external/scenarios/gaps_hi/master_stream.conf new file mode 100644 index 0000000000..600a9fac3b --- /dev/null +++ b/build_external/scenarios/gaps_hi/master_stream.conf @@ -0,0 +1,11 @@ +[00000000-0000-0000-0000-000000000000] + enabled = yes + allow from = * + default history = 3600 + default memory mode = dbengine + health enabled by default = no + + # postpone alarms for a short period after the sender is connected + default postpone alarms on connect seconds = 60 + multiple connections = allow + diff --git a/build_external/scenarios/gaps_hi/middle-compose.yml b/build_external/scenarios/gaps_hi/middle-compose.yml new file mode 100644 index 0000000000..6377a6518f --- /dev/null +++ b/build_external/scenarios/gaps_hi/middle-compose.yml @@ -0,0 +1,13 @@ +version: '3.3' +services: + agent_middle: + image: debian_10_dev + command: /usr/sbin/netdata -D + ports: + - 21001:19999 + volumes: + - ./middle_stream.conf:/etc/netdata/stream.conf:ro + - ./middle_guid:/var/lib/netdata/registry/netdata.public.unique.id:ro + - ./min.conf:/etc/netdata/netdata.conf:ro + cap_add: + - SYS_PTRACE diff --git a/build_external/scenarios/gaps_hi/middle_guid b/build_external/scenarios/gaps_hi/middle_guid new file mode 100644 index 0000000000..f8a43c2286 --- /dev/null +++ b/build_external/scenarios/gaps_hi/middle_guid @@ -0,0 +1 @@ +11111111-1111-1111-1111-111111111111
\ No newline at end of file diff --git a/build_external/scenarios/gaps_hi/middle_stream.conf b/build_external/scenarios/gaps_hi/middle_stream.conf new file mode 100644 index 0000000000..f4546e30d2 --- /dev/null +++ b/build_external/scenarios/gaps_hi/middle_stream.conf @@ -0,0 +1,23 @@ +[stream] + enabled = yes + destination = tcp:agent_master + api key = 00000000-0000-0000-0000-000000000000 + timeout seconds = 60 + default port = 19999 + + send charts matching = * + buffer size bytes = 1048576 + reconnect delay seconds = 5 + initial clock resync iterations = 60 + +[00000000-0000-0000-0000-000000000000] + enabled = yes + allow from = * + default history = 3600 + # default memory mode = ram + + health enabled by default = auto + + # postpone alarms for a short period after the sender is connected + default postpone alarms on connect seconds = 60 + multiple connections = allow diff --git a/build_external/scenarios/gaps_hi/min.conf b/build_external/scenarios/gaps_hi/min.conf new file mode 100644 index 0000000000..83fa23e51b --- /dev/null +++ b/build_external/scenarios/gaps_hi/min.conf @@ -0,0 +1,6 @@ +[global] + debug flags = 0x0000000040000000 + errors flood protection period = 0 +[web] + ssl key = /etc/netdata/ssl/key.pem + ssl certificate = /etc/netdata/ssl/cert.pem diff --git a/build_external/scenarios/gaps_hi/slave-compose.yml b/build_external/scenarios/gaps_hi/slave-compose.yml new file mode 100644 index 0000000000..072d67c141 --- /dev/null +++ b/build_external/scenarios/gaps_hi/slave-compose.yml @@ -0,0 +1,14 @@ +version: '3.3' +services: + agent_slave: + image: debian_10_dev + command: /usr/sbin/netdata -D + #ports: + #- 21002+:19999 + volumes: + - ./slave_stream.conf:/etc/netdata/stream.conf:ro + #- ./slave_guid:/var/lib/netdata/registry/netdata.public.unique.id:ro + - ./min.conf:/etc/netdata/netdata.conf:ro + cap_add: + - SYS_PTRACE + diff --git a/build_external/scenarios/gaps_hi/slave_guid b/build_external/scenarios/gaps_hi/slave_guid new file mode 100644 index 0000000000..670f7c2376 --- /dev/null +++ b/build_external/scenarios/gaps_hi/slave_guid @@ -0,0 +1 @@ +22222222-2222-2222-2222-222222222222
\ No newline at end of file diff --git a/build_external/scenarios/gaps_hi/slave_stream.conf b/build_external/scenarios/gaps_hi/slave_stream.conf new file mode 100644 index 0000000000..2218c681ea --- /dev/null +++ b/build_external/scenarios/gaps_hi/slave_stream.conf @@ -0,0 +1,11 @@ +[stream] + enabled = yes +# destination = tcp:agent_middle + destination = tcp:192.168.1.2 + api key = 00000000-0000-0000-0000-000000000000 + timeout seconds = 60 + default port = 19999 + send charts matching = * + buffer size bytes = 10485760 + reconnect delay seconds = 5 + initial clock resync iterations = 60 diff --git a/build_external/scenarios/gaps_lo/master-compose.yml b/build_external/scenarios/gaps_lo/master-compose.yml new file mode 100644 index 0000000000..f86d56f636 --- /dev/null +++ b/build_external/scenarios/gaps_lo/master-compose.yml @@ -0,0 +1,13 @@ +version: '3.3' +services: + agent_master: + image: debian_10_dev + command: /usr/sbin/netdata -D + ports: + - 21000:19999 + volumes: + - ./master_stream.conf:/etc/netdata/stream.conf:ro + - ./master_guid:/var/lib/netdata/registry/netdata.public.unique.id:ro + - ./mostly_off.conf:/etc/netdata/netdata.conf:ro + cap_add: + - SYS_PTRACE diff --git a/build_external/scenarios/gaps_lo/master_guid b/build_external/scenarios/gaps_lo/master_guid new file mode 100644 index 0000000000..fee6f3234d --- /dev/null +++ b/build_external/scenarios/gaps_lo/master_guid @@ -0,0 +1 @@ +00000000-0000-0000-0000-000000000000
\ No newline at end of file diff --git a/build_external/scenarios/gaps_lo/master_stream.conf b/build_external/scenarios/gaps_lo/master_stream.conf new file mode 100644 index 0000000000..99611cc5c2 --- /dev/null +++ b/build_external/scenarios/gaps_lo/master_stream.conf @@ -0,0 +1,12 @@ +[00000000-0000-0000-0000-000000000000] + enabled = yes + allow from = * + default history = 3600 + # default memory mode = ram + + health enabled by default = auto + + # postpone alarms for a short period after the sender is connected + default postpone alarms on connect seconds = 60 + multiple connections = allow + diff --git a/build_external/scenarios/gaps_lo/middle-compose.yml b/build_external/scenarios/gaps_lo/middle-compose.yml new file mode 100644 index 0000000000..26c22de062 --- /dev/null +++ b/build_external/scenarios/gaps_lo/middle-compose.yml @@ -0,0 +1,14 @@ +version: '3.3' +services: + agent_middle: + image: arch_extras_dev + #command: /usr/sbin/valgrind --leak-check=full /usr/sbin/netdata -D + command: /usr/sbin/netdata -D + ports: + - 21001:19999 + volumes: + - ./middle_stream.conf:/etc/netdata/stream.conf:ro + - ./middle_guid:/var/lib/netdata/registry/netdata.public.unique.id:ro + - ./mostly_off.conf:/etc/netdata/netdata.conf:ro + cap_add: + - SYS_PTRACE diff --git a/build_external/scenarios/gaps_lo/middle_guid b/build_external/scenarios/gaps_lo/middle_guid new file mode 100644 index 0000000000..f8a43c2286 --- /dev/null +++ b/build_external/scenarios/gaps_lo/middle_guid @@ -0,0 +1 @@ +11111111-1111-1111-1111-111111111111
\ No newline at end of file diff --git a/build_external/scenarios/gaps_lo/middle_stream.conf b/build_external/scenarios/gaps_lo/middle_stream.conf new file mode 100644 index 0000000000..8bbf31e5a4 --- /dev/null +++ b/build_external/scenarios/gaps_lo/middle_stream.conf @@ -0,0 +1,20 @@ +[stream] + enabled = yes + destination = tcp:agent_master + api key = 00000000-0000-0000-0000-000000000000 + timeout seconds = 60 + default port = 19999 + send charts matching = * + buffer size bytes = 1048576 + reconnect delay seconds = 5 + initial clock resync iterations = 60 + +[00000000-0000-0000-0000-000000000000] + enabled = yes + allow from = * + default history = 3600 + # default memory mode = ram + health enabled by default = auto + # postpone alarms for a short period after the sender is connected + default postpone alarms on connect seconds = 60 + multiple connections = allow diff --git a/build_external/scenarios/gaps_lo/mostly_off.conf b/build_external/scenarios/gaps_lo/mostly_off.conf new file mode 100644 index 0000000000..2ac395a9be --- /dev/null +++ b/build_external/scenarios/gaps_lo/mostly_off.conf @@ -0,0 +1,965 @@ +# netdata configuration +# +# You can download the latest version of this file, using: +# +# wget -O /etc/netdata/netdata.conf http://localhost:19999/netdata.conf +# or +# curl -o /etc/netdata/netdata.conf http://localhost:19999/netdata.conf +# +# You can uncomment and change any of the options below. +# The value shown in the commented settings, is the default value. +# + +# global netdata configuration + +[global] +debug flags = 0x0000000040000000 +errors flood protection period = 0 + +[plugins] + diskspace = no + cgroups = no + tc = no + idlejitter = no + ioping = no + apps = no + go.d = no + perf = no + fping = no + python.d = no + charts.d = no + node.d = no + nfacct = no + cups = no + freeipmi = no + +[health] +enabled = no + +[statsd] +enabled = no + +[plugin:proc] + /proc/uptime = yes + /proc/loadavg = no + /proc/sys/kernel/random/entropy_avail = no + /proc/pressure = no + /proc/interrupts = no + /proc/softirqs = no + /proc/vmstat = no + /proc/meminfo = no + /sys/kernel/mm/ksm = no + /sys/block/zram = no + /sys/devices/system/edac/mc = no + /sys/devices/system/node = no + /proc/net/dev = no + /proc/net/sockstat = no + /proc/net/sockstat6 = no + /proc/net/netstat = no + /proc/net/snmp = no + /proc/net/snmp6 = no + /proc/net/sctp/snmp = no + /proc/net/softnet_stat = no + /proc/net/ip_vs/stats = no + /proc/net/stat/conntrack = no + /proc/net/stat/synproxy = no + /proc/diskstats = no + /proc/mdstat = no + /proc/net/rpc/nfsd = no + /proc/net/rpc/nfs = no + /proc/spl/kstat/zfs/arcstats = no + /sys/fs/btrfs = no + ipc = no + /sys/class/power_supply = no + + +[plugin:proc:/proc/net/dev:docker0] +enabled = no + +[plugin:proc:/proc/net/dev:br-b87e56f878f1] +enabled = no + +[plugin:proc:/proc/net/dev:enp4s0] +enabled = no + +[plugin:proc:/proc/net/stat/nf_conntrack] +filename to monitor = /proc/net/stat/nf_conntrack +netfilter new connections = no +netfilter connection changes = no +netfilter connection expectations = no +netfilter connection searches = no +netfilter errors = no +netfilter connections = no + +[system.idlejitter] +enabled = no + +[netdata.statsd_metrics] +enabled = no + +[netdata.statsd_useful_metrics] +enabled = no + +[netdata.statsd_events] +enabled = no + +[netdata.statsd_reads] +enabled = no + +[netdata.statsd_bytes] +enabled = no + +[netdata.statsd_packets] +enabled = no + +[netdata.tcp_connects] +enabled = no + +[netdata.tcp_connected] +enabled = no + +[netdata.private_charts] +enabled = no + +[netdata.plugin_statsd_charting_cpu] +enabled = no + +[netdata.plugin_statsd_collector1_cpu] +enabled = no + +[netdata.plugin_tc_cpu] +enabled = no + +[netdata.plugin_tc_time] +enabled = no + +[netdata.runtime_sensors] +enabled = no + +[sensors.coretemp-isa-0000_temperature] +enabled = no + +[sensors.acpitz-acpi-0_temperature] +enabled = no + +[system.cpu] +enabled = yes + +[disk_space._dev] +enabled = no + +[netdata.plugin_cgroups_cpu] +enabled = no + +[netdata.apps_cpu] +enabled = no + +[netdata.apps_sizes] +enabled = no + +[netdata.apps_fix] +enabled = no + +[netdata.apps_children_fix] +enabled = no + +[apps.cpu] +enabled = no + +[apps.mem] +enabled = no + +[apps.vmem] +enabled = no + +[apps.threads] +enabled = no + +[apps.processes] +enabled = no + +[apps.uptime] +enabled = no + +[apps.uptime_min] +enabled = no + +[apps.uptime_avg] +enabled = no + +[apps.uptime_max] +enabled = no + +[apps.cpu_user] +enabled = no + +[apps.cpu_system] +enabled = no + +[apps.swap] +enabled = no |