summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCosta Tsaousis <costa@tsaousis.gr>2018-10-18 17:31:52 +0300
committerGitHub <noreply@github.com>2018-10-18 17:31:52 +0300
commite76aac74e69c7dd03060e800e206eee777661a0c (patch)
treefa8e082b99d4d85c950f3d5d25385f9d14980c6f
parent15408ace0c2a81986026dd66446bb216c9b96eb6 (diff)
moved related wiki pages into the repo (#4428)
* moved related wiki pages into the repo * updated web server docs * fixed typos
-rw-r--r--.gitignore2
-rwxr-xr-xCMakeLists.txt4
-rw-r--r--Makefile.am4
-rw-r--r--collectors/apps.plugin/README.md182
-rw-r--r--collectors/cgroups.plugin/Makefile.am1
-rw-r--r--collectors/cgroups.plugin/README.md187
-rw-r--r--collectors/diskspace.plugin/README.md2
-rw-r--r--collectors/plugins.d/README.md12
-rw-r--r--collectors/proc.plugin/README.md4
-rw-r--r--collectors/tc.plugin/README.md180
-rw-r--r--configure.ac2
-rw-r--r--daemon/README.md445
-rw-r--r--database/README.md206
-rw-r--r--health/Makefile.am18
-rw-r--r--health/README.md657
-rw-r--r--health/notifications/Makefile.am45
-rw-r--r--health/notifications/README.md60
-rwxr-xr-xhealth/notifications/alarm-email.sh (renamed from health/alarm-email.sh)0
-rwxr-xr-xhealth/notifications/alarm-notify.sh.in (renamed from health/alarm-notify.sh.in)0
-rwxr-xr-xhealth/notifications/alarm-test.sh (renamed from health/alarm-test.sh)0
-rw-r--r--health/notifications/alerta/Makefile.inc12
-rw-r--r--health/notifications/alerta/README.md236
-rw-r--r--health/notifications/awssns/Makefile.inc12
-rw-r--r--health/notifications/awssns/README.md31
-rw-r--r--health/notifications/discord/Makefile.inc12
-rw-r--r--health/notifications/discord/README.md44
-rw-r--r--health/notifications/email/Makefile.inc12
-rw-r--r--health/notifications/email/README.md31
-rw-r--r--health/notifications/flock/Makefile.inc12
-rw-r--r--health/notifications/flock/README.md31
-rwxr-xr-xhealth/notifications/health_alarm_notify.conf (renamed from health/health_alarm_notify.conf)0
-rw-r--r--health/notifications/health_email_recipients.conf (renamed from health/health_email_recipients.conf)0
-rw-r--r--health/notifications/irc/Makefile.inc12
-rw-r--r--health/notifications/irc/README.md73
-rw-r--r--health/notifications/kavenegar/Makefile.inc12
-rw-r--r--health/notifications/kavenegar/README.md39
-rw-r--r--health/notifications/messagebird/Makefile.inc12
-rw-r--r--health/notifications/messagebird/README.md38
-rw-r--r--health/notifications/pagerduty/Makefile.inc12
-rw-r--r--health/notifications/pagerduty/README.md34
-rw-r--r--health/notifications/pushbullet/Makefile.inc12
-rw-r--r--health/notifications/pushbullet/README.md42
-rw-r--r--health/notifications/pushover/Makefile.inc12
-rw-r--r--health/notifications/pushover/README.md17
-rw-r--r--health/notifications/rocketchat/Makefile.inc12
-rw-r--r--health/notifications/rocketchat/README.md46
-rw-r--r--health/notifications/slack/Makefile.inc12
-rw-r--r--health/notifications/slack/README.md45
-rw-r--r--health/notifications/syslog/Makefile.inc12
-rw-r--r--health/notifications/syslog/README.md23
-rw-r--r--health/notifications/telegram/Makefile.inc12
-rw-r--r--health/notifications/telegram/README.md19
-rw-r--r--health/notifications/twilio/Makefile.inc12
-rw-r--r--health/notifications/twilio/README.md40
-rw-r--r--health/notifications/web/Makefile.inc12
-rw-r--r--health/notifications/web/README.md6
-rw-r--r--libnetdata/storage_number/README.md10
-rw-r--r--registry/README.md152
-rw-r--r--streaming/README.md413
-rw-r--r--web/api/Makefile.am4
-rw-r--r--web/api/README.md83
-rw-r--r--web/api/badges/Makefile.am8
-rw-r--r--web/api/badges/README.md324
-rw-r--r--web/api/badges/web_buffer_svg.c (renamed from web/api/web_buffer_svg.c)0
-rw-r--r--web/api/badges/web_buffer_svg.h (renamed from web/api/web_buffer_svg.h)2
-rw-r--r--web/api/web_api_v1.h2
-rw-r--r--web/server/README.md107
-rw-r--r--web/server/multi/README.md8
-rw-r--r--web/server/single/README.md6
-rw-r--r--web/server/static/README.md9
70 files changed, 4082 insertions, 56 deletions
diff --git a/.gitignore b/.gitignore
index 2011a7ad5d..04024e6490 100644
--- a/.gitignore
+++ b/.gitignore
@@ -87,7 +87,7 @@ system/netdata.plist
system/netdata-freebsd
system/edit-config
-health/alarm-notify.sh
+health/notifications/alarm-notify.sh
collectors/cgroups.plugin/cgroup-name.sh
collectors/tc.plugin/tc-qos-helper.sh
collectors/charts.d.plugin/charts.d.plugin
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2e16e6262f..e6e2f5fb42 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -368,8 +368,8 @@ set(API_PLUGIN_FILES
web/api/rrd2json.h
web/api/web_api_v1.c
web/api/web_api_v1.h
- web/api/web_buffer_svg.c
- web/api/web_buffer_svg.h
+ web/api/badges/web_buffer_svg.c
+ web/api/badges/web_buffer_svg.h
)
set(STREAMING_PLUGIN_FILES
diff --git a/Makefile.am b/Makefile.am
index a4865d38c0..5c8e6f444e 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -287,8 +287,8 @@ API_PLUGIN_FILES = \
web/api/rrd2json.h \
web/api/web_api_v1.c \
web/api/web_api_v1.h \
- web/api/web_buffer_svg.c \
- web/api/web_buffer_svg.h \
+ web/api/badges/web_buffer_svg.c \
+ web/api/badges/web_buffer_svg.h \
$(NULL)
STREAMING_PLUGIN_FILES = \
diff --git a/collectors/apps.plugin/README.md b/collectors/apps.plugin/README.md
index a2cb9a70d6..05680efe8c 100644
--- a/collectors/apps.plugin/README.md
+++ b/collectors/apps.plugin/README.md
@@ -22,22 +22,6 @@ utilization of exit processes. Their utilization is accounted at their currently
So, `apps.plugin` is perfectly able to measure the resources used by shell scripts and other processes
that fork/spawn other short lived processes hundreds of times per second.
-For example, ssh to a server running netdata and execute this:
-
-```sh
-while true; do ls -l /var/run >/dev/null; done
-```
-
-All the console tools will report that a a CPU core is 100% used, but they will fail to identify which
-process is using all that CPU (because there is no single process using it - thousands of `ls` per second
-are using it). Netdata however, will be able to identify that `ssh` is using it
-(`ssh` is the parent process group defined in its [default config](apps_groups.conf)):
-
-![](https://cloud.githubusercontent.com/assets/2662304/21076220/c9687848-bf2e-11e6-8d81-348592c5aca2.png)
-
-This feature makes `apps.plugin` unique in narrowing down the list of offending processes that may be
-responsible for slow downs, or abusing system resources.
-
## Charts
`apps.plugin` provides charts for 3 sections:
@@ -221,4 +205,168 @@ Examples below for process group `sql`:
- Open Sockets ![image](http://registry.my-netdata.io/api/v1/badge.svg?chart=apps.sockets&dimensions=sql&value_color=green%3E=3%7Cred)
-For more information about badges check [Generating Badges](https://github.com/netdata/netdata/wiki/Generating-Badges) \ No newline at end of file
+For more information about badges check [Generating Badges](../../web/api/badges)
+
+## Comparison with console tools
+
+Ssh to a server running netdata and execute this:
+
+```sh
+while true; do ls -l /var/run >/dev/null; done
+```
+
+In most systems `/var/run` is a `tmpfs` device, so there is nothing that can stop this command
+from consuming entirely one of the CPU cores of the machine.
+
+As we will see below, **none** of the console performance monitoring tools can report that this
+command is using 100% CPU. They do report of course that the CPU is busy, but **they fail to
+identify the process that consumes so much CPU**.
+
+Here is what common Linux console monitoring tools report:
+
+#### top
+
+`top` reports that `bash` is using just 14%.
+
+If you check the total system CPU utilization, it says there is no idle CPU at all, but `top`
+fails to provide a breakdown of the CPU consumption in the system. The sum of the CPU utilization
+of all processes reported by `top`, is 15.6%.
+
+```
+top - 18:46:28 up 3 days, 20:14, 2 users, load average: 0.22, 0.05, 0.02
+Tasks: 76 total, 2 running, 74 sleeping, 0 stopped, 0 zombie
+%Cpu(s): 32.8 us, 65.6 sy, 0.0 ni, 0.0 id, 0.0 wa, 1.3 hi, 0.3 si, 0.0 st
+KiB Mem : 1016576 total, 244112 free, 52012 used, 720452 buff/cache
+KiB Swap: 0 total, 0 free, 0 used. 753712 avail Mem
+
+ PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
+12789 root 20 0 14980 4180 3020 S 14.0 0.4 0:02.82 bash
+ 9 root 20 0 0 0 0 S 1.0 0.0 0:22.36 rcuos/0
+ 642 netdata 20 0 132024 20112 2660 S 0.3 2.0 14:26.29 netdata
+12522 netdata 20 0 9508 2476 1828 S 0.3 0.2 0:02.26 apps.plugin
+ 1 root 20 0 67196 10216 7500 S 0.0 1.0 0:04.83 systemd
+ 2 root 20 0 0 0 0 S 0.0 0.0 0:00.00 kthreadd
+```
+
+#### htop
+
+Exactly like `top`, `htop` is providing an incomplete breakdown of the system CPU utilization.
+
+```
+ CPU[||||||||||||||||||||||||100.0%] Tasks: 27, 11 thr; 2 running
+ Mem[||||||||||||||||||||85.4M/993M] Load average: 1.16 0.88 0.90
+ Swp[ 0K/0K] Uptime: 3 days, 21:37:03
+
+ PID USER PRI NI VIRT RES SHR S CPU% MEM% TIME+ Command
+12789 root 20 0 15104 4484 3208 S 14.0 0.4 10:57.15 -bash
+ 7024 netdata 20 0 9544 2480 1744 S 0.7 0.2 0:00.88 /usr/libexec/netd
+ 7009 netdata 20 0 138M 21016 2712 S 0.7 2.1 0:00.89 /usr/sbin/netdata
+ 7012 netdata 20 0 138M 21016 2712 S 0.0 2.1 0:00.31 /usr/sbin/netdata
+ 563 root 20 0 308M 202M 202M S 0.0 20.4 1:00.81 /usr/lib/systemd/
+ 7019 netdata 20 0 138M 21016 2712 S 0.0 2.1 0:00.14 /usr/sbin/netdata
+```
+
+#### atop
+
+`atop` also fails to break down CPU usage.
+
+```
+ATOP - localhost 2016/12/10 20:11:27 ----------- 10s elapsed
+PRC | sys 1.13s | user 0.43s | #proc 75 | #zombie 0 | #exit 5383 |
+CPU | sys 67% | user 31% | irq 2% | idle 0% | wait 0% |
+CPL | avg1 1.34 | avg5 1.05 | avg15 0.96 | csw 51346 | intr 10508 |
+MEM | tot 992.8M | free 211.5M | cache 470.0M | buff 87.2M | slab 164.7M |
+SWP | tot 0.0M | free 0.0M | | vmcom 207.6M | vmlim 496.4M |
+DSK | vda | busy 0% | read 0 | write 4 | avio 1.50 ms |
+NET | transport | tcpi 16 | tcpo 15 | udpi 0 | udpo 0 |
+NET | network | ipi 16 | ipo 15 | ipfrw 0 | deliv 16 |
+NET | eth0 ---- | pcki 16 | pcko 15 | si 1 Kbps | so 4 Kbps |
+
+ PID SYSCPU USRCPU VGROW RGROW RDDSK WRDSK ST EXC S CPU CMD 1/600
+12789 0.98s 0.40s 0K 0K 0K 336K -- - S 14% bash
+ 9 0.08s 0.00s 0K 0K 0K 0K -- - S 1% rcuos/0
+ 7024 0.03s 0.00s 0K 0K 0K 0K -- - S 0% apps.plugin
+ 7009 0.01s 0.01s 0K 0K 0K 4K -- - S 0% netdata
+```
+
+#### glances
+
+And the same is true for `glances`. The system runs at 100%, but `glances` reports only 17%
+per process utilization.
+
+Note also, that being a `python` program, `glances` uses 1.6% CPU while it runs.
+
+
+```
+localhost Uptime: 3 days, 21:42:00
+
+CPU [100.0%] CPU 100.0% MEM 23.7% SWAP 0.0% LOAD 1-core
+MEM [ 23.7%] user: 30.9% total: 993M total: 0 1 min: 1.18
+SWAP [ 0.0%] system: 67.8% used: 236M used: 0 5 min: 1.08
+ idle: 0.0% free: 757M free: 0 15 min: 1.00
+
+NETWORK Rx/s Tx/s TASKS 75 (90 thr), 1 run, 74 slp, 0 oth
+eth0 168b 2Kb
+eth1 0b 0b CPU% MEM% PID USER NI S Command
+lo 0b 0b 13.5 0.4 12789 root 0 S -bash
+ 1.6 2.2 7025 root 0 R /usr/bin/python /u
+DISK I/O R/s W/s 1.0 0.0 9 root 0 S rcuos/0
+vda1 0 4K 0.3 0.2 7024 netdata 0 S /usr/libexec/netda
+ 0.3 0.0 7 root 0 S rcu_sched
+FILE SYS Used Total 0.3 2.1 7009 netdata 0 S /usr/sbin/netdata
+/ (vda1) 1.56G 29.5G 0.0 0.0 17 root 0 S oom_reaper
+```
+
+#### why this happens?
+
+All the console tools report usage based on the processes found running *at the moment they
+examine the process tree*. So, they see just one `ls` command, which is actually very quick
+with minor CPU utilization. But the shell, is spawning hundreds of them, one after another
+(much like shell scripts do).
+
+#### what netdata reports?
+
+The total CPU utilization of the system:
+
+![image](https://cloud.githubusercontent.com/assets/2662304/21076212/9198e5a6-bf2e-11e6-9bc0-6bdea25befb2.png)
+<br/>_**Figure 1**: The system overview section at netdata, just a few seconds after the command was run_
+
+And at the applications `apps.plugin` breaks down CPU usage per application:
+
+![image](https://cloud.githubusercontent.com/assets/2662304/21076220/c9687848-bf2e-11e6-8d81-348592c5aca2.png)
+<br/>_**Figure 2**: The Applications section at netdata, just a few seconds after the command was run_
+
+So, the `ssh` session is using 95% CPU time.
+
+Why `ssh`?
+
+`apps.plugin` groups all processes based on its configuration file
+[`/etc/netdata/apps_groups.conf`](apps_groups.conf)
+(to edit it on your system run `/etc/netdata/edit-config apps_groups.conf`).
+The default configuration has nothing for `bash`, but it has for `sshd`, so netdata accumulates
+all ssh sessions to a dimension on the charts, called `ssh`. This includes all the processes in
+the process tree of `sshd`, **including the exited children**.
+
+> Distributions based on `systemd`, provide another way to get cpu utilization per user session
+> or service running: control groups, or cgroups, commonly used as part of containers
+> `apps.plugin` does not use these mechanisms. The process grouping made by `apps.plugin` works
+> on any Linux, `systemd` based or not.
+
+#### a more technical description of how netdata works
+
+netdata reads `/proc/<pid>/stat` for all processes, once per second and extracts `utime` and
+`stime` (user and system cpu utilization), much like all the console tools do.
+
+But it [also extracts `cutime` and `cstime`](https://github.com/netdata/netdata/blob/62596cc6b906b1564657510ca9135c08f6d4cdda/src/apps_plugin.c#L636-L642)
+that account the user and system time of the exit children of each process. By keeping a map in
+memory of the whole process tree, it is capable of assigning the right time to every process,
+taking into account all its exited children.
+
+It is tricky, since a process may be running for 1 hour and once it exits, its parent should not
+receive the whole 1 hour of cpu time in just 1 second - you have to subtract the cpu time that has
+been reported for it prior to this iteration.
+
+It is even trickier, because walking through the entire process tree takes some time itself. So,
+if you sum the CPU utilization of all processes, you might have more CPU time than the reported
+total cpu time of the system. netdata solves this, by adapting the per process cpu utilization to
+the total of the system. [Netdata adds charts that document this normalization](https://london.my-netdata.io/default.html#menu_netdata_submenu_apps_plugin).
diff --git a/collectors/cgroups.plugin/Makefile.am b/collectors/cgroups.plugin/Makefile.am
index fd878049d0..eb3214ab27 100644
--- a/collectors/cgroups.plugin/Makefile.am
+++ b/collectors/cgroups.plugin/Makefile.am
@@ -17,4 +17,5 @@ dist_plugins_SCRIPTS = \
dist_noinst_DATA = \
cgroup-name.sh.in \
+ README.md \
$(NULL)
diff --git a/collectors/cgroups.plugin/README.md b/collectors/cgroups.plugin/README.md
new file mode 100644
index 0000000000..e78aa04406
--- /dev/null
+++ b/collectors/cgroups.plugin/README.md
@@ -0,0 +1,187 @@
+# cgroups.plugin
+
+You can monitor containers and virtual machines using **cgroups**.
+
+cgroups (or control groups), are a Linux kernel feature that provides accounting and resource usage limiting for processes. When cgroups are bundled with namespaces (i.e. isolation), they form what we usually call **containers**.
+
+cgroups are hierarchical, meaning that cgroups can contain child cgroups, which can contain more cgroups, etc. All accounting is reported (and resource usage limits are applied) also in a hierarchical way.
+
+To visualize cgroup metrics netdata provides configuration for cherry picking the cgroups of interest. By default (without any configuration) netdata should pick **systemd services**, all kinds of **containers** (lxc, docker, etc) and **virtual machines** spawn by managers that register them with cgroups (qemu, libvirt, etc).
+
+## configuring netdata for cgroups
+
+For each cgroup available in the system, netdata provides this configuration:
+
+```
+[plugin:cgroups]
+ enable cgroup XXX = yes | no
+```
+
+But it also provides a few patterns to provide a sane default (`yes` or `no`).
+
+Below we see, how this works.
+
+### how netdata finds the available cgroups
+
+Linux exposes resource usage reporting and provides dynamic configuration for cgroups, using virtual files (usually) under `/sys/fs/cgroup`. netdata reads `/proc/self/mountinfo` to detect the exact mount point of cgroups. netdata also allows manual configuration of this mount point, using these settings:
+
+```
+[plugin:cgroups]
+ check for new cgroups every = 10
+ path to /sys/fs/cgroup/cpuacct = /sys/fs/cgroup/cpuacct
+ path to /sys/fs/cgroup/blkio = /sys/fs/cgroup/blkio
+ path to /sys/fs/cgroup/memory = /sys/fs/cgroup/memory
+ path to /sys/fs/cgroup/devices = /sys/fs/cgroup/devices
+```
+
+netdata rescans these directories for added or removed cgroups every `check for new cgroups every` seconds.
+
+### hierarchical search for cgroups
+
+Since cgroups are hierarchical, for each of the directories shown above, netdata walks through the subdirectories recursively searching for cgroups (each subdirectory is another cgroup).
+
+For each of the directories found, netdata provides a configuration variable:
+
+```
+[plugin:cgroups]
+ search for cgroups under PATH = yes | no
+```
+
+To provide a sane default for this setting, netdata uses the following pattern list (patterns starting with `!` give a negative match and their order is important: the first matching a path will be used):
+
+```
+[plugin:cgroups]
+ search for cgroups in subpaths matching = !*/init.scope !*-qemu !/init.scope !/system !/systemd !/user !/user.slice *
+```
+
+So, we disable checking for **child cgroups** in systemd internal cgroups ([systemd services are monitored by netdata](https://github.com/netdata/netdata/wiki/monitoring-systemd-services)), user cgroups (normally used for desktop and remote user sessions), qemu virtual machines (child cgroups of virtual machines) and `init.scope`. All others are enabled.
+
+
+### enabled cgroups
+
+To check if the cgroup is enabled, netdata uses this setting:
+
+```
+[plugin:cgroups]
+ enable cgroup NAME = yes | no
+```
+
+To provide a sane default, netdata uses the following pattern list (it checks the pattern against the path of the cgroup):
+
+```
+[plugin:cgroups]
+ enable by default cgroups matching = !*/init.scope *.scope !*/vcpu* !*/emulator !*.mount !*.partition !*.service !*.slice !*.swap !*.user !/ !/docker !/libvirt !/lxc !/lxc/*/ns !/lxc/*/ns/* !/machine !/qemu !/system !/systemd !/user *
+```
+
+The above provides the default `yes` or `no` setting for the cgroup. However, there is an additional step. In many cases the cgroups found in the `/sys/fs/cgroup` hierarchy are just random numbers and in many cases these numbers are ephemeral: they change across reboots or sessions.
+
+So, we need to somehow map the paths of the cgroups to names, to provide consistent netdata configuration (i.e. there is no point to say `enable cgroup 1234 = yes | no`, if `1234` is a random number that changes over time - we need a name for the cgroup first, so that `enable cgroup NAME = yes | no` will be consistent).
+
+For this mapping netdata provides 2 configuration options:
+
+```
+[plugin:cgroups]
+ run script to rename cgroups matching = *.scope *docker* *lxc* *qemu* !/ !*.mount !*.partition !*.service !*.slice !*.swap !*.user *
+ script to get cgroup names = /usr/libexec/netdata/plugins.d/cgroup-name.sh
+```
+
+The whole point for the additional pattern list, is to limit the number of time