diff options
author | Vladimir Kobal <vlad@prokk.net> | 2020-03-12 13:28:43 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-03-12 13:28:43 +0200 |
commit | bc0ca9b1b3cbdd6d1a76d2150080dff79e116e24 (patch) | |
tree | 2d2c59e6f32e831612a257c73e81bfdf5f65c361 /exporting | |
parent | bec7b83e19944a7e844c6c7777f92ccba6d504e3 (diff) |
Add a Prometheus Remote Write connector to the exporting engine (#8292)
* Copy files from the Prometheus remote write backend
* Update the documentation
* Rename backend -> exporting
* Add the connector to the Netdata build
* Separate files for the remote write connector
* Add an initializer and formatters
* Read a connector specific configuration option
* Add a separate function for header sending
* Use labels instead of tags
* Separate write request for every instance
* Add unit tests
Diffstat (limited to 'exporting')
25 files changed, 2254 insertions, 60 deletions
diff --git a/exporting/Makefile.am b/exporting/Makefile.am index 82ae0f77e4..ea34ce516b 100644 --- a/exporting/Makefile.am +++ b/exporting/Makefile.am @@ -8,6 +8,7 @@ SUBDIRS = \ graphite \ json \ opentsdb \ + prometheus \ aws_kinesis \ $(NULL) diff --git a/exporting/aws_kinesis/aws_kinesis.c b/exporting/aws_kinesis/aws_kinesis.c index 2e5da3fad9..4b0d5f74a4 100644 --- a/exporting/aws_kinesis/aws_kinesis.c +++ b/exporting/aws_kinesis/aws_kinesis.c @@ -25,6 +25,9 @@ int init_aws_kinesis_instance(struct instance *instance) instance->end_host_formatting = flush_host_labels; instance->end_batch_formatting = NULL; + instance->send_header = NULL; + instance->check_response = NULL; + instance->buffer = (void *)buffer_create(0); if (!instance->buffer) { error("EXPORTING: cannot create buffer for AWS Kinesis exporting connector instance %s", instance->config.name); diff --git a/exporting/exporting_engine.h b/exporting/exporting_engine.h index c1e63a42eb..6a8296a19b 100644 --- a/exporting/exporting_engine.h +++ b/exporting/exporting_engine.h @@ -13,41 +13,8 @@ extern struct config exporting_config; -#define EXPORTER_DATA_SOURCE "data source" -#define EXPORTER_DATA_SOURCE_DEFAULT "average" - -#define EXPORTER_DESTINATION "destination" -#define EXPORTER_DESTINATION_DEFAULT "localhost" - -#define EXPORTER_UPDATE_EVERY "update every" -#define EXPORTER_UPDATE_EVERY_DEFAULT 10 - -#define EXPORTER_BUF_ONFAIL "buffer on failures" -#define EXPORTER_BUF_ONFAIL_DEFAULT 10 - -#define EXPORTER_TIMEOUT_MS "timeout ms" -#define EXPORTER_TIMEOUT_MS_DEFAULT 10000 - -#define EXPORTER_SEND_CHART_MATCH "send charts matching" -#define EXPORTER_SEND_CHART_MATCH_DEFAULT "*" - -#define EXPORTER_SEND_HOST_MATCH "send hosts matching" -#define EXPORTER_SEND_HOST_MATCH_DEFAULT "localhost *" - -#define EXPORTER_SEND_CONFIGURED_LABELS "send configured labels" -#define EXPORTER_SEND_CONFIGURED_LABELS_DEFAULT CONFIG_BOOLEAN_YES - -#define EXPORTER_SEND_AUTOMATIC_LABELS "send automatic labels" -#define EXPORTER_SEND_AUTOMATIC_LABELS_DEFAULT CONFIG_BOOLEAN_NO - -#define EXPORTER_SEND_NAMES "send names instead of ids" -#define EXPORTER_SEND_NAMES_DEFAULT CONFIG_BOOLEAN_YES - -#define EXPORTER_KINESIS_STREAM_NAME "stream name" -#define EXPORTER_KINESIS_STREAM_NAME_DEFAULT "netdata" - -#define EXPORTER_AWS_ACCESS_KEY_ID "aws_access_key_id" -#define EXPORTER_AWS_SECRET_ACCESS_KEY "aws_secret_access_key" +#define EXPORTING_UPDATE_EVERY_OPTION_NAME "update every" +#define EXPORTING_UPDATE_EVERY_DEFAULT 10 typedef enum exporting_options { EXPORTING_OPTION_NONE = 0, @@ -98,6 +65,10 @@ struct simple_connector_config { int default_port; }; +struct prometheus_remote_write_specific_config { + char *remote_write_path; +}; + struct aws_kinesis_specific_config { char *stream_name; char *auth_key_id; @@ -152,6 +123,9 @@ struct instance { int (*end_host_formatting)(struct instance *instance, RRDHOST *host); int (*end_batch_formatting)(struct instance *instance); + int (*send_header)(int *sock, struct instance *instance); + int (*check_response)(BUFFER *buffer, struct instance *instance); + void *connector_specific_data; size_t index; diff --git a/exporting/graphite/graphite.c b/exporting/graphite/graphite.c index fe748cc561..f815bff89d 100644 --- a/exporting/graphite/graphite.c +++ b/exporting/graphite/graphite.c @@ -29,6 +29,9 @@ int init_graphite_instance(struct instance *instance) instance->end_host_formatting = flush_host_labels; instance->end_batch_formatting = NULL; + instance->send_header = NULL; + instance->check_response = exporting_discard_response; + instance->buffer = (void *)buffer_create(0); if (!instance->buffer) { error("EXPORTING: cannot create buffer for graphite exporting connector instance %s", instance->config.name); diff --git a/exporting/init_connectors.c b/exporting/init_connectors.c index dae5576d3b..0e0cb27a78 100644 --- a/exporting/init_connectors.c +++ b/exporting/init_connectors.c @@ -5,6 +5,7 @@ #include "json/json.h" #include "opentsdb/opentsdb.h" #include "aws_kinesis/aws_kinesis.h" +#include "prometheus/remote_write/remote_write.h" /** * Initialize connectors @@ -37,6 +38,12 @@ int init_connectors(struct engine *engine) if (init_opentsdb_http_instance(instance) != 0) return 1; break; + case BACKEND_TYPE_PROMETHEUS_REMOTE_WRITE: +#if ENABLE_PROMETHEUS_REMOTE_WRITE + if (init_prometheus_remote_write_instance(instance) != 0) + return 1; +#endif + break; case BACKEND_TYPE_KINESIS: #if HAVE_KINESIS if (init_aws_kinesis_instance(instance) != 0) diff --git a/exporting/json/json.c b/exporting/json/json.c index 0e09e7e99d..9886b55567 100644 --- a/exporting/json/json.c +++ b/exporting/json/json.c @@ -29,6 +29,9 @@ int init_json_instance(struct instance *instance) instance->end_host_formatting = flush_host_labels; instance->end_batch_formatting = NULL; + instance->send_header = NULL; + instance->check_response = exporting_discard_response; + instance->buffer = (void *)buffer_create(0); if (!instance->buffer) { error("EXPORTING: cannot create buffer for json exporting connector instance %s", instance->config.name); diff --git a/exporting/opentsdb/opentsdb.c b/exporting/opentsdb/opentsdb.c index adae23076b..54f3c3c04d 100644 --- a/exporting/opentsdb/opentsdb.c +++ b/exporting/opentsdb/opentsdb.c @@ -29,6 +29,9 @@ int init_opentsdb_telnet_instance(struct instance *instance) instance->end_host_formatting = flush_host_labels; instance->end_batch_formatting = NULL; + instance->send_header = NULL; + instance->check_response = exporting_discard_response; + instance->buffer = (void *)buffer_create(0); if (!instance->buffer) { error("EXPORTING: cannot create buffer for opentsdb telnet exporting connector instance %s", instance->config.name); @@ -67,6 +70,9 @@ int init_opentsdb_http_instance(struct instance *instance) instance->end_host_formatting = flush_host_labels; instance->end_batch_formatting = NULL; + instance->send_header = NULL; + instance->check_response = exporting_discard_response; + instance->buffer = (void *)buffer_create(0); if (!instance->buffer) { error("EXPORTING: cannot create buffer for opentsdb HTTP exporting connector instance %s", instance->config.name); diff --git a/exporting/prometheus/Makefile.am b/exporting/prometheus/Makefile.am new file mode 100644 index 0000000000..334fca81ce --- /dev/null +++ b/exporting/prometheus/Makefile.am @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +AUTOMAKE_OPTIONS = subdir-objects +MAINTAINERCLEANFILES = $(srcdir)/Makefile.in + +SUBDIRS = \ + remote_write \ + $(NULL) + +dist_noinst_DATA = \ + README.md \ + $(NULL) diff --git a/exporting/prometheus/README.md b/exporting/prometheus/README.md new file mode 100644 index 0000000000..678bfe6e5a --- /dev/null +++ b/exporting/prometheus/README.md @@ -0,0 +1,452 @@ +# Using Netdata with Prometheus + +> IMPORTANT: the format Netdata sends metrics to Prometheus has changed since Netdata v1.7. The new Prometheus exporting +> connector for Netdata supports a lot more features and is aligned to the development of the rest of the Netdata +> exporting connectors. + +Prometheus is a distributed monitoring system which offers a very simple setup along with a robust data model. Recently +Netdata added support for Prometheus. I'm going to quickly show you how to install both Netdata and Prometheus on the +same server. We can then use Grafana pointed at Prometheus to obtain long term metrics Netdata offers. I'm assuming we +are starting at a fresh ubuntu shell (whether you'd like to follow along in a VM or a cloud instance is up to you). + +## Installing Netdata and Prometheus + +### Installing Netdata + +There are number of ways to install Netdata according to [Installation](../../packaging/installer/). The suggested way +of installing the latest Netdata and keep it upgrade automatically. Using one line installation: + +```sh +bash <(curl -Ss https://my-netdata.io/kickstart.sh) +``` + +At this point we should have Netdata listening on port 19999. Attempt to take your browser here: + +```sh +http://your.netdata.ip:19999 +``` + +_(replace `your.netdata.ip` with the IP or hostname of the server running Netdata)_ + +### Installing Prometheus + +In order to install Prometheus we are going to introduce our own systemd startup script along with an example of +prometheus.yaml configuration. Prometheus needs to be pointed to your server at a specific target url for it to scrape +Netdata's api. Prometheus is always a pull model meaning Netdata is the passive client within this architecture. +Prometheus always initiates the connection with Netdata. + +#### Download Prometheus + +```sh +cd /tmp && curl -s https://api.github.com/repos/prometheus/prometheus/releases/latest \ +| grep "browser_download_url.*linux-amd64.tar.gz" \ +| cut -d '"' -f 4 \ +| wget -qi - +``` + +#### Create prometheus system user + +```sh +sudo useradd -r prometheus +``` + +#### Create prometheus directory + +```sh +sudo mkdir /opt/prometheus +sudo chown prometheus:prometheus /opt/prometheus +``` + +#### Untar prometheus directory + +```sh +sudo tar -xvf /tmp/prometheus-*linux-amd64.tar.gz -C /opt/prometheus --strip=1 +``` + +#### Install prometheus.yml + +We will use the following `prometheus.yml` file. Save it at `/opt/prometheus/prometheus.yml`. + +Make sure to replace `your.netdata.ip` with the IP or hostname of the host running Netdata. + +```yaml +# my global config +global: + scrape_interval: 5s # Set the scrape interval to every 5 seconds. Default is every 1 minute. + evaluation_interval: 5s # Evaluate rules every 5 seconds. The default is every 1 minute. + # scrape_timeout is set to the global default (10s). + + # Attach these labels to any time series or alerts when communicating with + # external systems (federation, remote storage, Alertmanager). + external_labels: + monitor: 'codelab-monitor' + +# Load rules once and periodically evaluate them according to the global 'evaluation_interval'. +rule_files: + # - "first.rules" + # - "second.rules" + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +scrape_configs: + # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config. + - job_name: 'prometheus' + + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + static_configs: + - targets: ['0.0.0.0:9090'] + + - job_name: 'netdata-scrape' + + metrics_path: '/api/v1/allmetrics' + params: + # format: prometheus | prometheus_all_hosts + # You can use `prometheus_all_hosts` if you want Prometheus to set the `instance` to your hostname instead of IP + format: [prometheus] + # + # sources: as-collected | raw | average | sum | volume + # default is: average + #source: [as-collected] + # + # server name for this prometheus - the default is the client IP + # for Netdata to uniquely identify it + #server: ['prometheus1'] + honor_labels: true + + static_configs: + - targets: ['{your.netdata.ip}:19999'] +``` + +#### Install nodes.yml + +The following is completely optional, it will enable Prometheus to generate alerts from some NetData sources. Tweak the +values to your own needs. We will use the following `nodes.yml` file below. Save it at `/opt/prometheus/nodes.yml`, and +add a _- "nodes.yml"_ entry under the _rule_files:_ section in the example prometheus.yml file above. + +```yaml +groups: +- name: nodes + + rules: + - alert: node_high_cpu_usage_70 + expr: avg(rate(netdata_cpu_cpu_percentage_average{dimension="idle"}[1m])) by (job) > 70 + for: 1m + annotations: + description: '{{ $labels.job }} on ''{{ $labels.job }}'' CPU usage is at {{ humanize $value }}%.' + summary: CPU alert for container node '{{ $labels.job }}' + + - alert: node_high_memory_usage_70 + expr: 100 / sum(netdata_system_ram_MB_average) by (job) + * sum(netdata_system_ram_MB_average{dimension=~"free|cached"}) by (job) < 30 + for: 1m + annotations: + description: '{{ $labels.job }} memory usage is {{ humanize $value}}%.' + summary: Memory alert for container node '{{ $labels.job }}' + + - alert: node_low_root_filesystem_space_20 + expr: 100 / sum(netdata_disk_space_GB_average{family="/"}) by (job) + * sum(netdata_disk_space_GB_average{family="/",dimension=~"avail|cached"}) by (job) < 20 + for: 1m + annotations: + description: '{{ $labels.job }} root filesystem space is {{ humanize $value}}%.' + summary: Root filesystem alert for container node '{{ $labels.job }}' + + - alert: node_root_filesystem_fill_rate_6h + expr: predict_linear(netdata_disk_space_GB_average{family="/",dimension=~"avail|cached"}[1h], 6 * 3600) < 0 + for: 1h + labels: + severity: critical + annotations: + description: Container node {{ $labels.job }} root filesystem is going to fill up in 6h. + summary: Disk fill alert for Swarm node '{{ $labels.job }}' +``` + +#### Install prometheus.service + +Save this service file as `/etc/systemd/system/prometheus.service`: + +```sh +[Unit] +Description=Prometheus Server +AssertPathExists=/opt/prometheus + +[Service] +Type=simple +WorkingDirectory=/opt/prometheus +User=prometheus +Group=prometheus +ExecStart=/opt/prometheus/prometheus --config.file=/opt/prometheus/prometheus.yml --log.level=info +ExecReload=/bin/kill -SIGHUP $MAINPID +ExecStop=/bin/kill -SIGINT $MAINPID + +[Install] +WantedBy=multi-user.target +``` + +##### Start Prometheus + +```sh +sudo systemctl start prometheus +sudo systemctl enable prometheus +``` + +Prometheus should now start and listen on port 9090. Attempt to head there with your browser. + +If everything is working correctly when you fetch `http://your.prometheus.ip:9090` you will see a 'Status' tab. Click +this and click on 'targets' We should see the Netdata host as a scraped target. + +--- + +## Netdata support for Prometheus + +> IMPORTANT: the format Netdata sends metrics to Prometheus has changed since Netdata v1.6. The new format allows easier +> queries for metrics and supports both `as collected` and normalized metrics. + +Before explaining the changes, we have to understand the key differences between Netdata and Prometheus. + +### understanding Netdata metrics + +#### charts + +Each chart in Netdata has several properties (common to all its metrics): + +- `chart_id` - uniquely identifies a chart. + +- `chart_name` - a more human friendly name for `chart_id`, also unique. + +- `context` - this is the template of the chart. All disk I/O charts have the same context, all mysql requests charts + have the same context, etc. This is used for alarm templates to match all the charts they should be attached to. + +- `family` groups a set of charts together. It is used as the submenu of the dashboard. + +- `units` is the units for all the metrics attached to the chart. + +#### dimensions + +Then each Netdata chart contains metrics called `dimensions`. All the dimensions of a chart have the same units of +measurement, and are contextually in the same category (ie. the metrics for disk bandwidth are `read` and `write` and +they are both in the same chart). + +### Netdata data source + +Netdata can send metrics to Prometheus from 3 data sources: + +- `as collected` or `raw` - this data source sends the metrics to Prometheus as they are collected. No conversion is + done by Netdata. The latest value for each metric is just given to Prometheus. This is the most preferred method by + Prometheus, but it is also the harder to work with. To work with this data source, you will need to understand how + to get meaningful values out of them. + + The format of the metrics is: `CONTEXT{chart="CHART",family="FAMILY",dimension="DIMENSION"}`. + + If the metric is a counter (`incremental` in Netdata lingo), `_total` is appended the context. + + Unlike Prometheus, Netdata allows each dimension of a chart to have a different algorithm and conversion constants + (`multiplier` and `divisor`). In this case, that the dimensions of a charts are heterogeneous, Netdata will use this + format: `CONTEXT_DIMENSION{chart="CHART",family="FAMILY"}` + +- `average` - this data source uses the Netdata database to send the metrics to Prometheus as they are presented on + the Netdata dashboard. So, all the metrics are sent as gauges, at the units they are presented in the Netdata + dashboard charts. This is the easiest to work with. + + The format of the metrics is: `CONTEXT_UNITS_average{chart="CHART",family="FAMILY",dimension="DIMENSION"}`. + + When this source is used, Netdata keeps track of the last access time for each Prometheus server fetching the + metrics. This last access time is used at the subsequent queries of the same Prometheus server to identify the + time-frame the `average` will be calculated. + + So, no matter how frequently Prometheus scrapes Netdata, it will get all the database data. + To identify each Prometheus server, Netdata uses by default the IP of the client fetching the metrics. + + If there are multiple Prometheus servers fetching data from the same Netdata, using the same IP, each Prometheus + server can append `server=NAME` to the URL. Netdata will use this `NAME` to uniquely identify the Prometheus server. + +- `sum` or `volume`, is like `average` but instead of averaging the values, it sums them. + + The format of the metrics is: `CONTEXT_UNITS_sum{chart="CHART",family="FAMILY",dimension="DIMENSION"}`. All the + other operations are the same with `average`. + + To change the data source to `sum` or `as-collected` you need to provide the `source` parameter in the request URL. + e.g.: `http://your.netdata.ip:19999/api/v1/allmetrics?format=prometheus&help=yes&source=as-collected` + + Keep in mind that early versions of Netdata were sending the metrics as: `CHART_DIMENSION{}`. + +### Querying Metrics + +Fetch with your web browser this URL: + +`http://your.netdata.ip:19999/api/v1/allmetrics?format=prometheus&help=yes` + +_(replace `your.netdata.ip` with the ip or hostname of your Netdata server)_ + +Netdata will respond with all the metrics it sends to Prometheus. + +If you search that page for `"system.cpu"` you will find all the metrics Netdata is exporting to Prometheus for this +chart. `system.cpu` is the chart name on the Netdata dashboard (on the Netdata dashboard all charts have a text heading +such as : `Total CPU utilization (system.cpu)`. What we are interested here in the chart name: `system.cpu`). + +Searching for `"system.cpu"` reveals: + +```sh +# COMMENT homogeneous chart "system.cpu", context "system.cpu", family "cpu", units "percentage" +# COMMENT netdata_system_cpu_percentage_average: dimension "guest_nice", value is percentage, gauge, dt 1500066653 to 1500066662 inclusive +netdata_system_cpu_percentage_average{chart="system.cpu",family="cpu",dimension="guest_nice"} 0.0000000 1500066662000 +# COMMENT netdata_system_cpu_percentage_average: dimension "guest", value is percentage, gauge, dt 1500066653 to 1500066662 inclusive +netdata_system_cpu_percentage_average{chart="system.cpu",family="cpu",dimension="guest"} 1.7837326 1500066662000 +# COMMENT netdata_system_cpu_percentage_average: dimension "steal", value is percentage, gauge, dt 1500066653 to 1500066662 inclusive +netdata_system_cpu_percentage_average{chart="system.cpu",family="cpu",dimension="steal"} 0.0000000 1500066662000 +# COMMENT netdata_system_cpu_percentage_average: dimension "softirq", value is percentage, gauge, dt 1500066653 to 1500066662 inclusive +netdata_system_cpu_percentage_average{chart="system.cpu",family="cpu",dimension="softirq"} 0.5275442 1500066662000 +# COMMENT netdata_system_cpu_percentage_average: dimension "irq", value is percentage, gauge, dt 1500066653 to 1500066662 inclusive +netdata_system_cpu_percentage_average{chart="system.cpu",family="cpu",dimension="irq"} 0.2260836 1500066662000 +# COMMENT netdata_system_cpu_percentage_average: dimension "user", value is percentage, gauge, dt 1500066653 to 1500066662 inclusive +netdata_system_cpu_percentage_average{chart="system.cpu",family="cpu",dimension="user"} 2.3362762 1500066662000 +# COMMENT netdata_system_cpu_percentage_average: dimension "system", value is percentage, gauge, dt 1500066653 to 1500066662 inclusive +netdata_system_cpu_percentage_average{chart="system.cpu",family="cpu",dimension="system"} 1.7961062 1500066662000 +# COMMENT netdata_system_cpu_percentage_average: dimension "nice", value is percentage, gauge, dt 1500066653 to 1500066662 inclusive +netdata_system_cpu_percentage_average{chart="system.cpu",family="cpu",dimension="nice"} 0.0000000 1500066662000 +# COMMENT netdata_system_cpu_percentage_average: dimension "iowait", value is percentage, gauge, dt 1500066653 to 1500066662 inclusive +netdata_system_cpu_percentage_average{chart="system.cpu",family="cpu",dimension="iowait"} 0.9671802 1500066662000 +# COMMENT netdata_system_cpu_percentage_average: dimension "idle", value is percentage, gauge, dt 1500066653 to 1500066662 inclusive +netdata_system_cpu_percentage_average{chart="system.cpu",family="cpu",dimension="idle"} 92.3630770 1500066662000 +``` + +_(Netdata response for `system.cpu` with source=`average`)_ + +In `average` or `sum` data sources, all values are normalized and are reported to Prometheus as gauges. Now, use the +'expression' text form in Prometheus. Begin to type the metrics we are looking for: `netdata_system_cpu`. You should see +that the text form begins to auto-fill as Prometheus knows about this metric. + +If the data source was `as collected`, the response would be: + +```sh +# COMMENT homogeneous chart "system.cpu", context "system.cpu", family "cpu", units "percentage" +# COMMENT netdata_system_cpu_total: chart "system.cpu", context "system.cpu", family "cpu", dimension "guest_nice", value * 1 / 1 delta gives percentage (counter) +netdata_system_cpu_total{chart="system.cpu",family="cpu",dimension="guest_nice"} 0 1500066716438 +# COMMENT netdata_system_cpu_total: chart "system.cpu", context "system.cpu", family "cpu", dimension "guest", value * 1 / 1 delta gives percentage (counter) +netdata_system_cpu_total{chart="system.cpu",family="cpu",dimension="guest"} 63945 1500066716438 +# COMMENT netdata_system_cpu_total: chart "system.cpu", context "system.cpu", family "cpu", dimension "steal", value * 1 / 1 delta gives percentage (counter) +netdata_system_cpu_total{chart="system.cpu",family="cpu",dimension="steal"} 0 1500066716438 +# COMMENT netdata_system_cpu_total: chart "system.cpu", context "system.cpu", family "cpu", dimension "softirq", value * 1 / 1 delta gives percentage (counter) +netdata_system_cpu_total{chart="system.cpu",family="cpu",dimension="softirq"} 8295 1500066716438 +# COMMENT netdata_system_cpu_total: chart "system.cpu", context "system.cpu", family "cpu", dimension "irq", value * 1 / 1 delta gives percentage (counter) +netdata_system_cpu_total{chart="system.cpu",family="cpu",dimension="irq"} 4079 1500066716438 +# COMMENT netdata_system_cpu_total: chart "system.cpu", context "system.cpu", family "cpu", dimension "user", value * 1 / 1 delta gives percentage (counter) +netdata_system_cpu_total{chart="system.cpu",family="cpu",dimension="user"} 116488 1500066716438 +# COMMENT netdata_system_cpu_total: chart "system.cpu", context "system.cpu", family "cpu", dimension "system", value * 1 / 1 delta gives percentage (counter) +netdata_system_cpu_total{chart="system.cpu",family="cpu",dimension="system"} 35084 1500066716438 +# COMMENT netdata_system_cpu_total: chart "system.cpu", context "system.cpu", family "cpu", dimension "nice", value * 1 / 1 delta gives percentage (counter) +netdata_system_cpu_total{chart="system.cpu",family="cpu",dimension="nice"} 505 1500066716438 +# COMMENT netdata_system_cpu_total: chart "system.cpu", context "system.cpu", family "cpu", dimension "iowait", value * 1 / 1 delta gives percentage (counter) +netdata_system_cpu_total{chart="system.cpu",family="cpu",dimension="iowait"} 23314 1500066716438 +# COMMENT netdata_system_cpu_total: chart "system.cpu", context "system.cpu", family "cpu", dimension "idle", value * 1 / 1 delta gives percentage (counter) +netdata_system_cpu_total{chart="system.cpu",family="cpu",dimension="idle"} 918470 1500066716438 +``` + +_(Netdata response for `system.cpu` with source=`as-collected`)_ + +For more information check Prometheus documentation. + +### Streaming data from upstream hosts + +The `format=prometheus` parameter only exports the host's Netdata metrics. If you are using the master/slave +functionality of Netdata this ignores any upstream hosts - so you should consider using the below in your +**prometheus.yml**: + +```yaml + metrics_path: '/api/v1/allmetrics' + params: + format: [prometheus_all_hosts] + honor_labels: true +``` + +This will report all upstream host data, and `honor_labels` will make Prometheus take note of the instance names +provided. + +### Timestamps + +To pass the metrics through Prometheus pushgateway, Netdata supports the option `×tamps=no` to send the metrics +without timestamps. + +## Netdata host variables + +Netdata collects various system configuration metrics, like the max number of TCP sockets supported, the max number of +files allowed system-wide, various IPC sizes, etc. These metrics are not exposed to Prometheus by default. + +To expose them, append `variables=yes` to the Netdata URL. + +### TYPE and HELP + +To save bandwidth, and because Prometheus does not use them anyway, `# TYPE` and `# HELP` lines are suppressed. If +wanted they can be re-enabled via `types=yes` and `help=yes`, e.g. +`/api/v1/allmetrics?format=prometheus&types=yes&help=yes` + +### Names and IDs + +Netdata supports names and IDs for charts and dimensions. Usually IDs are unique identifiers as read by the system and +names are human friendly labels (also unique). + +Most charts and metrics have the same ID and name, but in several cases they are different: disks with device-mapper, +interrupts, QoS classes, statsd synthetic charts, etc. + +The default is controlled in `netdata.conf`: + +```conf +[backend] + send names instead of ids = yes | no +``` + +You can overwrite it from Prometheus, by appending to the URL: + +- `&names=no` to get IDs (the old behaviour) +- `&names=yes` to get names + +### Filtering metrics sent to Prometheus + +Netdata can filter the metrics it sends to Prometheus with this setting: + +```conf +[backend] + send charts matching = * +``` + +This settings accepts a space separated list of [simple patterns](../../libnetdata/simple_pattern/README.md) to match +the **charts** to be sent to Prometheus. Each pattern can use `*` as wildcard, any number of times (e.g `*a*b*c*` is +valid). Patterns starting with `!` give a negative match (e.g `!*.bad users.* groups.*` will send all the users and +groups except `bad` user and `bad` group). The order is important: the first match (positive or negative) left to right, +is used. + +### Changing the prefix of Netdata metrics + +Netdata sends all metrics prefixed with `netdata_`. You can change this in `netdata.conf`, like this: + +```conf +[backend] + prefix = netdata +``` + +It can also be changed from the URL, by appending `&prefix=netdata`. + +### Metric Units + +The default source `average` adds the unit of measurement to the name of each metric (e.g. `_KiB_persec`). To hide the +units and get the same metric names as with the other sources, append to the URL `&hideunits=yes`. + +The units were standardized in v1.12, with the effect of changing the metric names. To get the metric names as they were +before v1.12, append to the URL `&oldunits=yes` + +### Accuracy of `average` and `sum` data sources + +When the data source is set to `average` or `sum`, Netdata remembers the last access of each client accessing Prometheus +metrics and uses this last access time to respond with the `average` or `sum` of all the entries in the database since +that. This means that Prometheus servers are not losing data when they access Netdata with data source = `average` or +`sum`. + +To uniquely identify each Prometheus server, Netdata uses the IP of the client accessing the metrics. If however the IP +is not good enough for identifying a single Prometheus server (e.g. when Prometheus servers are accessing Netdata +through a web proxy, or when multiple Prometheus servers are NATed to a single IP), each Prometheus may append +`&server=NAME` to the URL. This `NAME` is used by Netdata to uniquely identify each Prometheus server and keep track of +its last access time. + +[![analytics](https://ww |