proc integrations (#15494)

Co-authored-by: ilyam8 <ilya@netdata.cloud>
author: Costa Tsaousis <costa@netdata.cloud> 2023-07-26 01:06:57 +0300
committer: GitHub <noreply@github.com> 2023-07-26 01:06:57 +0300
commit: 065091c3f5d908c5e06ed5ac817999f7dc965b90 (patch)
tree: 70e29c0d74eaf33f69e9e393732d3d051fd3da5d /collectors/proc.plugin
parent: accc426c8ab08a75c9e6470a4a89482fb4c4eb17 (diff)
6 files changed, 894 insertions, 192 deletions
diff --git a/collectors/proc.plugin/multi_metadata.yaml b/collectors/proc.plugin/multi_metadata.yaml
index 04e66ddfe6..bdbce89f98 100644
--- a/collectors/proc.plugin/multi_metadata.yaml
+++ b/collectors/proc.plugin/multi_metadata.yaml
@@ -20,27 +20,50 @@ modules:
       most_popular: false
     overview:
       data_collection:
-        metrics_description: "This integration provides a collection of statistics about the system such as CPU utilization, process counts and more."
+        metrics_description: |
+          CPU utilization, states and frequencies and key Linux system performance metrics.
+          
+          The `/proc/stat` file provides various types of system statistics:
+          
+          - The overall system CPU usage statistics
+          - Per CPU core statistics
+          - The total context switching of the system
+          - The total number of processes running
+          - The total CPU interrupts
+          - The total CPU softirqs
+          
+          The collector also reads:
+          
+          - `/proc/schedstat` for statistics about the process scheduler in the Linux kernel.
+          - `/sys/devices/system/cpu/[X]/thermal_throttle/core_throttle_count` to get the count of thermal throttling events for a specific CPU core on Linux systems.
+          - `/sys/devices/system/cpu/[X]/thermal_throttle/package_throttle_count` to get the count of thermal throttling events for a specific CPU package on a Linux system.
+          - `/sys/devices/system/cpu/[X]/cpufreq/scaling_cur_freq` to get the current operating frequency of a specific CPU core.
+          - `/sys/devices/system/cpu/[X]/cpufreq/stats/time_in_state` to get the amount of time the CPU has spent in each of its available frequency states.
+          - `/sys/devices/system/cpu/[X]/cpuidle/state[X]/name` to get the names of the idle states for each CPU core in a Linux system.
+          - `/sys/devices/system/cpu/[X]/cpuidle/state[X]/time` to get the total time each specific CPU core has spent in each idle state since the system was started.
         method_description: ""
       supported_platforms:
-        include: []
+        include: [ "linux" ]
         exclude: []
-      multi_instance: true
+      multi_instance: false
       additional_permissions:
         description: ""
       default_behavior:
         auto_detection:
-          description: ""
+          description: |
+            The collector auto-detects all metrics. No configuration is needed.
         limits:
           description: ""
         performance_impact:
-          description: ""
+          description: |
+            The collector disables cpu frequency and idle state monitoring when there are more than 128 CPU cores available.
     setup:
       prerequisites:
         list: []
       configuration:
         file:
-          name: ""
+          section_name: "plugin:proc:/proc/stat"
+          name: "netdata.conf"
           description: ""
         options:
           description: ""
@@ -187,12 +210,43 @@ modules:
       most_popular: false
     overview:
       data_collection:
-        metrics_description: ""
+        metrics_description: |
+          Entropy, a measure of the randomness or unpredictability of data.
+          
+          In the context of cryptography, entropy is used to generate random numbers or keys that are essential for
+          secure communication and encryption. Without a good source of entropy, cryptographic protocols can become
+          vulnerable to attacks that exploit the predictability of the generated keys.
+          
+          In most operating systems, entropy is generated by collecting random events from various sources, such as
+          hardware interrupts, mouse movements, keyboard presses, and disk activity. These events are fed into a pool
+          of entropy, which is then used to generate random numbers when needed.
+          
+          The `/dev/random` device in Linux is one such source of entropy, and it provides an interface for programs
+          to access the pool of entropy. When a program requests random numbers, it reads from the `/dev/random` device,
+          which blocks until enough entropy is available to generate the requested numbers. This ensures that the
+          generated numbers are truly random and not predictable. 
+          
+          However, if the pool of entropy gets depleted, the `/dev/random` device may block indefinitely, causing
+          programs that rely on random numbers to slow down or even freeze. This is especially problematic for
+          cryptographic protocols that require a continuous stream of random numbers, such as SSL/TLS and SSH.
+          
+          To avoid this issue, some systems use a hardware random number generator (RNG) to generate high-quality
+          entropy. A hardware RNG generates random numbers by measuring physical phenomena, such as thermal noise or
+          radioactive decay. These sources of randomness are considered to be more reliable and unpredictable than
+          software-based sources.
+          
+          One such hardware RNG is the Trusted Platform Module (TPM), which is a dedicated hardware chip that is used
+          for cryptographic operations and secure boot. The TPM contains a built-in hardware RNG that generates
+          high-quality entropy, which can be used to seed the pool of entropy in the operating system.
+          
+          Alternatively, software-based solutions such as `Haveged` can be used to generate additional entropy by
+          exploiting sources of randomness in the system, such as CPU utilization and network traffic. These solutions
+          can help to mitigate the risk of entropy depletion, but they may not be as reliable as hardware-based solutions.
         method_description: ""
       supported_platforms:
-        include: []
+        include: [ "linux" ]
         exclude: []
-      multi_instance: true
+      multi_instance: false
       additional_permissions:
         description: ""
       default_behavior:
@@ -264,12 +318,23 @@ modules:
       most_popular: false
     overview:
       data_collection:
-        metrics_description: "This integration provides the amount of time the system has been up (running)."
+        metrics_description: |
+          The amount of time the system has been up (running).
+          
+          Uptime is a critical aspect of overall system performance:
+          
+          - **Availability**: Uptime monitoring can show whether a server is consistently available or experiences frequent downtimes.
+          - **Performance Monitoring**: While server uptime alone doesn't provide detailed performance data, analyzing the duration and frequency of downtimes can help identify patterns or trends.
+          - **Proactive problem detection**: If server uptime monitoring reveals unexpected downtimes or a decreasing uptime trend, it can serve as an early warning sign of potential problems.
+          - **Root cause analysis**: When investigating server downtime, the uptime metric alone may not provide enough information to pinpoint the exact cause.
+          - **Load balancing**: Uptime data can indirectly indicate load balancing issues if certain servers have significantly lower uptimes than others.
+          - **Optimize maintenance efforts**: Servers with consistently low uptimes or frequent downtimes may require more attention.
+          - **Compliance requirements**: Server uptime data can be used to demonstrate compliance with regulatory requirements or SLAs that mandate a minimum level of server availability.
         method_description: ""
       supported_platforms:
-        include: []
+        include: [ "linux" ]
         exclude: []
-      multi_instance: true
+      multi_instance: false
       additional_permissions:
         description: ""
       default_behavior:
@@ -340,12 +405,33 @@ modules:
       most_popular: false
     overview:
       data_collection:
-        metrics_description: "This integration provides information about process, memory, swap space utilization and many more."
+        metrics_description: |
+          Linux Virtual memory subsystem.
+          
+          Information about memory management, indicating how effectively the kernel allocates and frees
+          memory resources in response to system demands.
+          
+          Monitors page faults, which occur when a process requests a portion of its memory that isn't
+          immediately available. Monitoring these events can help diagnose inefficiencies in memory management and
+          provide insights into application behavior.
+          
+          Tracks swapping activity — a vital aspect of memory management where the kernel moves data from RAM to
+          swap space, and vice versa, based on memory demand and usage. It also monitors the utilization of zswap,
+          a compressed cache for swap pages, and provides insights into its usage and performance implications.
+          
+          In the context of virtualized environments, it tracks the ballooning mechanism which is used to balance
+          memory resources between host and guest systems.
+          
+          For systems using NUMA architecture, it provides insights into the local and remote memory accesses, which
+          can impact the performance based on the memory access times.
+
+          The collector also watches for 'Out of Memory' kills, a drastic measure taken by the system when it runs out
+          of memory resources.
         method_description: ""
       supported_platforms:
-        include: []
+        include: [ "linux" ]
         exclude: []
-      multi_instance: true
+      multi_instance: false
       additional_permissions:
         description: ""
       default_behavior:
@@ -535,7 +621,37 @@ modules:
       most_popular: false
     overview:
       data_collection:
-        metrics_description: "Monitor Interrupts metrics for efficient processor interrupt handling."
+        metrics_description: |
+          Monitors `/proc/interrupts`, a file organized by CPU and then by the type of interrupt.
+          The numbers reported are the counts of the interrupts that have occurred of each type.
+
+          An interrupt is a signal to the processor emitted by hardware or software indicating an event that needs
+          immediate attention. The processor then interrupts its current activities and executes the interrupt handler
+          to deal with the event. This is part of the way a computer multitasks and handles concurrent processing.
+          
+          The types of interrupts include:
+
+          - **I/O interrupts**: These are caused by I/O devices like the keyboard, mouse, printer, etc. For example, when
+            you type something on the keyboard, an interrupt is triggered so the processor can handle the new input.
+
+          - **Timer interrupts**: These are generated at regular intervals by the system's timer circuit. It's primarily
+            used to switch the CPU among different tasks.
+
+          - **Software interrupts**: These are generated by a program requiring disk I/O operations, or other system resources.
+
+          - **Hardware interrupts**: These are caused by hardware conditions such as power failure, overheating, etc.
+          
+          Monitoring `/proc/interrupts` can be used for:
+
+          - **Performance tuning**: If an interrupt is happening very frequently, it could be a sign that a device is not
+            configured correctly, or there is a software bug causing unnecessary interrupts. This could lead to system
+            performance degradation.
+
+          - **System troubleshooting**: If you're seeing a lot of unexpected interrupts, it could be a sign of a hardware problem.
+
+          - **Understanding system behavior**: More generally, keeping an eye on what interrupts are occurring can help you
+            understand what your system is doing. It can provide insights into the system's interaction with hardware,
+            drivers, and other parts of the kernel.
         method_description: ""
       supported_platforms:
         include: []
@@ -621,12 +737,35 @@ modules:
       most_popular: false
     overview:
       data_collection:
-        metrics_description: "This integration provides the system load average for the last 1, 5, and 15 minutes."
+        metrics_description: |
+          The `/proc/loadavg` file provides information about the system load average.
+          
+          The load average is a measure of the amount of computational work that a system performs. It is a
+          representation of the average system load over a period of time.
+
+          This file contains three numbers representing the system load averages for the last 1, 5, and 15 minutes,
+          respectively. It also includes the currently running processes and the total number of processes.
+          
+          Monitoring the load average can be used for:
+
+          - **System performance**: If the load average is too high, it may indicate that your system is overloaded.
+            On a system with a single CPU, if the load average is 1, it means the single CPU is fully utilized. If the
+            load averages are consistently higher than the number of CPUs/cores, it may indicate that your system is
+            overloaded and tasks are waiting for CPU time.
+
+          - **Troubleshooting**: If the load average is unexpectedly high, it can be a sign of a problem. This could be
+            due to a runaway process, a software bug, or a hardware issue.
+
+          - **Capacity planning**: By monitoring the load average over time, you can understand the trends in your
+            system's workload. This can help with capacity planning and scaling decisions.
+
+          Remember that load average not only considers CPU usage, but also includes processes waiting for disk I/O.
+          Therefore, high load averages could be due to I/O contention as well as CPU contention.
         method_description: ""
       supported_platforms:
         include: []
         exclude: []
-      multi_instance: true
+      multi_instance: false
       additional_permissions:
         description: ""
       default_behavior:
@@ -726,12 +865,35 @@ modules:
       most_popular: false
     overview:
       data_collection:
-        metrics_description: "This integration measures system pressure, which can indicate resource shortages in CPU, memory, or I/O."
+        metrics_description: |
+          Introduced in Linux kernel 4.20, `/proc/pressure` provides information about system pressure stall information
+          (PSI). PSI is a feature that allows the system to track the amount of time the system is stalled due to
+          resource contention, such as CPU, memory, or I/O.
+
+          The collectors monitored 3 separate files for CPU, memory, and I/O:
+
+          - **cpu**: Tracks the amount of time tasks are stalled due to CPU contention.
+          - **memory**: Tracks the amount of time tasks are stalled due to memory contention.
+          - **io**: Tracks the amount of time tasks are stalled due to I/O contention.
+          - **irq**: Tracks the amount of time tasks are stalled due to IRQ contention.
+          
+          Each of them provides metrics for stall time over the last 10 seconds, 1 minute, 5 minutes, and 15 minutes.
+          
+          Monitoring the /proc/pressure files can provide important insights into system performance and capacity planning:
+          
+          - **Identifying resource contention**: If these metrics are consistently high, it indicates that tasks are
+            frequently being stalled due to lack of resources, which can significantly degrade system performance.
+          
+          - **Troubleshooting performance issues**: If a system is experiencing performance issues, these metrics can
+            help identify whether resource contention is the cause.
+
+          - **Capacity planning**: By monitoring these metrics over time, you can understand trends in resource
+            utilization and make informed decisions about when to add more resources to your system.
         method_description: ""
       supported_platforms:
         include: []
         exclude: []
-      multi_instance: true
+      multi_instance: false
       additional_permissions:
         description: ""
       default_behavior:
@@ -878,7 +1040,29 @@ modules:
       most_popular: false
     overview:
       data_collection:
-        metrics_description: "Monitor SoftIRQs metrics for efficient software interrupt operations."
+        metrics_description: |
+          In the Linux kernel, handling of hardware interrupts is split into two halves: the top half and the bottom half.
+          The top half is the routine that responds immediately to an interrupt, while the bottom half is deferred to be processed later.
+          
+          Softirqs are a mechanism in the Linux kernel used to handle the bottom halves of interrupts, which can be
+          deferred and processed later in a context where it's safe to enable interrupts.
+          
+          The actual work of handling the interrupt is offloaded to a softirq and executed later when the system
+          decides it's a good time to process them. This helps to keep the system responsive by not blocking the top
+          half for too long, which could lead to missed interrupts.
+          
+          Monitoring `/proc/softirqs` is useful for:
+
+          - **Performance tuning**: A high rate of softirqs could indicate a performance issue. For instance, a high
+            rate of network softirqs (`NET_RX` and `NET_TX`) could indicate a network performance issue.
+
+          - **Troubleshooting**: If a system is behaving unexpectedly, checking the softirqs could provide clues about
+            what is going on. For example, a sudden increase in block device softirqs (BLOCK) might indicate a problem
+            with a disk.
+
+          - **Understanding system behavior**: Knowing what types of softirqs are happening can help you understand what
+            your system is doing, particularly in terms of how it's interacting with hardware and how it's handling
+            interrupts.
         method_description: ""
       supported_platforms:
         include: []
@@ -963,7 +1147,29 @@ modules:
       most_popular: false
     overview:
       data_collection:
-        metrics_description: "This integration provides statistics on Softnet, such as processed events, dropped events and more."
+        metrics_description: |
+          `/proc/net/softnet_stat` provides statistics that relate to the handling of network packets by softirq.
+          
+          It provides information about:
+  
+          - Total number of processed packets (`processed`).
+          - Times ksoftirq ran out of quota (`dropped`).
+          - Times net_rx_action was rescheduled.
+          - Number of times processed all lists before quota.
+          - Number of times did not process all lists due to quota.
+          - Number of times net_rx_action was rescheduled for GRO (Generic Receive Offload) cells.
+          - Number of times GRO cells were processed.
+          
+          Monitoring the /proc/net/softnet_stat file can be useful for:
+          
+          - **Network performance monitoring**: By tracking the total number of processed packets and how many packets
+            were dropped, you can gain insights into your system's network performance.
+          
+          - **Troubleshooting**: If you're experiencing network-related issues, this collector can provide valuable clues.
+            For instance, a high number of dropped packets may indicate a network problem.
+          
+          - **Capacity planning**: If your system is consistently processing near its maximum capacity of network
+            packets, it might be time to consider upgrading your network infrastructure.
         method_description: ""
       supported_platforms:
         include: []
@@ -1069,12 +1275,28 @@ modules:
       most_popular: false
     overview:
       data_collection:
-        metrics_description: "Monitor memory usage metrics for efficient system memory management."
+        metrics_description: |
+          `/proc/meminfo` provides detailed information about the system's current memory usage. It includes information
+          about different types of memory, RAM, Swap, ZSwap, HugePages, Transparent HugePages (THP), Kernel memory,
+          SLAB memory, memory mappings, and more.
+          
+          Monitoring /proc/meminfo can be useful for:
+
+          - **Performance Tuning**: Understanding your system's memory usage can help you make decisions about system
+            tuning and optimization. For example, if your system is frequently low on free memory, it might benefit
+            from more RAM.
+          
+          - **Troubleshooting**: If your system is experiencing problems, `/proc/meminfo` can provide clues about
+            whether memory usage is a factor. For example, if your system is slow and cached swap is high, it could
+            mean that your system is swapping out a lot of memory to disk, which can degrade performance.
+          
+          - **Capacity Planning**: By monitoring memory usage over time, you can understand trends and make informed
+            decisions about future capacity needs.
         method_description: ""
       supported_platforms:
         include: []
         exclude: []
-      multi_instance: true
+      multi_instance: false
       additional_permissions:
         description: ""
       default_behavior:
@@ -1152,13 +1374,26 @@ modules:
               chart_type: area
               dimensions:
                 - name: avail
-            - name: system.swap
+            - name: mem.swap
               description: System Swap
               unit: "MiB"
               chart_type: stacked
               dimensions:
                 - name: free
                 - name: used
+            - name: mem.swap_cached
+              description: Swap Memory Cached in RAM
+              unit: "MiB"
+              chart_type: stacked
+              dimensions:
+                - name: cached
+            - name: mem.zswap
+              description: Zswap Usage
+              unit: "MiB"
+              chart_type: stacked
+              dimensions:
+                - name: in-ram
+                - name: on-disk
             - name: mem.hwcorrupt
               description: Corrupted Memory detected by ECC
               unit: "MiB"
@@ -1198,7 +1433,7 @@ modules:
               dimensions:
                 - name: reclaimable
                 - name: unreclaimable
-            - name: mem.hugepage
+            - name: mem.hugepages
               description: Dedicated HugePages Memory
               unit: "MiB"
               chart_type: stacked
@@ -1207,13 +1442,59 @@ modules:
                 - name: used
                 - name: surplus
                 - name: reserved
-            - name: mem.transparent_hugepages
+            - name: mem.thp
               description: Transparent HugePages Memory
               unit: "MiB"
               chart_type: stacked
               dimensions:
                 - name: anonymous
                 - name: shmem
+            - name: mem.thp_details
+              description: Details of Transparent HugePages Usage
+              unit: "MiB"
+              chart_type: line
+              dimensions:
+                - name: ShmemPmdMapped
+                - name: FileHugePages
+                - name: FilePmdMapped
+            - name: mem.reclaiming
+              description: Memory Reclaiming
+              unit: "MiB"
+              chart_type: line
+              dimensions:
+                - name: Active
+                - name: Inactive
+                - name: Active(anon)
+                - name: Inactive(anon)
+                - name: Active(file)
+                - name: Inactive(file)
+                - name: Unevictable
+                - name: Mlocked
+            - name: mem.high_low
+              description: High and Low Used and Free Memory Areas
+              unit: "MiB"
+              chart_type: stacked
+              dimensions:
+                - name: high_used
+                - name: low_used
+                - name: high_free
+                - name: low_free
+            - name: mem.cma
+              description: Contiguous Memory Allocator (CMA) Memory
+              unit: "MiB"
+              chart_type: stacked
+              dimensions:
+                - name: used
+                - name: free
+            - name: mem.directmaps
+              description: Direct Memory Mappings
+              unit: "MiB"
+              chart_type: stacked
+              dimensions:
+                - name: 4k
+                - name: 2m
+                - name: 4m
+                - name: 1g
   - meta:
       plugin_name: proc.plugin
       module_name: /proc/pagetypeinfo
@@ -1238,7 +1519,7 @@ modules:
       supported_platforms:
         include: []
         exclude: []
-      multi_instance: true
+      multi_instance: false
       additional_permissions:
         description: ""
       default_behavior:
@@ -1326,7 +1607,20 @@ modules:
       most_popular: false
     overview:
       data_collection:
-        metrics_description: "This integration monitors system memory errors detected and corrected by ECC RAM."
+        metrics_description: |
+          The Error Detection and Correction (EDAC) subsystem is detecting and reporting errors in the system's memory,
+          primarily ECC (Error-Correcting Code) memory errors.
+          
+          The collector provides data for:
+          
+          - Per memory controller (MC): correctable and uncorrectable errors. These can be of 2 kinds:
+            - errors related to a DIMM
+            - errors that cannot be associated with a DIMM
+          
+          - Per memory DIMM: correctable and uncorrectable errors. There are 2 kinds:
+            - memory controllers that can identify the physical DIMMS and report errors directly for them,
+            - memory controllers that report errors for memory address ranges that can be linked to dimms.
+              In this case the DIMMS reported may be more than the physical DIMMS installed.
         method_description: ""
       supported_platforms:
         include: []
@@ -1423,7 +1717,7 @@ modules:
             - name: dimm_location
               description: Location of the memory module.
             - name: dimm_mem_type
-              description: Type of the memory module. Usually either buffered or unbuffered memory.
+              description: Type of the memory module.
             - name: size
               description: The amount of memory in megabytes that this memory module manages.
           metrics:
@@ -1453,7 +1747,19 @@ modules:
       most_popular: false
     overview:
       data_collection:
-        metrics_description: "Monitor NUMA metrics for efficient non-uniform memory access operations."
+        metrics_description: |
+          Information about NUMA (Non-Uniform Memory Access) nodes on the system.
+          
+          NUMA is a method of configuring a cluster of microprocessor in a multiprocessing system so that they can
+          share memory locally, improving performance and the ability of the system to be expanded. NUMA is used in a
+          symmetric multiprocessing (SMP) system.
+
+          In a NUMA system, processors, memory, and I/O devices are grouped together into cells, also known as nodes.
+          Each node has its own memory and set of I/O devices, and one or more processors. While a processor can access
+          memory in any of the nodes, it does so faster when accessing memory within its own node.
+
+          The collector provides statistics on memory allocations for processes running on the NUMA nodes, revealing the
+          efficiency of memory allocations in multi-node systems.
         method_description: ""
       supported_platforms:
         include: []
@@ -1530,15 +1836,23 @@ modules:
         description: ""
       keywords:
         - ksm
+        - samepage
+        - merging
       most_popular: false
     overview:
       data_collection:
-        metrics_description: "Examine KSM metrics for insights into memory deduplication operations."
+        metrics_description: |
+          Kernel Samepage Merging (KSM) is a memory-saving feature in Linux that enables the kernel to examine the
+          memory of different processes and identify identical pages. It then merges these identical pages into a
+          single page that the processes share. This is particularly useful for virtualization, where multiple virtual
+          machines might be running the same operating system or applications and have many identical pages.
+          
+          The collector provides information about the operation and effectiveness of KSM on your system.
         method_description: ""
       supported_platforms:
         include: []
         exclude: []
-      multi_instance: true
+      multi_instance: false
       additional_permissions:
         description: ""
       default_behavior:
@@ -1622,7 +1936,11 @@ modules:
       most_popular: false
     overview:
       data_collection:
-        metrics_description: "This integration monitors ZRAM usage, compression ratios, and more."
+        metrics_description: |
+          zRAM, or compressed RAM, is a block device that uses a portion of your system's RAM as a block device.
+          The data written to this block device is compressed and stored in memory.
+          
+          The collectors provides information about the operation and the effectiveness of zRAM on your system.
         method_description: ""
       supported_platforms:
         include: []
@@ -1715,15 +2033,30 @@ modules:
       keywords:
         - ipc
         - semaphores
+        - shared memory
       most_popular: false
     overview:
       data_collection:
-        metrics_description: "Monitor Inter Process Communication performance for optimal process interaction."
+        metrics_description: |
+          IPC stands for Inter-Process Communication. It is a mechanism which allows processes to communicate with each
+          other and synchronize their actions.
+          
+          This collector exposes information about:
+
+          - Message Queues: This allows messages to be exchanged between processes. It's a more flexible method that
+            allows messages to be placed onto a queue and read at a later time.
+
+          - Shared Memory: This method allows for the fastest form of IPC because processes can exchange data by
+            reading/writing into shared memory segments.
+
+          - Semaphores: They are used to synchronize the operations performed by independent processes. So, if multiple
+            processes are trying to access a single shared resource, semaphores can ensure that only one process
+            accesses the resource at a given time.
         method_description: ""
       supported_platforms:
         include: []
         exclude: []
-      multi_instance: true
+      multi_instance: false
       additional_permissions:
         description: ""
       default_behavior:
@@ -1828,13 +2161,18 @@ modules:
         description: ""
       keywords:
         - disk
+        - disks
         - io
         - bcache
         - block devices
       most_popular: false
     overview:
       data_collection:
-        metrics_description: "This integration provides statistics about disk and Bcache I/O."
+        metrics_description: |
+          Detailed statistics for each of your system's disk devices and partitions.
+          The data is reported by the kernel and can be used to monitor disk activity on a Linux system.
+          
+          Get valuable insight into how your disks are performing and where potential bottlenecks might be.
         method_description: ""
       supported_platforms:
         include: []
diff --git a/collectors/proc.plugin/proc_meminfo.c b/collectors/proc.plugin/proc_meminfo.c
index 6988c70e01..cd1ba872c7 100644
--- a/collectors/proc.plugin/proc_meminfo.c
+++ b/collectors/proc.plugin/proc_meminfo.c
@@ -9,58 +9,92 @@ int do_proc_meminfo(int update_every, usec_t dt) {
     (void)dt;
 
     static procfile *ff = NULL;
-    static int do_ram = -1, do_swap = -1, do_hwcorrupt = -1, do_committed = -1, do_writeback = -1, do_kernel = -1, do_slab = -1, do_hugepages = -1, do_transparent_hugepages = -1;
-    static int do_percpu = 0;
+    static int do_ram = -1
+            , do_swap = -1
+            , do_hwcorrupt = -1
+            , do_committed = -1
+            , do_writeback = -1
+            , do_kernel = -1
+            , do_slab = -1
+            , do_hugepages = -1
+            , do_transparent_hugepages = -1
+            , do_reclaiming = -1
+            , do_high_low = -1
+            , do_cma = -1
+            , do_directmap = -1;
 
     static ARL_BASE *arl_base = NULL;
-    static ARL_ENTRY *arl_hwcorrupted = NULL, *arl_memavailable = NULL;
+    static ARL_ENTRY *arl_hwcorrupted = NULL, *arl_memavailable = NULL, *arl_hugepages_total = NULL,
+        *arl_zswapped = NULL, *arl_high_low = NULL, *arl_cma_total = NULL,
+        *arl_directmap4k = NULL, *arl_directmap2m = NULL, *arl_directmap4m = NULL, *arl_directmap1g = NULL;
 
     static unsigned long long
-            MemTotal = 0,
-            MemFree = 0,
-            MemAvailable = 0,
-            Buffers = 0,
-            Cached = 0,
-            //SwapCached = 0,
-            //Active = 0,
-            //Inactive = 0,
-            //ActiveAnon = 0,
-            //InactiveAnon = 0,
-            //ActiveFile = 0,
-            //InactiveFile = 0,
-            //Unevictable = 0,
-            //Mlocked = 0,
-            SwapTotal = 0,
-            SwapFree = 0,
-            Dirty = 0,
-            Writeback = 0,
-            //AnonPages = 0,
-            //Mapped = 0,
-            Shmem = 0,
-            Slab = 0,
-            SReclaimable = 0,
-            SUnreclaim = 0,
-            KernelStack = 0,
-            PageTables = 0,
-            NFS_Unstable = 0,
-            Bounce = 0,
-            WritebackTmp = 0,
-            //CommitLimit = 0,
-            Committed_AS = 0,
-            //VmallocTotal = 0,
-            VmallocUsed = 0,
-            //VmallocChunk = 0,
-            Percpu = 0,
-            AnonHugePages = 0,
-            ShmemHugePages = 0,
-            HugePages_Total = 0,
-            HugePages_Free = 0,
-            HugePages_Rsvd = 0,
-            HugePages_Surp = 0,
-            Hugepagesize = 0,
-            //DirectMap4k = 0,
-            //DirectMap2M = 0,
-            HardwareCorrupted = 0;
+              MemTotal = 0
+            , MemFree = 0
+            , MemAvailable = 0
+            , Buffers = 0
+            , Cached = 0
+            , SwapCached = 0
+            , Active = 0
+            , Inactive = 0
+            , ActiveAnon = 0
+            , InactiveAnon = 0
+            , ActiveFile = 0
+            , InactiveFile = 0
+            , Unevictable = 0
+            , Mlocked = 0
+            , HighTotal = 0
+            , HighFree  = 0
+            , LowTotal = 0
+            , LowFree = 0
+            , MmapCopy = 0
+            , SwapTotal = 0
+            , SwapFree = 0
+            , Zswap = 0
+            , Zswapped = 0
+            , Dirty = 0
+            , Writeback = 0
+            , AnonPages = 0
+            , Mapped = 0
+            , Shmem = 0
+            , KReclaimable = 0
+            , Slab = 0
+            , SReclaimable = 0
+            , SUnreclaim = 0
+            , KernelStack = 0
+            , ShadowCallStack = 0
+            , PageTables = 0
+            , SecPageTables = 0
+            , NFS_Unstable = 0
+            , Bounce = 0
+            , WritebackTmp = 0
+            , CommitLimit = 0
+            , Committed_AS = 0
+            , VmallocTotal = 0
+            , VmallocUsed = 0
+            , VmallocChunk = 0
+            , Percpu = 0
+            //, EarlyMemtestBad = 0
+            , HardwareCorrupted = 0
+            , AnonHugePages = 0
+            , ShmemHugePages = 0
+            , ShmemPmdMapped = 0
+            , FileHugePages = 0
+            , FilePmdMapped = 0
+            , CmaTotal = 0
author	Costa Tsaousis <costa@netdata.cloud>	2023-07-26 01:06:57 +0300
committer	GitHub <noreply@github.com>	2023-07-26 01:06:57 +0300
commit	065091c3f5d908c5e06ed5ac817999f7dc965b90 (patch)
tree	70e29c0d74eaf33f69e9e393732d3d051fd3da5d /collectors/proc.plugin
parent	accc426c8ab08a75c9e6470a4a89482fb4c4eb17 (diff)