summaryrefslogtreecommitdiffstats
path: root/src/go/collectors/go.d.plugin/modules/nvme/metadata.yaml
blob: 546f9c6c854e9870c60af8d96cce9fb786431b8f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
plugin_name: go.d.plugin
modules:
  - meta:
      id: collector-go.d.plugin-nvme
      plugin_name: go.d.plugin
      module_name: nvme
      monitored_instance:
        name: NVMe devices
        link: ""
        icon_filename: nvme.svg
        categories:
          - data-collection.storage-mount-points-and-filesystems
      keywords:
        - nvme
      related_resources:
        integrations:
          list: []
      info_provided_to_referring_integrations:
        description: ""
      most_popular: false
    overview:
      data_collection:
        metrics_description: >
          This collector monitors the health of NVMe devices.
          It relies on the [`nvme`](https://github.com/linux-nvme/nvme-cli#nvme-cli) CLI tool but avoids directly executing the binary.
          Instead, it utilizes `ndsudo`, a Netdata helper specifically designed to run privileged commands securely within the Netdata environment.
          This approach eliminates the need to use `sudo`, improving security and potentially simplifying permission management.
        method_description: ""
      supported_platforms:
        include: []
        exclude: []
      multi_instance: true
      additional_permissions:
        description: ""
      default_behavior:
        auto_detection:
          description: ""
        limits:
          description: ""
        performance_impact:
          description: ""
    setup:
      prerequisites:
        list:
          - title: Install nvme-cli
            description: |
              See [Distro Support](https://github.com/linux-nvme/nvme-cli#distro-support). Install `nvme-cli` using your distribution's package manager.
          - title: Allow netdata to execute nvme
            description: |
              Add the netdata user to `/etc/sudoers` (use `which nvme` to find the full path to the binary):

              ```bash
              netdata ALL=(root) NOPASSWD: /usr/sbin/nvme
              ```
      configuration:
        file:
          name: go.d/nvme.conf
        options:
          description: |
            The following options can be defined globally: update_every, autodetection_retry.
          folding:
            title: Config options
            enabled: true
          list:
            - name: update_every
              description: Data collection frequency.
              default_value: 10
              required: false
            - name: autodetection_retry
              description: Recheck interval in seconds. Zero means no recheck will be scheduled.
              default_value: 0
              required: false
            - name: timeout
              description: nvme binary execution timeout.
              default_value: 2
              required: false
        examples:
          folding:
            title: Config
            enabled: true
          list:
            - name: Custom update_every
              description: Allows you to override the default data collection interval.
              config: |
                jobs:
                  - name: nvme
                    update_every: 5  # Collect NVMe metrics every 5 seconds
    troubleshooting:
      problems:
        list: []
    alerts:
      - name: nvme_device_critical_warnings_state
        metric: nvme.device_critical_warnings_state
        info: "NVMe device ${label:device} has critical warnings"
        link: https://github.com/netdata/netdata/blob/master/src/health/health.d/nvme.conf
    metrics:
      folding:
        title: Metrics
        enabled: false
      description: ""
      availability: []
      scopes:
        - name: device
          description: These metrics refer to the NVME device.
          labels:
            - name: device
              description: NVMe device name
          metrics:
            - name: nvme.device_estimated_endurance_perc
              description: Estimated endurance
              unit: '%'
              chart_type: line
              dimensions:
                - name: used
            - name: nvme.device_available_spare_perc
              description: Remaining spare capacity
              unit: '%'
              chart_type: line
              dimensions:
                - name: spare
            - name: nvme.device_composite_temperature
              description: Composite temperature
              unit: celsius
              chart_type: line
              dimensions:
                - name: temperature
            - name: nvme.device_io_transferred_count
              description: Amount of data transferred to and from device
              unit: bytes
              chart_type: area
              dimensions:
                - name: read
                - name: written
            - name: nvme.device_power_cycles_count
              description: Power cycles
              unit: cycles
              chart_type: line
              dimensions:
                - name: power
            - name: nvme.device_power_on_time
              description: Power-on time
              unit: seconds
              chart_type: line
              dimensions:
                - name: power-on
            - name: nvme.device_critical_warnings_state
              description: Critical warnings state
              unit: state
              chart_type: line
              dimensions:
                - name: available_spare
                - name: temp_threshold
                - name: nvm_subsystem_reliability
                - name: read_only
                - name: volatile_mem_backup_failed
                - name: persistent_memory_read_only
            - name: nvme.device_unsafe_shutdowns_count
              description: Unsafe shutdowns
              unit: shutdowns
              chart_type: line
              dimensions:
                - name: unsafe
            - name: nvme.device_media_errors_rate
              description: Media and data integrity errors
              unit: errors/s
              chart_type: line
              dimensions:
                - name: media
            - name: nvme.device_error_log_entries_rate
              description: Error log entries
              unit: entries/s
              chart_type: line
              dimensions:
                - name: error_log
            - name: nvme.device_warning_composite_temperature_time
              description: Warning composite temperature time
              unit: seconds
              chart_type: line
              dimensions:
                - name: wctemp
            - name: nvme.device_critical_composite_temperature_time
              description: Critical composite temperature time
              unit: seconds
              chart_type: line
              dimensions:
                - name: cctemp
            - name: nvme.device_thermal_mgmt_temp1_transitions_rate
              description: Thermal management temp1 transitions
              unit: transitions/s
              chart_type: line
              dimensions:
                - name: temp1
            - name: nvme.device_thermal_mgmt_temp2_transitions_rate
              description: Thermal management temp2 transitions
              unit: transitions/s
              chart_type: line
              dimensions:
                - name: temp2
            - name: nvme.device_thermal_mgmt_temp1_time
              description: Thermal management temp1 time
              unit: seconds
              chart_type: line
              dimensions:
                - name: temp1
            - name: nvme.device_thermal_mgmt_temp2_time
              description: Thermal management temp2 time
              unit: seconds
              chart_type: line
              dimensions:
                - name: temp2