summaryrefslogtreecommitdiffstats
path: root/collectors
diff options
context:
space:
mode:
authorGuido <guido.scatena@unipi.it>2020-08-04 18:09:06 +0200
committerGitHub <noreply@github.com>2020-08-04 19:09:06 +0300
commit211137df801d1b3e4d4d484a10c4ec42d8640040 (patch)
tree79ecb281e5d6a9b616f3dea939ab8717d38b7a4c /collectors
parent101011f2f0186774af11f0a708e7e91d43f7ded7 (diff)
nvidia_smi: charts for memory used by each user and number of distinct users (#9372)
Co-authored-by: ilyam8 <ilya@netdata.cloud>
Diffstat (limited to 'collectors')
-rw-r--r--collectors/python.d.plugin/nvidia_smi/README.md37
-rw-r--r--collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py115
2 files changed, 123 insertions, 29 deletions
diff --git a/collectors/python.d.plugin/nvidia_smi/README.md b/collectors/python.d.plugin/nvidia_smi/README.md
index 9da51ea6a8..9d11e4a363 100644
--- a/collectors/python.d.plugin/nvidia_smi/README.md
+++ b/collectors/python.d.plugin/nvidia_smi/README.md
@@ -9,36 +9,33 @@ sidebar_label: "Nvidia GPUs"
Monitors performance metrics (memory usage, fan speed, pcie bandwidth utilization, temperature, etc.) using `nvidia-smi` cli tool.
-**Requirements and Notes:**
+## Requirements and Notes
- You must have the `nvidia-smi` tool installed and your NVIDIA GPU(s) must support the tool. Mostly the newer high end models used for AI / ML and Crypto or Pro range, read more about [nvidia_smi](https://developer.nvidia.com/nvidia-system-management-interface).
-
- You must enable this plugin as its disabled by default due to minor performance issues.
-
- On some systems when the GPU is idle the `nvidia-smi` tool unloads and there is added latency again when it is next queried. If you are running GPUs under constant workload this isn't likely to be an issue.
-
- Currently the `nvidia-smi` tool is being queried via cli. Updating the plugin to use the nvidia c/c++ API directly should resolve this issue. See discussion here: <https://github.com/netdata/netdata/pull/4357>
-
- Contributions are welcome.
-
- Make sure `netdata` user can execute `/usr/bin/nvidia-smi` or wherever your binary is.
-
- If `nvidia-smi` process [is not killed after netdata restart](https://github.com/netdata/netdata/issues/7143) you need to off `loop_mode`.
-
- `poll_seconds` is how often in seconds the tool is polled for as an integer.
-It produces:
-
-1. Per GPU
-
- - GPU utilization
- - memory allocation
- - memory utilization
- - fan speed
- - power usage
- - temperature
- - clock speed
- - PCI bandwidth
+## Charts
+
+It produces the following charts:
+
+- PCI Express Bandwidth Utilization in `KiB/s`
+- Fan Speed in `percentage`
+- GPU Utilization in `percentage`
+- Memory Bandwidth Utilization in `percentage`
+- Encoder/Decoder Utilization in `percentage`
+- Memory Usage in `MiB`
+- Temperature in `celsius`
+- Clock Frequencies in `MHz`
+- Power Utilization in `Watts`
+- Memory Used by Each Process in `MiB`
+- Memory Used by Each User in `MiB`
+- Number of User on GPU in `num`
## Configuration
diff --git a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
index 01a8f1c7bd..a3be7b3ce1 100644
--- a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
+++ b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
@@ -2,9 +2,12 @@
# Description: nvidia-smi netdata python.d module
# Original Author: Steven Noonan (tycho)
# Author: Ilya Mashchenko (ilyam8)
+# User Memory Stat Author: Guido Scatena (scatenag)
import subprocess
import threading
+import os
+
import xml.etree.ElementTree as et
from bases.FrameworkServices.SimpleService import SimpleService
@@ -30,6 +33,8 @@ TEMPERATURE = 'temperature'
CLOCKS = 'clocks'
POWER = 'power'
PROCESSES_MEM = 'processes_mem'
+USER_MEM = 'user_mem'
+USER_NUM = 'user_num'
ORDER = [
PCI_BANDWIDTH,
@@ -42,6 +47,8 @@ ORDER = [
CLOCKS,
POWER,
PROCESSES_MEM,
+ USER_MEM,
+ USER_NUM,
]
@@ -114,6 +121,16 @@ def gpu_charts(gpu):
'options': [None, 'Memory Used by Each Process', 'MiB', fam, 'nvidia_smi.processes_mem', 'stacked'],
'lines': []
},
+ USER_MEM: {
+ 'options': [None, 'Memory Used by Each User', 'MiB', fam, 'nvidia_smi.user_mem', 'stacked'],
+ 'lines': []
+ },
+ USER_NUM: {
+ 'options': [None, 'Number of User on GPU', 'num', fam, 'nvidia_smi.user_num', 'line'],
+ 'lines': [
+ ['user_num', 'users'],
+ ]
+ },
}
idx = gpu.num
@@ -226,6 +243,50 @@ def handle_value_error(method):
return on_call
+HOST_PREFIX = os.getenv('NETDATA_HOST_PREFIX')
+ETC_PASSWD_PATH = '/etc/passwd'
+PROC_PATH = '/proc'
+
+if HOST_PREFIX:
+ ETC_PASSWD_PATH = os.path.join(HOST_PREFIX, ETC_PASSWD_PATH[1:])
+ PROC_PATH = os.path.join(HOST_PREFIX, PROC_PATH[1:])
+
+
+def read_passwd_file():
+ data = dict()
+ with open(ETC_PASSWD_PATH, 'r') as f:
+ for line in f:
+ line = line.strip()
+ if line.startswith("#"):
+ continue
+ fields = line.split(":")
+ # name, passwd, uid, gid, comment, home_dir, shell
+ if len(fields) != 7:
+ continue
+ # uid, guid
+ fields[2], fields[3] = int(fields[2]), int(fields[3])
+ data[fields[2]] = fields
+ return data
+
+
+def read_passwd_file_safe():
+ try:
+ return read_passwd_file()
+ except (OSError, IOError):
+ return dict()
+
+
+def get_username_by_pid_safe(pid, passwd_file):
+ if not passwd_file:
+ return ''
+ path = os.path.join(PROC_PATH, pid)
+ try:
+ uid = os.stat(path).st_uid
+ return passwd_file[uid][0]
+ except (OSError, IOError, KeyError):
+ return ''
+
+
class GPU:
def __init__(self, num, root):
self.num = num
@@ -303,15 +364,22 @@ class GPU:
@handle_attr_error
def processes(self):
- p_nodes = self.root.find('processes').findall('process_info')
- ps = []
- for p in p_nodes:
- ps.append({
- 'pid': p.find('pid').text,
- 'process_name': p.find('process_name').text,
- 'used_memory': int(p.find('used_memory').text.split()[0]),
+ processes_info = self.root.find('processes').findall('process_info')
+ if not processes_info:
+ return list()
+
+ passwd_file = read_passwd_file_safe()
+ processes = list()
+
+ for info in processes_info:
+ pid = info.find('pid').text
+ processes.append({
+ 'pid': int(pid),
+ 'process_name': info.find('process_name').text,
+ 'used_memory': int(info.find('used_memory').text.split()[0]),
+ 'username': get_username_by_pid_safe(pid, passwd_file),
})
- return ps
+ return processes
def data(self):
data = {
@@ -332,7 +400,17 @@ class GPU:
'power_draw': self.power_draw(),
}
processes = self.processes() or []
- data.update({'process_mem_{0}'.format(p['pid']): p['used_memory'] for p in processes})
+ users = set()
+ for p in processes:
+ data['process_mem_{0}'.format(p['pid'])] = p['used_memory']
+ if p['username']:
+ users.add(p['username'])
+ key = 'user_mem_{0}'.format(p['username'])
+ if key in data:
+ data[key] += p['used_memory']
+ else:
+ data[key] = p['used_memory']
+ data['user_num'] = len(users)
return dict(
('gpu{0}_{1}'.format(self.num, k), v) for k, v in data.items() if v is not None and v != BAD_VALUE
@@ -379,6 +457,7 @@ class Service(SimpleService):
gpu = GPU(idx, root)
data.update(gpu.data())
self.update_processes_mem_chart(gpu)
+ self.update_processes_user_mem_chart(gpu)
return data or None
@@ -397,6 +476,24 @@ class Service(SimpleService):
if dim.id not in active_dim_ids:
chart.del_dimension(dim.id, hide=False)
+ def update_processes_user_mem_chart(self, gpu):
+ ps = gpu.processes()
+ if not ps:
+ return
+ chart = self.charts['gpu{0}_{1}'.format(gpu.num, USER_MEM)]
+ active_dim_ids = []
+ for p in ps:
+ if not p.get('username'):
+ continue
+ dim_id = 'gpu{0}_user_mem_{1}'.format(gpu.num, p['username'])
+ active_dim_ids.append(dim_id)
+ if dim_id not in chart:
+ chart.add_dimension([dim_id, '{0}'.format(p['username'])])
+
+ for dim in chart:
+ if dim.id not in active_dim_ids:
+ chart.del_dimension(dim.id, hide=False)
+
def check(self):
if not self.poller.has_smi():
self.error("couldn't find '{0}' binary".format(NVIDIA_SMI))