diff options
author | Guido <guido.scatena@unipi.it> | 2020-08-04 18:09:06 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-08-04 19:09:06 +0300 |
commit | 211137df801d1b3e4d4d484a10c4ec42d8640040 (patch) | |
tree | 79ecb281e5d6a9b616f3dea939ab8717d38b7a4c /collectors/python.d.plugin/nvidia_smi | |
parent | 101011f2f0186774af11f0a708e7e91d43f7ded7 (diff) |
nvidia_smi: charts for memory used by each user and number of distinct users (#9372)
Co-authored-by: ilyam8 <ilya@netdata.cloud>
Diffstat (limited to 'collectors/python.d.plugin/nvidia_smi')
-rw-r--r-- | collectors/python.d.plugin/nvidia_smi/README.md | 37 | ||||
-rw-r--r-- | collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py | 115 |
2 files changed, 123 insertions, 29 deletions
diff --git a/collectors/python.d.plugin/nvidia_smi/README.md b/collectors/python.d.plugin/nvidia_smi/README.md index 9da51ea6a8..9d11e4a363 100644 --- a/collectors/python.d.plugin/nvidia_smi/README.md +++ b/collectors/python.d.plugin/nvidia_smi/README.md @@ -9,36 +9,33 @@ sidebar_label: "Nvidia GPUs" Monitors performance metrics (memory usage, fan speed, pcie bandwidth utilization, temperature, etc.) using `nvidia-smi` cli tool. -**Requirements and Notes:** +## Requirements and Notes - You must have the `nvidia-smi` tool installed and your NVIDIA GPU(s) must support the tool. Mostly the newer high end models used for AI / ML and Crypto or Pro range, read more about [nvidia_smi](https://developer.nvidia.com/nvidia-system-management-interface). - - You must enable this plugin as its disabled by default due to minor performance issues. - - On some systems when the GPU is idle the `nvidia-smi` tool unloads and there is added latency again when it is next queried. If you are running GPUs under constant workload this isn't likely to be an issue. - - Currently the `nvidia-smi` tool is being queried via cli. Updating the plugin to use the nvidia c/c++ API directly should resolve this issue. See discussion here: <https://github.com/netdata/netdata/pull/4357> - - Contributions are welcome. - - Make sure `netdata` user can execute `/usr/bin/nvidia-smi` or wherever your binary is. - - If `nvidia-smi` process [is not killed after netdata restart](https://github.com/netdata/netdata/issues/7143) you need to off `loop_mode`. - - `poll_seconds` is how often in seconds the tool is polled for as an integer. -It produces: - -1. Per GPU - - - GPU utilization - - memory allocation - - memory utilization - - fan speed - - power usage - - temperature - - clock speed - - PCI bandwidth +## Charts + +It produces the following charts: + +- PCI Express Bandwidth Utilization in `KiB/s` +- Fan Speed in `percentage` +- GPU Utilization in `percentage` +- Memory Bandwidth Utilization in `percentage` +- Encoder/Decoder Utilization in `percentage` +- Memory Usage in `MiB` +- Temperature in `celsius` +- Clock Frequencies in `MHz` +- Power Utilization in `Watts` +- Memory Used by Each Process in `MiB` +- Memory Used by Each User in `MiB` +- Number of User on GPU in `num` ## Configuration diff --git a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py index 01a8f1c7bd..a3be7b3ce1 100644 --- a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py +++ b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py @@ -2,9 +2,12 @@ # Description: nvidia-smi netdata python.d module # Original Author: Steven Noonan (tycho) # Author: Ilya Mashchenko (ilyam8) +# User Memory Stat Author: Guido Scatena (scatenag) import subprocess import threading +import os + import xml.etree.ElementTree as et from bases.FrameworkServices.SimpleService import SimpleService @@ -30,6 +33,8 @@ TEMPERATURE = 'temperature' CLOCKS = 'clocks' POWER = 'power' PROCESSES_MEM = 'processes_mem' +USER_MEM = 'user_mem' +USER_NUM = 'user_num' ORDER = [ PCI_BANDWIDTH, @@ -42,6 +47,8 @@ ORDER = [ CLOCKS, POWER, PROCESSES_MEM, + USER_MEM, + USER_NUM, ] @@ -114,6 +121,16 @@ def gpu_charts(gpu): 'options': [None, 'Memory Used by Each Process', 'MiB', fam, 'nvidia_smi.processes_mem', 'stacked'], 'lines': [] }, + USER_MEM: { + 'options': [None, 'Memory Used by Each User', 'MiB', fam, 'nvidia_smi.user_mem', 'stacked'], + 'lines': [] + }, + USER_NUM: { + 'options': [None, 'Number of User on GPU', 'num', fam, 'nvidia_smi.user_num', 'line'], + 'lines': [ + ['user_num', 'users'], + ] + }, } idx = gpu.num @@ -226,6 +243,50 @@ def handle_value_error(method): return on_call +HOST_PREFIX = os.getenv('NETDATA_HOST_PREFIX') +ETC_PASSWD_PATH = '/etc/passwd' +PROC_PATH = '/proc' + +if HOST_PREFIX: + ETC_PASSWD_PATH = os.path.join(HOST_PREFIX, ETC_PASSWD_PATH[1:]) + PROC_PATH = os.path.join(HOST_PREFIX, PROC_PATH[1:]) + + +def read_passwd_file(): + data = dict() + with open(ETC_PASSWD_PATH, 'r') as f: + for line in f: + line = line.strip() + if line.startswith("#"): + continue + fields = line.split(":") + # name, passwd, uid, gid, comment, home_dir, shell + if len(fields) != 7: + continue + # uid, guid + fields[2], fields[3] = int(fields[2]), int(fields[3]) + data[fields[2]] = fields + return data + + +def read_passwd_file_safe(): + try: + return read_passwd_file() + except (OSError, IOError): + return dict() + + +def get_username_by_pid_safe(pid, passwd_file): + if not passwd_file: + return '' + path = os.path.join(PROC_PATH, pid) + try: + uid = os.stat(path).st_uid + return passwd_file[uid][0] + except (OSError, IOError, KeyError): + return '' + + class GPU: def __init__(self, num, root): self.num = num @@ -303,15 +364,22 @@ class GPU: @handle_attr_error def processes(self): - p_nodes = self.root.find('processes').findall('process_info') - ps = [] - for p in p_nodes: - ps.append({ - 'pid': p.find('pid').text, - 'process_name': p.find('process_name').text, - 'used_memory': int(p.find('used_memory').text.split()[0]), + processes_info = self.root.find('processes').findall('process_info') + if not processes_info: + return list() + + passwd_file = read_passwd_file_safe() + processes = list() + + for info in processes_info: + pid = info.find('pid').text + processes.append({ + 'pid': int(pid), + 'process_name': info.find('process_name').text, + 'used_memory': int(info.find('used_memory').text.split()[0]), + 'username': get_username_by_pid_safe(pid, passwd_file), }) - return ps + return processes def data(self): data = { @@ -332,7 +400,17 @@ class GPU: 'power_draw': self.power_draw(), } processes = self.processes() or [] - data.update({'process_mem_{0}'.format(p['pid']): p['used_memory'] for p in processes}) + users = set() + for p in processes: + data['process_mem_{0}'.format(p['pid'])] = p['used_memory'] + if p['username']: + users.add(p['username']) + key = 'user_mem_{0}'.format(p['username']) + if key in data: + data[key] += p['used_memory'] + else: + data[key] = p['used_memory'] + data['user_num'] = len(users) return dict( ('gpu{0}_{1}'.format(self.num, k), v) for k, v in data.items() if v is not None and v != BAD_VALUE @@ -379,6 +457,7 @@ class Service(SimpleService): gpu = GPU(idx, root) data.update(gpu.data()) self.update_processes_mem_chart(gpu) + self.update_processes_user_mem_chart(gpu) return data or None @@ -397,6 +476,24 @@ class Service(SimpleService): if dim.id not in active_dim_ids: chart.del_dimension(dim.id, hide=False) + def update_processes_user_mem_chart(self, gpu): + ps = gpu.processes() + if not ps: + return + chart = self.charts['gpu{0}_{1}'.format(gpu.num, USER_MEM)] + active_dim_ids = [] + for p in ps: + if not p.get('username'): + continue + dim_id = 'gpu{0}_user_mem_{1}'.format(gpu.num, p['username']) + active_dim_ids.append(dim_id) + if dim_id not in chart: + chart.add_dimension([dim_id, '{0}'.format(p['username'])]) + + for dim in chart: + if dim.id not in active_dim_ids: + chart.del_dimension(dim.id, hide=False) + def check(self): if not self.poller.has_smi(): self.error("couldn't find '{0}' binary".format(NVIDIA_SMI)) |