diff options
author | 胡玮文 <huww98@outlook.com> | 2019-04-05 15:59:14 +0800 |
---|---|---|
committer | Ilya Mashchenko <ilyamaschenko@gmail.com> | 2019-04-05 10:59:14 +0300 |
commit | 01b91117caecce0d50020a51673b9d3b3b0e01f0 (patch) | |
tree | fb95e138d9d5f497f5efcd97d7939bf5b7092bc9 /collectors/python.d.plugin/nvidia_smi | |
parent | d9d41e6b9e23db9aa48481dd7efa4c317273c9fb (diff) |
Add memory free and per process memory usage to nvidia_smi (#5796)
<!--
Describe the change in summary section, including rationale and degin decisions.
Include "Fixes #nnn" if you are fixing an existing issue.
In "Component Name" section write which component is changed in this PR. This
will help us review your PR quicker.
If you have more information you want to add, write them in "Additional
Information" section. This is usually used to help others understand your
motivation behind this change. A step-by-step reproduction of the problem is
helpful if there is no related issue.
-->
##### Summary
Add memory free
Add per process memory usage
##### Component Name
nvidia_smi
##### Additional Information
Diffstat (limited to 'collectors/python.d.plugin/nvidia_smi')
-rw-r--r-- | collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py | 68 |
1 files changed, 57 insertions, 11 deletions
diff --git a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py index 4dc67f1334..f7b7020e01 100644 --- a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py +++ b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py @@ -26,10 +26,11 @@ FAN_SPEED = 'fan_speed' GPU_UTIL = 'gpu_utilization' MEM_UTIL = 'mem_utilization' ENCODER_UTIL = 'encoder_utilization' -MEM_ALLOCATED = 'mem_allocated' +MEM_USAGE = 'mem_usage' TEMPERATURE = 'temperature' CLOCKS = 'clocks' POWER = 'power' +PROCESSES_MEM = 'processes_mem' ORDER = [ PCI_BANDWIDTH, @@ -37,10 +38,11 @@ ORDER = [ GPU_UTIL, MEM_UTIL, ENCODER_UTIL, - MEM_ALLOCATED, + MEM_USAGE, TEMPERATURE, CLOCKS, POWER, + PROCESSES_MEM, ] @@ -80,10 +82,11 @@ def gpu_charts(gpu): ['decoder_util', 'decoder'], ] }, - MEM_ALLOCATED: { - 'options': [None, 'Memory Allocated', 'MiB', fam, 'nvidia_smi.memory_allocated', 'line'], + MEM_USAGE: { + 'options': [None, 'Memory Usage', 'MiB', fam, 'nvidia_smi.memory_allocated', 'stacked'], 'lines': [ - ['fb_memory_usage', 'used'], + ['fb_memory_free', 'free'], + ['fb_memory_used', 'used'], ] }, TEMPERATURE: { @@ -107,6 +110,10 @@ def gpu_charts(gpu): ['power_draw', 'power', 1, 100], ] }, + PROCESSES_MEM: { + 'options': [None, 'Memory Used by Each Process', 'MiB', fam, 'nvidia_smi.processes_mem', 'stacked'], + 'lines': [] + }, } idx = gpu.num @@ -260,10 +267,14 @@ class GPU: return self.root.find('utilization').find('decoder_util').text.split()[0] @handle_attr_error - def fb_memory_usage(self): + def fb_memory_used(self): return self.root.find('fb_memory_usage').find('used').text.split()[0] @handle_attr_error + def fb_memory_free(self): + return self.root.find('fb_memory_usage').find('free').text.split()[0] + + @handle_attr_error def temperature(self): return self.root.find('temperature').find('gpu_temp').text.split()[0] @@ -288,6 +299,18 @@ class GPU: def power_draw(self): return float(self.root.find('power_readings').find('power_draw').text.split()[0]) * 100 + @handle_attr_error + def processes(self): + p_nodes = self.root.find('processes').findall('process_info') + ps = [] + for p in p_nodes: + ps.append({ + 'pid': p.find('pid').text, + 'process_name': p.find('process_name').text, + 'used_memory': int(p.find('used_memory').text.split()[0]), + }) + return ps + def data(self): data = { 'rx_util': self.rx_util(), @@ -297,7 +320,8 @@ class GPU: 'memory_util': self.memory_util(), 'encoder_util': self.encoder_util(), 'decoder_util': self.decoder_util(), - 'fb_memory_usage': self.fb_memory_usage(), + 'fb_memory_used': self.fb_memory_used(), + 'fb_memory_free': self.fb_memory_free(), 'gpu_temp': self.temperature(), 'graphics_clock': self.graphics_clock(), 'video_clock': self.video_clock(), @@ -305,12 +329,13 @@ class GPU: 'mem_clock': self.mem_clock(), 'power_draw': self.power_draw(), } + processes = self.processes() or [] + data.update({'process_mem_{0}'.format(p['pid']): p['used_memory'] for p in processes}) return dict( ('gpu{0}_{1}'.format(self.num, k), v) for k, v in data.items() if v is not None and v != BAD_VALUE ) - class Service(SimpleService): def __init__(self, configuration=None, name=None): super(Service, self).__init__(configuration=configuration, name=name) @@ -320,11 +345,16 @@ class Service(SimpleService): self.poller = NvidiaSMIPoller(poll) def get_data(self): + if not self.poller.is_started(): + self.poller.start() + if not self.poller.is_alive(): self.debug('poller is off') return None last_data = self.poller.data() + if not last_data: + return None parsed = self.parse_xml(last_data) if parsed is None: @@ -332,10 +362,27 @@ class Service(SimpleService): data = dict() for idx, root in enumerate(parsed.findall('gpu')): - data.update(GPU(idx, root).data()) + gpu = GPU(idx, root) + data.update(gpu.data()) + self.update_processes_mem_chart(gpu) return data or None + def update_processes_mem_chart(self, gpu): + ps = gpu.processes() + if not ps: + return + chart = self.charts['gpu{0}_{1}'.format(gpu.num, PROCESSES_MEM)] + active_dim_ids = [] + for p in ps: + dim_id = 'gpu{0}_process_mem_{1}'.format(gpu.num, p['pid']) + active_dim_ids.append(dim_id) + if dim_id not in chart: + chart.add_dimension([dim_id, '{0} {1}'.format(p['pid'], p['process_name'])]) + for dim in chart: + if dim.id not in active_dim_ids: + chart.del_dimension(dim.id, hide=False) + def check(self): if not self.poller.has_smi(): self.error("couldn't find '{0}' binary".format(NVIDIA_SMI)) @@ -355,7 +402,6 @@ class Service(SimpleService): return False self.create_charts(gpus) - self.poller.start() return True @@ -363,7 +409,7 @@ class Service(SimpleService): try: return et.fromstring(data) except et.ParseError as error: - self.error(error) + self.error('xml parse failed: "{0}", error: {1}'.format(data, error)) return None |