summaryrefslogtreecommitdiffstats
path: root/collectors/python.d.plugin/nvidia_smi
diff options
context:
space:
mode:
author胡玮文 <huww98@outlook.com>2019-04-05 15:59:14 +0800
committerIlya Mashchenko <ilyamaschenko@gmail.com>2019-04-05 10:59:14 +0300
commit01b91117caecce0d50020a51673b9d3b3b0e01f0 (patch)
treefb95e138d9d5f497f5efcd97d7939bf5b7092bc9 /collectors/python.d.plugin/nvidia_smi
parentd9d41e6b9e23db9aa48481dd7efa4c317273c9fb (diff)
Add memory free and per process memory usage to nvidia_smi (#5796)
<!-- Describe the change in summary section, including rationale and degin decisions. Include "Fixes #nnn" if you are fixing an existing issue. In "Component Name" section write which component is changed in this PR. This will help us review your PR quicker. If you have more information you want to add, write them in "Additional Information" section. This is usually used to help others understand your motivation behind this change. A step-by-step reproduction of the problem is helpful if there is no related issue. --> ##### Summary Add memory free Add per process memory usage ##### Component Name nvidia_smi ##### Additional Information
Diffstat (limited to 'collectors/python.d.plugin/nvidia_smi')
-rw-r--r--collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py68
1 files changed, 57 insertions, 11 deletions
diff --git a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
index 4dc67f1334..f7b7020e01 100644
--- a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
+++ b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
@@ -26,10 +26,11 @@ FAN_SPEED = 'fan_speed'
GPU_UTIL = 'gpu_utilization'
MEM_UTIL = 'mem_utilization'
ENCODER_UTIL = 'encoder_utilization'
-MEM_ALLOCATED = 'mem_allocated'
+MEM_USAGE = 'mem_usage'
TEMPERATURE = 'temperature'
CLOCKS = 'clocks'
POWER = 'power'
+PROCESSES_MEM = 'processes_mem'
ORDER = [
PCI_BANDWIDTH,
@@ -37,10 +38,11 @@ ORDER = [
GPU_UTIL,
MEM_UTIL,
ENCODER_UTIL,
- MEM_ALLOCATED,
+ MEM_USAGE,
TEMPERATURE,
CLOCKS,
POWER,
+ PROCESSES_MEM,
]
@@ -80,10 +82,11 @@ def gpu_charts(gpu):
['decoder_util', 'decoder'],
]
},
- MEM_ALLOCATED: {
- 'options': [None, 'Memory Allocated', 'MiB', fam, 'nvidia_smi.memory_allocated', 'line'],
+ MEM_USAGE: {
+ 'options': [None, 'Memory Usage', 'MiB', fam, 'nvidia_smi.memory_allocated', 'stacked'],
'lines': [
- ['fb_memory_usage', 'used'],
+ ['fb_memory_free', 'free'],
+ ['fb_memory_used', 'used'],
]
},
TEMPERATURE: {
@@ -107,6 +110,10 @@ def gpu_charts(gpu):
['power_draw', 'power', 1, 100],
]
},
+ PROCESSES_MEM: {
+ 'options': [None, 'Memory Used by Each Process', 'MiB', fam, 'nvidia_smi.processes_mem', 'stacked'],
+ 'lines': []
+ },
}
idx = gpu.num
@@ -260,10 +267,14 @@ class GPU:
return self.root.find('utilization').find('decoder_util').text.split()[0]
@handle_attr_error
- def fb_memory_usage(self):
+ def fb_memory_used(self):
return self.root.find('fb_memory_usage').find('used').text.split()[0]
@handle_attr_error
+ def fb_memory_free(self):
+ return self.root.find('fb_memory_usage').find('free').text.split()[0]
+
+ @handle_attr_error
def temperature(self):
return self.root.find('temperature').find('gpu_temp').text.split()[0]
@@ -288,6 +299,18 @@ class GPU:
def power_draw(self):
return float(self.root.find('power_readings').find('power_draw').text.split()[0]) * 100
+ @handle_attr_error
+ def processes(self):
+ p_nodes = self.root.find('processes').findall('process_info')
+ ps = []
+ for p in p_nodes:
+ ps.append({
+ 'pid': p.find('pid').text,
+ 'process_name': p.find('process_name').text,
+ 'used_memory': int(p.find('used_memory').text.split()[0]),
+ })
+ return ps
+
def data(self):
data = {
'rx_util': self.rx_util(),
@@ -297,7 +320,8 @@ class GPU:
'memory_util': self.memory_util(),
'encoder_util': self.encoder_util(),
'decoder_util': self.decoder_util(),
- 'fb_memory_usage': self.fb_memory_usage(),
+ 'fb_memory_used': self.fb_memory_used(),
+ 'fb_memory_free': self.fb_memory_free(),
'gpu_temp': self.temperature(),
'graphics_clock': self.graphics_clock(),
'video_clock': self.video_clock(),
@@ -305,12 +329,13 @@ class GPU:
'mem_clock': self.mem_clock(),
'power_draw': self.power_draw(),
}
+ processes = self.processes() or []
+ data.update({'process_mem_{0}'.format(p['pid']): p['used_memory'] for p in processes})
return dict(
('gpu{0}_{1}'.format(self.num, k), v) for k, v in data.items() if v is not None and v != BAD_VALUE
)
-
class Service(SimpleService):
def __init__(self, configuration=None, name=None):
super(Service, self).__init__(configuration=configuration, name=name)
@@ -320,11 +345,16 @@ class Service(SimpleService):
self.poller = NvidiaSMIPoller(poll)
def get_data(self):
+ if not self.poller.is_started():
+ self.poller.start()
+
if not self.poller.is_alive():
self.debug('poller is off')
return None
last_data = self.poller.data()
+ if not last_data:
+ return None
parsed = self.parse_xml(last_data)
if parsed is None:
@@ -332,10 +362,27 @@ class Service(SimpleService):
data = dict()
for idx, root in enumerate(parsed.findall('gpu')):
- data.update(GPU(idx, root).data())
+ gpu = GPU(idx, root)
+ data.update(gpu.data())
+ self.update_processes_mem_chart(gpu)
return data or None
+ def update_processes_mem_chart(self, gpu):
+ ps = gpu.processes()
+ if not ps:
+ return
+ chart = self.charts['gpu{0}_{1}'.format(gpu.num, PROCESSES_MEM)]
+ active_dim_ids = []
+ for p in ps:
+ dim_id = 'gpu{0}_process_mem_{1}'.format(gpu.num, p['pid'])
+ active_dim_ids.append(dim_id)
+ if dim_id not in chart:
+ chart.add_dimension([dim_id, '{0} {1}'.format(p['pid'], p['process_name'])])
+ for dim in chart:
+ if dim.id not in active_dim_ids:
+ chart.del_dimension(dim.id, hide=False)
+
def check(self):
if not self.poller.has_smi():
self.error("couldn't find '{0}' binary".format(NVIDIA_SMI))
@@ -355,7 +402,6 @@ class Service(SimpleService):
return False
self.create_charts(gpus)
- self.poller.start()
return True
@@ -363,7 +409,7 @@ class Service(SimpleService):
try:
return et.fromstring(data)
except et.ParseError as error:
- self.error(error)
+ self.error('xml parse failed: "{0}", error: {1}'.format(data, error))
return None