summaryrefslogtreecommitdiffstats
path: root/nvidia.go
blob: 7f2c743cbec7fdc620a8fe435e83408eea93ba0d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
package nvidia

// TODO: Optimization: cache most recent info w/ timestamp, and only update if older than X

import (
	"strconv"
	"strings"
	"time"

	"github.com/rai-project/nvidia-smi"
	//"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
	"github.com/xxxserxxx/gotop/v3/devices"
)

func init() {
	devices.RegisterTemp(updateNvidiaTemp)
	devices.RegisterMem(updateNvidiaMem)
	devices.RegisterCPU(updateNvidiaUsage)
}

func updateNvidiaTemp(temps map[string]int) map[string]error {
	errs := make(map[string]error)
	info, err := nvidiasmi.New()
	if err != nil {
		errs["nvidia"] = err
		return errs
	}
	if info.HasGPU() {
		for i := range info.GPUS {
			gpu := info.GPUS[i]
			if gpu.GpuTemp == "N/A" {
				// The GPU does not export a temperature measure
				continue
			}
			name := gpu.ProductName + " " + strconv.Itoa(i)
			temperature, err := strconv.ParseFloat(strings.ReplaceAll(gpu.GpuTemp, " C", ""), 10)
			if err != nil {
				errs[name] = err
				continue
			}
			temps[name] = int(temperature)
		}
	}
	return errs
}

func updateNvidiaMem(mems map[string]devices.MemoryInfo) map[string]error {
	errs := make(map[string]error)
	info, err := nvidiasmi.New()
	if err != nil {
		errs["nvidia"] = err
		return errs
	}
	if info.HasGPU() {
		for i := range info.GPUS {
			gpu := info.GPUS[i]
			if gpu.MemoryUtil == "N/A" || gpu.Total == "N/A" || gpu.Used == "N/A" {
				// The GPU does not export sufficient memory measures
				continue
			}
			name := gpu.ProductName + strconv.Itoa(i)
			mem, err := strconv.Atoi(gpu.MemoryUtil)
			if err != nil {
				errs[name+"Mem"] = err
				continue
			}
			total, err := strconv.Atoi(gpu.Total)
			if err != nil {
				errs[name+"Total"] = err
				continue
			}
			used, err := strconv.Atoi(gpu.Used)
			if err != nil {
				errs[name+"Used"] = err
				continue
			}
			if total == 0 && used == 0 {
				total = 100
				used = mem
			} else if total != 0 && used == 0 {
				used = int(float64(total) * (float64(mem) / 100))
			} else if total == 0 && used != 0 {
				total = int(float64(used) / (float64(mem) / 100))
			}
			dev := devices.MemoryInfo{
				Total: uint64(total),
				Used:  uint64(used),
			}
			dev.UsedPercent = float64(mem)
			mems[name] = dev
		}
	}
	return errs
}

func updateNvidiaUsage(cpus map[string]int, _ time.Duration, _ bool) map[string]error {
	errs := make(map[string]error)
	info, err := nvidiasmi.New()
	if err != nil {
		errs["nvidia"] = err
		return errs
	}
	if info.HasGPU() {
		for i := range info.GPUS {
			gpu := info.GPUS[i]
			if gpu.GpuUtil == "N/A" {
				// The GPU does not export sufficient memory measures
				continue
			}
			name := gpu.ProductName + " " + strconv.Itoa(i)
			usage, err := strconv.Atoi(gpu.GpuUtil)
			if err != nil {
				errs[name] = err
				continue
			}
			cpus[name] = usage
		}
	}
	return errs
}