summaryrefslogtreecommitdiffstats
path: root/nvidia.go
blob: baf1f52e52c92da91aa7bec8342d0546ea32c970 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
package nvidia

import (
	"bytes"
	"encoding/csv"
	"os/exec"
	"strconv"
	"sync"
	"time"

	//"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
	"github.com/xxxserxxx/gotop/v4/devices"
)

func init() {
	_temps = make(map[string]int)
	_mems  = make(map[string]devices.MemoryInfo)
	_cpus  = make(map[string]int)
	errors = make(map[string]error)
	devices.RegisterTemp(updateNvidiaTemp)
	devices.RegisterMem(updateNvidiaMem)
	devices.RegisterCPU(updateNvidiaUsage)

	lock = sync.Mutex{}
	devices.RegisterStartup(startup)
}

func updateNvidiaTemp(temps map[string]int) map[string]error {
	lock.Lock()
	defer lock.Unlock()
	for k, v := range _temps {
		temps[k] = v
	}
	return errors
}

func updateNvidiaMem(mems map[string]devices.MemoryInfo) map[string]error {
	lock.Lock()
	defer lock.Unlock()
	for k, v := range _mems {
		mems[k] = v
	}
	return errors
}

func updateNvidiaUsage(cpus map[string]int, _ bool) map[string]error {
	lock.Lock()
	defer lock.Unlock()
	for k, v := range _cpus {
		cpus[k] = v
	}
	return errors
}

func startup(vars map[string]string) error {
	var err error
	refresh := time.Second
	if v, ok := vars["nvidia-refresh"]; ok {
		if refresh, err = time.ParseDuration(v); err != nil {
			return err
		}
	}
	go func() {
		timer := time.Tick(refresh)
		for range timer {
			update()
		}
	}()
	return nil
}

var (
	_temps map[string]int
	_mems  map[string]devices.MemoryInfo
	_cpus  map[string]int
	errors map[string]error
)

var lock sync.Mutex

// update updates the cached NVidia metric data: name, index,
// temperature.gpu, utilization.gpu, utilization.memory, memory.total, memory.free, memory.used
func update() {
	bs, err := exec.Command(
		"nvidia-smi",
		"--query-gpu=name,index,temperature.gpu,utilization.gpu,memory.total,memory.used",
		"--format=csv,noheader,nounits").Output()
	if err != nil {
		errors["nvidia"] = err
		return
	}

	csvReader := csv.NewReader(bytes.NewReader(bs))
	csvReader.TrimLeadingSpace = true
	records, err := csvReader.ReadAll()
	if err != nil {
		errors["nvidia"] = err
		return
	}

	lock.Lock()
	defer lock.Unlock()
	for _, row := range records {
		name := row[0] + "." + row[1]
		if _temps[name], err = strconv.Atoi(row[2]); err != nil {
			errors[name] = err
		}
		if _cpus[name], err = strconv.Atoi(row[3]); err != nil {
			errors[name] = err
		}
		t, err := strconv.Atoi(row[4])
		if err != nil {
			errors[name] = err
		}
		u, err := strconv.Atoi(row[5])
		if err != nil {
			errors[name] = err
		}
		_mems[name] = devices.MemoryInfo{
			Total:       uint64(t),
			Used:        uint64(u),
			UsedPercent: float64(u) / float64(t),
		}
	}
}