summaryrefslogtreecommitdiffstats
path: root/devices/nvidia.go
blob: 0e50dba65cb68ae0bb2ff748afa37f11833e8fef (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
package devices

import (
	"bytes"
	"encoding/csv"
	"errors"
	"fmt"
	"os/exec"
	"strconv"
	"sync"
	"time"

	"github.com/xxxserxxx/opflag"
)

// Set up variables and register this plug-in with the main code.
// The functions Register*(f) tell gotop which of these plugin functions to
// call to update data; the RegisterStartup() function sets the function
// that gotop will call when everything else has been done and the plugin
// should start collecting data.
//
// In this plugin, one call to the nvidia program returns *all* the data
// we're looking for, but gotop will call each update function during each
// cycle. This means that the nvidia program would be called 3 (or more)
// times per update, which isn't very efficient. Therefore, we make this
// code more complex to run a job in the background that runs the nvidia
// tool periodically and puts the results into hashes; the update functions
// then just sync data from those hashes into the return data.
func init() {
	opflag.BoolVarP(&nvidia, "nvidia", "", false, "Enable NVidia GPU support")
	RegisterStartup(startNVidia)
}

// updateNvidiaTemp copies data from the local _temps cache into the passed-in
// return-value map. It is called once per cycle by gotop.
func updateNvidiaTemp(temps map[string]int) map[string]error {
	nvidiaLock.Lock()
	defer nvidiaLock.Unlock()
	for k, v := range _temps {
		temps[k] = v
	}
	return _errors
}

// updateNvidiaMem copies data from the local _mems cache into the passed-in
// return-value map. It is called once per cycle by gotop.
func updateNvidiaMem(mems map[string]MemoryInfo) map[string]error {
	nvidiaLock.Lock()
	defer nvidiaLock.Unlock()
	for k, v := range _mems {
		mems[k] = v
	}
	return _errors
}

// updateNvidiaUsage copies data from the local _cpus cache into the passed-in
// return-value map. It is called once per cycle by gotop.
func updateNvidiaUsage(cpus map[string]int, _ bool) map[string]error {
	nvidiaLock.Lock()
	defer nvidiaLock.Unlock()
	for k, v := range _cpus {
		cpus[k] = v
	}
	return _errors
}

// startNVidia is called once by gotop, and forks a thread to call the nvidia
// tool periodically and update the cached cpu, memory, and temperature
// values that are used by the update*() functions to return data to gotop.
//
// The vars argument contains command-line arguments to allow the plugin
// to change runtime options; the only option currently supported is the
// `nvidia-refresh` arg, which is expected to be a time.Duration value and
// sets how frequently the nvidia tool is called to refresh the date.
func startNVidia(vars map[string]string) error {
	if !nvidia {
		return nil
	}
	_, err := exec.Command("nvidia-smi", "-L").Output()
	if err != nil {
		return errors.New(fmt.Sprintf("NVidia GPU error: %s", err))
	}
	_errors = make(map[string]error)
	_temps = make(map[string]int)
	_mems = make(map[string]MemoryInfo)
	_cpus = make(map[string]int)
	_errors = make(map[string]error)
	RegisterTemp(updateNvidiaTemp)
	RegisterMem(updateNvidiaMem)
	RegisterCPU(updateNvidiaUsage)

	nvidiaLock = sync.Mutex{}
	// Get the refresh period from the passed-in command-line/config
	// file options
	refresh := time.Second
	if v, ok := vars["nvidia-refresh"]; ok {
		if refresh, err = time.ParseDuration(v); err != nil {
			return err
		}
	}
	// update once to populate the device names, for the widgets.
	update()
	// Fork off a long-running job to call the nvidia tool periodically,
	// parse out the values, and put them in the cache.
	go func() {
		timer := time.Tick(refresh)
		for range timer {
			update()
		}
	}()
	return nil
}

// Caches for the output from the nvidia tool; the update() functions pull
// from these and return the values to gotop when requested.
var (
	_temps map[string]int
	_mems  map[string]MemoryInfo
	_cpus  map[string]int
	// A cache of errors generated by the background job running the nvidia tool;
	// these errors are returned to gotop when it calls the update() functions.
	_errors map[string]error
)

var nvidiaLock sync.Mutex

// update calls the nvidia tool, parses the output, and caches the results
// in the various _* maps. The metric data parsed is: name, index,
// temperature.gpu, utilization.gpu, utilization.memory, memory.total,
// memory.free, memory.used
//
// If this function encounters an error calling `nvidia-smi`, it caches the
// error and returns immediately. We expect exec errors only when the tool
// isn't available, or when it fails for some reason; no exec error cases
// are recoverable. This does **not** stop the cache job; that will continue
// to run and continue to call update().
func update() {
	bs, err := exec.Command(
		"nvidia-smi",
		"--query-gpu=name,index,temperature.gpu,utilization.gpu,memory.total,memory.used",
		"--format=csv,noheader,nounits").Output()
	if err != nil {
		_errors["nvidia"] = err
		//bs = []byte("GeForce GTX 1080 Ti, 0, 31, 9, 11175, 206")
		return
	}
	csvReader := csv.NewReader(bytes.NewReader(bs))
	csvReader.TrimLeadingSpace = true
	records, err := csvReader.ReadAll()
	if err != nil {
		_errors["nvidia"] = err
		return
	}

	// Ensure we're not trying to modify the caches while they're being read by the update() functions.
	nvidiaLock.Lock()
	defer nvidiaLock.Unlock()
	// Errors during parsing are recorded, but do not stop parsing.
	for _, row := range records {
		// The name of the devices is the nvidia-smi "<name>.<index>"
		name := row[0] + "." + row[1]
		if _temps[name], err = strconv.Atoi(row[2]); err != nil {
			_errors[name] = err
		}
		if _cpus[name], err = strconv.Atoi(row[3]); err != nil {
			_errors[name] = err
		}
		t, err := strconv.Atoi(row[4])
		if err != nil {
			_errors[name] = err
		}
		u, err := strconv.Atoi(row[5])
		if err != nil {
			_errors[name] = err
		}
		_mems[name] = MemoryInfo{
			Total:       1048576 * uint64(t),
			Used:        1048576 * uint64(u),
			UsedPercent: (float64(u) / float64(t)) * 100.0,
		}
	}
}

var nvidia bool