summaryrefslogtreecommitdiffstats
path: root/devices/nvidia.go
diff options
context:
space:
mode:
Diffstat (limited to 'devices/nvidia.go')
-rw-r--r--devices/nvidia.go184
1 files changed, 184 insertions, 0 deletions
diff --git a/devices/nvidia.go b/devices/nvidia.go
new file mode 100644
index 0000000..0e50dba
--- /dev/null
+++ b/devices/nvidia.go
@@ -0,0 +1,184 @@
+package devices
+
+import (
+ "bytes"
+ "encoding/csv"
+ "errors"
+ "fmt"
+ "os/exec"
+ "strconv"
+ "sync"
+ "time"
+
+ "github.com/xxxserxxx/opflag"
+)
+
+// Set up variables and register this plug-in with the main code.
+// The functions Register*(f) tell gotop which of these plugin functions to
+// call to update data; the RegisterStartup() function sets the function
+// that gotop will call when everything else has been done and the plugin
+// should start collecting data.
+//
+// In this plugin, one call to the nvidia program returns *all* the data
+// we're looking for, but gotop will call each update function during each
+// cycle. This means that the nvidia program would be called 3 (or more)
+// times per update, which isn't very efficient. Therefore, we make this
+// code more complex to run a job in the background that runs the nvidia
+// tool periodically and puts the results into hashes; the update functions
+// then just sync data from those hashes into the return data.
+func init() {
+ opflag.BoolVarP(&nvidia, "nvidia", "", false, "Enable NVidia GPU support")
+ RegisterStartup(startNVidia)
+}
+
+// updateNvidiaTemp copies data from the local _temps cache into the passed-in
+// return-value map. It is called once per cycle by gotop.
+func updateNvidiaTemp(temps map[string]int) map[string]error {
+ nvidiaLock.Lock()
+ defer nvidiaLock.Unlock()
+ for k, v := range _temps {
+ temps[k] = v
+ }
+ return _errors
+}
+
+// updateNvidiaMem copies data from the local _mems cache into the passed-in
+// return-value map. It is called once per cycle by gotop.
+func updateNvidiaMem(mems map[string]MemoryInfo) map[string]error {
+ nvidiaLock.Lock()
+ defer nvidiaLock.Unlock()
+ for k, v := range _mems {
+ mems[k] = v
+ }
+ return _errors
+}
+
+// updateNvidiaUsage copies data from the local _cpus cache into the passed-in
+// return-value map. It is called once per cycle by gotop.
+func updateNvidiaUsage(cpus map[string]int, _ bool) map[string]error {
+ nvidiaLock.Lock()
+ defer nvidiaLock.Unlock()
+ for k, v := range _cpus {
+ cpus[k] = v
+ }
+ return _errors
+}
+
+// startNVidia is called once by gotop, and forks a thread to call the nvidia
+// tool periodically and update the cached cpu, memory, and temperature
+// values that are used by the update*() functions to return data to gotop.
+//
+// The vars argument contains command-line arguments to allow the plugin
+// to change runtime options; the only option currently supported is the
+// `nvidia-refresh` arg, which is expected to be a time.Duration value and
+// sets how frequently the nvidia tool is called to refresh the date.
+func startNVidia(vars map[string]string) error {
+ if !nvidia {
+ return nil
+ }
+ _, err := exec.Command("nvidia-smi", "-L").Output()
+ if err != nil {
+ return errors.New(fmt.Sprintf("NVidia GPU error: %s", err))
+ }
+ _errors = make(map[string]error)
+ _temps = make(map[string]int)
+ _mems = make(map[string]MemoryInfo)
+ _cpus = make(map[string]int)
+ _errors = make(map[string]error)
+ RegisterTemp(updateNvidiaTemp)
+ RegisterMem(updateNvidiaMem)
+ RegisterCPU(updateNvidiaUsage)
+
+ nvidiaLock = sync.Mutex{}
+ // Get the refresh period from the passed-in command-line/config
+ // file options
+ refresh := time.Second
+ if v, ok := vars["nvidia-refresh"]; ok {
+ if refresh, err = time.ParseDuration(v); err != nil {
+ return err
+ }
+ }
+ // update once to populate the device names, for the widgets.
+ update()
+ // Fork off a long-running job to call the nvidia tool periodically,
+ // parse out the values, and put them in the cache.
+ go func() {
+ timer := time.Tick(refresh)
+ for range timer {
+ update()
+ }
+ }()
+ return nil
+}
+
+// Caches for the output from the nvidia tool; the update() functions pull
+// from these and return the values to gotop when requested.
+var (
+ _temps map[string]int
+ _mems map[string]MemoryInfo
+ _cpus map[string]int
+ // A cache of errors generated by the background job running the nvidia tool;
+ // these errors are returned to gotop when it calls the update() functions.
+ _errors map[string]error
+)
+
+var nvidiaLock sync.Mutex
+
+// update calls the nvidia tool, parses the output, and caches the results
+// in the various _* maps. The metric data parsed is: name, index,
+// temperature.gpu, utilization.gpu, utilization.memory, memory.total,
+// memory.free, memory.used
+//
+// If this function encounters an error calling `nvidia-smi`, it caches the
+// error and returns immediately. We expect exec errors only when the tool
+// isn't available, or when it fails for some reason; no exec error cases
+// are recoverable. This does **not** stop the cache job; that will continue
+// to run and continue to call update().
+func update() {
+ bs, err := exec.Command(
+ "nvidia-smi",
+ "--query-gpu=name,index,temperature.gpu,utilization.gpu,memory.total,memory.used",
+ "--format=csv,noheader,nounits").Output()
+ if err != nil {
+ _errors["nvidia"] = err
+ //bs = []byte("GeForce GTX 1080 Ti, 0, 31, 9, 11175, 206")
+ return
+ }
+ csvReader := csv.NewReader(bytes.NewReader(bs))
+ csvReader.TrimLeadingSpace = true
+ records, err := csvReader.ReadAll()
+ if err != nil {
+ _errors["nvidia"] = err
+ return
+ }
+
+ // Ensure we're not trying to modify the caches while they're being read by the update() functions.
+ nvidiaLock.Lock()
+ defer nvidiaLock.Unlock()
+ // Errors during parsing are recorded, but do not stop parsing.
+ for _, row := range records {
+ // The name of the devices is the nvidia-smi "<name>.<index>"
+ name := row[0] + "." + row[1]
+ if _temps[name], err = strconv.Atoi(row[2]); err != nil {
+ _errors[name] = err
+ }
+ if _cpus[name], err = strconv.Atoi(row[3]); err != nil {
+ _errors[name] = err
+ }
+ t, err := strconv.Atoi(row[4])
+ if err != nil {
+ _errors[name] = err
+ }
+ u, err := strconv.Atoi(row[5])
+ if err != nil {
+ _errors[name] = err
+ }
+ _mems[name] = MemoryInfo{
+ Total: 1048576 * uint64(t),
+ Used: 1048576 * uint64(u),
+ UsedPercent: (float64(u) / float64(t)) * 100.0,
+ }
+ }
+}
+
+var nvidia bool