diff options
Diffstat (limited to 'devices/nvidia.go')
-rw-r--r-- | devices/nvidia.go | 184 |
1 files changed, 184 insertions, 0 deletions
diff --git a/devices/nvidia.go b/devices/nvidia.go new file mode 100644 index 0000000..0e50dba --- /dev/null +++ b/devices/nvidia.go @@ -0,0 +1,184 @@ +package devices + +import ( + "bytes" + "encoding/csv" + "errors" + "fmt" + "os/exec" + "strconv" + "sync" + "time" + + "github.com/xxxserxxx/opflag" +) + +// Set up variables and register this plug-in with the main code. +// The functions Register*(f) tell gotop which of these plugin functions to +// call to update data; the RegisterStartup() function sets the function +// that gotop will call when everything else has been done and the plugin +// should start collecting data. +// +// In this plugin, one call to the nvidia program returns *all* the data +// we're looking for, but gotop will call each update function during each +// cycle. This means that the nvidia program would be called 3 (or more) +// times per update, which isn't very efficient. Therefore, we make this +// code more complex to run a job in the background that runs the nvidia +// tool periodically and puts the results into hashes; the update functions +// then just sync data from those hashes into the return data. +func init() { + opflag.BoolVarP(&nvidia, "nvidia", "", false, "Enable NVidia GPU support") + RegisterStartup(startNVidia) +} + +// updateNvidiaTemp copies data from the local _temps cache into the passed-in +// return-value map. It is called once per cycle by gotop. +func updateNvidiaTemp(temps map[string]int) map[string]error { + nvidiaLock.Lock() + defer nvidiaLock.Unlock() + for k, v := range _temps { + temps[k] = v + } + return _errors +} + +// updateNvidiaMem copies data from the local _mems cache into the passed-in +// return-value map. It is called once per cycle by gotop. +func updateNvidiaMem(mems map[string]MemoryInfo) map[string]error { + nvidiaLock.Lock() + defer nvidiaLock.Unlock() + for k, v := range _mems { + mems[k] = v + } + return _errors +} + +// updateNvidiaUsage copies data from the local _cpus cache into the passed-in +// return-value map. It is called once per cycle by gotop. +func updateNvidiaUsage(cpus map[string]int, _ bool) map[string]error { + nvidiaLock.Lock() + defer nvidiaLock.Unlock() + for k, v := range _cpus { + cpus[k] = v + } + return _errors +} + +// startNVidia is called once by gotop, and forks a thread to call the nvidia +// tool periodically and update the cached cpu, memory, and temperature +// values that are used by the update*() functions to return data to gotop. +// +// The vars argument contains command-line arguments to allow the plugin +// to change runtime options; the only option currently supported is the +// `nvidia-refresh` arg, which is expected to be a time.Duration value and +// sets how frequently the nvidia tool is called to refresh the date. +func startNVidia(vars map[string]string) error { + if !nvidia { + return nil + } + _, err := exec.Command("nvidia-smi", "-L").Output() + if err != nil { + return errors.New(fmt.Sprintf("NVidia GPU error: %s", err)) + } + _errors = make(map[string]error) + _temps = make(map[string]int) + _mems = make(map[string]MemoryInfo) + _cpus = make(map[string]int) + _errors = make(map[string]error) + RegisterTemp(updateNvidiaTemp) + RegisterMem(updateNvidiaMem) + RegisterCPU(updateNvidiaUsage) + + nvidiaLock = sync.Mutex{} + // Get the refresh period from the passed-in command-line/config + // file options + refresh := time.Second + if v, ok := vars["nvidia-refresh"]; ok { + if refresh, err = time.ParseDuration(v); err != nil { + return err + } + } + // update once to populate the device names, for the widgets. + update() + // Fork off a long-running job to call the nvidia tool periodically, + // parse out the values, and put them in the cache. + go func() { + timer := time.Tick(refresh) + for range timer { + update() + } + }() + return nil +} + +// Caches for the output from the nvidia tool; the update() functions pull +// from these and return the values to gotop when requested. +var ( + _temps map[string]int + _mems map[string]MemoryInfo + _cpus map[string]int + // A cache of errors generated by the background job running the nvidia tool; + // these errors are returned to gotop when it calls the update() functions. + _errors map[string]error +) + +var nvidiaLock sync.Mutex + +// update calls the nvidia tool, parses the output, and caches the results +// in the various _* maps. The metric data parsed is: name, index, +// temperature.gpu, utilization.gpu, utilization.memory, memory.total, +// memory.free, memory.used +// +// If this function encounters an error calling `nvidia-smi`, it caches the +// error and returns immediately. We expect exec errors only when the tool +// isn't available, or when it fails for some reason; no exec error cases +// are recoverable. This does **not** stop the cache job; that will continue +// to run and continue to call update(). +func update() { + bs, err := exec.Command( + "nvidia-smi", + "--query-gpu=name,index,temperature.gpu,utilization.gpu,memory.total,memory.used", + "--format=csv,noheader,nounits").Output() + if err != nil { + _errors["nvidia"] = err + //bs = []byte("GeForce GTX 1080 Ti, 0, 31, 9, 11175, 206") + return + } + csvReader := csv.NewReader(bytes.NewReader(bs)) + csvReader.TrimLeadingSpace = true + records, err := csvReader.ReadAll() + if err != nil { + _errors["nvidia"] = err + return + } + + // Ensure we're not trying to modify the caches while they're being read by the update() functions. + nvidiaLock.Lock() + defer nvidiaLock.Unlock() + // Errors during parsing are recorded, but do not stop parsing. + for _, row := range records { + // The name of the devices is the nvidia-smi "<name>.<index>" + name := row[0] + "." + row[1] + if _temps[name], err = strconv.Atoi(row[2]); err != nil { + _errors[name] = err + } + if _cpus[name], err = strconv.Atoi(row[3]); err != nil { + _errors[name] = err + } + t, err := strconv.Atoi(row[4]) + if err != nil { + _errors[name] = err + } + u, err := strconv.Atoi(row[5]) + if err != nil { + _errors[name] = err + } + _mems[name] = MemoryInfo{ + Total: 1048576 * uint64(t), + Used: 1048576 * uint64(u), + UsedPercent: (float64(u) / float64(t)) * 100.0, + } + } +} + +var nvidia bool |