package devices import ( "bytes" "encoding/csv" "errors" "fmt" "os/exec" "strconv" "sync" "time" ) // Set up variables and register this plug-in with the main code. // The functions Register*(f) tell gotop which of these plugin functions to // call to update data; the RegisterStartup() function sets the function // that gotop will call when everything else has been done and the plugin // should start collecting data. // // In this plugin, one call to the nvidia program returns *all* the data // we're looking for, but gotop will call each update function during each // cycle. This means that the nvidia program would be called 3 (or more) // times per update, which isn't very efficient. Therefore, we make this // code more complex to run a job in the background that runs the nvidia // tool periodically and puts the results into hashes; the update functions // then just sync data from those hashes into the return data. func init() { RegisterStartup(startNVidia) } // updateNvidiaTemp copies data from the local _temps cache into the passed-in // return-value map. It is called once per cycle by gotop. func updateNvidiaTemp(temps map[string]int) map[string]error { nvidiaLock.Lock() defer nvidiaLock.Unlock() for k, v := range _temps { temps[k] = v } return _errors } // updateNvidiaMem copies data from the local _mems cache into the passed-in // return-value map. It is called once per cycle by gotop. func updateNvidiaMem(mems map[string]MemoryInfo) map[string]error { nvidiaLock.Lock() defer nvidiaLock.Unlock() for k, v := range _mems { mems[k] = v } return _errors } // updateNvidiaUsage copies data from the local _cpus cache into the passed-in // return-value map. It is called once per cycle by gotop. func updateNvidiaUsage(cpus map[string]int, _ bool) map[string]error { nvidiaLock.Lock() defer nvidiaLock.Unlock() for k, v := range _cpus { cpus[k] = v } return _errors } // startNVidia is called once by gotop, and forks a thread to call the nvidia // tool periodically and update the cached cpu, memory, and temperature // values that are used by the update*() functions to return data to gotop. // // The vars argument contains command-line arguments to allow the plugin // to change runtime options; the only option currently supported is the // `nvidia-refresh` arg, which is expected to be a time.Duration value and // sets how frequently the nvidia tool is called to refresh the date. func startNVidia(vars map[string]string) error { if vars["nvidia"] != "true" { return nil } _, err := exec.Command("nvidia-smi", "-L").Output() if err != nil { return errors.New(fmt.Sprintf("NVidia GPU error: %s", err)) } _errors = make(map[string]error) _temps = make(map[string]int) _mems = make(map[string]MemoryInfo) _cpus = make(map[string]int) _errors = make(map[string]error) RegisterTemp(updateNvidiaTemp) RegisterMem(updateNvidiaMem) RegisterCPU(updateNvidiaUsage) nvidiaLock = sync.Mutex{} // Get the refresh period from the passed-in command-line/config // file options refresh := time.Second if v, ok := vars["nvidia-refresh"]; ok { if refresh, err = time.ParseDuration(v); err != nil { return err } } // update once to populate the device names, for the widgets. updateNvidia() // Fork off a long-running job to call the nvidia tool periodically, // parse out the values, and put them in the cache. go func() { timer := time.Tick(refresh) for range timer { updateNvidia() } }() return nil } // Caches for the output from the nvidia tool; the update() functions pull // from these and return the values to gotop when requested. var ( _temps map[string]int _mems map[string]MemoryInfo _cpus map[string]int // A cache of errors generated by the background job running the nvidia tool; // these errors are returned to gotop when it calls the update() functions. _errors map[string]error ) var nvidiaLock sync.Mutex // updateNvidia calls the nvidia tool, parses the output, and caches the results // in the various _* maps. The metric data parsed is: name, index, // temperature.gpu, utilization.gpu, utilization.memory, memory.total, // memory.free, memory.used // // If this function encounters an error calling `nvidia-smi`, it caches the // error and returns immediately. We expect exec errors only when the tool // isn't available, or when it fails for some reason; no exec error cases // are recoverable. This does **not** stop the cache job; that will continue // to run and continue to call updateNvidia(). func updateNvidia() { bs, err := exec.Command( "nvidia-smi", "--query-gpu=name,index,temperature.gpu,utilization.gpu,memory.total,memory.used", "--format=csv,noheader,nounits").Output() if err != nil { _errors["nvidia"] = err //bs = []byte("GeForce GTX 1080 Ti, 0, 31, 9, 11175, 206") return } csvReader := csv.NewReader(bytes.NewReader(bs)) csvReader.TrimLeadingSpace = true records, err := csvReader.ReadAll() if err != nil { _errors["nvidia"] = err return } // Ensure we're not trying to modify the caches while they're being read by the update() functions. nvidiaLock.Lock() defer nvidiaLock.Unlock() // Errors during parsing are recorded, but do not stop parsing. for _, row := range records { // The name of the devices is the nvidia-smi "." name := row[0] + "." + row[1] if _temps[name], err = strconv.Atoi(row[2]); err != nil { _errors[name] = err } if _cpus[name], err = strconv.Atoi(row[3]); err != nil { _errors[name] = err } t, err := strconv.Atoi(row[4]) if err != nil { _errors[name] = err } u, err := strconv.Atoi(row[5]) if err != nil { _errors[name] = err } _mems[name] = MemoryInfo{ Total: 1048576 * uint64(t), Used: 1048576 * uint64(u), UsedPercent: (float64(u) / float64(t)) * 100.0, } } }