diff options
author | Maxime Schmitt <maxime.schmitt91@gmail.com> | 2024-02-25 19:49:28 +0100 |
---|---|---|
committer | Maxime Schmitt <maxime.schmitt91@gmail.com> | 2024-02-25 21:51:08 +0100 |
commit | a7a00275f140789d67561a2c6bcb18f4a5d1249f (patch) | |
tree | 5abce5b0d5132cf371265738e17d26dd1e985b50 | |
parent | 4989f313284fbbe6e470c5bfb142acca770176b0 (diff) |
Support for recent NVML API
-rw-r--r-- | src/extract_gpuinfo_nvidia.c | 337 |
1 files changed, 263 insertions, 74 deletions
diff --git a/src/extract_gpuinfo_nvidia.c b/src/extract_gpuinfo_nvidia.c index 4bf7bae..541318e 100644 --- a/src/extract_gpuinfo_nvidia.c +++ b/src/extract_gpuinfo_nvidia.c @@ -71,6 +71,11 @@ static nvmlReturn_t (*nvmlDeviceGetMaxPcieLinkWidth)(nvmlDevice_t device, unsign typedef enum { NVML_TEMPERATURE_THRESHOLD_SHUTDOWN = 0, NVML_TEMPERATURE_THRESHOLD_SLOWDOWN = 1, + NVML_TEMPERATURE_THRESHOLD_MEM_MAX = 2, + NVML_TEMPERATURE_THRESHOLD_GPU_MAX = 3, + NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_MIN = 4, + NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_CURR = 5, + NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_MAX = 6, } nvmlTemperatureThresholds_t; static nvmlReturn_t (*nvmlDeviceGetTemperatureThreshold)(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, @@ -100,9 +105,18 @@ typedef struct { unsigned long long total; unsigned long long free; unsigned long long used; -} nvmlMemory_t; +} nvmlMemory_v1_t; -static nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t device, nvmlMemory_t *memory); +typedef struct { + unsigned int version; + unsigned long long total; + unsigned long long reserved; + unsigned long long free; + unsigned long long used; +} nvmlMemory_v2_t; + +static nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t device, nvmlMemory_v1_t *memory); +static nvmlReturn_t (*nvmlDeviceGetMemoryInfo_v2)(nvmlDevice_t device, nvmlMemory_v2_t *memory); static nvmlReturn_t (*nvmlDeviceGetCurrPcieLinkGeneration)(nvmlDevice_t device, unsigned int *currLinkGen); @@ -140,15 +154,56 @@ static nvmlReturn_t (*nvmlDeviceGetDecoderUtilization)(nvmlDevice_t device, unsi typedef struct { unsigned int pid; unsigned long long usedGpuMemory; - // unsigned int gpuInstanceId; // not supported by older NVIDIA drivers - // unsigned int computeInstanceId; // not supported by older NVIDIA drivers -} nvmlProcessInfo_t; +} nvmlProcessInfo_v1_t; -static nvmlReturn_t (*nvmlDeviceGetGraphicsRunningProcesses)(nvmlDevice_t device, unsigned int *infoCount, - nvmlProcessInfo_t *infos); +typedef struct { + unsigned int pid; + unsigned long long usedGpuMemory; + unsigned int gpuInstanceId; + unsigned int computeInstanceId; +} nvmlProcessInfo_v2_t; -static nvmlReturn_t (*nvmlDeviceGetComputeRunningProcesses)(nvmlDevice_t device, unsigned int *infoCount, - nvmlProcessInfo_t *infos); +typedef struct { + unsigned int pid; + unsigned long long usedGpuMemory; + unsigned int gpuInstanceId; + unsigned int computeInstanceId; + // This is present in https://github.com/NVIDIA/DCGM/blob/master/sdk/nvidia/nvml/nvml.h#L294 but not the latest driver nvml.h + // unsigned long long usedGpuCcProtectedMemory; +} nvmlProcessInfo_v3_t; + +static nvmlReturn_t (*nvmlDeviceGetGraphicsRunningProcesses_v1)(nvmlDevice_t device, unsigned int *infoCount, + nvmlProcessInfo_v1_t *infos); +static nvmlReturn_t (*nvmlDeviceGetGraphicsRunningProcesses_v2)(nvmlDevice_t device, unsigned int *infoCount, + nvmlProcessInfo_v2_t *infos); +static nvmlReturn_t (*nvmlDeviceGetGraphicsRunningProcesses_v3)(nvmlDevice_t device, unsigned int *infoCount, + nvmlProcessInfo_v3_t *infos); + +static nvmlReturn_t (*nvmlDeviceGetComputeRunningProcesses_v1)(nvmlDevice_t device, unsigned int *infoCount, + nvmlProcessInfo_v1_t *infos); +static nvmlReturn_t (*nvmlDeviceGetComputeRunningProcesses_v2)(nvmlDevice_t device, unsigned int *infoCount, + nvmlProcessInfo_v2_t *infos); +static nvmlReturn_t (*nvmlDeviceGetComputeRunningProcesses_v3)(nvmlDevice_t device, unsigned int *infoCount, + nvmlProcessInfo_v3_t *infos); + +static nvmlReturn_t (*nvmlDeviceGetMPSComputeRunningProcesses_v1)(nvmlDevice_t device, unsigned int *infoCount, + nvmlProcessInfo_v1_t *infos); +static nvmlReturn_t (*nvmlDeviceGetMPSComputeRunningProcesses_v2)(nvmlDevice_t device, unsigned int *infoCount, + nvmlProcessInfo_v2_t *infos); +static nvmlReturn_t (*nvmlDeviceGetMPSComputeRunningProcesses_v3)(nvmlDevice_t device, unsigned int *infoCount, + nvmlProcessInfo_v3_t *infos); + +// Common interface passing void* +static nvmlReturn_t (*nvmlDeviceGetGraphicsRunningProcesses[4])(nvmlDevice_t device, unsigned int *infoCount, + void *infos); +static nvmlReturn_t (*nvmlDeviceGetComputeRunningProcesses[4])(nvmlDevice_t device, unsigned int *infoCount, + void *infos); +static nvmlReturn_t (*nvmlDeviceGetMPSComputeRunningProcesses[4])(nvmlDevice_t device, unsigned int *infoCount, + void *infos); + +#define NVML_DEVICE_MIG_DISABLE 0x0 +#define NVML_DEVICE_MIG_ENABLE 0x1 +nvmlReturn_t (*nvmlDeviceGetMigMode)(nvmlDevice_t device, unsigned int *currentMode, unsigned int *pendingMode); static void *libnvidia_ml_handle; @@ -177,6 +232,7 @@ struct gpu_info_nvidia { struct list_head allocate_list; nvmlDevice_t gpuhandle; + bool isInMigMode; unsigned long long last_utilization_timestamp; }; @@ -286,8 +342,10 @@ static bool gpuinfo_nvidia_init(void) { if (!nvmlDeviceGetUtilizationRates) goto init_error_clean_exit; + // Get v2 and fallback to v1 + nvmlDeviceGetMemoryInfo_v2 = dlsym(libnvidia_ml_handle, "nvmlDeviceGetMemoryInfo_v2"); nvmlDeviceGetMemoryInfo = dlsym(libnvidia_ml_handle, "nvmlDeviceGetMemoryInfo"); - if (!nvmlDeviceGetMemoryInfo) + if (!nvmlDeviceGetMemoryInfo_v2 && !nvmlDeviceGetMemoryInfo) goto init_error_clean_exit; nvmlDeviceGetCurrPcieLinkGeneration = dlsym(libnvidia_ml_handle, "nvmlDeviceGetCurrPcieLinkGeneration"); @@ -326,16 +384,49 @@ static bool gpuinfo_nvidia_init(void) { if (!nvmlDeviceGetDecoderUtilization) goto init_error_clean_exit; - nvmlDeviceGetGraphicsRunningProcesses = dlsym(libnvidia_ml_handle, "nvmlDeviceGetGraphicsRunningProcesses"); - if (!nvmlDeviceGetGraphicsRunningProcesses) + nvmlDeviceGetGraphicsRunningProcesses_v3 = dlsym(libnvidia_ml_handle, "nvmlDeviceGetGraphicsRunningProcesses_v3"); + nvmlDeviceGetGraphicsRunningProcesses_v2 = dlsym(libnvidia_ml_handle, "nvmlDeviceGetGraphicsRunningProcesses_v2"); + nvmlDeviceGetGraphicsRunningProcesses_v1 = dlsym(libnvidia_ml_handle, "nvmlDeviceGetGraphicsRunningProcesses"); + if (!nvmlDeviceGetGraphicsRunningProcesses_v3 && !nvmlDeviceGetGraphicsRunningProcesses_v2 && + !nvmlDeviceGetGraphicsRunningProcesses_v1) goto init_error_clean_exit; - nvmlDeviceGetComputeRunningProcesses = dlsym(libnvidia_ml_handle, "nvmlDeviceGetComputeRunningProcesses"); - if (!nvmlDeviceGetComputeRunningProcesses) + nvmlDeviceGetGraphicsRunningProcesses[1] = + (nvmlReturn_t(*)(nvmlDevice_t, unsigned int *, void *))nvmlDeviceGetGraphicsRunningProcesses_v1; + nvmlDeviceGetGraphicsRunningProcesses[2] = + (nvmlReturn_t(*)(nvmlDevice_t, unsigned int *, void *))nvmlDeviceGetGraphicsRunningProcesses_v2; + nvmlDeviceGetGraphicsRunningProcesses[3] = + (nvmlReturn_t(*)(nvmlDevice_t, unsigned int *, void *))nvmlDeviceGetGraphicsRunningProcesses_v3; + + nvmlDeviceGetComputeRunningProcesses_v3 = dlsym(libnvidia_ml_handle, "nvmlDeviceGetComputeRunningProcesses_v3"); + nvmlDeviceGetComputeRunningProcesses_v2 = dlsym(libnvidia_ml_handle, "nvmlDeviceGetComputeRunningProcesses_v2"); + nvmlDeviceGetComputeRunningProcesses_v1 = dlsym(libnvidia_ml_handle, "nvmlDeviceGetComputeRunningProcesses"); + if (!nvmlDeviceGetComputeRunningProcesses_v3 && !nvmlDeviceGetComputeRunningProcesses_v2 && + !nvmlDeviceGetComputeRunningProcesses_v1) goto init_error_clean_exit; - // This one might not be available + nvmlDeviceGetComputeRunningProcesses[1] = + (nvmlReturn_t(*)(nvmlDevice_t, unsigned int *, void *))nvmlDeviceGetComputeRunningProcesses_v1; + nvmlDeviceGetComputeRunningProcesses[2] = + (nvmlReturn_t(*)(nvmlDevice_t, unsigned int *, void *))nvmlDeviceGetComputeRunningProcesses_v2; + nvmlDeviceGetComputeRunningProcesses[3] = + (nvmlReturn_t(*)(nvmlDevice_t, unsigned int *, void *))nvmlDeviceGetComputeRunningProcesses_v3; + + // These functions were not available in older NVML libs; don't error if not present + nvmlDeviceGetMPSComputeRunningProcesses_v3 = dlsym(libnvidia_ml_handle, "nvmlDeviceGetMPSComputeRunningProcesses_v3"); + nvmlDeviceGetMPSComputeRunningProcesses_v2 = dlsym(libnvidia_ml_handle, "nvmlDeviceGetMPSComputeRunningProcesses_v2"); + nvmlDeviceGetMPSComputeRunningProcesses_v1 = dlsym(libnvidia_ml_handle, "nvmlDeviceGetMPSComputeRunningProcesses"); + + nvmlDeviceGetMPSComputeRunningProcesses[1] = + (nvmlReturn_t(*)(nvmlDevice_t, unsigned int *, void *))nvmlDeviceGetMPSComputeRunningProcesses_v1; + nvmlDeviceGetMPSComputeRunningProcesses[2] = + (nvmlReturn_t(*)(nvmlDevice_t, unsigned int *, void *))nvmlDeviceGetMPSComputeRunningProcesses_v2; + nvmlDeviceGetMPSComputeRunningProcesses[3] = + (nvmlReturn_t(*)(nvmlDevice_t, unsigned int *, void *))nvmlDeviceGetMPSComputeRunningProcesses_v3; + + // These ones might not be available nvmlDeviceGetProcessUtilization = dlsym(libnvidia_ml_handle, "nvmlDeviceGetProcessUtilization"); + nvmlDeviceGetMigMode = dlsym(libnvidia_ml_handle, "nvmlDeviceGetMigMode"); last_nvml_return_status = nvmlInit(); if (last_nvml_return_status != NVML_SUCCESS) { @@ -514,13 +605,28 @@ static void gpuinfo_nvidia_refresh_dynamic_info(struct gpu_info *_gpu_info) { SET_VALID(gpuinfo_decoder_rate_valid, dynamic_info->valid); // Device memory info (total,used,free) - nvmlMemory_t memory_info; - last_nvml_return_status = nvmlDeviceGetMemoryInfo(device, &memory_info); - if (last_nvml_return_status == NVML_SUCCESS) { - SET_GPUINFO_DYNAMIC(dynamic_info, total_memory, memory_info.total); - SET_GPUINFO_DYNAMIC(dynamic_info, used_memory, memory_info.used); - SET_GPUINFO_DYNAMIC(dynamic_info, free_memory, memory_info.free); - SET_GPUINFO_DYNAMIC(dynamic_info, mem_util_rate, memory_info.used * 100 / memory_info.total); + bool got_meminfo = false; + if (nvmlDeviceGetMemoryInfo_v2) { + nvmlMemory_v2_t memory_info; + memory_info.version = 2; + last_nvml_return_status = nvmlDeviceGetMemoryInfo_v2(device, &memory_info); + if (last_nvml_return_status == NVML_SUCCESS) { + got_meminfo = true; + SET_GPUINFO_DYNAMIC(dynamic_info, total_memory, memory_info.total); + SET_GPUINFO_DYNAMIC(dynamic_info, used_memory, memory_info.used); + SET_GPUINFO_DYNAMIC(dynamic_info, free_memory, memory_info.free); + SET_GPUINFO_DYNAMIC(dynamic_info, mem_util_rate, memory_info.used * 100 / memory_info.total); + } + } + if (!got_meminfo && nvmlDeviceGetMemoryInfo) { + nvmlMemory_v1_t memory_info; + last_nvml_return_status = nvmlDeviceGetMemoryInfo(device, &memory_info); + if (last_nvml_return_status == NVML_SUCCESS) { + SET_GPUINFO_DYNAMIC(dynamic_info, total_memory, memory_info.total); + SET_GPUINFO_DYNAMIC(dynamic_info, used_memory, memory_info.used); + SET_GPUINFO_DYNAMIC(dynamic_info, free_memory, memory_info.free); + SET_GPUINFO_DYNAMIC(dynamic_info, mem_util_rate, memory_info.used * 100 / memory_info.total); + } } // Pcie generation used by the device @@ -562,6 +668,15 @@ static void gpuinfo_nvidia_refresh_dynamic_info(struct gpu_info *_gpu_info) { last_nvml_return_status = nvmlDeviceGetEnforcedPowerLimit(device, &dynamic_info->power_draw_max); if (last_nvml_return_status == NVML_SUCCESS) SET_VALID(gpuinfo_power_draw_max_valid, dynamic_info->valid); + + // MIG mode + if (nvmlDeviceGetMigMode) { + unsigned currentMode, pendingMode; + last_nvml_return_status = nvmlDeviceGetMigMode(device, ¤tMode, &pendingMode); + if (last_nvml_return_status == NVML_SUCCESS) { + SET_GPUINFO_DYNAMIC(dynamic_info, multi_instance_mode, currentMode == NVML_DEVICE_MIG_ENABLE); + } + } } static void gpuinfo_nvidia_get_process_utilization(struct gpu_info_nvidia *gpu_info, unsigned num_processes_recovered, @@ -609,69 +724,143 @@ static void gpuinfo_nvidia_get_process_utilization(struct gpu_info_nvidia *gpu_i gpu_info->last_utilization_timestamp = newest_timestamp_candidate; free(samples); } + // Mark the ones w/o update since last sample period to 0% usage + for (unsigned j = 0; j < num_processes_recovered; ++j) { + if (!IS_VALID(gpuinfo_process_gpu_usage_valid, processes[j].valid)) + SET_GPUINFO_PROCESS(&processes[j], gpu_usage, 0); + if (!IS_VALID(gpuinfo_process_encode_usage_valid, processes[j].valid)) + SET_GPUINFO_PROCESS(&processes[j], encode_usage, 0); + if (!IS_VALID(gpuinfo_process_decode_usage_valid, processes[j].valid)) + SET_GPUINFO_PROCESS(&processes[j], decode_usage, 0); + } } static void gpuinfo_nvidia_get_running_processes(struct gpu_info *_gpu_info) { struct gpu_info_nvidia *gpu_info = container_of(_gpu_info, struct gpu_info_nvidia, base); nvmlDevice_t device = gpu_info->gpuhandle; + bool validProcessGathering = false; + for (unsigned version = 3; !validProcessGathering && version > 0; version--) { + // Get the size of the actual function being used + size_t sizeof_nvmlProcessInfo; + switch (version) { + case 3: + sizeof_nvmlProcessInfo = sizeof(nvmlProcessInfo_v3_t); + break; + case 2: + sizeof_nvmlProcessInfo = sizeof(nvmlProcessInfo_v2_t); + break; + default: + sizeof_nvmlProcessInfo = sizeof(nvmlProcessInfo_v1_t); + break; + } - _gpu_info->processes_count = 0; - static size_t array_size = 0; - static nvmlProcessInfo_t *retrieved_infos = NULL; - unsigned graphical_count = 0, compute_count = 0, recovered_count; -retry_query_graphical: - recovered_count = array_size; - last_nvml_return_status = nvmlDeviceGetGraphicsRunningProcesses(device, &recovered_count, retrieved_infos); - if (last_nvml_return_status == NVML_ERROR_INSUFFICIENT_SIZE) { - array_size += COMMON_PROCESS_LINEAR_REALLOC_INC; - retrieved_infos = reallocarray(retrieved_infos, array_size, sizeof(*retrieved_infos)); - if (!retrieved_infos) { - perror("Could not re-allocate memory: "); - exit(EXIT_FAILURE); + _gpu_info->processes_count = 0; + static size_t array_size = 0; + static char *retrieved_infos = NULL; + unsigned graphical_count = 0, compute_count = 0, recovered_count; + if (nvmlDeviceGetGraphicsRunningProcesses[version]) { + retry_query_graphical: + recovered_count = array_size; + last_nvml_return_status = + nvmlDeviceGetGraphicsRunningProcesses[version](device, &recovered_count, retrieved_infos); + if (last_nvml_return_status == NVML_ERROR_INSUFFICIENT_SIZE) { + array_size += COMMON_PROCESS_LINEAR_REALLOC_INC; + retrieved_infos = reallocarray(retrieved_infos, array_size, sizeof_nvmlProcessInfo); + if (!retrieved_infos) { + perror("Could not re-allocate memory: "); + exit(EXIT_FAILURE); + } + goto retry_query_graphical; + } + if (last_nvml_return_status == NVML_SUCCESS) { + validProcessGathering = true; + graphical_count = recovered_count; + } } - goto retry_query_graphical; - } - if (last_nvml_return_status == NVML_SUCCESS) { - graphical_count = recovered_count; - } -retry_query_compute: - recovered_count = array_size - graphical_count; - last_nvml_return_status = - nvmlDeviceGetComputeRunningProcesses(device, &recovered_count, retrieved_infos + graphical_count); - if (last_nvml_return_status == NVML_ERROR_INSUFFICIENT_SIZE) { - array_size += COMMON_PROCESS_LINEAR_REALLOC_INC; - retrieved_infos = reallocarray(retrieved_infos, array_size, sizeof(*retrieved_infos)); - if (!retrieved_infos) { - perror("Could not re-allocate memory: "); - exit(EXIT_FAILURE); + + if (nvmlDeviceGetComputeRunningProcesses[version]) { + retry_query_compute: + recovered_count = array_size - graphical_count; + last_nvml_return_status = nvmlDeviceGetComputeRunningProcesses[version]( + device, &recovered_count, retrieved_infos + graphical_count * sizeof_nvmlProcessInfo); + if (last_nvml_return_status == NVML_ERROR_INSUFFICIENT_SIZE) { + array_size += COMMON_PROCESS_LINEAR_REALLOC_INC; + retrieved_infos = reallocarray(retrieved_infos, array_size, sizeof_nvmlProcessInfo); + if (!retrieved_infos) { + perror("Could not re-allocate memory: "); + exit(EXIT_FAILURE); + } + goto retry_query_compute; + } + if (last_nvml_return_status == NVML_SUCCESS) { + validProcessGathering = true; + compute_count = recovered_count; + } } - goto retry_query_compute; - } - if (last_nvml_return_status == NVML_SUCCESS) { - compute_count = recovered_count; - } - _gpu_info->processes_count = graphical_count + compute_count; - if (_gpu_info->processes_count > 0) { - if (_gpu_info->processes_count > _gpu_info->processes_array_size) { - _gpu_info->processes_array_size = _gpu_info->processes_count + COMMON_PROCESS_LINEAR_REALLOC_INC; - _gpu_info->processes = - reallocarray(_gpu_info->processes, _gpu_info->processes_array_size, sizeof(*_gpu_info->processes)); - if (!_gpu_info->processes) { - perror("Could not allocate memory: "); - exit(EXIT_FAILURE); + if (nvmlDeviceGetMPSComputeRunningProcesses[version]) { + retry_query_compute_MPS: + recovered_count = array_size - graphical_count - compute_count; + last_nvml_return_status = nvmlDeviceGetMPSComputeRunningProcesses[version]( + device, &recovered_count, retrieved_infos + (graphical_count + compute_count) * sizeof_nvmlProcessInfo); + if (last_nvml_return_status == NVML_ERROR_INSUFFICIENT_SIZE) { + array_size += COMMON_PROCESS_LINEAR_REALLOC_INC; + retrieved_infos = reallocarray(retrieved_infos, array_size, sizeof_nvmlProcessInfo); + if (!retrieved_infos) { + perror("Could not re-allocate memory: "); + exit(EXIT_FAILURE); + } + goto retry_query_compute_MPS; + } + if (last_nvml_return_status == NVML_SUCCESS) { + validProcessGathering = true; + compute_count += recovered_count; } } - memset(_gpu_info->processes, 0, _gpu_info->processes_count * sizeof(*_gpu_info->processes)); - for (unsigned i = 0; i < graphical_count + compute_count; ++i) { - if (i < graphical_count) - _gpu_info->processes[i].type = gpu_process_graphical; - else - _gpu_info->processes[i].type = gpu_process_compute; - _gpu_info->processes[i].pid = retrieved_infos[i].pid; - _gpu_info->processes[i].gpu_memory_usage = retrieved_infos[i].usedGpuMemory; - SET_VALID(gpuinfo_process_gpu_memory_usage_valid, _gpu_info->processes[i].valid); + + if (!validProcessGathering) + continue; + + _gpu_info->processes_count = graphical_count + compute_count; + if (_gpu_info->processes_count > 0) { + if (_gpu_info->processes_count > _gpu_info->processes_array_size) { + _gpu_info->processes_array_size = _gpu_info->processes_count + COMMON_PROCESS_LINEAR_REALLOC_INC; + _gpu_info->processes = + reallocarray(_gpu_info->processes, _gpu_info->processes_array_size, sizeof(*_gpu_info->processes)); + if (!_gpu_info->processes) { + perror("Could not allocate memory: "); + exit(EXIT_FAILURE); + } + } + memset(_gpu_info->processes, 0, _gpu_info->processes_count * sizeof(*_gpu_info->processes)); + for (unsigned i = 0; i < graphical_count + compute_count; ++i) { + if (i < graphical_count) + _gpu_info->processes[i].type = gpu_process_graphical; + else + _gpu_info->processes[i].type = gpu_process_compute; + switch (version) { + case 2: { + nvmlProcessInfo_v2_t *pinfo = (nvmlProcessInfo_v2_t *)retrieved_infos; + _gpu_info->processes[i].pid = pinfo[i].pid; + _gpu_info->processes[i].gpu_memory_usage = pinfo[i].usedGpuMemory; + } break; + case 3: { + nvmlProcessInfo_v3_t *pinfo = (nvmlProcessInfo_v3_t *)retrieved_infos; + _gpu_info->processes[i].pid = pinfo[i].pid; + _gpu_info->processes[i].gpu_memory_usage = pinfo[i].usedGpuMemory; + } break; + default: { + nvmlProcessInfo_v1_t *pinfo = (nvmlProcessInfo_v1_t *)retrieved_infos; + _gpu_info->processes[i].pid = pinfo[i].pid; + _gpu_info->processes[i].gpu_memory_usage = pinfo[i].usedGpuMemory; + } break; + } + SET_VALID(gpuinfo_process_gpu_memory_usage_valid, _gpu_info->processes[i].valid); + } } } - gpuinfo_nvidia_get_process_utilization(gpu_info, _gpu_info->processes_count, _gpu_info->processes); + // If the GPU is in MIG mode; process utilization is not supported + if (!(IS_VALID(gpuinfo_multi_instance_mode_valid, gpu_info->base.dynamic_info.valid) && + !gpu_info->base.dynamic_info.multi_instance_mode)) + gpuinfo_nvidia_get_process_utilization(gpu_info, _gpu_info->processes_count, _gpu_info->processes); } |