src/extract_gpuinfo_ascend.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250

/*
 * Copyright (C) 2023 klayer <klayer@163.com>
 *
 * This file is part of Nvtop and adapted from Ascend DCMI from Huawei Technologies Co., Ltd.
 *
 * Nvtop is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Nvtop is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with nvtop.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

#include <stdio.h>
#include <stdbool.h>
#include <string.h>
#include <errno.h>

#include "ascend/dcmi_interface_api.h"
#include "list.h"
#include "nvtop/common.h"
#include "nvtop/extract_gpuinfo_common.h"

#define KB_TO_GB (1024 * 1024)
#define DCMI_SUCCESS 0
#define MAX_DEVICE_NUM 64
#define MAX_PROC_NUM 32
#define PROC_ALLOC_INC 16

static int last_dcmi_return_status = DCMI_SUCCESS;
static const char *unknown_error = "unknown Ascend DCMI error";
static const char *local_error_string = "";

struct gpu_info_ascend {
  struct gpu_info base;
  struct list_head allocate_list;
};

static LIST_HEAD(allocations);

static bool gpuinfo_ascend_init(void);
static void gpuinfo_ascend_shutdown(void);
static const char *gpuinfo_ascend_last_error_string(void);
static bool gpuinfo_ascend_get_device_handles(struct list_head *devices, unsigned *count);
static void gpuinfo_ascend_populate_static_info(struct gpu_info *_gpu_info);
static void gpuinfo_ascend_refresh_dynamic_info(struct gpu_info *_gpu_info);
static void gpuinfo_ascend_get_running_processes(struct gpu_info *_gpu_info);

static void _encode_card_device_id_to_pdev(char *pdev, int card_id, int device_id);
static void _decode_card_device_id_from_pdev(const char *pdev, int *card_id, int *device_id);

struct gpu_vendor gpu_vendor_ascend = {
    .init = gpuinfo_ascend_init,
    .shutdown = gpuinfo_ascend_shutdown,
    .last_error_string = gpuinfo_ascend_last_error_string,
    .get_device_handles = gpuinfo_ascend_get_device_handles,
    .populate_static_info = gpuinfo_ascend_populate_static_info,
    .refresh_dynamic_info = gpuinfo_ascend_refresh_dynamic_info,
    .refresh_running_processes = gpuinfo_ascend_get_running_processes,
    .name = "Ascend",
};

__attribute__((constructor)) static void init_extract_gpuinfo_ascend(void) { register_gpu_vendor(&gpu_vendor_ascend); }

static bool gpuinfo_ascend_init(void) {
  last_dcmi_return_status = dcmi_init();
  return last_dcmi_return_status == DCMI_SUCCESS;
}

static void gpuinfo_ascend_shutdown(void) {
  local_error_string = "";

  struct gpu_info_ascend *allocated, *tmp;
  list_for_each_entry_safe(allocated, tmp, &allocations, allocate_list) {
    list_del(&allocated->allocate_list);
    free(allocated);
  }
}

static const char *gpuinfo_ascend_last_error_string(void) {
  return local_error_string;
}

static bool gpuinfo_ascend_get_device_handles(struct list_head *devices, unsigned *count) {
  int num_cards;
  int card_list[MAX_CARD_NUM] = {0};
  last_dcmi_return_status = dcmi_get_card_list(&num_cards, card_list, MAX_DEVICE_NUM);
  if (last_dcmi_return_status != DCMI_SUCCESS) {
    local_error_string = "Failed to get card num";
    return false;
  } else if (num_cards == 0) {
    local_error_string = "Not found NPU(s)";
    return false;
  }

  int num_devices = 0;
  int card_device_list[num_cards];
  for (int i = 0; i < num_cards; ++i) {
    int num_card_devices;
    last_dcmi_return_status = dcmi_get_device_num_in_card(card_list[i], &num_card_devices);
    if (last_dcmi_return_status != DCMI_SUCCESS) {
      local_error_string = "Failed to get device num of card";
      return false;
    }
    num_devices += num_card_devices;
    card_device_list[i] = num_card_devices;
  }

  struct gpu_info_ascend *gpu_infos = calloc(num_devices, sizeof(*gpu_infos));
  if (!gpu_infos) {
    local_error_string = strerror(errno);
    return false;
  }

  // todo: for free gpu_infos when shutting down, rewrite to direct free?
  list_add(&gpu_infos[0].allocate_list, &allocations);

  *count = 0;
  for (int i = 0; i < num_cards; ++i) {
    for (int j = 0; j < card_device_list[i]; ++j) {
      gpu_infos[*count].base.vendor = &gpu_vendor_ascend;
      _encode_card_device_id_to_pdev(gpu_infos[*count].base.pdev, i, j);
      list_add_tail(&gpu_infos[*count].base.list, devices);
      *count += 1;
    }
  }

  return true;
}

static void _encode_card_device_id_to_pdev(char *pdev, int card_id, int device_id) {
  sprintf(pdev, "%d-%d", (short)card_id, (short)device_id);
}

static void _decode_card_device_id_from_pdev(const char *pdev, int *card_id, int *device_id) {
  sscanf(pdev, "%d-%d", card_id, device_id);
}

static void gpuinfo_ascend_populate_static_info(struct gpu_info *_gpu_info) {
  struct gpu_info_ascend *gpu_info = container_of(_gpu_info, struct gpu_info_ascend, base);
  struct gpuinfo_static_info *static_info = &gpu_info->base.static_info;
  static_info->integrated_graphics = false;
  RESET_ALL(static_info->valid);

  int card_id, device_id;
  _decode_card_device_id_from_pdev(_gpu_info->pdev, &card_id, &device_id);

  struct dcmi_chip_info *chip_info = malloc(sizeof(struct dcmi_chip_info));
  last_dcmi_return_status = dcmi_get_device_chip_info(card_id, device_id, chip_info);
  if (last_dcmi_return_status == DCMI_SUCCESS) {
    // assume Ascend only use ASCII code for chip name
    strcpy(static_info->device_name, (char *) chip_info->chip_name);
    SET_VALID(gpuinfo_device_name_valid, static_info->valid);
  }
  free(chip_info);
  // todo: it seems that other static infos are not supported by Ascend DCMI for now, will add if possible in future
}

static void gpuinfo_ascend_refresh_dynamic_info(struct gpu_info *_gpu_info) {
  struct gpu_info_ascend *gpu_info = container_of(_gpu_info, struct gpu_info_ascend, base);
  struct gpuinfo_dynamic_info *dynamic_info = &gpu_info->base.dynamic_info;
  RESET_ALL(dynamic_info->valid);
  dynamic_info->encode_decode_shared = false;

  int card_id, device_id;
  _decode_card_device_id_from_pdev(_gpu_info->pdev, &card_id, &device_id);

  unsigned aicore_freq;
  last_dcmi_return_status = dcmi_get_device_frequency(card_id, device_id, DCMI_FREQ_AICORE_CURRENT_, &aicore_freq);
  if (last_dcmi_return_status == DCMI_SUCCESS) {
    dynamic_info->gpu_clock_speed = aicore_freq;
    SET_VALID(gpuinfo_gpu_clock_speed_valid, dynamic_info->valid);
  }

  unsigned aicore_max_freq;
  last_dcmi_return_status = dcmi_get_device_frequency(card_id, device_id, DCMI_FREQ_AICORE_MAX, &aicore_max_freq);
  if (last_dcmi_return_status == DCMI_SUCCESS) {
    dynamic_info->gpu_clock_speed_max = aicore_max_freq;
    SET_VALID(gpuinfo_gpu_clock_speed_max_valid, dynamic_info->valid);
  }

  unsigned hbm_freq;
  last_dcmi_return_status = dcmi_get_device_frequency(card_id, device_id, DCMI_FREQ_HBM, &hbm_freq);
  if (last_dcmi_return_status == DCMI_SUCCESS) {
    dynamic_info->mem_clock_speed = hbm_freq;
    SET_VALID(gpuinfo_mem_clock_speed_valid, dynamic_info->valid);
  }

  unsigned aicore_util_rate;
  last_dcmi_return_status = dcmi_get_device_utilization_rate(card_id, device_id, DCMI_UTILIZATION_RATE_AICORE, &aicore_util_rate);
  if (last_dcmi_return_status == DCMI_SUCCESS) {
    dynamic_info->gpu_util_rate = aicore_util_rate;
    SET_VALID(gpuinfo_gpu_util_rate_valid, dynamic_info->valid);
  }

  struct dsmi_hbm_info_stru hbm_info;
  last_dcmi_return_status = dcmi_get_hbm_info(card_id, device_id, &hbm_info);
  if (last_dcmi_return_status == DCMI_SUCCESS) {
    SET_GPUINFO_DYNAMIC(dynamic_info, total_memory, hbm_info.memory_size * KB_TO_GB);
    SET_GPUINFO_DYNAMIC(dynamic_info, used_memory, hbm_info.memory_usage * KB_TO_GB);
    SET_GPUINFO_DYNAMIC(dynamic_info, free_memory, (hbm_info.memory_size - hbm_info.memory_usage) * KB_TO_GB);
    SET_GPUINFO_DYNAMIC(dynamic_info, mem_util_rate, hbm_info.memory_usage * 100 / hbm_info.memory_size);
  }

  int device_temperature;
  last_dcmi_return_status = dcmi_get_device_temperature(card_id, device_id, &device_temperature);
  if (last_dcmi_return_status == DCMI_SUCCESS) {
    dynamic_info->gpu_temp = device_temperature;
    SET_VALID(gpuinfo_gpu_temp_valid, dynamic_info->valid);
  }

  int power_usage;
  last_dcmi_return_status = dcmi_get_device_power_info(card_id, device_id, &power_usage);
  if (last_dcmi_return_status == DCMI_SUCCESS) {
    dynamic_info->power_draw = power_usage * 100;
    SET_VALID(gpuinfo_power_draw_valid, dynamic_info->valid);
  }
}

static void gpuinfo_ascend_get_running_processes(struct gpu_info *_gpu_info) {
  int card_id, device_id;
  _decode_card_device_id_from_pdev(_gpu_info->pdev, &card_id, &device_id);