Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

accel/amdxdna: Support getting last hardware error

Add new parameter DRM_AMDXDNA_HW_LAST_ASYNC_ERR to get array IOCTL. When
hardware reports an error, the driver save the error information and
timestamp. This new get array parameter retrieves the last error.

Reviewed-by: Mario Limonciello (AMD) <superm1@kernel.org>
Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
Link: https://lore.kernel.org/r/20251014234119.628453-1-lizhi.hou@amd.com

Lizhi Hou b291e4f1 83f81f54

+159 -19
+78 -17
drivers/accel/amdxdna/aie2_error.c
··· 13 13 14 14 #include "aie2_msg_priv.h" 15 15 #include "aie2_pci.h" 16 + #include "amdxdna_error.h" 16 17 #include "amdxdna_mailbox.h" 17 18 #include "amdxdna_pci_drv.h" 18 19 ··· 47 46 AIE_MEM_MOD = 0, 48 47 AIE_CORE_MOD, 49 48 AIE_PL_MOD, 49 + AIE_UNKNOWN_MOD, 50 50 }; 51 51 52 52 enum aie_error_category { ··· 145 143 EVENT_CATEGORY(74U, AIE_ERROR_LOCK), 146 144 }; 147 145 146 + static const enum amdxdna_error_num aie_cat_err_num_map[] = { 147 + [AIE_ERROR_SATURATION] = AMDXDNA_ERROR_NUM_AIE_SATURATION, 148 + [AIE_ERROR_FP] = AMDXDNA_ERROR_NUM_AIE_FP, 149 + [AIE_ERROR_STREAM] = AMDXDNA_ERROR_NUM_AIE_STREAM, 150 + [AIE_ERROR_ACCESS] = AMDXDNA_ERROR_NUM_AIE_ACCESS, 151 + [AIE_ERROR_BUS] = AMDXDNA_ERROR_NUM_AIE_BUS, 152 + [AIE_ERROR_INSTRUCTION] = AMDXDNA_ERROR_NUM_AIE_INSTRUCTION, 153 + [AIE_ERROR_ECC] = AMDXDNA_ERROR_NUM_AIE_ECC, 154 + [AIE_ERROR_LOCK] = AMDXDNA_ERROR_NUM_AIE_LOCK, 155 + [AIE_ERROR_DMA] = AMDXDNA_ERROR_NUM_AIE_DMA, 156 + [AIE_ERROR_MEM_PARITY] = AMDXDNA_ERROR_NUM_AIE_MEM_PARITY, 157 + [AIE_ERROR_UNKNOWN] = AMDXDNA_ERROR_NUM_UNKNOWN, 158 + }; 159 + 160 + static_assert(ARRAY_SIZE(aie_cat_err_num_map) == AIE_ERROR_UNKNOWN + 1); 161 + 162 + static const enum amdxdna_error_module aie_err_mod_map[] = { 163 + [AIE_MEM_MOD] = AMDXDNA_ERROR_MODULE_AIE_MEMORY, 164 + [AIE_CORE_MOD] = AMDXDNA_ERROR_MODULE_AIE_CORE, 165 + [AIE_PL_MOD] = AMDXDNA_ERROR_MODULE_AIE_PL, 166 + [AIE_UNKNOWN_MOD] = AMDXDNA_ERROR_MODULE_UNKNOWN, 167 + }; 168 + 169 + static_assert(ARRAY_SIZE(aie_err_mod_map) == AIE_UNKNOWN_MOD + 1); 170 + 148 171 static enum aie_error_category 149 172 aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type) 150 173 { ··· 203 176 if (event_id != lut[i].event_id) 204 177 continue; 205 178 179 + if (lut[i].category > AIE_ERROR_UNKNOWN) 180 + return AIE_ERROR_UNKNOWN; 181 + 206 182 return lut[i].category; 207 183 } 208 184 209 185 return AIE_ERROR_UNKNOWN; 186 + } 187 + 188 + static void aie2_update_last_async_error(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err) 189 + { 190 + struct aie_error *errs = err_info; 191 + enum amdxdna_error_module err_mod; 192 + enum aie_error_category aie_err; 193 + enum amdxdna_error_num err_num; 194 + struct aie_error *last_err; 195 + 196 + last_err = &errs[num_err - 1]; 197 + if (last_err->mod_type >= AIE_UNKNOWN_MOD) { 198 + err_num = aie_cat_err_num_map[AIE_ERROR_UNKNOWN]; 199 + err_mod = aie_err_mod_map[AIE_UNKNOWN_MOD]; 200 + } else { 201 + aie_err = aie_get_error_category(last_err->row, 202 + last_err->event_id, 203 + last_err->mod_type); 204 + err_num = aie_cat_err_num_map[aie_err]; 205 + err_mod = aie_err_mod_map[last_err->mod_type]; 206 + } 207 + 208 + ndev->last_async_err.err_code = AMDXDNA_ERROR_ENCODE(err_num, err_mod); 209 + ndev->last_async_err.ts_us = ktime_to_us(ktime_get_real()); 210 + ndev->last_async_err.ex_err_code = AMDXDNA_EXTRA_ERR_ENCODE(last_err->row, last_err->col); 210 211 } 211 212 212 213 static u32 aie2_error_backtrack(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err) ··· 319 264 } 320 265 321 266 mutex_lock(&xdna->dev_lock); 267 + aie2_update_last_async_error(e->ndev, info->payload, info->err_cnt); 268 + 322 269 /* Re-sent this event to firmware */ 323 270 if (aie2_error_event_send(e)) 324 271 XDNA_WARN(xdna, "Unable to register async event"); 325 272 mutex_unlock(&xdna->dev_lock); 326 - } 327 - 328 - int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev) 329 - { 330 - struct amdxdna_dev *xdna = ndev->xdna; 331 - struct async_event *e; 332 - int i, ret; 333 - 334 - drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock)); 335 - for (i = 0; i < ndev->async_events->event_cnt; i++) { 336 - e = &ndev->async_events->event[i]; 337 - ret = aie2_error_event_send(e); 338 - if (ret) 339 - return ret; 340 - } 341 - 342 - return 0; 343 273 } 344 274 345 275 void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev) ··· 381 341 e->size = ASYNC_BUF_SIZE; 382 342 e->resp.status = MAX_AIE2_STATUS_CODE; 383 343 INIT_WORK(&e->work, aie2_error_worker); 344 + 345 + ret = aie2_error_event_send(e); 346 + if (ret) 347 + goto free_wq; 384 348 } 385 349 386 350 ndev->async_events = events; ··· 393 349 events->event_cnt, events->size); 394 350 return 0; 395 351 352 + free_wq: 353 + destroy_workqueue(events->wq); 396 354 free_buf: 397 355 dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf, 398 356 events->addr, DMA_FROM_DEVICE); 399 357 free_events: 400 358 kfree(events); 401 359 return ret; 360 + } 361 + 362 + int aie2_get_array_async_error(struct amdxdna_dev_hdl *ndev, struct amdxdna_drm_get_array *args) 363 + { 364 + struct amdxdna_dev *xdna = ndev->xdna; 365 + 366 + drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock)); 367 + 368 + args->num_element = 1; 369 + args->element_size = sizeof(ndev->last_async_err); 370 + if (copy_to_user(u64_to_user_ptr(args->buffer), 371 + &ndev->last_async_err, args->element_size)) 372 + return -EFAULT; 373 + 374 + return 0; 402 375 }
+3
drivers/accel/amdxdna/aie2_pci.c
··· 924 924 case DRM_AMDXDNA_HW_CONTEXT_ALL: 925 925 ret = aie2_query_ctx_status_array(client, args); 926 926 break; 927 + case DRM_AMDXDNA_HW_LAST_ASYNC_ERR: 928 + ret = aie2_get_array_async_error(xdna->dev_handle, args); 929 + break; 927 930 default: 928 931 XDNA_ERR(xdna, "Not supported request parameter %u", args->param); 929 932 ret = -EOPNOTSUPP;
+4 -1
drivers/accel/amdxdna/aie2_pci.h
··· 190 190 191 191 enum aie2_dev_status dev_status; 192 192 u32 hwctx_num; 193 + 194 + struct amdxdna_async_error last_async_err; 193 195 }; 194 196 195 197 #define DEFINE_BAR_OFFSET(reg_name, bar, reg_addr) \ ··· 255 253 /* aie2_error.c */ 256 254 int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev); 257 255 void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev); 258 - int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev); 259 256 int aie2_error_async_msg_thread(void *data); 257 + int aie2_get_array_async_error(struct amdxdna_dev_hdl *ndev, 258 + struct amdxdna_drm_get_array *args); 260 259 261 260 /* aie2_message.c */ 262 261 int aie2_suspend_fw(struct amdxdna_dev_hdl *ndev);
+59
drivers/accel/amdxdna/amdxdna_error.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Copyright (C) 2025, Advanced Micro Devices, Inc. 4 + */ 5 + 6 + #ifndef _AMDXDNA_ERROR_H_ 7 + #define _AMDXDNA_ERROR_H_ 8 + 9 + #include <linux/bitfield.h> 10 + #include <linux/bits.h> 11 + 12 + #define AMDXDNA_ERR_DRV_AIE 4 13 + #define AMDXDNA_ERR_SEV_CRITICAL 3 14 + #define AMDXDNA_ERR_CLASS_AIE 2 15 + 16 + #define AMDXDNA_ERR_NUM_MASK GENMASK_U64(15, 0) 17 + #define AMDXDNA_ERR_DRV_MASK GENMASK_U64(23, 16) 18 + #define AMDXDNA_ERR_SEV_MASK GENMASK_U64(31, 24) 19 + #define AMDXDNA_ERR_MOD_MASK GENMASK_U64(39, 32) 20 + #define AMDXDNA_ERR_CLASS_MASK GENMASK_U64(47, 40) 21 + 22 + enum amdxdna_error_num { 23 + AMDXDNA_ERROR_NUM_AIE_SATURATION = 3, 24 + AMDXDNA_ERROR_NUM_AIE_FP, 25 + AMDXDNA_ERROR_NUM_AIE_STREAM, 26 + AMDXDNA_ERROR_NUM_AIE_ACCESS, 27 + AMDXDNA_ERROR_NUM_AIE_BUS, 28 + AMDXDNA_ERROR_NUM_AIE_INSTRUCTION, 29 + AMDXDNA_ERROR_NUM_AIE_ECC, 30 + AMDXDNA_ERROR_NUM_AIE_LOCK, 31 + AMDXDNA_ERROR_NUM_AIE_DMA, 32 + AMDXDNA_ERROR_NUM_AIE_MEM_PARITY, 33 + AMDXDNA_ERROR_NUM_UNKNOWN = 15, 34 + }; 35 + 36 + enum amdxdna_error_module { 37 + AMDXDNA_ERROR_MODULE_AIE_CORE = 3, 38 + AMDXDNA_ERROR_MODULE_AIE_MEMORY, 39 + AMDXDNA_ERROR_MODULE_AIE_SHIM, 40 + AMDXDNA_ERROR_MODULE_AIE_NOC, 41 + AMDXDNA_ERROR_MODULE_AIE_PL, 42 + AMDXDNA_ERROR_MODULE_UNKNOWN = 8, 43 + }; 44 + 45 + #define AMDXDNA_ERROR_ENCODE(err_num, err_mod) \ 46 + (FIELD_PREP(AMDXDNA_ERR_NUM_MASK, err_num) | \ 47 + FIELD_PREP_CONST(AMDXDNA_ERR_DRV_MASK, AMDXDNA_ERR_DRV_AIE) | \ 48 + FIELD_PREP_CONST(AMDXDNA_ERR_SEV_MASK, AMDXDNA_ERR_SEV_CRITICAL) | \ 49 + FIELD_PREP(AMDXDNA_ERR_MOD_MASK, err_mod) | \ 50 + FIELD_PREP_CONST(AMDXDNA_ERR_CLASS_MASK, AMDXDNA_ERR_CLASS_AIE)) 51 + 52 + #define AMDXDNA_EXTRA_ERR_COL_MASK GENMASK_U64(7, 0) 53 + #define AMDXDNA_EXTRA_ERR_ROW_MASK GENMASK_U64(15, 8) 54 + 55 + #define AMDXDNA_EXTRA_ERR_ENCODE(row, col) \ 56 + (FIELD_PREP(AMDXDNA_EXTRA_ERR_COL_MASK, col) | \ 57 + FIELD_PREP(AMDXDNA_EXTRA_ERR_ROW_MASK, row)) 58 + 59 + #endif /* _AMDXDNA_ERROR_H_ */
+2 -1
drivers/accel/amdxdna/amdxdna_pci_drv.c
··· 27 27 /* 28 28 * 0.0: Initial version 29 29 * 0.1: Support getting all hardware contexts by DRM_IOCTL_AMDXDNA_GET_ARRAY 30 + * 0.2: Support getting last error hardware error 30 31 */ 31 32 #define AMDXDNA_DRIVER_MAJOR 0 32 - #define AMDXDNA_DRIVER_MINOR 1 33 + #define AMDXDNA_DRIVER_MINOR 2 33 34 34 35 /* 35 36 * Bind the driver base on (vendor_id, device_id) pair and later use the
+13
include/uapi/drm/amdxdna_accel.h
··· 523 523 __u32 pad; 524 524 }; 525 525 526 + /** 527 + * struct amdxdna_async_error - XDNA async error structure 528 + */ 529 + struct amdxdna_async_error { 530 + /** @err_code: Error code. */ 531 + __u64 err_code; 532 + /** @ts_us: Timestamp. */ 533 + __u64 ts_us; 534 + /** @ex_err_code: Extra error code */ 535 + __u64 ex_err_code; 536 + }; 537 + 526 538 #define DRM_AMDXDNA_HW_CONTEXT_ALL 0 539 + #define DRM_AMDXDNA_HW_LAST_ASYNC_ERR 2 527 540 528 541 /** 529 542 * struct amdxdna_drm_get_array - Get information array.