Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

accel/amdxdna: Support retrieving hardware context debug information

The firmware implements the GET_APP_HEALTH command to collect debug
information for a specific hardware context.

When a command times out, the driver issues this command to collect the
relevant debug information. User space tools can also retrieve this
information through the hardware context query IOCTL.

Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
Link: https://patch.msgid.link/20260317044906.1513133-1-lizhi.hou@amd.com

+213 -11
+77 -8
drivers/accel/amdxdna/aie2_ctx.c
··· 29 29 30 30 #define HWCTX_MAX_TIMEOUT 60000 /* milliseconds */ 31 31 32 + struct aie2_ctx_health { 33 + struct amdxdna_ctx_health header; 34 + u32 txn_op_idx; 35 + u32 ctx_pc; 36 + u32 fatal_error_type; 37 + u32 fatal_error_exception_type; 38 + u32 fatal_error_exception_pc; 39 + u32 fatal_error_app_module; 40 + }; 41 + 32 42 static void aie2_job_release(struct kref *ref) 33 43 { 34 44 struct amdxdna_sched_job *job; ··· 49 39 wake_up(&job->hwctx->priv->job_free_wq); 50 40 if (job->out_fence) 51 41 dma_fence_put(job->out_fence); 42 + kfree(job->aie2_job_health); 52 43 kfree(job); 53 44 } 54 45 ··· 187 176 aie2_job_put(job); 188 177 } 189 178 179 + static void aie2_set_cmd_timeout(struct amdxdna_sched_job *job) 180 + { 181 + struct aie2_ctx_health *aie2_health __free(kfree) = NULL; 182 + struct amdxdna_dev *xdna = job->hwctx->client->xdna; 183 + struct amdxdna_gem_obj *cmd_abo = job->cmd_bo; 184 + struct app_health_report *report = job->aie2_job_health; 185 + u32 fail_cmd_idx = 0; 186 + 187 + if (!report) 188 + goto set_timeout; 189 + 190 + XDNA_ERR(xdna, "Firmware timeout state capture:"); 191 + XDNA_ERR(xdna, "\tVersion: %d.%d", report->major, report->minor); 192 + XDNA_ERR(xdna, "\tReport size: 0x%x", report->size); 193 + XDNA_ERR(xdna, "\tContext ID: %d", report->context_id); 194 + XDNA_ERR(xdna, "\tDPU PC: 0x%x", report->dpu_pc); 195 + XDNA_ERR(xdna, "\tTXN OP ID: 0x%x", report->txn_op_id); 196 + XDNA_ERR(xdna, "\tContext PC: 0x%x", report->ctx_pc); 197 + XDNA_ERR(xdna, "\tFatal error type: 0x%x", report->fatal_info.fatal_type); 198 + XDNA_ERR(xdna, "\tFatal error exception type: 0x%x", report->fatal_info.exception_type); 199 + XDNA_ERR(xdna, "\tFatal error exception PC: 0x%x", report->fatal_info.exception_pc); 200 + XDNA_ERR(xdna, "\tFatal error app module: 0x%x", report->fatal_info.app_module); 201 + XDNA_ERR(xdna, "\tFatal error task ID: %d", report->fatal_info.task_index); 202 + XDNA_ERR(xdna, "\tTimed out sub command ID: %d", report->run_list_id); 203 + 204 + fail_cmd_idx = report->run_list_id; 205 + aie2_health = kzalloc_obj(*aie2_health); 206 + if (!aie2_health) 207 + goto set_timeout; 208 + 209 + aie2_health->header.version = AMDXDNA_CMD_CTX_HEALTH_V1; 210 + aie2_health->header.npu_gen = AMDXDNA_CMD_CTX_HEALTH_AIE2; 211 + aie2_health->txn_op_idx = report->txn_op_id; 212 + aie2_health->ctx_pc = report->ctx_pc; 213 + aie2_health->fatal_error_type = report->fatal_info.fatal_type; 214 + aie2_health->fatal_error_exception_type = report->fatal_info.exception_type; 215 + aie2_health->fatal_error_exception_pc = report->fatal_info.exception_pc; 216 + aie2_health->fatal_error_app_module = report->fatal_info.app_module; 217 + 218 + set_timeout: 219 + amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_TIMEOUT, 220 + aie2_health, sizeof(*aie2_health)); 221 + } 222 + 190 223 static int 191 224 aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size) 192 225 { ··· 242 187 cmd_abo = job->cmd_bo; 243 188 244 189 if (unlikely(job->job_timeout)) { 245 - amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_TIMEOUT); 190 + aie2_set_cmd_timeout(job); 246 191 ret = -EINVAL; 247 192 goto out; 248 193 } 249 194 250 195 if (unlikely(!data) || unlikely(size != sizeof(u32))) { 251 - amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT); 196 + amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT, NULL, 0); 252 197 ret = -EINVAL; 253 198 goto out; 254 199 } ··· 258 203 if (status == AIE2_STATUS_SUCCESS) 259 204 amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED); 260 205 else 261 - amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ERROR); 206 + amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ERROR, NULL, 0); 262 207 263 208 out: 264 209 aie2_sched_notify(job); ··· 292 237 struct amdxdna_sched_job *job = handle; 293 238 struct amdxdna_gem_obj *cmd_abo; 294 239 struct amdxdna_dev *xdna; 240 + u32 fail_cmd_idx = 0; 295 241 u32 fail_cmd_status; 296 - u32 fail_cmd_idx; 297 242 u32 cmd_status; 298 243 int ret = 0; 299 244 300 245 cmd_abo = job->cmd_bo; 301 246 302 247 if (unlikely(job->job_timeout)) { 303 - amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_TIMEOUT); 248 + aie2_set_cmd_timeout(job); 304 249 ret = -EINVAL; 305 250 goto out; 306 251 } 307 252 308 253 if (unlikely(!data) || unlikely(size != sizeof(u32) * 3)) { 309 - amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT); 254 + amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT, NULL, 0); 310 255 ret = -EINVAL; 311 256 goto out; 312 257 } ··· 326 271 fail_cmd_idx, fail_cmd_status); 327 272 328 273 if (fail_cmd_status == AIE2_STATUS_SUCCESS) { 329 - amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ABORT); 274 + amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ABORT, NULL, 0); 330 275 ret = -EINVAL; 331 276 } else { 332 - amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ERROR); 277 + amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ERROR, NULL, 0); 333 278 } 334 279 335 280 out: ··· 418 363 { 419 364 struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job); 420 365 struct amdxdna_hwctx *hwctx = job->hwctx; 366 + struct app_health_report *report; 421 367 struct amdxdna_dev *xdna; 368 + int ret; 422 369 423 370 xdna = hwctx->client->xdna; 424 371 trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq); 425 372 job->job_timeout = true; 373 + 426 374 mutex_lock(&xdna->dev_lock); 375 + report = kzalloc_obj(*report); 376 + if (!report) 377 + goto reset_hwctx; 378 + 379 + ret = aie2_query_app_health(xdna->dev_handle, hwctx->fw_ctx_id, report); 380 + if (ret) 381 + kfree(report); 382 + else 383 + job->aie2_job_health = report; 384 + 385 + reset_hwctx: 427 386 aie2_hwctx_stop(xdna, hwctx, sched_job); 428 387 429 388 aie2_hwctx_restart(xdna, hwctx);
+41
drivers/accel/amdxdna/aie2_message.c
··· 1185 1185 1186 1186 return xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT); 1187 1187 } 1188 + 1189 + int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id, 1190 + struct app_health_report *report) 1191 + { 1192 + DECLARE_AIE2_MSG(get_app_health, MSG_OP_GET_APP_HEALTH); 1193 + struct amdxdna_dev *xdna = ndev->xdna; 1194 + struct app_health_report *buf; 1195 + dma_addr_t dma_addr; 1196 + u32 buf_size; 1197 + int ret; 1198 + 1199 + if (!AIE2_FEATURE_ON(ndev, AIE2_APP_HEALTH)) { 1200 + XDNA_DBG(xdna, "App health feature not supported"); 1201 + return -EOPNOTSUPP; 1202 + } 1203 + 1204 + buf_size = sizeof(*report); 1205 + buf = aie2_alloc_msg_buffer(ndev, &buf_size, &dma_addr); 1206 + if (IS_ERR(buf)) { 1207 + XDNA_ERR(xdna, "Failed to allocate buffer for app health"); 1208 + return PTR_ERR(buf); 1209 + } 1210 + 1211 + req.buf_addr = dma_addr; 1212 + req.context_id = context_id; 1213 + req.buf_size = buf_size; 1214 + 1215 + drm_clflush_virt_range(buf, sizeof(*report)); 1216 + ret = aie2_send_mgmt_msg_wait(ndev, &msg); 1217 + if (ret) { 1218 + XDNA_ERR(xdna, "Get app health failed, ret %d status 0x%x", ret, resp.status); 1219 + goto free_buf; 1220 + } 1221 + 1222 + /* Copy the report to caller's buffer */ 1223 + memcpy(report, buf, sizeof(*report)); 1224 + 1225 + free_buf: 1226 + aie2_free_msg_buffer(ndev, buf_size, buf, dma_addr); 1227 + return ret; 1228 + }
+52
drivers/accel/amdxdna/aie2_msg_priv.h
··· 31 31 MSG_OP_SET_RUNTIME_CONFIG = 0x10A, 32 32 MSG_OP_GET_RUNTIME_CONFIG = 0x10B, 33 33 MSG_OP_REGISTER_ASYNC_EVENT_MSG = 0x10C, 34 + MSG_OP_GET_APP_HEALTH = 0x114, 34 35 MSG_OP_MAX_DRV_OPCODE, 35 36 MSG_OP_GET_PROTOCOL_VERSION = 0x301, 36 37 MSG_OP_MAX_OPCODE ··· 451 450 452 451 struct config_debug_bo_resp { 453 452 enum aie2_msg_status status; 453 + } __packed; 454 + 455 + struct fatal_error_info { 456 + __u32 fatal_type; /* Fatal error type */ 457 + __u32 exception_type; /* Only valid if fatal_type is a specific value */ 458 + __u32 exception_argument; /* Argument based on exception type */ 459 + __u32 exception_pc; /* Program Counter at the time of the exception */ 460 + __u32 app_module; /* Error module name */ 461 + __u32 task_index; /* Index of the task in which the error occurred */ 462 + __u32 reserved[128]; 463 + }; 464 + 465 + struct app_health_report { 466 + __u16 major; 467 + __u16 minor; 468 + __u32 size; 469 + __u32 context_id; 470 + /* 471 + * Program Counter (PC) of the last initiated DPU opcode, as reported by the ERT 472 + * application. Before execution begins or after successful completion, the value is set 473 + * to UINT_MAX. If execution halts prematurely due to an error, this field retains the 474 + * opcode's PC value. 475 + * Note: To optimize performance, the ERT may simplify certain aspects of reporting. 476 + * Proper interpretation requires familiarity with the implementation details. 477 + */ 478 + __u32 dpu_pc; 479 + /* 480 + * Index of the last initiated TXN opcode. 481 + * Before execution starts or after successful completion, the value is set to UINT_MAX. 482 + * If execution halts prematurely due to an error, this field retains the opcode's ID. 483 + * Note: To optimize performance, the ERT may simplify certain aspects of reporting. 484 + * Proper interpretation requires familiarity with the implementation details. 485 + */ 486 + __u32 txn_op_id; 487 + /* The PC of the context at the time of the report */ 488 + __u32 ctx_pc; 489 + struct fatal_error_info fatal_info; 490 + /* Index of the most recently executed run list entry. */ 491 + __u32 run_list_id; 492 + }; 493 + 494 + struct get_app_health_req { 495 + __u32 context_id; 496 + __u32 buf_size; 497 + __u64 buf_addr; 498 + } __packed; 499 + 500 + struct get_app_health_resp { 501 + enum aie2_msg_status status; 502 + __u32 required_buffer_size; 503 + __u32 reserved[7]; 454 504 } __packed; 455 505 #endif /* _AIE2_MSG_PRIV_H_ */
+14
drivers/accel/amdxdna/aie2_pci.c
··· 846 846 struct amdxdna_drm_hwctx_entry *tmp __free(kfree) = NULL; 847 847 struct amdxdna_drm_get_array *array_args = arg; 848 848 struct amdxdna_drm_hwctx_entry __user *buf; 849 + struct app_health_report report; 850 + struct amdxdna_dev_hdl *ndev; 849 851 u32 size; 852 + int ret; 850 853 851 854 if (!array_args->num_element) 852 855 return -EINVAL; ··· 872 869 tmp->latency = hwctx->qos.latency; 873 870 tmp->frame_exec_time = hwctx->qos.frame_exec_time; 874 871 tmp->state = AMDXDNA_HWCTX_STATE_ACTIVE; 872 + ndev = hwctx->client->xdna->dev_handle; 873 + ret = aie2_query_app_health(ndev, hwctx->fw_ctx_id, &report); 874 + if (!ret) { 875 + /* Fill in app health report fields */ 876 + tmp->txn_op_idx = report.txn_op_id; 877 + tmp->ctx_pc = report.ctx_pc; 878 + tmp->fatal_error_type = report.fatal_info.fatal_type; 879 + tmp->fatal_error_exception_type = report.fatal_info.exception_type; 880 + tmp->fatal_error_exception_pc = report.fatal_info.exception_pc; 881 + tmp->fatal_error_app_module = report.fatal_info.app_module; 882 + } 875 883 876 884 buf = u64_to_user_ptr(array_args->buffer); 877 885 size = min(sizeof(*tmp), array_args->element_size);
+5
drivers/accel/amdxdna/aie2_pci.h
··· 10 10 #include <linux/limits.h> 11 11 #include <linux/semaphore.h> 12 12 13 + #include "aie2_msg_priv.h" 13 14 #include "amdxdna_mailbox.h" 14 15 15 16 #define AIE2_INTERVAL 20000 /* us */ ··· 262 261 AIE2_NPU_COMMAND, 263 262 AIE2_PREEMPT, 264 263 AIE2_TEMPORAL_ONLY, 264 + AIE2_APP_HEALTH, 265 265 AIE2_FEATURE_MAX 266 266 }; 267 267 ··· 273 271 u32 min_minor; 274 272 }; 275 273 274 + #define AIE2_ALL_FEATURES GENMASK_ULL(AIE2_FEATURE_MAX - 1, AIE2_NPU_COMMAND) 276 275 #define AIE2_FEATURE_ON(ndev, feature) test_bit(feature, &(ndev)->feature_mask) 277 276 278 277 struct amdxdna_dev_priv { ··· 344 341 int aie2_query_aie_metadata(struct amdxdna_dev_hdl *ndev, struct aie_metadata *metadata); 345 342 int aie2_query_firmware_version(struct amdxdna_dev_hdl *ndev, 346 343 struct amdxdna_fw_ver *fw_ver); 344 + int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id, 345 + struct app_health_report *report); 347 346 int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx); 348 347 int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx); 349 348 int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size);
+5 -1
drivers/accel/amdxdna/amdxdna_ctx.c
··· 137 137 138 138 int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo, 139 139 struct amdxdna_sched_job *job, u32 cmd_idx, 140 - enum ert_cmd_state error_state) 140 + enum ert_cmd_state error_state, 141 + void *err_data, size_t size) 141 142 { 142 143 struct amdxdna_client *client = job->hwctx->client; 143 144 struct amdxdna_cmd *cmd = abo->mem.kva; ··· 157 156 } 158 157 159 158 memset(cmd->data, 0xff, abo->mem.size - sizeof(*cmd)); 159 + if (err_data) 160 + memcpy(cmd->data, err_data, min(size, abo->mem.size - sizeof(*cmd))); 161 + 160 162 if (cc) 161 163 amdxdna_gem_put_obj(abo); 162 164
+17 -1
drivers/accel/amdxdna/amdxdna_ctx.h
··· 72 72 u32 prop_args[]; /* properties and regular kernel arguments */ 73 73 }; 74 74 75 + #define AMDXDNA_CMD_CTX_HEALTH_V1 1 76 + #define AMDXDNA_CMD_CTX_HEALTH_AIE2 0 77 + struct amdxdna_ctx_health { 78 + u32 version; 79 + u32 npu_gen; 80 + }; 81 + 75 82 /* Exec buffer command header format */ 76 83 #define AMDXDNA_CMD_STATE GENMASK(3, 0) 77 84 #define AMDXDNA_CMD_EXTRA_CU_MASK GENMASK(11, 10) ··· 129 122 u32 result; 130 123 }; 131 124 125 + struct app_health_report; 126 + union amdxdna_job_priv { 127 + struct app_health_report *aie2_health; 128 + }; 129 + 132 130 struct amdxdna_sched_job { 133 131 struct drm_sched_job base; 134 132 struct kref refcnt; ··· 148 136 u64 seq; 149 137 struct amdxdna_drv_cmd *drv_cmd; 150 138 struct amdxdna_gem_obj *cmd_bo; 139 + union amdxdna_job_priv priv; 151 140 size_t bo_cnt; 152 141 struct drm_gem_object *bos[] __counted_by(bo_cnt); 153 142 }; 143 + 144 + #define aie2_job_health priv.aie2_health 154 145 155 146 static inline u32 156 147 amdxdna_cmd_get_op(struct amdxdna_gem_obj *abo) ··· 184 169 u32 amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo); 185 170 int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo, 186 171 struct amdxdna_sched_job *job, u32 cmd_idx, 187 - enum ert_cmd_state error_state); 172 + enum ert_cmd_state error_state, 173 + void *err_data, size_t size); 188 174 189 175 void amdxdna_sched_job_cleanup(struct amdxdna_sched_job *job); 190 176 void amdxdna_hwctx_remove_all(struct amdxdna_client *client);
+2 -1
drivers/accel/amdxdna/npu4_regs.c
··· 93 93 { .features = BIT_U64(AIE2_NPU_COMMAND), .major = 6, .min_minor = 15 }, 94 94 { .features = BIT_U64(AIE2_PREEMPT), .major = 6, .min_minor = 12 }, 95 95 { .features = BIT_U64(AIE2_TEMPORAL_ONLY), .major = 6, .min_minor = 12 }, 96 - { .features = GENMASK_ULL(AIE2_TEMPORAL_ONLY, AIE2_NPU_COMMAND), .major = 7 }, 96 + { .features = BIT_U64(AIE2_APP_HEALTH), .major = 6, .min_minor = 18 }, 97 + { .features = AIE2_ALL_FEATURES, .major = 7 }, 97 98 { 0 } 98 99 }; 99 100