Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

accel/ivpu: Perform engine reset instead of device recovery on TDR

Replace full device recovery on TDR timeout with per-context abort,
allowing individual context handling instead of resetting the entire
device.

Extend ivpu_jsm_reset_engine() to return the list of contexts impacted
by the engine reset and use that information to abort only the affected
contexts.

Only check for potentially faulty contexts when the engine reset was not
triggered by an MMU fault or a job completion error status. This prevents
misidentifying non-guilty contexts that happened to be running at the
time of the fault.

Trigger full device recovery if no contexts were marked by engine reset
if triggered by job completion timeout, as there is no way to identify
guilty one.

Add engine reset counter to debugfs for engine resets bookkeeping
for debugging/testing purposes.

Reviewed-by: Lizhi Hou <lizhi.hou@amd.com>
Signed-off-by: Karol Wachowski <karol.wachowski@linux.intel.com>
Link: https://patch.msgid.link/20260318093927.4080303-1-karol.wachowski@linux.intel.com

+92 -17
+12 -2
drivers/accel/ivpu/ivpu_debugfs.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0-only 2 2 /* 3 - * Copyright (C) 2020-2024 Intel Corporation 3 + * Copyright (C) 2020-2026 Intel Corporation 4 4 */ 5 5 6 6 #include <linux/debugfs.h> ··· 127 127 return 0; 128 128 } 129 129 130 + static int engine_reset_counter_show(struct seq_file *s, void *v) 131 + { 132 + struct ivpu_device *vdev = seq_to_ivpu(s); 133 + 134 + seq_printf(s, "%d\n", atomic_read(&vdev->pm->engine_reset_counter)); 135 + return 0; 136 + } 137 + 130 138 static const struct drm_debugfs_info vdev_debugfs_list[] = { 131 139 {"bo_list", bo_list_show, 0}, 132 140 {"fw_name", fw_name_show, 0}, ··· 145 137 {"reset_counter", reset_counter_show, 0}, 146 138 {"reset_pending", reset_pending_show, 0}, 147 139 {"firewall_irq_counter", firewall_irq_counter_show, 0}, 140 + {"engine_reset_counter", engine_reset_counter_show, 0}, 148 141 }; 149 142 150 143 static int dvfs_mode_get(void *data, u64 *dvfs_mode) ··· 361 352 static int ivpu_reset_engine_fn(void *data, u64 val) 362 353 { 363 354 struct ivpu_device *vdev = (struct ivpu_device *)data; 355 + struct vpu_jsm_msg resp; 364 356 365 - return ivpu_jsm_reset_engine(vdev, (u32)val); 357 + return ivpu_jsm_reset_engine(vdev, (u32)val, &resp); 366 358 } 367 359 368 360 DEFINE_DEBUGFS_ATTRIBUTE(ivpu_reset_engine_fops, NULL, ivpu_reset_engine_fn, "0x%02llx\n");
+1
drivers/accel/ivpu/ivpu_drv.c
··· 665 665 vdev->context_xa_limit.max = IVPU_USER_CONTEXT_MAX_SSID; 666 666 atomic64_set(&vdev->unique_id_counter, 0); 667 667 atomic_set(&vdev->job_timeout_counter, 0); 668 + atomic_set(&vdev->faults_detected, 0); 668 669 xa_init_flags(&vdev->context_xa, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ); 669 670 xa_init_flags(&vdev->submitted_jobs_xa, XA_FLAGS_ALLOC1); 670 671 xa_init_flags(&vdev->db_xa, XA_FLAGS_ALLOC1);
+2 -1
drivers/accel/ivpu/ivpu_drv.h
··· 1 1 /* SPDX-License-Identifier: GPL-2.0-only */ 2 2 /* 3 - * Copyright (C) 2020-2025 Intel Corporation 3 + * Copyright (C) 2020-2026 Intel Corporation 4 4 */ 5 5 6 6 #ifndef __IVPU_DRV_H__ ··· 168 168 struct xarray submitted_jobs_xa; 169 169 struct ivpu_ipc_consumer job_done_consumer; 170 170 atomic_t job_timeout_counter; 171 + atomic_t faults_detected; 171 172 172 173 atomic64_t unique_id_counter; 173 174
+48 -2
drivers/accel/ivpu/ivpu_job.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0-only 2 2 /* 3 - * Copyright (C) 2020-2025 Intel Corporation 3 + * Copyright (C) 2020-2026 Intel Corporation 4 4 */ 5 5 6 6 #include <drm/drm_file.h> ··· 607 607 * status and ensure both are handled in the same way 608 608 */ 609 609 job->file_priv->has_mmu_faults = true; 610 + atomic_set(&vdev->faults_detected, 1); 610 611 queue_work(system_percpu_wq, &vdev->context_abort_work); 611 612 return true; 612 613 } ··· 1116 1115 ivpu_ipc_consumer_del(vdev, &vdev->job_done_consumer); 1117 1116 } 1118 1117 1118 + static int reset_engine_and_mark_faulty_contexts(struct ivpu_device *vdev) 1119 + { 1120 + u32 num_impacted_contexts; 1121 + struct vpu_jsm_msg resp; 1122 + int ret; 1123 + u32 i; 1124 + 1125 + ret = ivpu_jsm_reset_engine(vdev, 0, &resp); 1126 + if (ret) 1127 + return ret; 1128 + 1129 + /* 1130 + * If faults are detected, ignore guilty contexts from engine reset as NPU may not be stuck 1131 + * and could return currently running good context and faulty contexts are already marked 1132 + */ 1133 + if (atomic_cmpxchg(&vdev->faults_detected, 1, 0) == 1) 1134 + return 0; 1135 + 1136 + num_impacted_contexts = resp.payload.engine_reset_done.num_impacted_contexts; 1137 + 1138 + ivpu_warn_ratelimited(vdev, "Engine reset performed, impacted contexts: %u\n", 1139 + num_impacted_contexts); 1140 + 1141 + if (!in_range(num_impacted_contexts, 1, VPU_MAX_ENGINE_RESET_IMPACTED_CONTEXTS - 1)) { 1142 + ivpu_pm_trigger_recovery(vdev, "Cannot determine guilty contexts"); 1143 + return -EIO; 1144 + } 1145 + 1146 + /* No faults detected, NPU likely got stuck. Mark returned contexts as guilty */ 1147 + guard(mutex)(&vdev->context_list_lock); 1148 + 1149 + for (i = 0; i < num_impacted_contexts; i++) { 1150 + u32 ssid = resp.payload.engine_reset_done.impacted_contexts[i].host_ssid; 1151 + struct ivpu_file_priv *file_priv = xa_load(&vdev->context_xa, ssid); 1152 + 1153 + if (file_priv) { 1154 + mutex_lock(&file_priv->lock); 1155 + file_priv->has_mmu_faults = true; 1156 + mutex_unlock(&file_priv->lock); 1157 + } 1158 + } 1159 + 1160 + return 0; 1161 + } 1162 + 1119 1163 void ivpu_context_abort_work_fn(struct work_struct *work) 1120 1164 { 1121 1165 struct ivpu_device *vdev = container_of(work, struct ivpu_device, context_abort_work); ··· 1173 1127 return; 1174 1128 1175 1129 if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_HW) 1176 - if (ivpu_jsm_reset_engine(vdev, 0)) 1130 + if (reset_engine_and_mark_faulty_contexts(vdev)) 1177 1131 goto runtime_put; 1178 1132 1179 1133 mutex_lock(&vdev->context_list_lock);
+15 -4
drivers/accel/ivpu/ivpu_jsm_msg.c
··· 151 151 return ret; 152 152 } 153 153 154 - int ivpu_jsm_reset_engine(struct ivpu_device *vdev, u32 engine) 154 + int ivpu_jsm_reset_engine(struct ivpu_device *vdev, u32 engine, struct vpu_jsm_msg *resp) 155 155 { 156 156 struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_ENGINE_RESET }; 157 - struct vpu_jsm_msg resp; 158 157 int ret; 159 158 160 159 if (engine != VPU_ENGINE_COMPUTE) ··· 161 162 162 163 req.payload.engine_reset.engine_idx = engine; 163 164 164 - ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_ENGINE_RESET_DONE, &resp, 165 + ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_ENGINE_RESET_DONE, resp, 165 166 VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm); 166 167 if (ret) { 167 168 ivpu_err_ratelimited(vdev, "Failed to reset engine %d: %d\n", engine, ret); 168 169 ivpu_pm_trigger_recovery(vdev, "Engine reset failed"); 170 + return ret; 169 171 } 170 172 171 - return ret; 173 + atomic_inc(&vdev->pm->engine_reset_counter); 174 + 175 + return 0; 172 176 } 173 177 174 178 int ivpu_jsm_preempt_engine(struct ivpu_device *vdev, u32 engine, u32 preempt_id) ··· 556 554 } 557 555 558 556 int ivpu_jsm_state_dump(struct ivpu_device *vdev) 557 + { 558 + struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_STATE_DUMP }; 559 + struct vpu_jsm_msg resp; 560 + 561 + return ivpu_ipc_send_receive_internal(vdev, &req, VPU_JSM_MSG_STATE_DUMP_RSP, &resp, 562 + VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm); 563 + } 564 + 565 + int ivpu_jsm_state_dump_no_reply(struct ivpu_device *vdev) 559 566 { 560 567 struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_STATE_DUMP }; 561 568
+2 -1
drivers/accel/ivpu/ivpu_jsm_msg.h
··· 14 14 u64 jobq_base, u32 jobq_size); 15 15 int ivpu_jsm_unregister_db(struct ivpu_device *vdev, u32 db_id); 16 16 int ivpu_jsm_get_heartbeat(struct ivpu_device *vdev, u32 engine, u64 *heartbeat); 17 - int ivpu_jsm_reset_engine(struct ivpu_device *vdev, u32 engine); 17 + int ivpu_jsm_reset_engine(struct ivpu_device *vdev, u32 engine, struct vpu_jsm_msg *response); 18 18 int ivpu_jsm_preempt_engine(struct ivpu_device *vdev, u32 engine, u32 preempt_id); 19 19 int ivpu_jsm_dyndbg_control(struct ivpu_device *vdev, char *command, size_t size); 20 20 int ivpu_jsm_trace_get_capability(struct ivpu_device *vdev, u32 *trace_destination_mask, ··· 44 44 int ivpu_jsm_dct_enable(struct ivpu_device *vdev, u32 active_us, u32 inactive_us); 45 45 int ivpu_jsm_dct_disable(struct ivpu_device *vdev); 46 46 int ivpu_jsm_state_dump(struct ivpu_device *vdev); 47 + int ivpu_jsm_state_dump_no_reply(struct ivpu_device *vdev); 47 48 48 49 #endif
+2 -1
drivers/accel/ivpu/ivpu_mmu.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0-only 2 2 /* 3 - * Copyright (C) 2020-2024 Intel Corporation 3 + * Copyright (C) 2020-2026 Intel Corporation 4 4 */ 5 5 6 6 #include <linux/circ_buf.h> ··· 964 964 file_priv = xa_load(&vdev->context_xa, ssid); 965 965 if (file_priv) { 966 966 if (!READ_ONCE(file_priv->has_mmu_faults)) { 967 + atomic_set(&vdev->faults_detected, 1); 967 968 ivpu_mmu_dump_event(vdev, event); 968 969 WRITE_ONCE(file_priv->has_mmu_faults, true); 969 970 }
+9 -6
drivers/accel/ivpu/ivpu_pm.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0-only 2 2 /* 3 - * Copyright (C) 2020-2024 Intel Corporation 3 + * Copyright (C) 2020-2026 Intel Corporation 4 4 */ 5 5 6 6 #include <linux/highmem.h> ··· 166 166 ivpu_pm_reset_begin(vdev); 167 167 168 168 if (!pm_runtime_status_suspended(vdev->drm.dev)) { 169 - ivpu_jsm_state_dump(vdev); 169 + ivpu_jsm_state_dump_no_reply(vdev); 170 170 ivpu_dev_coredump(vdev); 171 171 ivpu_suspend(vdev); 172 172 } ··· 205 205 206 206 if (ivpu_jsm_get_heartbeat(vdev, 0, &heartbeat) || heartbeat <= vdev->fw->last_heartbeat) { 207 207 ivpu_err(vdev, "Job timeout detected, heartbeat not progressed\n"); 208 - goto recovery; 208 + goto abort; 209 209 } 210 210 211 211 inference_max_retries = DIV_ROUND_UP(inference_timeout_ms, timeout_ms); 212 212 if (atomic_fetch_inc(&vdev->job_timeout_counter) >= inference_max_retries) { 213 213 ivpu_err(vdev, "Job timeout detected, heartbeat limit (%lld) exceeded\n", 214 214 inference_max_retries); 215 - goto recovery; 215 + goto abort; 216 216 } 217 217 218 218 vdev->fw->last_heartbeat = heartbeat; 219 219 ivpu_start_job_timeout_detection(vdev); 220 220 return; 221 221 222 - recovery: 222 + abort: 223 223 atomic_set(&vdev->job_timeout_counter, 0); 224 - ivpu_pm_trigger_recovery(vdev, "TDR"); 224 + ivpu_jsm_state_dump(vdev); 225 + ivpu_dev_coredump(vdev); 226 + queue_work(system_percpu_wq, &vdev->context_abort_work); 225 227 } 226 228 227 229 void ivpu_start_job_timeout_detection(struct ivpu_device *vdev) ··· 406 404 init_rwsem(&pm->reset_lock); 407 405 atomic_set(&pm->reset_pending, 0); 408 406 atomic_set(&pm->reset_counter, 0); 407 + atomic_set(&pm->engine_reset_counter, 0); 409 408 410 409 INIT_WORK(&pm->recovery_work, ivpu_pm_recovery_work); 411 410 INIT_DELAYED_WORK(&pm->job_timeout_work, ivpu_job_timeout_work);
+1
drivers/accel/ivpu/ivpu_pm.h
··· 18 18 struct rw_semaphore reset_lock; 19 19 atomic_t reset_counter; 20 20 atomic_t reset_pending; 21 + atomic_t engine_reset_counter; 21 22 u8 dct_active_percent; 22 23 }; 23 24