Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2020-2026 Intel Corporation
4 */
5
6#include <linux/highmem.h>
7#include <linux/moduleparam.h>
8#include <linux/pci.h>
9#include <linux/pm_runtime.h>
10#include <linux/reboot.h>
11
12#include "ivpu_coredump.h"
13#include "ivpu_drv.h"
14#include "ivpu_fw.h"
15#include "ivpu_fw_log.h"
16#include "ivpu_hw.h"
17#include "ivpu_ipc.h"
18#include "ivpu_job.h"
19#include "ivpu_jsm_msg.h"
20#include "ivpu_mmu.h"
21#include "ivpu_ms.h"
22#include "ivpu_pm.h"
23#include "ivpu_trace.h"
24#include "vpu_boot_api.h"
25
26static bool ivpu_disable_recovery;
27#if IS_ENABLED(CONFIG_DRM_ACCEL_IVPU_DEBUG)
28module_param_named_unsafe(disable_recovery, ivpu_disable_recovery, bool, 0644);
29MODULE_PARM_DESC(disable_recovery, "Disables recovery when NPU hang is detected");
30#endif
31
32static unsigned long ivpu_tdr_timeout_ms;
33module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, ulong, 0644);
34MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in milliseconds, 0 - default");
35
36static unsigned long ivpu_inference_timeout_ms;
37module_param_named(inference_timeout_ms, ivpu_inference_timeout_ms, ulong, 0644);
38MODULE_PARM_DESC(inference_timeout_ms, "Inference maximum duration, in milliseconds, 0 - default");
39
40#define PM_RESCHEDULE_LIMIT 5
41
42static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev)
43{
44 struct ivpu_fw_info *fw = vdev->fw;
45
46 ivpu_cmdq_reset_all_contexts(vdev);
47 ivpu_ipc_reset(vdev);
48 ivpu_fw_log_reset(vdev);
49 ivpu_fw_load(vdev);
50 fw->last_heartbeat = 0;
51
52 ivpu_dbg(vdev, FW_BOOT, "Cold boot entry point 0x%llx", vdev->fw->cold_boot_entry_point);
53 fw->next_boot_mode = VPU_BOOT_TYPE_COLDBOOT;
54}
55
56static void ivpu_pm_prepare_warm_boot(struct ivpu_device *vdev)
57{
58 struct ivpu_fw_info *fw = vdev->fw;
59 struct vpu_boot_params *bp = ivpu_bo_vaddr(fw->mem_bp);
60
61 fw->warm_boot_entry_point = bp->save_restore_ret_address;
62 if (!fw->warm_boot_entry_point) {
63 ivpu_pm_prepare_cold_boot(vdev);
64 return;
65 }
66
67 ivpu_dbg(vdev, FW_BOOT, "Warm boot entry point 0x%llx", fw->warm_boot_entry_point);
68 fw->next_boot_mode = VPU_BOOT_TYPE_WARMBOOT;
69}
70
71static int ivpu_suspend(struct ivpu_device *vdev)
72{
73 int ret;
74
75 ivpu_prepare_for_reset(vdev);
76
77 ret = ivpu_shutdown(vdev);
78 if (ret)
79 ivpu_err(vdev, "Failed to shutdown NPU: %d\n", ret);
80
81 return ret;
82}
83
84static int ivpu_resume(struct ivpu_device *vdev)
85{
86 int ret;
87
88retry:
89 pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D0);
90 pci_restore_state(to_pci_dev(vdev->drm.dev));
91
92 ret = ivpu_hw_power_up(vdev);
93 if (ret) {
94 ivpu_err(vdev, "Failed to power up HW: %d\n", ret);
95 goto err_power_down;
96 }
97
98 ret = ivpu_mmu_enable(vdev);
99 if (ret) {
100 ivpu_err(vdev, "Failed to resume MMU: %d\n", ret);
101 goto err_power_down;
102 }
103
104 ret = ivpu_boot(vdev);
105 if (ret)
106 goto err_mmu_disable;
107
108 return 0;
109
110err_mmu_disable:
111 ivpu_mmu_disable(vdev);
112err_power_down:
113 ivpu_hw_power_down(vdev);
114 pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot);
115
116 if (ivpu_fw_is_warm_boot(vdev)) {
117 ivpu_pm_prepare_cold_boot(vdev);
118 goto retry;
119 } else {
120 ivpu_err(vdev, "Failed to resume the FW: %d\n", ret);
121 }
122
123 return ret;
124}
125
126static void ivpu_pm_reset_begin(struct ivpu_device *vdev)
127{
128 pm_runtime_disable(vdev->drm.dev);
129
130 atomic_inc(&vdev->pm->reset_counter);
131 atomic_set(&vdev->pm->reset_pending, 1);
132 down_write(&vdev->pm->reset_lock);
133}
134
135static void ivpu_pm_reset_complete(struct ivpu_device *vdev)
136{
137 int ret;
138
139 ivpu_pm_prepare_cold_boot(vdev);
140 ivpu_jobs_abort_all(vdev);
141 ivpu_ms_cleanup_all(vdev);
142
143 ret = ivpu_resume(vdev);
144 if (ret) {
145 ivpu_err(vdev, "Failed to resume NPU: %d\n", ret);
146 pm_runtime_set_suspended(vdev->drm.dev);
147 } else {
148 pm_runtime_set_active(vdev->drm.dev);
149 }
150
151 up_write(&vdev->pm->reset_lock);
152 atomic_set(&vdev->pm->reset_pending, 0);
153
154 pm_runtime_mark_last_busy(vdev->drm.dev);
155 pm_runtime_enable(vdev->drm.dev);
156}
157
158static void ivpu_pm_recovery_work(struct work_struct *work)
159{
160 struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, recovery_work);
161 struct ivpu_device *vdev = pm->vdev;
162 char *evt[2] = {"IVPU_PM_EVENT=IVPU_RECOVER", NULL};
163
164 ivpu_err(vdev, "Recovering the NPU (reset #%d)\n", atomic_read(&vdev->pm->reset_counter));
165
166 ivpu_pm_reset_begin(vdev);
167
168 if (!pm_runtime_status_suspended(vdev->drm.dev)) {
169 ivpu_jsm_state_dump_no_reply(vdev);
170 ivpu_dev_coredump(vdev);
171 ivpu_suspend(vdev);
172 }
173
174 ivpu_pm_reset_complete(vdev);
175
176 kobject_uevent_env(&vdev->drm.dev->kobj, KOBJ_CHANGE, evt);
177}
178
179void ivpu_pm_trigger_recovery(struct ivpu_device *vdev, const char *reason)
180{
181 ivpu_err(vdev, "Recovery triggered by %s\n", reason);
182
183 if (ivpu_disable_recovery) {
184 ivpu_err(vdev, "Recovery not available when disable_recovery param is set\n");
185 return;
186 }
187
188 /* Trigger recovery if it's not in progress */
189 if (atomic_cmpxchg(&vdev->pm->reset_pending, 0, 1) == 0) {
190 ivpu_hw_diagnose_failure(vdev);
191 ivpu_hw_irq_disable(vdev); /* Disable IRQ early to protect from IRQ storm */
192 queue_work(system_dfl_wq, &vdev->pm->recovery_work);
193 }
194}
195
196static void ivpu_job_timeout_work(struct work_struct *work)
197{
198 struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work);
199 struct ivpu_device *vdev = pm->vdev;
200 unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr;
201 unsigned long inference_timeout_ms = ivpu_inference_timeout_ms ? ivpu_inference_timeout_ms :
202 vdev->timeout.inference;
203 u64 inference_max_retries;
204 u64 heartbeat;
205
206 if (ivpu_jsm_get_heartbeat(vdev, 0, &heartbeat) || heartbeat <= vdev->fw->last_heartbeat) {
207 ivpu_err(vdev, "Job timeout detected, heartbeat not progressed\n");
208 goto abort;
209 }
210
211 inference_max_retries = DIV_ROUND_UP(inference_timeout_ms, timeout_ms);
212 if (atomic_fetch_inc(&vdev->job_timeout_counter) >= inference_max_retries) {
213 ivpu_err(vdev, "Job timeout detected, heartbeat limit (%lld) exceeded\n",
214 inference_max_retries);
215 goto abort;
216 }
217
218 vdev->fw->last_heartbeat = heartbeat;
219 ivpu_start_job_timeout_detection(vdev);
220 return;
221
222abort:
223 atomic_set(&vdev->job_timeout_counter, 0);
224
225 if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_OS) {
226 ivpu_pm_trigger_recovery(vdev, "Job timeout");
227 return;
228 }
229
230 ivpu_jsm_state_dump(vdev);
231 ivpu_dev_coredump(vdev);
232 queue_work(system_percpu_wq, &vdev->context_abort_work);
233}
234
235void ivpu_start_job_timeout_detection(struct ivpu_device *vdev)
236{
237 unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr;
238
239 /* No-op if already queued */
240 queue_delayed_work(system_percpu_wq, &vdev->pm->job_timeout_work,
241 msecs_to_jiffies(timeout_ms));
242}
243
244void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev)
245{
246 cancel_delayed_work_sync(&vdev->pm->job_timeout_work);
247 atomic_set(&vdev->job_timeout_counter, 0);
248}
249
250int ivpu_pm_suspend_cb(struct device *dev)
251{
252 struct drm_device *drm = dev_get_drvdata(dev);
253 struct ivpu_device *vdev = to_ivpu_device(drm);
254 unsigned long timeout;
255
256 trace_pm("suspend");
257 ivpu_dbg(vdev, PM, "Suspend..\n");
258
259 timeout = jiffies + msecs_to_jiffies(vdev->timeout.tdr);
260 while (!ivpu_hw_is_idle(vdev)) {
261 cond_resched();
262 if (time_after_eq(jiffies, timeout)) {
263 ivpu_err(vdev, "Failed to enter idle on system suspend\n");
264 return -EBUSY;
265 }
266 }
267
268 ivpu_jsm_pwr_d0i3_enter(vdev);
269
270 ivpu_suspend(vdev);
271 ivpu_pm_prepare_warm_boot(vdev);
272
273 ivpu_dbg(vdev, PM, "Suspend done.\n");
274 trace_pm("suspend done");
275
276 return 0;
277}
278
279int ivpu_pm_resume_cb(struct device *dev)
280{
281 struct drm_device *drm = dev_get_drvdata(dev);
282 struct ivpu_device *vdev = to_ivpu_device(drm);
283 int ret;
284
285 trace_pm("resume");
286 ivpu_dbg(vdev, PM, "Resume..\n");
287
288 ret = ivpu_resume(vdev);
289 if (ret)
290 ivpu_err(vdev, "Failed to resume: %d\n", ret);
291
292 ivpu_dbg(vdev, PM, "Resume done.\n");
293 trace_pm("resume done");
294
295 return ret;
296}
297
298int ivpu_pm_runtime_suspend_cb(struct device *dev)
299{
300 struct drm_device *drm = dev_get_drvdata(dev);
301 struct ivpu_device *vdev = to_ivpu_device(drm);
302 int ret, ret_d0i3;
303 bool is_idle;
304
305 drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa));
306 drm_WARN_ON(&vdev->drm, work_pending(&vdev->pm->recovery_work));
307
308 trace_pm("runtime suspend");
309 ivpu_dbg(vdev, PM, "Runtime suspend..\n");
310
311 ivpu_mmu_disable(vdev);
312
313 is_idle = ivpu_hw_is_idle(vdev) || vdev->pm->dct_active_percent;
314 if (!is_idle)
315 ivpu_err(vdev, "NPU is not idle before autosuspend\n");
316
317 ret_d0i3 = ivpu_jsm_pwr_d0i3_enter(vdev);
318 if (ret_d0i3)
319 ivpu_err(vdev, "Failed to prepare for d0i3: %d\n", ret_d0i3);
320
321 ret = ivpu_suspend(vdev);
322 if (ret)
323 ivpu_err(vdev, "Failed to suspend NPU: %d\n", ret);
324
325 if (!is_idle || ret_d0i3) {
326 ivpu_err(vdev, "Forcing cold boot due to previous errors\n");
327 atomic_inc(&vdev->pm->reset_counter);
328 ivpu_dev_coredump(vdev);
329 ivpu_pm_prepare_cold_boot(vdev);
330 } else {
331 ivpu_pm_prepare_warm_boot(vdev);
332 }
333
334 ivpu_dbg(vdev, PM, "Runtime suspend done.\n");
335 trace_pm("runtime suspend done");
336
337 return 0;
338}
339
340int ivpu_pm_runtime_resume_cb(struct device *dev)
341{
342 struct drm_device *drm = dev_get_drvdata(dev);
343 struct ivpu_device *vdev = to_ivpu_device(drm);
344 int ret;
345
346 trace_pm("runtime resume");
347 ivpu_dbg(vdev, PM, "Runtime resume..\n");
348
349 ret = ivpu_resume(vdev);
350 if (ret)
351 ivpu_err(vdev, "Failed to set RESUME state: %d\n", ret);
352
353 ivpu_dbg(vdev, PM, "Runtime resume done.\n");
354 trace_pm("runtime resume done");
355
356 return ret;
357}
358
359int ivpu_rpm_get(struct ivpu_device *vdev)
360{
361 int ret;
362
363 ret = pm_runtime_resume_and_get(vdev->drm.dev);
364 if (ret < 0) {
365 ivpu_err(vdev, "Failed to resume NPU: %d\n", ret);
366 pm_runtime_set_suspended(vdev->drm.dev);
367 }
368
369 return ret;
370}
371
372void ivpu_rpm_put(struct ivpu_device *vdev)
373{
374 pm_runtime_put_autosuspend(vdev->drm.dev);
375}
376
377void ivpu_pm_reset_prepare_cb(struct pci_dev *pdev)
378{
379 struct ivpu_device *vdev = pci_get_drvdata(pdev);
380
381 ivpu_dbg(vdev, PM, "Pre-reset..\n");
382
383 ivpu_pm_reset_begin(vdev);
384
385 if (!pm_runtime_status_suspended(vdev->drm.dev)) {
386 ivpu_prepare_for_reset(vdev);
387 ivpu_hw_reset(vdev);
388 }
389
390 ivpu_dbg(vdev, PM, "Pre-reset done.\n");
391}
392
393void ivpu_pm_reset_done_cb(struct pci_dev *pdev)
394{
395 struct ivpu_device *vdev = pci_get_drvdata(pdev);
396
397 ivpu_dbg(vdev, PM, "Post-reset..\n");
398
399 ivpu_pm_reset_complete(vdev);
400
401 ivpu_dbg(vdev, PM, "Post-reset done.\n");
402}
403
404void ivpu_pm_init(struct ivpu_device *vdev)
405{
406 struct device *dev = vdev->drm.dev;
407 struct ivpu_pm_info *pm = vdev->pm;
408 int delay;
409
410 pm->vdev = vdev;
411
412 init_rwsem(&pm->reset_lock);
413 atomic_set(&pm->reset_pending, 0);
414 atomic_set(&pm->reset_counter, 0);
415 atomic_set(&pm->engine_reset_counter, 0);
416
417 INIT_WORK(&pm->recovery_work, ivpu_pm_recovery_work);
418 INIT_DELAYED_WORK(&pm->job_timeout_work, ivpu_job_timeout_work);
419
420 if (ivpu_disable_recovery)
421 delay = -1;
422 else
423 delay = vdev->timeout.autosuspend;
424
425 pm_runtime_use_autosuspend(dev);
426 pm_runtime_set_autosuspend_delay(dev, delay);
427 pm_runtime_set_active(dev);
428
429 ivpu_dbg(vdev, PM, "Autosuspend delay = %d\n", delay);
430}
431
432void ivpu_pm_disable_recovery(struct ivpu_device *vdev)
433{
434 drm_WARN_ON(&vdev->drm, delayed_work_pending(&vdev->pm->job_timeout_work));
435 disable_work_sync(&vdev->pm->recovery_work);
436}
437
438void ivpu_pm_enable(struct ivpu_device *vdev)
439{
440 struct device *dev = vdev->drm.dev;
441
442 pm_runtime_allow(dev);
443 pm_runtime_put_autosuspend(dev);
444}
445
446void ivpu_pm_disable(struct ivpu_device *vdev)
447{
448 pm_runtime_get_noresume(vdev->drm.dev);
449 pm_runtime_forbid(vdev->drm.dev);
450}
451
452int ivpu_pm_dct_init(struct ivpu_device *vdev)
453{
454 if (vdev->pm->dct_active_percent)
455 return ivpu_pm_dct_enable(vdev, vdev->pm->dct_active_percent);
456
457 return 0;
458}
459
460int ivpu_pm_dct_enable(struct ivpu_device *vdev, u8 active_percent)
461{
462 u32 active_us, inactive_us;
463 int ret;
464
465 if (active_percent == 0 || active_percent > 100)
466 return -EINVAL;
467
468 active_us = (DCT_PERIOD_US * active_percent) / 100;
469 inactive_us = DCT_PERIOD_US - active_us;
470
471 vdev->pm->dct_active_percent = active_percent;
472
473 ivpu_dbg(vdev, PM, "DCT requested %u%% (D0: %uus, D0i2: %uus)\n",
474 active_percent, active_us, inactive_us);
475
476 ret = ivpu_jsm_dct_enable(vdev, active_us, inactive_us);
477 if (ret) {
478 ivpu_err_ratelimited(vdev, "Failed to enable DCT: %d\n", ret);
479 return ret;
480 }
481
482 return 0;
483}
484
485int ivpu_pm_dct_disable(struct ivpu_device *vdev)
486{
487 int ret;
488
489 vdev->pm->dct_active_percent = 0;
490
491 ivpu_dbg(vdev, PM, "DCT requested to be disabled\n");
492
493 ret = ivpu_jsm_dct_disable(vdev);
494 if (ret) {
495 ivpu_err_ratelimited(vdev, "Failed to disable DCT: %d\n", ret);
496 return ret;
497 }
498
499 return 0;
500}
501
502void ivpu_pm_irq_dct_work_fn(struct work_struct *work)
503{
504 struct ivpu_device *vdev = container_of(work, struct ivpu_device, irq_dct_work);
505 bool enable;
506 int ret;
507
508 if (ivpu_hw_btrs_dct_get_request(vdev, &enable))
509 return;
510
511 if (enable)
512 ret = ivpu_pm_dct_enable(vdev, DCT_DEFAULT_ACTIVE_PERCENT);
513 else
514 ret = ivpu_pm_dct_disable(vdev);
515
516 if (!ret) {
517 /* Convert percent to U1.7 format */
518 u8 val = DIV_ROUND_CLOSEST(vdev->pm->dct_active_percent * 128, 100);
519
520 ivpu_hw_btrs_dct_set_status(vdev, enable, val);
521 }
522
523}