Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at master 523 lines 13 kB view raw
1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * Copyright (C) 2020-2026 Intel Corporation 4 */ 5 6#include <linux/highmem.h> 7#include <linux/moduleparam.h> 8#include <linux/pci.h> 9#include <linux/pm_runtime.h> 10#include <linux/reboot.h> 11 12#include "ivpu_coredump.h" 13#include "ivpu_drv.h" 14#include "ivpu_fw.h" 15#include "ivpu_fw_log.h" 16#include "ivpu_hw.h" 17#include "ivpu_ipc.h" 18#include "ivpu_job.h" 19#include "ivpu_jsm_msg.h" 20#include "ivpu_mmu.h" 21#include "ivpu_ms.h" 22#include "ivpu_pm.h" 23#include "ivpu_trace.h" 24#include "vpu_boot_api.h" 25 26static bool ivpu_disable_recovery; 27#if IS_ENABLED(CONFIG_DRM_ACCEL_IVPU_DEBUG) 28module_param_named_unsafe(disable_recovery, ivpu_disable_recovery, bool, 0644); 29MODULE_PARM_DESC(disable_recovery, "Disables recovery when NPU hang is detected"); 30#endif 31 32static unsigned long ivpu_tdr_timeout_ms; 33module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, ulong, 0644); 34MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in milliseconds, 0 - default"); 35 36static unsigned long ivpu_inference_timeout_ms; 37module_param_named(inference_timeout_ms, ivpu_inference_timeout_ms, ulong, 0644); 38MODULE_PARM_DESC(inference_timeout_ms, "Inference maximum duration, in milliseconds, 0 - default"); 39 40#define PM_RESCHEDULE_LIMIT 5 41 42static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev) 43{ 44 struct ivpu_fw_info *fw = vdev->fw; 45 46 ivpu_cmdq_reset_all_contexts(vdev); 47 ivpu_ipc_reset(vdev); 48 ivpu_fw_log_reset(vdev); 49 ivpu_fw_load(vdev); 50 fw->last_heartbeat = 0; 51 52 ivpu_dbg(vdev, FW_BOOT, "Cold boot entry point 0x%llx", vdev->fw->cold_boot_entry_point); 53 fw->next_boot_mode = VPU_BOOT_TYPE_COLDBOOT; 54} 55 56static void ivpu_pm_prepare_warm_boot(struct ivpu_device *vdev) 57{ 58 struct ivpu_fw_info *fw = vdev->fw; 59 struct vpu_boot_params *bp = ivpu_bo_vaddr(fw->mem_bp); 60 61 fw->warm_boot_entry_point = bp->save_restore_ret_address; 62 if (!fw->warm_boot_entry_point) { 63 ivpu_pm_prepare_cold_boot(vdev); 64 return; 65 } 66 67 ivpu_dbg(vdev, FW_BOOT, "Warm boot entry point 0x%llx", fw->warm_boot_entry_point); 68 fw->next_boot_mode = VPU_BOOT_TYPE_WARMBOOT; 69} 70 71static int ivpu_suspend(struct ivpu_device *vdev) 72{ 73 int ret; 74 75 ivpu_prepare_for_reset(vdev); 76 77 ret = ivpu_shutdown(vdev); 78 if (ret) 79 ivpu_err(vdev, "Failed to shutdown NPU: %d\n", ret); 80 81 return ret; 82} 83 84static int ivpu_resume(struct ivpu_device *vdev) 85{ 86 int ret; 87 88retry: 89 pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D0); 90 pci_restore_state(to_pci_dev(vdev->drm.dev)); 91 92 ret = ivpu_hw_power_up(vdev); 93 if (ret) { 94 ivpu_err(vdev, "Failed to power up HW: %d\n", ret); 95 goto err_power_down; 96 } 97 98 ret = ivpu_mmu_enable(vdev); 99 if (ret) { 100 ivpu_err(vdev, "Failed to resume MMU: %d\n", ret); 101 goto err_power_down; 102 } 103 104 ret = ivpu_boot(vdev); 105 if (ret) 106 goto err_mmu_disable; 107 108 return 0; 109 110err_mmu_disable: 111 ivpu_mmu_disable(vdev); 112err_power_down: 113 ivpu_hw_power_down(vdev); 114 pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot); 115 116 if (ivpu_fw_is_warm_boot(vdev)) { 117 ivpu_pm_prepare_cold_boot(vdev); 118 goto retry; 119 } else { 120 ivpu_err(vdev, "Failed to resume the FW: %d\n", ret); 121 } 122 123 return ret; 124} 125 126static void ivpu_pm_reset_begin(struct ivpu_device *vdev) 127{ 128 pm_runtime_disable(vdev->drm.dev); 129 130 atomic_inc(&vdev->pm->reset_counter); 131 atomic_set(&vdev->pm->reset_pending, 1); 132 down_write(&vdev->pm->reset_lock); 133} 134 135static void ivpu_pm_reset_complete(struct ivpu_device *vdev) 136{ 137 int ret; 138 139 ivpu_pm_prepare_cold_boot(vdev); 140 ivpu_jobs_abort_all(vdev); 141 ivpu_ms_cleanup_all(vdev); 142 143 ret = ivpu_resume(vdev); 144 if (ret) { 145 ivpu_err(vdev, "Failed to resume NPU: %d\n", ret); 146 pm_runtime_set_suspended(vdev->drm.dev); 147 } else { 148 pm_runtime_set_active(vdev->drm.dev); 149 } 150 151 up_write(&vdev->pm->reset_lock); 152 atomic_set(&vdev->pm->reset_pending, 0); 153 154 pm_runtime_mark_last_busy(vdev->drm.dev); 155 pm_runtime_enable(vdev->drm.dev); 156} 157 158static void ivpu_pm_recovery_work(struct work_struct *work) 159{ 160 struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, recovery_work); 161 struct ivpu_device *vdev = pm->vdev; 162 char *evt[2] = {"IVPU_PM_EVENT=IVPU_RECOVER", NULL}; 163 164 ivpu_err(vdev, "Recovering the NPU (reset #%d)\n", atomic_read(&vdev->pm->reset_counter)); 165 166 ivpu_pm_reset_begin(vdev); 167 168 if (!pm_runtime_status_suspended(vdev->drm.dev)) { 169 ivpu_jsm_state_dump_no_reply(vdev); 170 ivpu_dev_coredump(vdev); 171 ivpu_suspend(vdev); 172 } 173 174 ivpu_pm_reset_complete(vdev); 175 176 kobject_uevent_env(&vdev->drm.dev->kobj, KOBJ_CHANGE, evt); 177} 178 179void ivpu_pm_trigger_recovery(struct ivpu_device *vdev, const char *reason) 180{ 181 ivpu_err(vdev, "Recovery triggered by %s\n", reason); 182 183 if (ivpu_disable_recovery) { 184 ivpu_err(vdev, "Recovery not available when disable_recovery param is set\n"); 185 return; 186 } 187 188 /* Trigger recovery if it's not in progress */ 189 if (atomic_cmpxchg(&vdev->pm->reset_pending, 0, 1) == 0) { 190 ivpu_hw_diagnose_failure(vdev); 191 ivpu_hw_irq_disable(vdev); /* Disable IRQ early to protect from IRQ storm */ 192 queue_work(system_dfl_wq, &vdev->pm->recovery_work); 193 } 194} 195 196static void ivpu_job_timeout_work(struct work_struct *work) 197{ 198 struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work); 199 struct ivpu_device *vdev = pm->vdev; 200 unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr; 201 unsigned long inference_timeout_ms = ivpu_inference_timeout_ms ? ivpu_inference_timeout_ms : 202 vdev->timeout.inference; 203 u64 inference_max_retries; 204 u64 heartbeat; 205 206 if (ivpu_jsm_get_heartbeat(vdev, 0, &heartbeat) || heartbeat <= vdev->fw->last_heartbeat) { 207 ivpu_err(vdev, "Job timeout detected, heartbeat not progressed\n"); 208 goto abort; 209 } 210 211 inference_max_retries = DIV_ROUND_UP(inference_timeout_ms, timeout_ms); 212 if (atomic_fetch_inc(&vdev->job_timeout_counter) >= inference_max_retries) { 213 ivpu_err(vdev, "Job timeout detected, heartbeat limit (%lld) exceeded\n", 214 inference_max_retries); 215 goto abort; 216 } 217 218 vdev->fw->last_heartbeat = heartbeat; 219 ivpu_start_job_timeout_detection(vdev); 220 return; 221 222abort: 223 atomic_set(&vdev->job_timeout_counter, 0); 224 225 if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_OS) { 226 ivpu_pm_trigger_recovery(vdev, "Job timeout"); 227 return; 228 } 229 230 ivpu_jsm_state_dump(vdev); 231 ivpu_dev_coredump(vdev); 232 queue_work(system_percpu_wq, &vdev->context_abort_work); 233} 234 235void ivpu_start_job_timeout_detection(struct ivpu_device *vdev) 236{ 237 unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr; 238 239 /* No-op if already queued */ 240 queue_delayed_work(system_percpu_wq, &vdev->pm->job_timeout_work, 241 msecs_to_jiffies(timeout_ms)); 242} 243 244void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev) 245{ 246 cancel_delayed_work_sync(&vdev->pm->job_timeout_work); 247 atomic_set(&vdev->job_timeout_counter, 0); 248} 249 250int ivpu_pm_suspend_cb(struct device *dev) 251{ 252 struct drm_device *drm = dev_get_drvdata(dev); 253 struct ivpu_device *vdev = to_ivpu_device(drm); 254 unsigned long timeout; 255 256 trace_pm("suspend"); 257 ivpu_dbg(vdev, PM, "Suspend..\n"); 258 259 timeout = jiffies + msecs_to_jiffies(vdev->timeout.tdr); 260 while (!ivpu_hw_is_idle(vdev)) { 261 cond_resched(); 262 if (time_after_eq(jiffies, timeout)) { 263 ivpu_err(vdev, "Failed to enter idle on system suspend\n"); 264 return -EBUSY; 265 } 266 } 267 268 ivpu_jsm_pwr_d0i3_enter(vdev); 269 270 ivpu_suspend(vdev); 271 ivpu_pm_prepare_warm_boot(vdev); 272 273 ivpu_dbg(vdev, PM, "Suspend done.\n"); 274 trace_pm("suspend done"); 275 276 return 0; 277} 278 279int ivpu_pm_resume_cb(struct device *dev) 280{ 281 struct drm_device *drm = dev_get_drvdata(dev); 282 struct ivpu_device *vdev = to_ivpu_device(drm); 283 int ret; 284 285 trace_pm("resume"); 286 ivpu_dbg(vdev, PM, "Resume..\n"); 287 288 ret = ivpu_resume(vdev); 289 if (ret) 290 ivpu_err(vdev, "Failed to resume: %d\n", ret); 291 292 ivpu_dbg(vdev, PM, "Resume done.\n"); 293 trace_pm("resume done"); 294 295 return ret; 296} 297 298int ivpu_pm_runtime_suspend_cb(struct device *dev) 299{ 300 struct drm_device *drm = dev_get_drvdata(dev); 301 struct ivpu_device *vdev = to_ivpu_device(drm); 302 int ret, ret_d0i3; 303 bool is_idle; 304 305 drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa)); 306 drm_WARN_ON(&vdev->drm, work_pending(&vdev->pm->recovery_work)); 307 308 trace_pm("runtime suspend"); 309 ivpu_dbg(vdev, PM, "Runtime suspend..\n"); 310 311 ivpu_mmu_disable(vdev); 312 313 is_idle = ivpu_hw_is_idle(vdev) || vdev->pm->dct_active_percent; 314 if (!is_idle) 315 ivpu_err(vdev, "NPU is not idle before autosuspend\n"); 316 317 ret_d0i3 = ivpu_jsm_pwr_d0i3_enter(vdev); 318 if (ret_d0i3) 319 ivpu_err(vdev, "Failed to prepare for d0i3: %d\n", ret_d0i3); 320 321 ret = ivpu_suspend(vdev); 322 if (ret) 323 ivpu_err(vdev, "Failed to suspend NPU: %d\n", ret); 324 325 if (!is_idle || ret_d0i3) { 326 ivpu_err(vdev, "Forcing cold boot due to previous errors\n"); 327 atomic_inc(&vdev->pm->reset_counter); 328 ivpu_dev_coredump(vdev); 329 ivpu_pm_prepare_cold_boot(vdev); 330 } else { 331 ivpu_pm_prepare_warm_boot(vdev); 332 } 333 334 ivpu_dbg(vdev, PM, "Runtime suspend done.\n"); 335 trace_pm("runtime suspend done"); 336 337 return 0; 338} 339 340int ivpu_pm_runtime_resume_cb(struct device *dev) 341{ 342 struct drm_device *drm = dev_get_drvdata(dev); 343 struct ivpu_device *vdev = to_ivpu_device(drm); 344 int ret; 345 346 trace_pm("runtime resume"); 347 ivpu_dbg(vdev, PM, "Runtime resume..\n"); 348 349 ret = ivpu_resume(vdev); 350 if (ret) 351 ivpu_err(vdev, "Failed to set RESUME state: %d\n", ret); 352 353 ivpu_dbg(vdev, PM, "Runtime resume done.\n"); 354 trace_pm("runtime resume done"); 355 356 return ret; 357} 358 359int ivpu_rpm_get(struct ivpu_device *vdev) 360{ 361 int ret; 362 363 ret = pm_runtime_resume_and_get(vdev->drm.dev); 364 if (ret < 0) { 365 ivpu_err(vdev, "Failed to resume NPU: %d\n", ret); 366 pm_runtime_set_suspended(vdev->drm.dev); 367 } 368 369 return ret; 370} 371 372void ivpu_rpm_put(struct ivpu_device *vdev) 373{ 374 pm_runtime_put_autosuspend(vdev->drm.dev); 375} 376 377void ivpu_pm_reset_prepare_cb(struct pci_dev *pdev) 378{ 379 struct ivpu_device *vdev = pci_get_drvdata(pdev); 380 381 ivpu_dbg(vdev, PM, "Pre-reset..\n"); 382 383 ivpu_pm_reset_begin(vdev); 384 385 if (!pm_runtime_status_suspended(vdev->drm.dev)) { 386 ivpu_prepare_for_reset(vdev); 387 ivpu_hw_reset(vdev); 388 } 389 390 ivpu_dbg(vdev, PM, "Pre-reset done.\n"); 391} 392 393void ivpu_pm_reset_done_cb(struct pci_dev *pdev) 394{ 395 struct ivpu_device *vdev = pci_get_drvdata(pdev); 396 397 ivpu_dbg(vdev, PM, "Post-reset..\n"); 398 399 ivpu_pm_reset_complete(vdev); 400 401 ivpu_dbg(vdev, PM, "Post-reset done.\n"); 402} 403 404void ivpu_pm_init(struct ivpu_device *vdev) 405{ 406 struct device *dev = vdev->drm.dev; 407 struct ivpu_pm_info *pm = vdev->pm; 408 int delay; 409 410 pm->vdev = vdev; 411 412 init_rwsem(&pm->reset_lock); 413 atomic_set(&pm->reset_pending, 0); 414 atomic_set(&pm->reset_counter, 0); 415 atomic_set(&pm->engine_reset_counter, 0); 416 417 INIT_WORK(&pm->recovery_work, ivpu_pm_recovery_work); 418 INIT_DELAYED_WORK(&pm->job_timeout_work, ivpu_job_timeout_work); 419 420 if (ivpu_disable_recovery) 421 delay = -1; 422 else 423 delay = vdev->timeout.autosuspend; 424 425 pm_runtime_use_autosuspend(dev); 426 pm_runtime_set_autosuspend_delay(dev, delay); 427 pm_runtime_set_active(dev); 428 429 ivpu_dbg(vdev, PM, "Autosuspend delay = %d\n", delay); 430} 431 432void ivpu_pm_disable_recovery(struct ivpu_device *vdev) 433{ 434 drm_WARN_ON(&vdev->drm, delayed_work_pending(&vdev->pm->job_timeout_work)); 435 disable_work_sync(&vdev->pm->recovery_work); 436} 437 438void ivpu_pm_enable(struct ivpu_device *vdev) 439{ 440 struct device *dev = vdev->drm.dev; 441 442 pm_runtime_allow(dev); 443 pm_runtime_put_autosuspend(dev); 444} 445 446void ivpu_pm_disable(struct ivpu_device *vdev) 447{ 448 pm_runtime_get_noresume(vdev->drm.dev); 449 pm_runtime_forbid(vdev->drm.dev); 450} 451 452int ivpu_pm_dct_init(struct ivpu_device *vdev) 453{ 454 if (vdev->pm->dct_active_percent) 455 return ivpu_pm_dct_enable(vdev, vdev->pm->dct_active_percent); 456 457 return 0; 458} 459 460int ivpu_pm_dct_enable(struct ivpu_device *vdev, u8 active_percent) 461{ 462 u32 active_us, inactive_us; 463 int ret; 464 465 if (active_percent == 0 || active_percent > 100) 466 return -EINVAL; 467 468 active_us = (DCT_PERIOD_US * active_percent) / 100; 469 inactive_us = DCT_PERIOD_US - active_us; 470 471 vdev->pm->dct_active_percent = active_percent; 472 473 ivpu_dbg(vdev, PM, "DCT requested %u%% (D0: %uus, D0i2: %uus)\n", 474 active_percent, active_us, inactive_us); 475 476 ret = ivpu_jsm_dct_enable(vdev, active_us, inactive_us); 477 if (ret) { 478 ivpu_err_ratelimited(vdev, "Failed to enable DCT: %d\n", ret); 479 return ret; 480 } 481 482 return 0; 483} 484 485int ivpu_pm_dct_disable(struct ivpu_device *vdev) 486{ 487 int ret; 488 489 vdev->pm->dct_active_percent = 0; 490 491 ivpu_dbg(vdev, PM, "DCT requested to be disabled\n"); 492 493 ret = ivpu_jsm_dct_disable(vdev); 494 if (ret) { 495 ivpu_err_ratelimited(vdev, "Failed to disable DCT: %d\n", ret); 496 return ret; 497 } 498 499 return 0; 500} 501 502void ivpu_pm_irq_dct_work_fn(struct work_struct *work) 503{ 504 struct ivpu_device *vdev = container_of(work, struct ivpu_device, irq_dct_work); 505 bool enable; 506 int ret; 507 508 if (ivpu_hw_btrs_dct_get_request(vdev, &enable)) 509 return; 510 511 if (enable) 512 ret = ivpu_pm_dct_enable(vdev, DCT_DEFAULT_ACTIVE_PERCENT); 513 else 514 ret = ivpu_pm_dct_disable(vdev); 515 516 if (!ret) { 517 /* Convert percent to U1.7 format */ 518 u8 val = DIV_ROUND_CLOSEST(vdev->pm->dct_active_percent * 128, 100); 519 520 ivpu_hw_btrs_dct_set_status(vdev, enable, val); 521 } 522 523}