Merge tag 'drm-xe-fixes-2026-03-05' of https://gitlab.freedesktop.org/drm/xe/kernel into drm-fixes

+2 -12

drivers/gpu/drm/drm_pagemap.c

··· 480 480 .start = start, 481 481 .end = end, 482 482 .pgmap_owner = pagemap->owner, 483 - /* 484 - * FIXME: MIGRATE_VMA_SELECT_DEVICE_PRIVATE intermittently 485 - * causes 'xe_exec_system_allocator --r *race*no*' to trigger aa 486 - * engine reset and a hard hang due to getting stuck on a folio 487 - * lock. This should work and needs to be root-caused. The only 488 - * downside of not selecting MIGRATE_VMA_SELECT_DEVICE_PRIVATE 489 - * is that device-to-device migrations won’t work; instead, 490 - * memory will bounce through system memory. This path should be 491 - * rare and only occur when the madvise attributes of memory are 492 - * changed or atomics are being used. 493 - */ 494 - .flags = MIGRATE_VMA_SELECT_SYSTEM | MIGRATE_VMA_SELECT_DEVICE_COHERENT, 483 + .flags = MIGRATE_VMA_SELECT_SYSTEM | MIGRATE_VMA_SELECT_DEVICE_COHERENT | 484 + MIGRATE_VMA_SELECT_DEVICE_PRIVATE, 495 485 }; 496 486 unsigned long i, npages = npages_in_range(start, end); 497 487 unsigned long own_pages = 0, migrated_pages = 0;

+1

drivers/gpu/drm/xe/xe_configfs.c

··· 830 830 831 831 mutex_destroy(&dev->lock); 832 832 833 + kfree(dev->config.ctx_restore_mid_bb[0].cs); 833 834 kfree(dev->config.ctx_restore_post_bb[0].cs); 834 835 kfree(dev); 835 836 }

+11 -12

drivers/gpu/drm/xe/xe_exec_queue.c

··· 266 266 return q; 267 267 } 268 268 269 + static void __xe_exec_queue_fini(struct xe_exec_queue *q) 270 + { 271 + int i; 272 + 273 + q->ops->fini(q); 274 + 275 + for (i = 0; i < q->width; ++i) 276 + xe_lrc_put(q->lrc[i]); 277 + } 278 + 269 279 static int __xe_exec_queue_init(struct xe_exec_queue *q, u32 exec_queue_flags) 270 280 { 271 281 int i, err; ··· 330 320 return 0; 331 321 332 322 err_lrc: 333 - for (i = i - 1; i >= 0; --i) 334 - xe_lrc_put(q->lrc[i]); 323 + __xe_exec_queue_fini(q); 335 324 return err; 336 - } 337 - 338 - static void __xe_exec_queue_fini(struct xe_exec_queue *q) 339 - { 340 - int i; 341 - 342 - q->ops->fini(q); 343 - 344 - for (i = 0; i < q->width; ++i) 345 - xe_lrc_put(q->lrc[i]); 346 325 } 347 326 348 327 struct xe_exec_queue *xe_exec_queue_create(struct xe_device *xe, struct xe_vm *vm,

+35 -8

drivers/gpu/drm/xe/xe_gsc_proxy.c

··· 435 435 return 0; 436 436 } 437 437 438 - static void xe_gsc_proxy_remove(void *arg) 438 + static void xe_gsc_proxy_stop(struct xe_gsc *gsc) 439 439 { 440 - struct xe_gsc *gsc = arg; 441 440 struct xe_gt *gt = gsc_to_gt(gsc); 442 441 struct xe_device *xe = gt_to_xe(gt); 443 - 444 - if (!gsc->proxy.component_added) 445 - return; 446 442 447 443 /* disable HECI2 IRQs */ 448 444 scoped_guard(xe_pm_runtime, xe) { ··· 451 455 } 452 456 453 457 xe_gsc_wait_for_worker_completion(gsc); 458 + gsc->proxy.started = false; 459 + } 460 + 461 + static void xe_gsc_proxy_remove(void *arg) 462 + { 463 + struct xe_gsc *gsc = arg; 464 + struct xe_gt *gt = gsc_to_gt(gsc); 465 + struct xe_device *xe = gt_to_xe(gt); 466 + 467 + if (!gsc->proxy.component_added) 468 + return; 469 + 470 + /* 471 + * GSC proxy start is an async process that can be ongoing during 472 + * Xe module load/unload. Using devm managed action to register 473 + * xe_gsc_proxy_stop could cause issues if Xe module unload has 474 + * already started when the action is registered, potentially leading 475 + * to the cleanup being called at the wrong time. Therefore, instead 476 + * of registering a separate devm action to undo what is done in 477 + * proxy start, we call it from here, but only if the start has 478 + * completed successfully (tracked with the 'started' flag). 479 + */ 480 + if (gsc->proxy.started) 481 + xe_gsc_proxy_stop(gsc); 454 482 455 483 component_del(xe->drm.dev, &xe_gsc_proxy_component_ops); 456 484 gsc->proxy.component_added = false; ··· 530 510 */ 531 511 int xe_gsc_proxy_start(struct xe_gsc *gsc) 532 512 { 513 + struct xe_gt *gt = gsc_to_gt(gsc); 533 514 int err; 534 515 535 516 /* enable the proxy interrupt in the GSC shim layer */ ··· 542 521 */ 543 522 err = xe_gsc_proxy_request_handler(gsc); 544 523 if (err) 545 - return err; 524 + goto err_irq_disable; 546 525 547 526 if (!xe_gsc_proxy_init_done(gsc)) { 548 - xe_gt_err(gsc_to_gt(gsc), "GSC FW reports proxy init not completed\n"); 549 - return -EIO; 527 + xe_gt_err(gt, "GSC FW reports proxy init not completed\n"); 528 + err = -EIO; 529 + goto err_irq_disable; 550 530 } 551 531 532 + gsc->proxy.started = true; 552 533 return 0; 534 + 535 + err_irq_disable: 536 + gsc_proxy_irq_toggle(gsc, false); 537 + return err; 553 538 }

+2

drivers/gpu/drm/xe/xe_gsc_types.h

··· 58 58 struct mutex mutex; 59 59 /** @proxy.component_added: whether the component has been added */ 60 60 bool component_added; 61 + /** @proxy.started: whether the proxy has been started */ 62 + bool started; 61 63 /** @proxy.bo: object to store message to and from the GSC */ 62 64 struct xe_bo *bo; 63 65 /** @proxy.to_gsc: map of the memory used to send messages to the GSC */

+2 -1

drivers/gpu/drm/xe/xe_lrc.h

··· 75 75 */ 76 76 static inline void xe_lrc_put(struct xe_lrc *lrc) 77 77 { 78 - kref_put(&lrc->refcount, xe_lrc_destroy); 78 + if (lrc) 79 + kref_put(&lrc->refcount, xe_lrc_destroy); 79 80 } 80 81 81 82 /**

+3 -1

drivers/gpu/drm/xe/xe_reg_sr.c

··· 98 98 *pentry = *e; 99 99 ret = xa_err(xa_store(&sr->xa, idx, pentry, GFP_KERNEL)); 100 100 if (ret) 101 - goto fail; 101 + goto fail_free; 102 102 103 103 return 0; 104 104 105 + fail_free: 106 + kfree(pentry); 105 107 fail: 106 108 xe_gt_err(gt, 107 109 "discarding save-restore reg %04lx (clear: %08x, set: %08x, masked: %s, mcr: %s): ret=%d\n",

+9

drivers/gpu/drm/xe/xe_ring_ops.c

··· 280 280 281 281 i = emit_bb_start(batch_addr, ppgtt_flag, dw, i); 282 282 283 + /* Don't preempt fence signaling */ 284 + dw[i++] = MI_ARB_ON_OFF | MI_ARB_DISABLE; 285 + 283 286 if (job->user_fence.used) { 284 287 i = emit_flush_dw(dw, i); 285 288 i = emit_store_imm_ppgtt_posted(job->user_fence.addr, ··· 348 345 349 346 i = emit_bb_start(batch_addr, ppgtt_flag, dw, i); 350 347 348 + /* Don't preempt fence signaling */ 349 + dw[i++] = MI_ARB_ON_OFF | MI_ARB_DISABLE; 350 + 351 351 if (job->user_fence.used) { 352 352 i = emit_flush_dw(dw, i); 353 353 i = emit_store_imm_ppgtt_posted(job->user_fence.addr, ··· 402 396 seqno, dw, i); 403 397 404 398 i = emit_bb_start(batch_addr, ppgtt_flag, dw, i); 399 + 400 + /* Don't preempt fence signaling */ 401 + dw[i++] = MI_ARB_ON_OFF | MI_ARB_DISABLE; 405 402 406 403 i = emit_render_cache_flush(job, dw, i); 407 404

+2 -1

drivers/gpu/drm/xe/xe_vm_madvise.c

··· 453 453 madvise_range.num_vmas, 454 454 args->atomic.val)) { 455 455 err = -EINVAL; 456 - goto madv_fini; 456 + goto free_vmas; 457 457 } 458 458 } 459 459 ··· 490 490 err_fini: 491 491 if (madvise_range.has_bo_vmas) 492 492 drm_exec_fini(&exec); 493 + free_vmas: 493 494 kfree(madvise_range.vmas); 494 495 madvise_range.vmas = NULL; 495 496 madv_fini:

+7 -6

drivers/gpu/drm/xe/xe_wa.c

··· 241 241 242 242 { XE_RTP_NAME("16025250150"), 243 243 XE_RTP_RULES(GRAPHICS_VERSION(2001)), 244 - XE_RTP_ACTIONS(SET(LSN_VC_REG2, 245 - LSN_LNI_WGT(1) | 246 - LSN_LNE_WGT(1) | 247 - LSN_DIM_X_WGT(1) | 248 - LSN_DIM_Y_WGT(1) | 249 - LSN_DIM_Z_WGT(1))) 244 + XE_RTP_ACTIONS(FIELD_SET(LSN_VC_REG2, 245 + LSN_LNI_WGT_MASK | LSN_LNE_WGT_MASK | 246 + LSN_DIM_X_WGT_MASK | LSN_DIM_Y_WGT_MASK | 247 + LSN_DIM_Z_WGT_MASK, 248 + LSN_LNI_WGT(1) | LSN_LNE_WGT(1) | 249 + LSN_DIM_X_WGT(1) | LSN_DIM_Y_WGT(1) | 250 + LSN_DIM_Z_WGT(1))) 250 251 }, 251 252 252 253 /* Xe2_HPM */

+9 -1

include/linux/migrate.h

··· 65 65 66 66 int migrate_huge_page_move_mapping(struct address_space *mapping, 67 67 struct folio *dst, struct folio *src); 68 - void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) 68 + void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) 69 69 __releases(ptl); 70 70 void folio_migrate_flags(struct folio *newfolio, struct folio *folio); 71 71 int folio_migrate_mapping(struct address_space *mapping, ··· 95 95 static inline int set_movable_ops(const struct movable_operations *ops, enum pagetype type) 96 96 { 97 97 return -ENOSYS; 98 + } 99 + 100 + static inline void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) 101 + __releases(ptl) 102 + { 103 + WARN_ON_ONCE(1); 104 + 105 + spin_unlock(ptl); 98 106 } 99 107 100 108 #endif /* CONFIG_MIGRATION */

+10 -5

mm/filemap.c

··· 1379 1379 1380 1380 #ifdef CONFIG_MIGRATION 1381 1381 /** 1382 - * migration_entry_wait_on_locked - Wait for a migration entry to be removed 1383 - * @entry: migration swap entry. 1382 + * softleaf_entry_wait_on_locked - Wait for a migration entry or 1383 + * device_private entry to be removed. 1384 + * @entry: migration or device_private swap entry. 1384 1385 * @ptl: already locked ptl. This function will drop the lock. 1385 1386 * 1386 - * Wait for a migration entry referencing the given page to be removed. This is 1387 + * Wait for a migration entry referencing the given page, or device_private 1388 + * entry referencing a dvice_private page to be unlocked. This is 1387 1389 * equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except 1388 1390 * this can be called without taking a reference on the page. Instead this 1389 - * should be called while holding the ptl for the migration entry referencing 1391 + * should be called while holding the ptl for @entry referencing 1390 1392 * the page. 1391 1393 * 1392 1394 * Returns after unlocking the ptl. ··· 1396 1394 * This follows the same logic as folio_wait_bit_common() so see the comments 1397 1395 * there. 1398 1396 */ 1399 - void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) 1397 + void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) 1400 1398 __releases(ptl) 1401 1399 { 1402 1400 struct wait_page_queue wait_page; ··· 1430 1428 * If a migration entry exists for the page the migration path must hold 1431 1429 * a valid reference to the page, and it must take the ptl to remove the 1432 1430 * migration entry. So the page is valid until the ptl is dropped. 1431 + * Similarly any path attempting to drop the last reference to a 1432 + * device-private page needs to grab the ptl to remove the device-private 1433 + * entry. 1433 1434 */ 1434 1435 spin_unlock(ptl); 1435 1436

+2 -1

mm/memory.c

··· 4763 4763 unlock_page(vmf->page); 4764 4764 put_page(vmf->page); 4765 4765 } else { 4766 - pte_unmap_unlock(vmf->pte, vmf->ptl); 4766 + pte_unmap(vmf->pte); 4767 + softleaf_entry_wait_on_locked(entry, vmf->ptl); 4767 4768 } 4768 4769 } else if (softleaf_is_hwpoison(entry)) { 4769 4770 ret = VM_FAULT_HWPOISON;

+4 -4

mm/migrate.c

··· 500 500 if (!softleaf_is_migration(entry)) 501 501 goto out; 502 502 503 - migration_entry_wait_on_locked(entry, ptl); 503 + softleaf_entry_wait_on_locked(entry, ptl); 504 504 return; 505 505 out: 506 506 spin_unlock(ptl); ··· 532 532 * If migration entry existed, safe to release vma lock 533 533 * here because the pgtable page won't be freed without the 534 534 * pgtable lock released. See comment right above pgtable 535 - * lock release in migration_entry_wait_on_locked(). 535 + * lock release in softleaf_entry_wait_on_locked(). 536 536 */ 537 537 hugetlb_vma_unlock_read(vma); 538 - migration_entry_wait_on_locked(entry, ptl); 538 + softleaf_entry_wait_on_locked(entry, ptl); 539 539 return; 540 540 } 541 541 ··· 553 553 ptl = pmd_lock(mm, pmd); 554 554 if (!pmd_is_migration_entry(*pmd)) 555 555 goto unlock; 556 - migration_entry_wait_on_locked(softleaf_from_pmd(*pmd), ptl); 556 + softleaf_entry_wait_on_locked(softleaf_from_pmd(*pmd), ptl); 557 557 return; 558 558 unlock: 559 559 spin_unlock(ptl);

+1 -1

mm/migrate_device.c

··· 176 176 } 177 177 178 178 if (softleaf_is_migration(entry)) { 179 - migration_entry_wait_on_locked(entry, ptl); 179 + softleaf_entry_wait_on_locked(entry, ptl); 180 180 spin_unlock(ptl); 181 181 return -EAGAIN; 182 182 }

Configure Feed

Configure Feed