Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at master 2677 lines 72 kB view raw
1// SPDX-License-Identifier: MIT 2/* 3 * Copyright © 2021 Intel Corporation 4 */ 5 6#include "xe_lrc.h" 7 8#include <generated/xe_wa_oob.h> 9 10#include <linux/ascii85.h> 11#include <linux/panic.h> 12 13#include "instructions/xe_mi_commands.h" 14#include "instructions/xe_gfxpipe_commands.h" 15#include "instructions/xe_gfx_state_commands.h" 16#include "regs/xe_engine_regs.h" 17#include "regs/xe_gt_regs.h" 18#include "regs/xe_lrc_layout.h" 19#include "xe_bb.h" 20#include "xe_bo.h" 21#include "xe_configfs.h" 22#include "xe_device.h" 23#include "xe_drm_client.h" 24#include "xe_exec_queue_types.h" 25#include "xe_gt.h" 26#include "xe_gt_printk.h" 27#include "xe_hw_fence.h" 28#include "xe_map.h" 29#include "xe_memirq.h" 30#include "xe_mmio.h" 31#include "xe_ring_ops.h" 32#include "xe_sriov.h" 33#include "xe_trace_lrc.h" 34#include "xe_vm.h" 35#include "xe_wa.h" 36 37#define LRC_VALID BIT_ULL(0) 38#define LRC_PRIVILEGE BIT_ULL(8) 39#define LRC_ADDRESSING_MODE GENMASK_ULL(4, 3) 40#define LRC_LEGACY_64B_CONTEXT 3 41 42#define LRC_ENGINE_CLASS GENMASK_ULL(63, 61) 43#define LRC_ENGINE_INSTANCE GENMASK_ULL(53, 48) 44 45#define LRC_PPHWSP_SIZE SZ_4K 46#define LRC_INDIRECT_CTX_BO_SIZE SZ_4K 47#define LRC_INDIRECT_RING_STATE_SIZE SZ_4K 48 49#define LRC_PRIORITY GENMASK_ULL(10, 9) 50#define LRC_PRIORITY_LOW 0 51#define LRC_PRIORITY_NORMAL 1 52#define LRC_PRIORITY_HIGH 2 53 54/* 55 * Layout of the LRC and associated data allocated as 56 * lrc->bo: 57 * 58 * Region Size 59 * +============================+=================================+ <- __xe_lrc_ring_offset() 60 * | Ring | ring_size, see | 61 * | | xe_lrc_init() | 62 * +============================+=================================+ <- __xe_lrc_pphwsp_offset() 63 * | PPHWSP (includes SW state) | 4K | 64 * +----------------------------+---------------------------------+ <- __xe_lrc_regs_offset() 65 * | Engine Context Image | n * 4K, see | 66 * | | xe_gt_lrc_size() | 67 * +----------------------------+---------------------------------+ <- __xe_lrc_indirect_ring_offset() 68 * | Indirect Ring State Page | 0 or 4k, see | 69 * | | XE_LRC_FLAG_INDIRECT_RING_STATE | 70 * +============================+=================================+ <- __xe_lrc_indirect_ctx_offset() 71 * | Indirect Context Page | 0 or 4k, see | 72 * | | XE_LRC_FLAG_INDIRECT_CTX | 73 * +============================+=================================+ <- __xe_lrc_wa_bb_offset() 74 * | WA BB Per Ctx | 4k | 75 * +============================+=================================+ <- xe_bo_size(lrc->bo) 76 */ 77 78static struct xe_device * 79lrc_to_xe(struct xe_lrc *lrc) 80{ 81 return gt_to_xe(lrc->fence_ctx.gt); 82} 83 84static bool 85gt_engine_needs_indirect_ctx(struct xe_gt *gt, enum xe_engine_class class) 86{ 87 struct xe_device *xe = gt_to_xe(gt); 88 89 if (XE_GT_WA(gt, 16010904313) && 90 (class == XE_ENGINE_CLASS_RENDER || 91 class == XE_ENGINE_CLASS_COMPUTE)) 92 return true; 93 94 if (xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev), 95 class, NULL)) 96 return true; 97 98 if (gt->ring_ops[class]->emit_aux_table_inv) 99 return true; 100 101 return false; 102} 103 104/** 105 * xe_gt_lrc_hang_replay_size() - Hang replay size 106 * @gt: The GT 107 * @class: Hardware engine class 108 * 109 * Determine size of GPU hang replay state for a GT and hardware engine class. 110 * 111 * Return: Size of GPU hang replay size 112 */ 113size_t xe_gt_lrc_hang_replay_size(struct xe_gt *gt, enum xe_engine_class class) 114{ 115 struct xe_device *xe = gt_to_xe(gt); 116 size_t size = 0; 117 118 /* Engine context image */ 119 switch (class) { 120 case XE_ENGINE_CLASS_RENDER: 121 if (GRAPHICS_VERx100(xe) >= 3510) 122 size += 7 * SZ_4K; 123 else if (GRAPHICS_VER(xe) >= 20) 124 size += 3 * SZ_4K; 125 else 126 size += 13 * SZ_4K; 127 break; 128 case XE_ENGINE_CLASS_COMPUTE: 129 if (GRAPHICS_VERx100(xe) >= 3510) 130 size += 5 * SZ_4K; 131 else if (GRAPHICS_VER(xe) >= 20) 132 size += 2 * SZ_4K; 133 else 134 size += 13 * SZ_4K; 135 break; 136 default: 137 WARN(1, "Unknown engine class: %d", class); 138 fallthrough; 139 case XE_ENGINE_CLASS_COPY: 140 case XE_ENGINE_CLASS_VIDEO_DECODE: 141 case XE_ENGINE_CLASS_VIDEO_ENHANCE: 142 case XE_ENGINE_CLASS_OTHER: 143 size += 1 * SZ_4K; 144 } 145 146 return size; 147} 148 149size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class) 150{ 151 size_t size = xe_gt_lrc_hang_replay_size(gt, class); 152 153 /* Add indirect ring state page */ 154 if (xe_gt_has_indirect_ring_state(gt)) 155 size += LRC_INDIRECT_RING_STATE_SIZE; 156 157 return size + LRC_PPHWSP_SIZE; 158} 159 160/* 161 * The per-platform tables are u8-encoded in @data. Decode @data and set the 162 * addresses' offset and commands in @regs. The following encoding is used 163 * for each byte. There are 2 steps: decoding commands and decoding addresses. 164 * 165 * Commands: 166 * [7]: create NOPs - number of NOPs are set in lower bits 167 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set 168 * MI_LRI_FORCE_POSTED 169 * [5:0]: Number of NOPs or registers to set values to in case of 170 * MI_LOAD_REGISTER_IMM 171 * 172 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count" 173 * number of registers. They are set by using the REG/REG16 macros: the former 174 * is used for offsets smaller than 0x200 while the latter is for values bigger 175 * than that. Those macros already set all the bits documented below correctly: 176 * 177 * [7]: When a register offset needs more than 6 bits, use additional bytes, to 178 * follow, for the lower bits 179 * [6:0]: Register offset, without considering the engine base. 180 * 181 * This function only tweaks the commands and register offsets. Values are not 182 * filled out. 183 */ 184static void set_offsets(u32 *regs, 185 const u8 *data, 186 const struct xe_hw_engine *hwe) 187#define NOP(x) (BIT(7) | (x)) 188#define LRI(count, flags) ((flags) << 6 | (count) | \ 189 BUILD_BUG_ON_ZERO(count >= BIT(6))) 190#define POSTED BIT(0) 191#define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 192#define REG16(x) \ 193 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 194 (((x) >> 2) & 0x7f) 195{ 196 const u32 base = hwe->mmio_base; 197 198 while (*data) { 199 u8 count, flags; 200 201 if (*data & BIT(7)) { /* skip */ 202 count = *data++ & ~BIT(7); 203 regs += count; 204 continue; 205 } 206 207 count = *data & 0x3f; 208 flags = *data >> 6; 209 data++; 210 211 *regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count); 212 if (flags & POSTED) 213 *regs |= MI_LRI_FORCE_POSTED; 214 *regs |= MI_LRI_LRM_CS_MMIO; 215 regs++; 216 217 xe_gt_assert(hwe->gt, count); 218 do { 219 u32 offset = 0; 220 u8 v; 221 222 do { 223 v = *data++; 224 offset <<= 7; 225 offset |= v & ~BIT(7); 226 } while (v & BIT(7)); 227 228 regs[0] = base + (offset << 2); 229 regs += 2; 230 } while (--count); 231 } 232 233 *regs = MI_BATCH_BUFFER_END | BIT(0); 234} 235 236static const u8 gen12_xcs_offsets[] = { 237 NOP(1), 238 LRI(13, POSTED), 239 REG16(0x244), 240 REG(0x034), 241 REG(0x030), 242 REG(0x038), 243 REG(0x03c), 244 REG(0x168), 245 REG(0x140), 246 REG(0x110), 247 REG(0x1c0), 248 REG(0x1c4), 249 REG(0x1c8), 250 REG(0x180), 251 REG16(0x2b4), 252 253 NOP(5), 254 LRI(9, POSTED), 255 REG16(0x3a8), 256 REG16(0x28c), 257 REG16(0x288), 258 REG16(0x284), 259 REG16(0x280), 260 REG16(0x27c), 261 REG16(0x278), 262 REG16(0x274), 263 REG16(0x270), 264 265 0 266}; 267 268static const u8 dg2_xcs_offsets[] = { 269 NOP(1), 270 LRI(15, POSTED), 271 REG16(0x244), 272 REG(0x034), 273 REG(0x030), 274 REG(0x038), 275 REG(0x03c), 276 REG(0x168), 277 REG(0x140), 278 REG(0x110), 279 REG(0x1c0), 280 REG(0x1c4), 281 REG(0x1c8), 282 REG(0x180), 283 REG16(0x2b4), 284 REG(0x120), 285 REG(0x124), 286 287 NOP(1), 288 LRI(9, POSTED), 289 REG16(0x3a8), 290 REG16(0x28c), 291 REG16(0x288), 292 REG16(0x284), 293 REG16(0x280), 294 REG16(0x27c), 295 REG16(0x278), 296 REG16(0x274), 297 REG16(0x270), 298 299 0 300}; 301 302static const u8 gen12_rcs_offsets[] = { 303 NOP(1), 304 LRI(13, POSTED), 305 REG16(0x244), 306 REG(0x034), 307 REG(0x030), 308 REG(0x038), 309 REG(0x03c), 310 REG(0x168), 311 REG(0x140), 312 REG(0x110), 313 REG(0x1c0), 314 REG(0x1c4), 315 REG(0x1c8), 316 REG(0x180), 317 REG16(0x2b4), 318 319 NOP(5), 320 LRI(9, POSTED), 321 REG16(0x3a8), 322 REG16(0x28c), 323 REG16(0x288), 324 REG16(0x284), 325 REG16(0x280), 326 REG16(0x27c), 327 REG16(0x278), 328 REG16(0x274), 329 REG16(0x270), 330 331 LRI(3, POSTED), 332 REG(0x1b0), 333 REG16(0x5a8), 334 REG16(0x5ac), 335 336 NOP(6), 337 LRI(1, 0), 338 REG(0x0c8), 339 NOP(3 + 9 + 1), 340 341 LRI(51, POSTED), 342 REG16(0x588), 343 REG16(0x588), 344 REG16(0x588), 345 REG16(0x588), 346 REG16(0x588), 347 REG16(0x588), 348 REG(0x028), 349 REG(0x09c), 350 REG(0x0c0), 351 REG(0x178), 352 REG(0x17c), 353 REG16(0x358), 354 REG(0x170), 355 REG(0x150), 356 REG(0x154), 357 REG(0x158), 358 REG16(0x41c), 359 REG16(0x600), 360 REG16(0x604), 361 REG16(0x608), 362 REG16(0x60c), 363 REG16(0x610), 364 REG16(0x614), 365 REG16(0x618), 366 REG16(0x61c), 367 REG16(0x620), 368 REG16(0x624), 369 REG16(0x628), 370 REG16(0x62c), 371 REG16(0x630), 372 REG16(0x634), 373 REG16(0x638), 374 REG16(0x63c), 375 REG16(0x640), 376 REG16(0x644), 377 REG16(0x648), 378 REG16(0x64c), 379 REG16(0x650), 380 REG16(0x654), 381 REG16(0x658), 382 REG16(0x65c), 383 REG16(0x660), 384 REG16(0x664), 385 REG16(0x668), 386 REG16(0x66c), 387 REG16(0x670), 388 REG16(0x674), 389 REG16(0x678), 390 REG16(0x67c), 391 REG(0x068), 392 REG(0x084), 393 NOP(1), 394 395 0 396}; 397 398static const u8 xehp_rcs_offsets[] = { 399 NOP(1), 400 LRI(13, POSTED), 401 REG16(0x244), 402 REG(0x034), 403 REG(0x030), 404 REG(0x038), 405 REG(0x03c), 406 REG(0x168), 407 REG(0x140), 408 REG(0x110), 409 REG(0x1c0), 410 REG(0x1c4), 411 REG(0x1c8), 412 REG(0x180), 413 REG16(0x2b4), 414 415 NOP(5), 416 LRI(9, POSTED), 417 REG16(0x3a8), 418 REG16(0x28c), 419 REG16(0x288), 420 REG16(0x284), 421 REG16(0x280), 422 REG16(0x27c), 423 REG16(0x278), 424 REG16(0x274), 425 REG16(0x270), 426 427 LRI(3, POSTED), 428 REG(0x1b0), 429 REG16(0x5a8), 430 REG16(0x5ac), 431 432 NOP(6), 433 LRI(1, 0), 434 REG(0x0c8), 435 436 0 437}; 438 439static const u8 dg2_rcs_offsets[] = { 440 NOP(1), 441 LRI(15, POSTED), 442 REG16(0x244), 443 REG(0x034), 444 REG(0x030), 445 REG(0x038), 446 REG(0x03c), 447 REG(0x168), 448 REG(0x140), 449 REG(0x110), 450 REG(0x1c0), 451 REG(0x1c4), 452 REG(0x1c8), 453 REG(0x180), 454 REG16(0x2b4), 455 REG(0x120), 456 REG(0x124), 457 458 NOP(1), 459 LRI(9, POSTED), 460 REG16(0x3a8), 461 REG16(0x28c), 462 REG16(0x288), 463 REG16(0x284), 464 REG16(0x280), 465 REG16(0x27c), 466 REG16(0x278), 467 REG16(0x274), 468 REG16(0x270), 469 470 LRI(3, POSTED), 471 REG(0x1b0), 472 REG16(0x5a8), 473 REG16(0x5ac), 474 475 NOP(6), 476 LRI(1, 0), 477 REG(0x0c8), 478 479 0 480}; 481 482static const u8 mtl_rcs_offsets[] = { 483 NOP(1), 484 LRI(15, POSTED), 485 REG16(0x244), 486 REG(0x034), 487 REG(0x030), 488 REG(0x038), 489 REG(0x03c), 490 REG(0x168), 491 REG(0x140), 492 REG(0x110), 493 REG(0x1c0), 494 REG(0x1c4), 495 REG(0x1c8), 496 REG(0x180), 497 REG16(0x2b4), 498 REG(0x120), 499 REG(0x124), 500 501 NOP(1), 502 LRI(9, POSTED), 503 REG16(0x3a8), 504 REG16(0x28c), 505 REG16(0x288), 506 REG16(0x284), 507 REG16(0x280), 508 REG16(0x27c), 509 REG16(0x278), 510 REG16(0x274), 511 REG16(0x270), 512 513 NOP(2), 514 LRI(2, POSTED), 515 REG16(0x5a8), 516 REG16(0x5ac), 517 518 NOP(6), 519 LRI(1, 0), 520 REG(0x0c8), 521 522 0 523}; 524 525#define XE2_CTX_COMMON \ 526 NOP(1), /* [0x00] */ \ 527 LRI(15, POSTED), /* [0x01] */ \ 528 REG16(0x244), /* [0x02] CTXT_SR_CTL */ \ 529 REG(0x034), /* [0x04] RING_BUFFER_HEAD */ \ 530 REG(0x030), /* [0x06] RING_BUFFER_TAIL */ \ 531 REG(0x038), /* [0x08] RING_BUFFER_START */ \ 532 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */ \ 533 REG(0x168), /* [0x0c] BB_ADDR_UDW */ \ 534 REG(0x140), /* [0x0e] BB_ADDR */ \ 535 REG(0x110), /* [0x10] BB_STATE */ \ 536 REG(0x1c0), /* [0x12] BB_PER_CTX_PTR */ \ 537 REG(0x1c4), /* [0x14] RCS_INDIRECT_CTX */ \ 538 REG(0x1c8), /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \ 539 REG(0x180), /* [0x18] CCID */ \ 540 REG16(0x2b4), /* [0x1a] SEMAPHORE_TOKEN */ \ 541 REG(0x120), /* [0x1c] PRT_BB_STATE */ \ 542 REG(0x124), /* [0x1e] PRT_BB_STATE_UDW */ \ 543 \ 544 NOP(1), /* [0x20] */ \ 545 LRI(9, POSTED), /* [0x21] */ \ 546 REG16(0x3a8), /* [0x22] CTX_TIMESTAMP */ \ 547 REG16(0x3ac), /* [0x24] CTX_TIMESTAMP_UDW */ \ 548 REG(0x108), /* [0x26] INDIRECT_RING_STATE */ \ 549 REG16(0x284), /* [0x28] dummy reg */ \ 550 REG16(0x280), /* [0x2a] CS_ACC_CTR_THOLD */ \ 551 REG16(0x27c), /* [0x2c] CS_CTX_SYS_PASID */ \ 552 REG16(0x278), /* [0x2e] CS_CTX_ASID */ \ 553 REG16(0x274), /* [0x30] PTBP_UDW */ \ 554 REG16(0x270) /* [0x32] PTBP_LDW */ 555 556static const u8 xe2_rcs_offsets[] = { 557 XE2_CTX_COMMON, 558 559 NOP(2), /* [0x34] */ 560 LRI(2, POSTED), /* [0x36] */ 561 REG16(0x5a8), /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */ 562 REG16(0x5ac), /* [0x39] PREEMPTION_STATUS */ 563 564 NOP(6), /* [0x41] */ 565 LRI(1, 0), /* [0x47] */ 566 REG(0x0c8), /* [0x48] R_PWR_CLK_STATE */ 567 568 0 569}; 570 571static const u8 xe2_bcs_offsets[] = { 572 XE2_CTX_COMMON, 573 574 NOP(4 + 8 + 1), /* [0x34] */ 575 LRI(2, POSTED), /* [0x41] */ 576 REG16(0x200), /* [0x42] BCS_SWCTRL */ 577 REG16(0x204), /* [0x44] BLIT_CCTL */ 578 579 0 580}; 581 582static const u8 xe2_xcs_offsets[] = { 583 XE2_CTX_COMMON, 584 585 0 586}; 587 588static const u8 xe2_indirect_ring_state_offsets[] = { 589 NOP(1), /* [0x00] */ 590 LRI(5, POSTED), /* [0x01] */ 591 REG(0x034), /* [0x02] RING_BUFFER_HEAD */ 592 REG(0x030), /* [0x04] RING_BUFFER_TAIL */ 593 REG(0x038), /* [0x06] RING_BUFFER_START */ 594 REG(0x048), /* [0x08] RING_BUFFER_START_UDW */ 595 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */ 596 597 NOP(5), /* [0x0c] */ 598 LRI(9, POSTED), /* [0x11] */ 599 REG(0x168), /* [0x12] BB_ADDR_UDW */ 600 REG(0x140), /* [0x14] BB_ADDR */ 601 REG(0x110), /* [0x16] BB_STATE */ 602 REG16(0x588), /* [0x18] BB_STACK_WRITE_PORT */ 603 REG16(0x588), /* [0x20] BB_STACK_WRITE_PORT */ 604 REG16(0x588), /* [0x22] BB_STACK_WRITE_PORT */ 605 REG16(0x588), /* [0x24] BB_STACK_WRITE_PORT */ 606 REG16(0x588), /* [0x26] BB_STACK_WRITE_PORT */ 607 REG16(0x588), /* [0x28] BB_STACK_WRITE_PORT */ 608 609 NOP(12), /* [0x00] */ 610 611 0 612}; 613 614#undef REG16 615#undef REG 616#undef LRI 617#undef NOP 618 619static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class) 620{ 621 if (class == XE_ENGINE_CLASS_RENDER) { 622 if (GRAPHICS_VER(xe) >= 20) 623 return xe2_rcs_offsets; 624 else if (GRAPHICS_VERx100(xe) >= 1270) 625 return mtl_rcs_offsets; 626 else if (GRAPHICS_VERx100(xe) >= 1255) 627 return dg2_rcs_offsets; 628 else if (GRAPHICS_VERx100(xe) >= 1250) 629 return xehp_rcs_offsets; 630 else 631 return gen12_rcs_offsets; 632 } else if (class == XE_ENGINE_CLASS_COPY) { 633 if (GRAPHICS_VER(xe) >= 20) 634 return xe2_bcs_offsets; 635 else 636 return gen12_xcs_offsets; 637 } else { 638 if (GRAPHICS_VER(xe) >= 20) 639 return xe2_xcs_offsets; 640 else if (GRAPHICS_VERx100(xe) >= 1255) 641 return dg2_xcs_offsets; 642 else 643 return gen12_xcs_offsets; 644 } 645} 646 647static void set_context_control(u32 *regs, struct xe_hw_engine *hwe) 648{ 649 regs[CTX_CONTEXT_CONTROL] = REG_MASKED_FIELD_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH | 650 CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 651 652 if (xe_gt_has_indirect_ring_state(hwe->gt)) 653 regs[CTX_CONTEXT_CONTROL] |= 654 REG_MASKED_FIELD_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE); 655} 656 657static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe) 658{ 659 struct xe_memirq *memirq = &gt_to_tile(hwe->gt)->memirq; 660 struct xe_device *xe = gt_to_xe(hwe->gt); 661 u8 num_regs; 662 663 if (!xe_device_uses_memirq(xe)) 664 return; 665 666 regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM | 667 MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT; 668 regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr; 669 regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq); 670 671 num_regs = xe_device_has_msix(xe) ? 3 : 2; 672 regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) | 673 MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED; 674 regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr; 675 regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe); 676 regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr; 677 regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe); 678 679 if (xe_device_has_msix(xe)) { 680 regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr; 681 /* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */ 682 } 683} 684 685static int lrc_ring_mi_mode(struct xe_hw_engine *hwe) 686{ 687 struct xe_device *xe = gt_to_xe(hwe->gt); 688 689 if (GRAPHICS_VERx100(xe) >= 1250) 690 return 0x70; 691 else 692 return 0x60; 693} 694 695static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe) 696{ 697 int x; 698 699 x = lrc_ring_mi_mode(hwe); 700 regs[x + 1] &= ~STOP_RING; 701 regs[x + 1] |= STOP_RING << 16; 702} 703 704static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc) 705{ 706 return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE; 707} 708 709static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc) 710{ 711 return 0; 712} 713 714u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc) 715{ 716 return lrc->ring.size; 717} 718 719/* Make the magic macros work */ 720#define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset 721#define __xe_lrc_regs_offset xe_lrc_regs_offset 722 723#define LRC_CTX_JOB_TIMESTAMP_OFFSET 512 724#define LRC_ENGINE_ID_PPHWSP_OFFSET 1024 725#define LRC_PARALLEL_PPHWSP_OFFSET 2048 726 727#define LRC_SEQNO_OFFSET 0 728#define LRC_START_SEQNO_OFFSET (LRC_SEQNO_OFFSET + 8) 729 730u32 xe_lrc_regs_offset(struct xe_lrc *lrc) 731{ 732 return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE; 733} 734 735/** 736 * xe_lrc_reg_size() - Get size of the LRC registers area within queues 737 * @xe: the &xe_device struct instance 738 * 739 * Returns: Size of the LRC registers area for current platform 740 */ 741size_t xe_lrc_reg_size(struct xe_device *xe) 742{ 743 if (GRAPHICS_VERx100(xe) >= 1250) 744 return 96 * sizeof(u32); 745 else 746 return 80 * sizeof(u32); 747} 748 749/** 750 * xe_lrc_engine_state_size() - Get size of the engine state within LRC 751 * @gt: the &xe_gt struct instance 752 * @class: Hardware engine class 753 * 754 * Returns: Size of the engine state 755 */ 756size_t xe_lrc_engine_state_size(struct xe_gt *gt, enum xe_engine_class class) 757{ 758 return xe_gt_lrc_hang_replay_size(gt, class) - xe_lrc_reg_size(gt_to_xe(gt)); 759} 760 761static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc) 762{ 763 return LRC_SEQNO_OFFSET; 764} 765 766static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc) 767{ 768 return LRC_START_SEQNO_OFFSET; 769} 770 771static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc) 772{ 773 /* This is stored in the driver-defined portion of PPHWSP */ 774 return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET; 775} 776 777static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc) 778{ 779 /* The parallel is stored in the driver-defined portion of PPHWSP */ 780 return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET; 781} 782 783static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc) 784{ 785 return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET; 786} 787 788static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc) 789{ 790 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32); 791} 792 793static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc) 794{ 795 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32); 796} 797 798static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc) 799{ 800 u32 offset = xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - 801 LRC_INDIRECT_RING_STATE_SIZE; 802 803 if (lrc->flags & XE_LRC_FLAG_INDIRECT_CTX) 804 offset -= LRC_INDIRECT_CTX_BO_SIZE; 805 806 return offset; 807} 808 809static inline u32 __xe_lrc_indirect_ctx_offset(struct xe_lrc *lrc) 810{ 811 return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - LRC_INDIRECT_CTX_BO_SIZE; 812} 813 814static inline u32 __xe_lrc_wa_bb_offset(struct xe_lrc *lrc) 815{ 816 return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE; 817} 818 819#define DECL_MAP_ADDR_HELPERS(elem, bo_expr) \ 820static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \ 821{ \ 822 struct xe_bo *bo = (bo_expr); \ 823 struct iosys_map map = bo->vmap; \ 824\ 825 xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map)); \ 826 iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \ 827 return map; \ 828} \ 829static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \ 830{ \ 831 struct xe_bo *bo = (bo_expr); \ 832\ 833 return xe_bo_ggtt_addr(bo) + __xe_lrc_##elem##_offset(lrc); \ 834} \ 835 836DECL_MAP_ADDR_HELPERS(ring, lrc->bo) 837DECL_MAP_ADDR_HELPERS(pphwsp, lrc->bo) 838DECL_MAP_ADDR_HELPERS(seqno, lrc->seqno_bo) 839DECL_MAP_ADDR_HELPERS(regs, lrc->bo) 840DECL_MAP_ADDR_HELPERS(start_seqno, lrc->seqno_bo) 841DECL_MAP_ADDR_HELPERS(ctx_job_timestamp, lrc->bo) 842DECL_MAP_ADDR_HELPERS(ctx_timestamp, lrc->bo) 843DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw, lrc->bo) 844DECL_MAP_ADDR_HELPERS(parallel, lrc->bo) 845DECL_MAP_ADDR_HELPERS(indirect_ring, lrc->bo) 846DECL_MAP_ADDR_HELPERS(engine_id, lrc->bo) 847 848#undef DECL_MAP_ADDR_HELPERS 849 850/** 851 * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address 852 * @lrc: Pointer to the lrc. 853 * 854 * Returns: ctx timestamp GGTT address 855 */ 856u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc) 857{ 858 return __xe_lrc_ctx_timestamp_ggtt_addr(lrc); 859} 860 861/** 862 * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address 863 * @lrc: Pointer to the lrc. 864 * 865 * Returns: ctx timestamp udw GGTT address 866 */ 867u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc) 868{ 869 return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc); 870} 871 872/** 873 * xe_lrc_ctx_timestamp() - Read ctx timestamp value 874 * @lrc: Pointer to the lrc. 875 * 876 * Returns: ctx timestamp value 877 */ 878static u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc) 879{ 880 struct xe_device *xe = lrc_to_xe(lrc); 881 struct iosys_map map; 882 u32 ldw, udw = 0; 883 884 map = __xe_lrc_ctx_timestamp_map(lrc); 885 ldw = xe_map_read32(xe, &map); 886 887 if (xe->info.has_64bit_timestamp) { 888 map = __xe_lrc_ctx_timestamp_udw_map(lrc); 889 udw = xe_map_read32(xe, &map); 890 } 891 892 return (u64)udw << 32 | ldw; 893} 894 895/** 896 * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address 897 * @lrc: Pointer to the lrc. 898 * 899 * Returns: ctx timestamp job GGTT address 900 */ 901u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc) 902{ 903 return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc); 904} 905 906/** 907 * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value 908 * @lrc: Pointer to the lrc. 909 * 910 * Returns: ctx timestamp job value 911 */ 912u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc) 913{ 914 struct xe_device *xe = lrc_to_xe(lrc); 915 struct iosys_map map; 916 917 map = __xe_lrc_ctx_job_timestamp_map(lrc); 918 return xe_map_read32(xe, &map); 919} 920 921u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc) 922{ 923 return __xe_lrc_pphwsp_ggtt_addr(lrc); 924} 925 926u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc) 927{ 928 if (!xe_lrc_has_indirect_ring_state(lrc)) 929 return 0; 930 931 return __xe_lrc_indirect_ring_ggtt_addr(lrc); 932} 933 934static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr) 935{ 936 struct xe_device *xe = lrc_to_xe(lrc); 937 struct iosys_map map; 938 939 map = __xe_lrc_indirect_ring_map(lrc); 940 iosys_map_incr(&map, reg_nr * sizeof(u32)); 941 return xe_map_read32(xe, &map); 942} 943 944static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc, 945 int reg_nr, u32 val) 946{ 947 struct xe_device *xe = lrc_to_xe(lrc); 948 struct iosys_map map; 949 950 map = __xe_lrc_indirect_ring_map(lrc); 951 iosys_map_incr(&map, reg_nr * sizeof(u32)); 952 xe_map_write32(xe, &map, val); 953} 954 955u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr) 956{ 957 struct xe_device *xe = lrc_to_xe(lrc); 958 struct iosys_map map; 959 960 map = __xe_lrc_regs_map(lrc); 961 iosys_map_incr(&map, reg_nr * sizeof(u32)); 962 return xe_map_read32(xe, &map); 963} 964 965void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val) 966{ 967 struct xe_device *xe = lrc_to_xe(lrc); 968 struct iosys_map map; 969 970 map = __xe_lrc_regs_map(lrc); 971 iosys_map_incr(&map, reg_nr * sizeof(u32)); 972 xe_map_write32(xe, &map, val); 973} 974 975static void *empty_lrc_data(struct xe_hw_engine *hwe) 976{ 977 struct xe_gt *gt = hwe->gt; 978 void *data; 979 u32 *regs; 980 981 data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL); 982 if (!data) 983 return NULL; 984 985 /* 1st page: Per-Process of HW status Page */ 986 regs = data + LRC_PPHWSP_SIZE; 987 set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe); 988 set_context_control(regs, hwe); 989 set_memory_based_intr(regs, hwe); 990 reset_stop_ring(regs, hwe); 991 if (xe_gt_has_indirect_ring_state(gt)) { 992 regs = data + xe_gt_lrc_size(gt, hwe->class) - 993 LRC_INDIRECT_RING_STATE_SIZE; 994 set_offsets(regs, xe2_indirect_ring_state_offsets, hwe); 995 } 996 997 return data; 998} 999 1000/** 1001 * xe_default_lrc_update_memirq_regs_with_address - Re-compute GGTT references in default LRC 1002 * of given engine. 1003 * @hwe: the &xe_hw_engine struct instance 1004 */ 1005void xe_default_lrc_update_memirq_regs_with_address(struct xe_hw_engine *hwe) 1006{ 1007 struct xe_gt *gt = hwe->gt; 1008 u32 *regs; 1009 1010 if (!gt->default_lrc[hwe->class]) 1011 return; 1012 1013 regs = gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE; 1014 set_memory_based_intr(regs, hwe); 1015} 1016 1017/** 1018 * xe_lrc_update_memirq_regs_with_address - Re-compute GGTT references in mem interrupt data 1019 * for given LRC. 1020 * @lrc: the &xe_lrc struct instance 1021 * @hwe: the &xe_hw_engine struct instance 1022 * @regs: scratch buffer to be used as temporary storage 1023 */ 1024void xe_lrc_update_memirq_regs_with_address(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 1025 u32 *regs) 1026{ 1027 struct xe_gt *gt = hwe->gt; 1028 struct iosys_map map; 1029 size_t regs_len; 1030 1031 if (!xe_device_uses_memirq(gt_to_xe(gt))) 1032 return; 1033 1034 map = __xe_lrc_regs_map(lrc); 1035 regs_len = xe_lrc_reg_size(gt_to_xe(gt)); 1036 xe_map_memcpy_from(gt_to_xe(gt), regs, &map, 0, regs_len); 1037 set_memory_based_intr(regs, hwe); 1038 xe_map_memcpy_to(gt_to_xe(gt), &map, 0, regs, regs_len); 1039} 1040 1041static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm) 1042{ 1043 u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt)); 1044 1045 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc)); 1046 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc)); 1047} 1048 1049static void xe_lrc_finish(struct xe_lrc *lrc) 1050{ 1051 xe_hw_fence_ctx_finish(&lrc->fence_ctx); 1052 xe_bo_unpin_map_no_vm(lrc->bo); 1053 xe_bo_unpin_map_no_vm(lrc->seqno_bo); 1054} 1055 1056/* 1057 * wa_bb_setup_utilization() - Write commands to wa bb to assist 1058 * in calculating active context run ticks. 1059 * 1060 * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the 1061 * context, but only gets updated when the context switches out. In order to 1062 * check how long a context has been active before it switches out, two things 1063 * are required: 1064 * 1065 * (1) Determine if the context is running: 1066 * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in 1067 * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is 1068 * initialized. During a query, we just check for this value to determine if the 1069 * context is active. If the context switched out, it would overwrite this 1070 * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as 1071 * the last part of context restore, so reusing this LRC location will not 1072 * clobber anything. 1073 * 1074 * (2) Calculate the time that the context has been active for: 1075 * The CTX_TIMESTAMP ticks only when the context is active. If a context is 1076 * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization. 1077 * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific 1078 * engine instance. Since we do not know which instance the context is running 1079 * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and 1080 * store it in the PPHSWP. 1081 */ 1082#define CONTEXT_ACTIVE 1ULL 1083static ssize_t setup_utilization_wa(struct xe_lrc *lrc, 1084 struct xe_hw_engine *hwe, 1085 u32 *batch, 1086 size_t max_len) 1087{ 1088 u32 *cmd = batch; 1089 1090 if (IS_SRIOV_VF(gt_to_xe(lrc->gt))) 1091 return 0; 1092 1093 if (xe_gt_WARN_ON(lrc->gt, max_len < 12)) 1094 return -ENOSPC; 1095 1096 *cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET; 1097 *cmd++ = ENGINE_ID(0).addr; 1098 *cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc); 1099 *cmd++ = 0; 1100 1101 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1); 1102 *cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc); 1103 *cmd++ = 0; 1104 *cmd++ = lower_32_bits(CONTEXT_ACTIVE); 1105 1106 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) { 1107 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1); 1108 *cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc); 1109 *cmd++ = 0; 1110 *cmd++ = upper_32_bits(CONTEXT_ACTIVE); 1111 } 1112 1113 return cmd - batch; 1114} 1115 1116static ssize_t setup_timestamp_wa(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 1117 u32 *batch, size_t max_len) 1118{ 1119 const u32 ts_addr = __xe_lrc_ctx_timestamp_ggtt_addr(lrc); 1120 u32 *cmd = batch; 1121 1122 if (!XE_GT_WA(lrc->gt, 16010904313) || 1123 !(hwe->class == XE_ENGINE_CLASS_RENDER || 1124 hwe->class == XE_ENGINE_CLASS_COMPUTE || 1125 hwe->class == XE_ENGINE_CLASS_COPY || 1126 hwe->class == XE_ENGINE_CLASS_VIDEO_DECODE || 1127 hwe->class == XE_ENGINE_CLASS_VIDEO_ENHANCE)) 1128 return 0; 1129 1130 if (xe_gt_WARN_ON(lrc->gt, max_len < 12)) 1131 return -ENOSPC; 1132 1133 *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO | 1134 MI_LRM_ASYNC; 1135 *cmd++ = RING_CTX_TIMESTAMP(0).addr; 1136 *cmd++ = ts_addr; 1137 *cmd++ = 0; 1138 1139 *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO | 1140 MI_LRM_ASYNC; 1141 *cmd++ = RING_CTX_TIMESTAMP(0).addr; 1142 *cmd++ = ts_addr; 1143 *cmd++ = 0; 1144 1145 *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO; 1146 *cmd++ = RING_CTX_TIMESTAMP(0).addr; 1147 *cmd++ = ts_addr; 1148 *cmd++ = 0; 1149 1150 return cmd - batch; 1151} 1152 1153static ssize_t setup_configfs_post_ctx_restore_bb(struct xe_lrc *lrc, 1154 struct xe_hw_engine *hwe, 1155 u32 *batch, size_t max_len) 1156{ 1157 struct xe_device *xe = gt_to_xe(lrc->gt); 1158 const u32 *user_batch; 1159 u32 *cmd = batch; 1160 u32 count; 1161 1162 count = xe_configfs_get_ctx_restore_post_bb(to_pci_dev(xe->drm.dev), 1163 hwe->class, &user_batch); 1164 if (!count) 1165 return 0; 1166 1167 if (count > max_len) 1168 return -ENOSPC; 1169 1170 /* 1171 * This should be used only for tests and validation. Taint the kernel 1172 * as anything could be submitted directly in context switches 1173 */ 1174 add_taint(TAINT_TEST, LOCKDEP_STILL_OK); 1175 1176 memcpy(cmd, user_batch, count * sizeof(u32)); 1177 cmd += count; 1178 1179 return cmd - batch; 1180} 1181 1182static ssize_t setup_configfs_mid_ctx_restore_bb(struct xe_lrc *lrc, 1183 struct xe_hw_engine *hwe, 1184 u32 *batch, size_t max_len) 1185{ 1186 struct xe_device *xe = gt_to_xe(lrc->gt); 1187 const u32 *user_batch; 1188 u32 *cmd = batch; 1189 u32 count; 1190 1191 count = xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev), 1192 hwe->class, &user_batch); 1193 if (!count) 1194 return 0; 1195 1196 if (count > max_len) 1197 return -ENOSPC; 1198 1199 /* 1200 * This should be used only for tests and validation. Taint the kernel 1201 * as anything could be submitted directly in context switches 1202 */ 1203 add_taint(TAINT_TEST, LOCKDEP_STILL_OK); 1204 1205 memcpy(cmd, user_batch, count * sizeof(u32)); 1206 cmd += count; 1207 1208 return cmd - batch; 1209} 1210 1211static ssize_t setup_invalidate_state_cache_wa(struct xe_lrc *lrc, 1212 struct xe_hw_engine *hwe, 1213 u32 *batch, size_t max_len) 1214{ 1215 u32 *cmd = batch; 1216 1217 if (!XE_GT_WA(lrc->gt, 18022495364) || 1218 hwe->class != XE_ENGINE_CLASS_RENDER) 1219 return 0; 1220 1221 if (xe_gt_WARN_ON(lrc->gt, max_len < 3)) 1222 return -ENOSPC; 1223 1224 *cmd++ = MI_LOAD_REGISTER_IMM | MI_LRI_LRM_CS_MMIO | MI_LRI_NUM_REGS(1); 1225 *cmd++ = CS_DEBUG_MODE2(0).addr; 1226 *cmd++ = REG_MASKED_FIELD_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE); 1227 1228 return cmd - batch; 1229} 1230 1231static ssize_t setup_invalidate_auxccs_wa(struct xe_lrc *lrc, 1232 struct xe_hw_engine *hwe, 1233 u32 *batch, size_t max_len) 1234{ 1235 struct xe_gt *gt = lrc->gt; 1236 u32 *(*emit)(struct xe_gt *gt, u32 *cmd) = 1237 gt->ring_ops[hwe->class]->emit_aux_table_inv; 1238 1239 if (!emit) 1240 return 0; 1241 1242 if (xe_gt_WARN_ON(gt, max_len < 8)) 1243 return -ENOSPC; 1244 1245 return emit(gt, batch) - batch; 1246} 1247 1248struct bo_setup { 1249 ssize_t (*setup)(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 1250 u32 *batch, size_t max_size); 1251}; 1252 1253struct bo_setup_state { 1254 /* Input: */ 1255 struct xe_lrc *lrc; 1256 struct xe_hw_engine *hwe; 1257 size_t max_size; 1258 size_t reserve_dw; 1259 unsigned int offset; 1260 const struct bo_setup *funcs; 1261 unsigned int num_funcs; 1262 1263 /* State: */ 1264 u32 *buffer; 1265 u32 *ptr; 1266 unsigned int written; 1267}; 1268 1269static int setup_bo(struct bo_setup_state *state) 1270{ 1271 ssize_t remain; 1272 1273 if (state->lrc->bo->vmap.is_iomem) { 1274 xe_gt_assert(state->hwe->gt, state->buffer); 1275 state->ptr = state->buffer; 1276 } else { 1277 state->ptr = state->lrc->bo->vmap.vaddr + state->offset; 1278 } 1279 1280 remain = state->max_size / sizeof(u32); 1281 1282 for (size_t i = 0; i < state->num_funcs; i++) { 1283 ssize_t len = state->funcs[i].setup(state->lrc, state->hwe, 1284 state->ptr, remain); 1285 1286 remain -= len; 1287 1288 /* 1289 * Caller has asked for at least reserve_dw to remain unused. 1290 */ 1291 if (len < 0 || 1292 xe_gt_WARN_ON(state->lrc->gt, remain < state->reserve_dw)) 1293 goto fail; 1294 1295 state->ptr += len; 1296 state->written += len; 1297 } 1298 1299 return 0; 1300 1301fail: 1302 return -ENOSPC; 1303} 1304 1305static void finish_bo(struct bo_setup_state *state) 1306{ 1307 if (!state->lrc->bo->vmap.is_iomem) 1308 return; 1309 1310 xe_map_memcpy_to(gt_to_xe(state->lrc->gt), &state->lrc->bo->vmap, 1311 state->offset, state->buffer, 1312 state->written * sizeof(u32)); 1313} 1314 1315/** 1316 * xe_lrc_setup_wa_bb_with_scratch - Execute all wa bb setup callbacks. 1317 * @lrc: the &xe_lrc struct instance 1318 * @hwe: the &xe_hw_engine struct instance 1319 * @scratch: preallocated scratch buffer for temporary storage 1320 * Return: 0 on success, negative error code on failure 1321 */ 1322int xe_lrc_setup_wa_bb_with_scratch(struct xe_lrc *lrc, struct xe_hw_engine *hwe, u32 *scratch) 1323{ 1324 static const struct bo_setup funcs[] = { 1325 { .setup = setup_timestamp_wa }, 1326 { .setup = setup_invalidate_state_cache_wa }, 1327 { .setup = setup_utilization_wa }, 1328 { .setup = setup_configfs_post_ctx_restore_bb }, 1329 }; 1330 struct bo_setup_state state = { 1331 .lrc = lrc, 1332 .hwe = hwe, 1333 .max_size = LRC_WA_BB_SIZE, 1334 .buffer = scratch, 1335 .reserve_dw = 1, 1336 .offset = __xe_lrc_wa_bb_offset(lrc), 1337 .funcs = funcs, 1338 .num_funcs = ARRAY_SIZE(funcs), 1339 }; 1340 int ret; 1341 1342 ret = setup_bo(&state); 1343 if (ret) 1344 return ret; 1345 1346 *state.ptr++ = MI_BATCH_BUFFER_END; 1347 state.written++; 1348 1349 finish_bo(&state); 1350 1351 xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR, 1352 xe_bo_ggtt_addr(lrc->bo) + state.offset + 1); 1353 1354 return 0; 1355} 1356 1357static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe) 1358{ 1359 u32 *buf = NULL; 1360 int ret; 1361 1362 if (lrc->bo->vmap.is_iomem) { 1363 buf = kmalloc(LRC_WA_BB_SIZE, GFP_KERNEL); 1364 if (!buf) 1365 return -ENOMEM; 1366 } 1367 1368 ret = xe_lrc_setup_wa_bb_with_scratch(lrc, hwe, buf); 1369 1370 kfree(buf); 1371 1372 return ret; 1373} 1374 1375static int 1376setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe) 1377{ 1378 static const struct bo_setup rcs_funcs[] = { 1379 { .setup = setup_timestamp_wa }, 1380 { .setup = setup_invalidate_auxccs_wa }, 1381 { .setup = setup_configfs_mid_ctx_restore_bb }, 1382 }; 1383 static const struct bo_setup xcs_funcs[] = { 1384 { .setup = setup_invalidate_auxccs_wa }, 1385 { .setup = setup_configfs_mid_ctx_restore_bb }, 1386 }; 1387 struct bo_setup_state state = { 1388 .lrc = lrc, 1389 .hwe = hwe, 1390 .max_size = (63 * 64) /* max 63 cachelines */, 1391 .buffer = NULL, 1392 .offset = __xe_lrc_indirect_ctx_offset(lrc), 1393 }; 1394 int ret; 1395 1396 if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX)) 1397 return 0; 1398 1399 if (hwe->class == XE_ENGINE_CLASS_RENDER || 1400 hwe->class == XE_ENGINE_CLASS_COMPUTE) { 1401 state.funcs = rcs_funcs; 1402 state.num_funcs = ARRAY_SIZE(rcs_funcs); 1403 } else { 1404 state.funcs = xcs_funcs; 1405 state.num_funcs = ARRAY_SIZE(xcs_funcs); 1406 } 1407 1408 if (xe_gt_WARN_ON(lrc->gt, !state.funcs)) 1409 return 0; 1410 1411 if (lrc->bo->vmap.is_iomem) { 1412 state.buffer = kmalloc(state.max_size, GFP_KERNEL); 1413 if (!state.buffer) 1414 return -ENOMEM; 1415 } 1416 1417 ret = setup_bo(&state); 1418 if (ret) { 1419 kfree(state.buffer); 1420 return ret; 1421 } 1422 1423 /* 1424 * Align to 64B cacheline so there's no garbage at the end for CS to 1425 * execute: size for indirect ctx must be a multiple of 64. 1426 */ 1427 while (state.written & 0xf) { 1428 *state.ptr++ = MI_NOOP; 1429 state.written++; 1430 } 1431 1432 finish_bo(&state); 1433 kfree(state.buffer); 1434 1435 /* 1436 * Enable INDIRECT_CTX leaving INDIRECT_CTX_OFFSET at its default: it 1437 * varies per engine class, but the default is good enough 1438 */ 1439 xe_lrc_write_ctx_reg(lrc, 1440 CTX_CS_INDIRECT_CTX, 1441 (xe_bo_ggtt_addr(lrc->bo) + state.offset) | 1442 /* Size in CLs. */ 1443 (state.written * sizeof(u32) / 64)); 1444 1445 return 0; 1446} 1447 1448static u8 xe_multi_queue_prio_to_lrc(struct xe_lrc *lrc, enum xe_multi_queue_priority priority) 1449{ 1450 struct xe_device *xe = gt_to_xe(lrc->gt); 1451 1452 xe_assert(xe, (priority >= XE_MULTI_QUEUE_PRIORITY_LOW && 1453 priority <= XE_MULTI_QUEUE_PRIORITY_HIGH)); 1454 1455 /* xe_multi_queue_priority is directly mapped to LRC priority values */ 1456 return priority; 1457} 1458 1459/** 1460 * xe_lrc_set_multi_queue_priority() - Set multi queue priority in LRC 1461 * @lrc: Logical Ring Context 1462 * @priority: Multi queue priority of the exec queue 1463 * 1464 * Convert @priority to LRC multi queue priority and update the @lrc descriptor 1465 */ 1466void xe_lrc_set_multi_queue_priority(struct xe_lrc *lrc, enum xe_multi_queue_priority priority) 1467{ 1468 lrc->desc &= ~LRC_PRIORITY; 1469 lrc->desc |= FIELD_PREP(LRC_PRIORITY, xe_multi_queue_prio_to_lrc(lrc, priority)); 1470} 1471 1472static int xe_lrc_ctx_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm, 1473 void *replay_state, u16 msix_vec, u32 init_flags) 1474{ 1475 struct xe_gt *gt = hwe->gt; 1476 struct xe_tile *tile = gt_to_tile(gt); 1477 struct xe_device *xe = gt_to_xe(gt); 1478 struct iosys_map map; 1479 u32 arb_enable; 1480 u32 state_cache_perf_fix[3]; 1481 int err; 1482 1483 /* 1484 * Init Per-Process of HW status Page, LRC / context state to known 1485 * values. If there's already a primed default_lrc, just copy it, otherwise 1486 * it's the early submission to record the lrc: build a new empty one from 1487 * scratch. 1488 */ 1489 map = __xe_lrc_pphwsp_map(lrc); 1490 if (gt->default_lrc[hwe->class] || replay_state) { 1491 xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE); /* PPHWSP */ 1492 xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE, 1493 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE, 1494 lrc->size - LRC_PPHWSP_SIZE); 1495 if (replay_state) 1496 xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE, 1497 replay_state, lrc->replay_size); 1498 } else { 1499 void *init_data = empty_lrc_data(hwe); 1500 1501 if (!init_data) { 1502 return -ENOMEM; 1503 } 1504 1505 xe_map_memcpy_to(xe, &map, 0, init_data, lrc->size); 1506 kfree(init_data); 1507 } 1508 1509 if (vm) 1510 xe_lrc_set_ppgtt(lrc, vm); 1511 1512 if (xe_device_has_msix(xe)) { 1513 xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR, 1514 xe_memirq_status_ptr(&tile->memirq, hwe)); 1515 xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR, 1516 xe_memirq_source_ptr(&tile->memirq, hwe)); 1517 xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec); 1518 } 1519 1520 if (xe_gt_has_indirect_ring_state(gt)) { 1521 xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE, 1522 __xe_lrc_indirect_ring_ggtt_addr(lrc)); 1523 1524 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START, 1525 __xe_lrc_ring_ggtt_addr(lrc)); 1526 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0); 1527 1528 /* Match head and tail pointers */ 1529 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, lrc->ring.tail); 1530 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail); 1531 1532 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL, 1533 RING_CTL_SIZE(lrc->ring.size) | RING_VALID); 1534 } else { 1535 xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc)); 1536 1537 /* Match head and tail pointers */ 1538 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, lrc->ring.tail); 1539 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail); 1540 1541 xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL, 1542 RING_CTL_SIZE(lrc->ring.size) | RING_VALID); 1543 } 1544 1545 if (init_flags & XE_LRC_CREATE_RUNALONE) 1546 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL, 1547 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) | 1548 REG_MASKED_FIELD_ENABLE(CTX_CTRL_RUN_ALONE)); 1549 1550 if (init_flags & XE_LRC_CREATE_PXP) 1551 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL, 1552 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) | 1553 REG_MASKED_FIELD_ENABLE(CTX_CTRL_PXP_ENABLE)); 1554 1555 lrc->ctx_timestamp = 0; 1556 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0); 1557 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) 1558 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0); 1559 1560 if (xe->info.has_asid && vm) 1561 xe_lrc_write_ctx_reg(lrc, CTX_ASID, vm->usm.asid); 1562 1563 lrc->desc = LRC_VALID; 1564 lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT); 1565 /* TODO: Priority */ 1566 1567 /* While this appears to have something about privileged batches or 1568 * some such, it really just means PPGTT mode. 1569 */ 1570 if (vm) 1571 lrc->desc |= LRC_PRIVILEGE; 1572 1573 if (GRAPHICS_VERx100(xe) < 1250) { 1574 lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance); 1575 lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class); 1576 } 1577 1578 arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1579 xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable)); 1580 1581 if (init_flags & XE_LRC_DISABLE_STATE_CACHE_PERF_FIX) { 1582 state_cache_perf_fix[0] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1); 1583 state_cache_perf_fix[1] = COMMON_SLICE_CHICKEN3.addr; 1584 state_cache_perf_fix[2] = REG_MASKED_FIELD_ENABLE(DISABLE_STATE_CACHE_PERF_FIX); 1585 xe_lrc_write_ring(lrc, state_cache_perf_fix, sizeof(state_cache_perf_fix)); 1586 } 1587 1588 map = __xe_lrc_seqno_map(lrc); 1589 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1); 1590 1591 map = __xe_lrc_start_seqno_map(lrc); 1592 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1); 1593 1594 err = setup_wa_bb(lrc, hwe); 1595 if (err) 1596 return err; 1597 1598 err = setup_indirect_ctx(lrc, hwe); 1599 1600 return err; 1601} 1602 1603static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm, 1604 void *replay_state, u32 ring_size, u16 msix_vec, u32 init_flags) 1605{ 1606 struct xe_gt *gt = hwe->gt; 1607 const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class); 1608 u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE; 1609 struct xe_tile *tile = gt_to_tile(gt); 1610 struct xe_device *xe = gt_to_xe(gt); 1611 struct xe_bo *bo; 1612 u32 bo_flags; 1613 int err; 1614 1615 kref_init(&lrc->refcount); 1616 lrc->gt = gt; 1617 lrc->replay_size = xe_gt_lrc_hang_replay_size(gt, hwe->class); 1618 lrc->size = lrc_size; 1619 lrc->flags = 0; 1620 lrc->ring.size = ring_size; 1621 lrc->ring.tail = 0; 1622 1623 if (gt_engine_needs_indirect_ctx(gt, hwe->class)) { 1624 lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX; 1625 bo_size += LRC_INDIRECT_CTX_BO_SIZE; 1626 } 1627 1628 if (xe_gt_has_indirect_ring_state(gt)) 1629 lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE; 1630 1631 bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT | 1632 XE_BO_FLAG_GGTT_INVALIDATE; 1633 1634 if ((vm && vm->xef) || init_flags & XE_LRC_CREATE_USER_CTX) /* userspace */ 1635 bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE | XE_BO_FLAG_FORCE_USER_VRAM; 1636 1637 bo = xe_bo_create_pin_map_novm(xe, tile, bo_size, 1638 ttm_bo_type_kernel, 1639 bo_flags, false); 1640 if (IS_ERR(bo)) 1641 return PTR_ERR(bo); 1642 1643 lrc->bo = bo; 1644 1645 bo = xe_bo_create_pin_map_novm(xe, tile, PAGE_SIZE, 1646 ttm_bo_type_kernel, 1647 XE_BO_FLAG_GGTT | 1648 XE_BO_FLAG_GGTT_INVALIDATE | 1649 XE_BO_FLAG_SYSTEM, false); 1650 if (IS_ERR(bo)) { 1651 err = PTR_ERR(bo); 1652 goto err_lrc_finish; 1653 } 1654 lrc->seqno_bo = bo; 1655 1656 xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt, 1657 hwe->fence_irq, hwe->name); 1658 1659 err = xe_lrc_ctx_init(lrc, hwe, vm, replay_state, msix_vec, init_flags); 1660 if (err) 1661 goto err_lrc_finish; 1662 1663 if (vm && vm->xef) 1664 xe_drm_client_add_bo(vm->xef->client, lrc->bo); 1665 1666 return 0; 1667 1668err_lrc_finish: 1669 xe_lrc_finish(lrc); 1670 return err; 1671} 1672 1673/** 1674 * xe_lrc_create - Create a LRC 1675 * @hwe: Hardware Engine 1676 * @vm: The VM (address space) 1677 * @replay_state: GPU hang replay state 1678 * @ring_size: LRC ring size 1679 * @msix_vec: MSI-X interrupt vector (for platforms that support it) 1680 * @flags: LRC initialization flags 1681 * 1682 * Allocate and initialize the Logical Ring Context (LRC). 1683 * 1684 * Return pointer to created LRC upon success and an error pointer 1685 * upon failure. 1686 */ 1687struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm, 1688 void *replay_state, u32 ring_size, u16 msix_vec, u32 flags) 1689{ 1690 struct xe_lrc *lrc; 1691 int err; 1692 1693 lrc = kzalloc_obj(*lrc); 1694 if (!lrc) 1695 return ERR_PTR(-ENOMEM); 1696 1697 err = xe_lrc_init(lrc, hwe, vm, replay_state, ring_size, msix_vec, flags); 1698 if (err) { 1699 kfree(lrc); 1700 return ERR_PTR(err); 1701 } 1702 1703 return lrc; 1704} 1705 1706/** 1707 * xe_lrc_destroy - Destroy the LRC 1708 * @ref: reference to LRC 1709 * 1710 * Called when ref == 0, release resources held by the Logical Ring Context 1711 * (LRC) and free the LRC memory. 1712 */ 1713void xe_lrc_destroy(struct kref *ref) 1714{ 1715 struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount); 1716 1717 xe_lrc_finish(lrc); 1718 kfree(lrc); 1719} 1720 1721/** 1722 * xe_lrc_update_hwctx_regs_with_address - Re-compute GGTT references within given LRC. 1723 * @lrc: the &xe_lrc struct instance 1724 */ 1725void xe_lrc_update_hwctx_regs_with_address(struct xe_lrc *lrc) 1726{ 1727 if (xe_lrc_has_indirect_ring_state(lrc)) { 1728 xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE, 1729 __xe_lrc_indirect_ring_ggtt_addr(lrc)); 1730 1731 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START, 1732 __xe_lrc_ring_ggtt_addr(lrc)); 1733 } else { 1734 xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc)); 1735 } 1736} 1737 1738void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail) 1739{ 1740 if (xe_lrc_has_indirect_ring_state(lrc)) 1741 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail); 1742 else 1743 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail); 1744} 1745 1746u32 xe_lrc_ring_tail(struct xe_lrc *lrc) 1747{ 1748 if (xe_lrc_has_indirect_ring_state(lrc)) 1749 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR; 1750 else 1751 return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR; 1752} 1753 1754static u32 xe_lrc_ring_start(struct xe_lrc *lrc) 1755{ 1756 if (xe_lrc_has_indirect_ring_state(lrc)) 1757 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START); 1758 else 1759 return xe_lrc_read_ctx_reg(lrc, CTX_RING_START); 1760} 1761 1762void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head) 1763{ 1764 if (xe_lrc_has_indirect_ring_state(lrc)) 1765 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head); 1766 else 1767 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head); 1768} 1769 1770u32 xe_lrc_ring_head(struct xe_lrc *lrc) 1771{ 1772 if (xe_lrc_has_indirect_ring_state(lrc)) 1773 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR; 1774 else 1775 return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR; 1776} 1777 1778u32 xe_lrc_ring_space(struct xe_lrc *lrc) 1779{ 1780 const u32 head = xe_lrc_ring_head(lrc); 1781 const u32 tail = lrc->ring.tail; 1782 const u32 size = lrc->ring.size; 1783 1784 return ((head - tail - 1) & (size - 1)) + 1; 1785} 1786 1787static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring, 1788 const void *data, size_t size) 1789{ 1790 struct xe_device *xe = lrc_to_xe(lrc); 1791 1792 iosys_map_incr(&ring, lrc->ring.tail); 1793 xe_map_memcpy_to(xe, &ring, 0, data, size); 1794 lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1); 1795} 1796 1797void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size) 1798{ 1799 struct xe_device *xe = lrc_to_xe(lrc); 1800 struct iosys_map ring; 1801 u32 rhs; 1802 size_t aligned_size; 1803 1804 xe_assert(xe, IS_ALIGNED(size, 4)); 1805 aligned_size = ALIGN(size, 8); 1806 1807 ring = __xe_lrc_ring_map(lrc); 1808 1809 xe_assert(xe, lrc->ring.tail < lrc->ring.size); 1810 rhs = lrc->ring.size - lrc->ring.tail; 1811 if (size > rhs) { 1812 __xe_lrc_write_ring(lrc, ring, data, rhs); 1813 __xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs); 1814 } else { 1815 __xe_lrc_write_ring(lrc, ring, data, size); 1816 } 1817 1818 if (aligned_size > size) { 1819 u32 noop = MI_NOOP; 1820 1821 __xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop)); 1822 } 1823} 1824 1825u64 xe_lrc_descriptor(struct xe_lrc *lrc) 1826{ 1827 return lrc->desc | xe_lrc_ggtt_addr(lrc); 1828} 1829 1830u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc) 1831{ 1832 return __xe_lrc_seqno_ggtt_addr(lrc); 1833} 1834 1835/** 1836 * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence. 1837 * 1838 * Allocate but don't initialize an lrc seqno fence. 1839 * 1840 * Return: Pointer to the allocated fence or 1841 * negative error pointer on error. 1842 */ 1843struct dma_fence *xe_lrc_alloc_seqno_fence(void) 1844{ 1845 return xe_hw_fence_alloc(); 1846} 1847 1848/** 1849 * xe_lrc_free_seqno_fence() - Free an lrc seqno fence. 1850 * @fence: Pointer to the fence to free. 1851 * 1852 * Frees an lrc seqno fence that hasn't yet been 1853 * initialized. 1854 */ 1855void xe_lrc_free_seqno_fence(struct dma_fence *fence) 1856{ 1857 xe_hw_fence_free(fence); 1858} 1859 1860/** 1861 * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence. 1862 * @lrc: Pointer to the lrc. 1863 * @fence: Pointer to the fence to initialize. 1864 * 1865 * Initializes a pre-allocated lrc seqno fence. 1866 * After initialization, the fence is subject to normal 1867 * dma-fence refcounting. 1868 */ 1869void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence) 1870{ 1871 xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc)); 1872} 1873 1874s32 xe_lrc_seqno(struct xe_lrc *lrc) 1875{ 1876 struct iosys_map map = __xe_lrc_seqno_map(lrc); 1877 1878 return xe_map_read32(lrc_to_xe(lrc), &map); 1879} 1880 1881s32 xe_lrc_start_seqno(struct xe_lrc *lrc) 1882{ 1883 struct iosys_map map = __xe_lrc_start_seqno_map(lrc); 1884 1885 return xe_map_read32(lrc_to_xe(lrc), &map); 1886} 1887 1888u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc) 1889{ 1890 return __xe_lrc_start_seqno_ggtt_addr(lrc); 1891} 1892 1893u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc) 1894{ 1895 return __xe_lrc_parallel_ggtt_addr(lrc); 1896} 1897 1898struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc) 1899{ 1900 return __xe_lrc_parallel_map(lrc); 1901} 1902 1903/** 1904 * xe_lrc_engine_id() - Read engine id value 1905 * @lrc: Pointer to the lrc. 1906 * 1907 * Returns: context id value 1908 */ 1909static u32 xe_lrc_engine_id(struct xe_lrc *lrc) 1910{ 1911 struct xe_device *xe = lrc_to_xe(lrc); 1912 struct iosys_map map; 1913 1914 map = __xe_lrc_engine_id_map(lrc); 1915 return xe_map_read32(xe, &map); 1916} 1917 1918static int instr_dw(u32 cmd_header) 1919{ 1920 /* GFXPIPE "SINGLE_DW" opcodes are a single dword */ 1921 if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) == 1922 GFXPIPE_SINGLE_DW_CMD(0, 0)) 1923 return 1; 1924 1925 /* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */ 1926 if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST) 1927 return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2; 1928 1929 /* Most instructions have the # of dwords (minus 2) in 7:0 */ 1930 return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2; 1931} 1932 1933static int dump_mi_command(struct drm_printer *p, 1934 struct xe_gt *gt, 1935 u32 *start, 1936 u32 *dw, 1937 int remaining_dw) 1938{ 1939 u32 inst_header = *dw; 1940 u32 numdw = instr_dw(inst_header); 1941 u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header); 1942 int num_noop; 1943 1944 /* First check for commands that don't have/use a '# DW' field */ 1945 switch (inst_header & MI_OPCODE) { 1946 case MI_NOOP: 1947 num_noop = 1; 1948 while (num_noop < remaining_dw && 1949 (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP) 1950 num_noop++; 1951 drm_printf(p, "LRC[%#5tx] = [%#010x] MI_NOOP (%d dwords)\n", 1952 dw - num_noop - start, inst_header, num_noop); 1953 return num_noop; 1954 1955 case MI_TOPOLOGY_FILTER: 1956 drm_printf(p, "LRC[%#5tx] = [%#010x] MI_TOPOLOGY_FILTER\n", 1957 dw - start, inst_header); 1958 return 1; 1959 1960 case MI_BATCH_BUFFER_END: 1961 drm_printf(p, "LRC[%#5tx] = [%#010x] MI_BATCH_BUFFER_END\n", 1962 dw - start, inst_header); 1963 /* Return 'remaining_dw' to consume the rest of the LRC */ 1964 return remaining_dw; 1965 } 1966 1967 /* 1968 * Any remaining commands include a # of dwords. We should make sure 1969 * it doesn't exceed the remaining size of the LRC. 1970 */ 1971 if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) 1972 numdw = remaining_dw; 1973 1974 switch (inst_header & MI_OPCODE) { 1975 case MI_LOAD_REGISTER_IMM: 1976 drm_printf(p, "LRC[%#5tx] = [%#010x] MI_LOAD_REGISTER_IMM: %d regs\n", 1977 dw - start, inst_header, (numdw - 1) / 2); 1978 for (int i = 1; i < numdw; i += 2) 1979 drm_printf(p, "LRC[%#5tx] = - %#6x = %#010x\n", 1980 &dw[i] - start, dw[i], dw[i + 1]); 1981 return numdw; 1982 1983 case MI_LOAD_REGISTER_MEM & MI_OPCODE: 1984 drm_printf(p, "LRC[%#5tx] = [%#010x] MI_LOAD_REGISTER_MEM: %s%s\n", 1985 dw - start, inst_header, 1986 dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "", 1987 dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : ""); 1988 if (numdw == 4) 1989 drm_printf(p, "LRC[%#5tx] = - %#6x = %#010llx\n", 1990 dw - start, 1991 dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2]))); 1992 else 1993 drm_printf(p, "LRC[%#5tx] = - %*ph (%s)\n", 1994 dw - start, (int)sizeof(u32) * (numdw - 1), 1995 dw + 1, numdw < 4 ? "truncated" : "malformed"); 1996 return numdw; 1997 1998 case MI_FORCE_WAKEUP: 1999 drm_printf(p, "LRC[%#5tx] = [%#010x] MI_FORCE_WAKEUP\n", 2000 dw - start, inst_header); 2001 return numdw; 2002 2003 default: 2004 drm_printf(p, "LRC[%#5tx] = [%#010x] unknown MI opcode %#x, likely %d dwords\n", 2005 dw - start, inst_header, opcode, numdw); 2006 return numdw; 2007 } 2008} 2009 2010static int dump_gfxpipe_command(struct drm_printer *p, 2011 struct xe_gt *gt, 2012 u32 *start, 2013 u32 *dw, 2014 int remaining_dw) 2015{ 2016 u32 numdw = instr_dw(*dw); 2017 u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw); 2018 u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw); 2019 u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw); 2020 2021 /* 2022 * Make sure we haven't mis-parsed a number of dwords that exceeds the 2023 * remaining size of the LRC. 2024 */ 2025 if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) 2026 numdw = remaining_dw; 2027 2028 switch (*dw & GFXPIPE_MATCH_MASK) { 2029#define MATCH(cmd) \ 2030 case cmd: \ 2031 drm_printf(p, "LRC[%#5tx] = [%#010x] " #cmd " (%d dwords)\n", \ 2032 dw - start, *dw, numdw); \ 2033 return numdw 2034#define MATCH3D(cmd) \ 2035 case CMD_##cmd: \ 2036 drm_printf(p, "LRC[%#5tx] = [%#010x] " #cmd " (%d dwords)\n", \ 2037 dw - start, *dw, numdw); \ 2038 return numdw 2039 2040 MATCH(STATE_BASE_ADDRESS); 2041 MATCH(STATE_SIP); 2042 MATCH(GPGPU_CSR_BASE_ADDRESS); 2043 MATCH(STATE_COMPUTE_MODE); 2044 MATCH3D(3DSTATE_BTD); 2045 MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS); 2046 MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS); 2047 2048 MATCH3D(3DSTATE_VF_STATISTICS); 2049 2050 MATCH(PIPELINE_SELECT); 2051 2052 MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST); 2053 MATCH3D(3DSTATE_CUSTOM_SAMPLE_PATTERN); 2054 MATCH3D(3DSTATE_CLEAR_PARAMS); 2055 MATCH3D(3DSTATE_DEPTH_BUFFER); 2056 MATCH3D(3DSTATE_STENCIL_BUFFER); 2057 MATCH3D(3DSTATE_HIER_DEPTH_BUFFER); 2058 MATCH3D(3DSTATE_VERTEX_BUFFERS); 2059 MATCH3D(3DSTATE_VERTEX_ELEMENTS); 2060 MATCH3D(3DSTATE_INDEX_BUFFER); 2061 MATCH3D(3DSTATE_VF); 2062 MATCH3D(3DSTATE_MULTISAMPLE); 2063 MATCH3D(3DSTATE_CC_STATE_POINTERS); 2064 MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS); 2065 MATCH3D(3DSTATE_VS); 2066 MATCH3D(3DSTATE_GS); 2067 MATCH3D(3DSTATE_CLIP); 2068 MATCH3D(3DSTATE_SF); 2069 MATCH3D(3DSTATE_WM); 2070 MATCH3D(3DSTATE_CONSTANT_VS); 2071 MATCH3D(3DSTATE_CONSTANT_GS); 2072 MATCH3D(3DSTATE_CONSTANT_PS); 2073 MATCH3D(3DSTATE_SAMPLE_MASK); 2074 MATCH3D(3DSTATE_CONSTANT_HS); 2075 MATCH3D(3DSTATE_CONSTANT_DS); 2076 MATCH3D(3DSTATE_HS); 2077 MATCH3D(3DSTATE_TE); 2078 MATCH3D(3DSTATE_DS); 2079 MATCH3D(3DSTATE_STREAMOUT); 2080 MATCH3D(3DSTATE_SBE); 2081 MATCH3D(3DSTATE_PS); 2082 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP); 2083 MATCH3D(3DSTATE_CPS_POINTERS); 2084 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC); 2085 MATCH3D(3DSTATE_BLEND_STATE_POINTERS); 2086 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS); 2087 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS); 2088 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS); 2089 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS); 2090 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS); 2091 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS); 2092 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS); 2093 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS); 2094 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS); 2095 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS); 2096 MATCH3D(3DSTATE_VF_INSTANCING); 2097 MATCH3D(3DSTATE_VF_SGVS); 2098 MATCH3D(3DSTATE_VF_TOPOLOGY); 2099 MATCH3D(3DSTATE_WM_CHROMAKEY); 2100 MATCH3D(3DSTATE_PS_BLEND); 2101 MATCH3D(3DSTATE_WM_DEPTH_STENCIL); 2102 MATCH3D(3DSTATE_PS_EXTRA); 2103 MATCH3D(3DSTATE_RASTER); 2104 MATCH3D(3DSTATE_SBE_SWIZ); 2105 MATCH3D(3DSTATE_WM_HZ_OP); 2106 MATCH3D(3DSTATE_VF_COMPONENT_PACKING); 2107 MATCH3D(3DSTATE_VF_SGVS_2); 2108 MATCH3D(3DSTATE_VFG); 2109 MATCH3D(3DSTATE_URB_ALLOC_VS); 2110 MATCH3D(3DSTATE_URB_ALLOC_HS); 2111 MATCH3D(3DSTATE_URB_ALLOC_DS); 2112 MATCH3D(3DSTATE_URB_ALLOC_GS); 2113 MATCH3D(3DSTATE_SO_BUFFER_INDEX_0); 2114 MATCH3D(3DSTATE_SO_BUFFER_INDEX_1); 2115 MATCH3D(3DSTATE_SO_BUFFER_INDEX_2); 2116 MATCH3D(3DSTATE_SO_BUFFER_INDEX_3); 2117 MATCH3D(3DSTATE_PRIMITIVE_REPLICATION); 2118 MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO); 2119 MATCH3D(3DSTATE_AMFS); 2120 MATCH3D(3DSTATE_DEPTH_BOUNDS); 2121 MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS); 2122 MATCH3D(3DSTATE_CONSTANT_TS_POINTER); 2123 MATCH3D(3DSTATE_MESH_CONTROL); 2124 MATCH3D(3DSTATE_MESH_DISTRIB); 2125 MATCH3D(3DSTATE_TASK_REDISTRIB); 2126 MATCH3D(3DSTATE_MESH_SHADER); 2127 MATCH3D(3DSTATE_MESH_SHADER_DATA); 2128 MATCH3D(3DSTATE_TASK_CONTROL); 2129 MATCH3D(3DSTATE_TASK_SHADER); 2130 MATCH3D(3DSTATE_TASK_SHADER_DATA); 2131 MATCH3D(3DSTATE_URB_ALLOC_MESH); 2132 MATCH3D(3DSTATE_URB_ALLOC_TASK); 2133 MATCH3D(3DSTATE_CLIP_MESH); 2134 MATCH3D(3DSTATE_SBE_MESH); 2135 MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER); 2136 MATCH3D(3DSTATE_COARSE_PIXEL); 2137 MATCH3D(3DSTATE_MESH_SHADER_DATA_EXT); 2138 MATCH3D(3DSTATE_TASK_SHADER_DATA_EXT); 2139 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC_2); 2140 MATCH3D(3DSTATE_CC_STATE_POINTERS_2); 2141 MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS_2); 2142 MATCH3D(3DSTATE_BLEND_STATE_POINTERS_2); 2143 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP_2); 2144 2145 MATCH3D(3DSTATE_DRAWING_RECTANGLE); 2146 MATCH3D(3DSTATE_URB_MEMORY); 2147 MATCH3D(3DSTATE_CHROMA_KEY); 2148 MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET); 2149 MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN); 2150 MATCH3D(3DSTATE_LINE_STIPPLE); 2151 MATCH3D(3DSTATE_AA_LINE_PARAMETERS); 2152 MATCH3D(3DSTATE_MONOFILTER_SIZE); 2153 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS); 2154 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS); 2155 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS); 2156 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS); 2157 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS); 2158 MATCH3D(3DSTATE_SO_DECL_LIST); 2159 MATCH3D(3DSTATE_SO_BUFFER); 2160 MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC); 2161 MATCH3D(3DSTATE_SAMPLE_PATTERN); 2162 MATCH3D(3DSTATE_3D_MODE); 2163 MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE); 2164 MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS); 2165 MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO); 2166 MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTER_2); 2167 2168 default: 2169 drm_printf(p, "LRC[%#5tx] = [%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n", 2170 dw - start, *dw, pipeline, opcode, subopcode, numdw); 2171 return numdw; 2172 } 2173} 2174 2175static int dump_gfx_state_command(struct drm_printer *p, 2176 struct xe_gt *gt, 2177 u32 *start, 2178 u32 *dw, 2179 int remaining_dw) 2180{ 2181 u32 numdw = instr_dw(*dw); 2182 u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw); 2183 2184 /* 2185 * Make sure we haven't mis-parsed a number of dwords that exceeds the 2186 * remaining size of the LRC. 2187 */ 2188 if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) 2189 numdw = remaining_dw; 2190 2191 switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) { 2192 MATCH(STATE_WRITE_INLINE); 2193 2194 default: 2195 drm_printf(p, "LRC[%#5tx] = [%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n", 2196 dw - start, *dw, opcode, numdw); 2197 return numdw; 2198 } 2199} 2200 2201void xe_lrc_dump_default(struct drm_printer *p, 2202 struct xe_gt *gt, 2203 enum xe_engine_class hwe_class) 2204{ 2205 u32 *dw, *start; 2206 int remaining_dw, num_dw; 2207 2208 if (!gt->default_lrc[hwe_class]) { 2209 drm_printf(p, "No default LRC for class %d\n", hwe_class); 2210 return; 2211 } 2212 2213 /* 2214 * Skip the beginning of the LRC since it contains the per-process 2215 * hardware status page. 2216 */ 2217 dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE; 2218 start = dw; 2219 remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4; 2220 2221 while (remaining_dw > 0) { 2222 if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) { 2223 num_dw = dump_mi_command(p, gt, start, dw, remaining_dw); 2224 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) { 2225 num_dw = dump_gfxpipe_command(p, gt, start, dw, remaining_dw); 2226 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) { 2227 num_dw = dump_gfx_state_command(p, gt, start, dw, remaining_dw); 2228 } else { 2229 num_dw = min(instr_dw(*dw), remaining_dw); 2230 drm_printf(p, "LRC[%#5tx] = [%#10x] Unknown instruction of type %#x, likely %d dwords\n", 2231 dw - start, 2232 *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw), 2233 num_dw); 2234 } 2235 2236 dw += num_dw; 2237 remaining_dw -= num_dw; 2238 } 2239} 2240 2241/* 2242 * Lookup the value of a register within the offset/value pairs of an 2243 * MI_LOAD_REGISTER_IMM instruction. 2244 * 2245 * Return -ENOENT if the register is not present in the MI_LRI instruction. 2246 */ 2247static int lookup_reg_in_mi_lri(u32 offset, u32 *value, 2248 const u32 *dword_pair, int num_regs) 2249{ 2250 for (int i = 0; i < num_regs; i++) { 2251 if (dword_pair[2 * i] == offset) { 2252 *value = dword_pair[2 * i + 1]; 2253 return 0; 2254 } 2255 } 2256 2257 return -ENOENT; 2258} 2259 2260/* 2261 * Lookup the value of a register in a specific engine type's default LRC. 2262 * 2263 * Return -EINVAL if the default LRC doesn't exist, or ENOENT if the register 2264 * cannot be found in the default LRC. 2265 */ 2266int xe_lrc_lookup_default_reg_value(struct xe_gt *gt, 2267 enum xe_engine_class hwe_class, 2268 u32 offset, 2269 u32 *value) 2270{ 2271 u32 *dw; 2272 int remaining_dw, ret; 2273 2274 if (!gt->default_lrc[hwe_class]) 2275 return -EINVAL; 2276 2277 /* 2278 * Skip the beginning of the LRC since it contains the per-process 2279 * hardware status page. 2280 */ 2281 dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE; 2282 remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4; 2283 2284 while (remaining_dw > 0) { 2285 u32 num_dw = instr_dw(*dw); 2286 2287 if (num_dw > remaining_dw) 2288 num_dw = remaining_dw; 2289 2290 switch (*dw & XE_INSTR_CMD_TYPE) { 2291 case XE_INSTR_MI: 2292 switch (*dw & MI_OPCODE) { 2293 case MI_BATCH_BUFFER_END: 2294 /* End of LRC; register not found */ 2295 return -ENOENT; 2296 2297 case MI_NOOP: 2298 case MI_TOPOLOGY_FILTER: 2299 /* 2300 * MI_NOOP and MI_TOPOLOGY_FILTER don't have 2301 * a length field and are always 1-dword 2302 * instructions. 2303 */ 2304 remaining_dw--; 2305 dw++; 2306 break; 2307 2308 case MI_LOAD_REGISTER_IMM: 2309 ret = lookup_reg_in_mi_lri(offset, value, 2310 dw + 1, (num_dw - 1) / 2); 2311 if (ret == 0) 2312 return 0; 2313 2314 fallthrough; 2315 2316 default: 2317 /* 2318 * Jump to next instruction based on length 2319 * field. 2320 */ 2321 remaining_dw -= num_dw; 2322 dw += num_dw; 2323 break; 2324 } 2325 break; 2326 2327 default: 2328 /* Jump to next instruction based on length field. */ 2329 remaining_dw -= num_dw; 2330 dw += num_dw; 2331 } 2332 } 2333 2334 return -ENOENT; 2335} 2336 2337struct instr_state { 2338 u32 instr; 2339 u16 num_dw; 2340}; 2341 2342static const struct instr_state xe_hpg_svg_state[] = { 2343 { .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 }, 2344 { .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 }, 2345 { .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 }, 2346 { .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 }, 2347 { .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 }, 2348 { .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 }, 2349 { .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 }, 2350 { .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 }, 2351 { .instr = CMD_3DSTATE_VS, .num_dw = 9 }, 2352 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 }, 2353 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 }, 2354 { .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 }, 2355 { .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 }, 2356 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 }, 2357 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 }, 2358 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 }, 2359 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 }, 2360 { .instr = CMD_3DSTATE_CLIP, .num_dw = 4 }, 2361 { .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 }, 2362 { .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 }, 2363 { .instr = CMD_3DSTATE_SF, .num_dw = 4 }, 2364 { .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 }, 2365 { .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 }, 2366 { .instr = CMD_3DSTATE_RASTER, .num_dw = 5 }, 2367 { .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 }, 2368 { .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 }, 2369 { .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 }, 2370 { .instr = CMD_3DSTATE_HS, .num_dw = 9 }, 2371 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 }, 2372 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 }, 2373 { .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 }, 2374 { .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 }, 2375 { .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 }, 2376 { .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 }, 2377 { .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 }, 2378 { .instr = CMD_3DSTATE_TE, .num_dw = 5 }, 2379 { .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 }, 2380 { .instr = CMD_3DSTATE_DS, .num_dw = 11 }, 2381 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 }, 2382 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 }, 2383 { .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 }, 2384 { .instr = CMD_3DSTATE_GS, .num_dw = 10 }, 2385 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 }, 2386 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 }, 2387 { .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 }, 2388 { .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 }, 2389 { .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 }, 2390 { .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 }, 2391 { .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 }, 2392 { .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 }, 2393}; 2394 2395u32 *xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, u32 *cs) 2396{ 2397 struct xe_gt *gt = q->hwe->gt; 2398 struct xe_device *xe = gt_to_xe(gt); 2399 const struct instr_state *state_table = NULL; 2400 int state_table_size = 0; 2401 2402 /* 2403 * Wa_14019789679 2404 * 2405 * If the driver doesn't explicitly emit the SVG instructions while 2406 * setting up the default LRC, the context switch will write 0's 2407 * (noops) into the LRC memory rather than the expected instruction 2408 * headers. Application contexts start out as a copy of the default 2409 * LRC, and if they also do not emit specific settings for some SVG 2410 * state, then on context restore they'll unintentionally inherit 2411 * whatever state setting the previous context had programmed into the 2412 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will 2413 * prevent the hardware from resetting that state back to any specific 2414 * value). 2415 * 2416 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL 2417 * since that's a specific state setting that can easily cause GPU 2418 * hangs if unintentionally inherited. However to be safe we'll 2419 * continue to emit all of the SVG state since it's best not to leak 2420 * any of the state between contexts, even if that leakage is harmless. 2421 */ 2422 if (XE_GT_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) { 2423 state_table = xe_hpg_svg_state; 2424 state_table_size = ARRAY_SIZE(xe_hpg_svg_state); 2425 } 2426 2427 if (!state_table) { 2428 xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n", 2429 GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100); 2430 return cs; 2431 } 2432 2433 for (int i = 0; i < state_table_size; i++) { 2434 u32 instr = state_table[i].instr; 2435 u16 num_dw = state_table[i].num_dw; 2436 bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW); 2437 2438 xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE); 2439 xe_gt_assert(gt, num_dw != 0); 2440 xe_gt_assert(gt, is_single_dw ^ (num_dw > 1)); 2441 2442 /* 2443 * Xe2's SVG context is the same as the one on DG2 / MTL 2444 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has 2445 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined). 2446 * Just make the replacement here rather than defining a 2447 * whole separate table for the single trivial change. 2448 */ 2449 if (GRAPHICS_VER(xe) >= 20 && 2450 instr == CMD_3DSTATE_DRAWING_RECTANGLE) 2451 instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST; 2452 2453 *cs = instr; 2454 if (!is_single_dw) 2455 *cs |= (num_dw - 2); 2456 2457 cs += num_dw; 2458 } 2459 2460 return cs; 2461} 2462 2463struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc) 2464{ 2465 struct xe_lrc_snapshot *snapshot = kmalloc_obj(*snapshot, GFP_NOWAIT); 2466 2467 if (!snapshot) 2468 return NULL; 2469 2470 snapshot->context_desc = xe_lrc_ggtt_addr(lrc); 2471 snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc); 2472 snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc); 2473 snapshot->head = xe_lrc_ring_head(lrc); 2474 snapshot->tail.internal = lrc->ring.tail; 2475 snapshot->tail.memory = xe_lrc_ring_tail(lrc); 2476 snapshot->start = xe_lrc_ring_start(lrc); 2477 snapshot->start_seqno = xe_lrc_start_seqno(lrc); 2478 snapshot->seqno = xe_lrc_seqno(lrc); 2479 snapshot->lrc_bo = xe_bo_get(lrc->bo); 2480 snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc); 2481 snapshot->lrc_size = lrc->size; 2482 snapshot->replay_offset = 0; 2483 snapshot->replay_size = lrc->replay_size; 2484 snapshot->lrc_snapshot = NULL; 2485 snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc)); 2486 snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc); 2487 return snapshot; 2488} 2489 2490void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot) 2491{ 2492 struct xe_bo *bo; 2493 struct iosys_map src; 2494 2495 if (!snapshot) 2496 return; 2497 2498 bo = snapshot->lrc_bo; 2499 snapshot->lrc_bo = NULL; 2500 2501 snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL); 2502 if (!snapshot->lrc_snapshot) 2503 goto put_bo; 2504 2505 xe_bo_lock(bo, false); 2506 if (!ttm_bo_vmap(&bo->ttm, &src)) { 2507 xe_map_memcpy_from(xe_bo_device(bo), 2508 snapshot->lrc_snapshot, &src, snapshot->lrc_offset, 2509 snapshot->lrc_size); 2510 ttm_bo_vunmap(&bo->ttm, &src); 2511 } else { 2512 kvfree(snapshot->lrc_snapshot); 2513 snapshot->lrc_snapshot = NULL; 2514 } 2515 xe_bo_unlock(bo); 2516put_bo: 2517 xe_bo_put(bo); 2518} 2519 2520void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p) 2521{ 2522 unsigned long i; 2523 2524 if (!snapshot) 2525 return; 2526 2527 drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc); 2528 drm_printf(p, "\tHW Ring address: 0x%08x\n", 2529 snapshot->ring_addr); 2530 drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n", 2531 snapshot->indirect_context_desc); 2532 drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head); 2533 drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n", 2534 snapshot->tail.internal, snapshot->tail.memory); 2535 drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start); 2536 drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno); 2537 drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno); 2538 drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp); 2539 drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp); 2540 2541 if (!snapshot->lrc_snapshot) 2542 return; 2543 2544 drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE); 2545 drm_puts(p, "\t[HWSP].data: "); 2546 for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) { 2547 u32 *val = snapshot->lrc_snapshot + i; 2548 char dumped[ASCII85_BUFSZ]; 2549 2550 drm_puts(p, ascii85_encode(*val, dumped)); 2551 } 2552 2553 drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE); 2554 drm_printf(p, "\n\t[HWCTX].replay_offset: 0x%lx\n", snapshot->replay_offset); 2555 drm_printf(p, "\n\t[HWCTX].replay_length: 0x%lx\n", snapshot->replay_size); 2556 2557 drm_puts(p, "\t[HWCTX].data: "); 2558 for (; i < snapshot->lrc_size; i += sizeof(u32)) { 2559 u32 *val = snapshot->lrc_snapshot + i; 2560 char dumped[ASCII85_BUFSZ]; 2561 2562 drm_puts(p, ascii85_encode(*val, dumped)); 2563 } 2564 drm_puts(p, "\n"); 2565} 2566 2567void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot) 2568{ 2569 if (!snapshot) 2570 return; 2571 2572 kvfree(snapshot->lrc_snapshot); 2573 if (snapshot->lrc_bo) 2574 xe_bo_put(snapshot->lrc_bo); 2575 2576 kfree(snapshot); 2577} 2578 2579static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts) 2580{ 2581 u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id); 2582 u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id); 2583 struct xe_hw_engine *hwe; 2584 u64 val; 2585 2586 hwe = xe_gt_hw_engine(lrc->gt, class, instance, false); 2587 if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe), 2588 "Unexpected engine class:instance %d:%d for context utilization\n", 2589 class, instance)) 2590 return -1; 2591 2592 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) 2593 val = xe_mmio_read64_2x32(&hwe->gt->mmio, 2594 RING_CTX_TIMESTAMP(hwe->mmio_base)); 2595 else 2596 val = xe_mmio_read32(&hwe->gt->mmio, 2597 RING_CTX_TIMESTAMP(hwe->mmio_base)); 2598 2599 *reg_ctx_ts = val; 2600 2601 return 0; 2602} 2603 2604/** 2605 * xe_lrc_timestamp() - Current ctx timestamp 2606 * @lrc: Pointer to the lrc. 2607 * 2608 * Return latest ctx timestamp. With support for active contexts, the 2609 * calculation may be slightly racy, so follow a read-again logic to ensure that 2610 * the context is still active before returning the right timestamp. 2611 * 2612 * Returns: New ctx timestamp value 2613 */ 2614u64 xe_lrc_timestamp(struct xe_lrc *lrc) 2615{ 2616 u64 lrc_ts, reg_ts, new_ts = lrc->ctx_timestamp; 2617 u32 engine_id; 2618 2619 lrc_ts = xe_lrc_ctx_timestamp(lrc); 2620 /* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */ 2621 if (IS_SRIOV_VF(lrc_to_xe(lrc))) { 2622 new_ts = lrc_ts; 2623 goto done; 2624 } 2625 2626 if (lrc_ts == CONTEXT_ACTIVE) { 2627 engine_id = xe_lrc_engine_id(lrc); 2628 if (!get_ctx_timestamp(lrc, engine_id, &reg_ts)) 2629 new_ts = reg_ts; 2630 2631 /* read lrc again to ensure context is still active */ 2632 lrc_ts = xe_lrc_ctx_timestamp(lrc); 2633 } 2634 2635 /* 2636 * If context switched out, just use the lrc_ts. Note that this needs to 2637 * be a separate if condition. 2638 */ 2639 if (lrc_ts != CONTEXT_ACTIVE) 2640 new_ts = lrc_ts; 2641 2642done: 2643 return new_ts; 2644} 2645 2646/** 2647 * xe_lrc_update_timestamp() - Update ctx timestamp 2648 * @lrc: Pointer to the lrc. 2649 * @old_ts: Old timestamp value 2650 * 2651 * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and 2652 * update saved value. 2653 * 2654 * Returns: New ctx timestamp value 2655 */ 2656u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts) 2657{ 2658 *old_ts = lrc->ctx_timestamp; 2659 lrc->ctx_timestamp = xe_lrc_timestamp(lrc); 2660 2661 trace_xe_lrc_update_timestamp(lrc, *old_ts); 2662 2663 return lrc->ctx_timestamp; 2664} 2665 2666/** 2667 * xe_lrc_ring_is_idle() - LRC is idle 2668 * @lrc: Pointer to the lrc. 2669 * 2670 * Compare LRC ring head and tail to determine if idle. 2671 * 2672 * Return: True is ring is idle, False otherwise 2673 */ 2674bool xe_lrc_ring_is_idle(struct xe_lrc *lrc) 2675{ 2676 return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc); 2677}