Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: MIT
2/*
3 * Copyright © 2021 Intel Corporation
4 */
5
6#include "xe_lrc.h"
7
8#include <generated/xe_wa_oob.h>
9
10#include <linux/ascii85.h>
11#include <linux/panic.h>
12
13#include "instructions/xe_mi_commands.h"
14#include "instructions/xe_gfxpipe_commands.h"
15#include "instructions/xe_gfx_state_commands.h"
16#include "regs/xe_engine_regs.h"
17#include "regs/xe_gt_regs.h"
18#include "regs/xe_lrc_layout.h"
19#include "xe_bb.h"
20#include "xe_bo.h"
21#include "xe_configfs.h"
22#include "xe_device.h"
23#include "xe_drm_client.h"
24#include "xe_exec_queue_types.h"
25#include "xe_gt.h"
26#include "xe_gt_printk.h"
27#include "xe_hw_fence.h"
28#include "xe_map.h"
29#include "xe_memirq.h"
30#include "xe_mmio.h"
31#include "xe_ring_ops.h"
32#include "xe_sriov.h"
33#include "xe_trace_lrc.h"
34#include "xe_vm.h"
35#include "xe_wa.h"
36
37#define LRC_VALID BIT_ULL(0)
38#define LRC_PRIVILEGE BIT_ULL(8)
39#define LRC_ADDRESSING_MODE GENMASK_ULL(4, 3)
40#define LRC_LEGACY_64B_CONTEXT 3
41
42#define LRC_ENGINE_CLASS GENMASK_ULL(63, 61)
43#define LRC_ENGINE_INSTANCE GENMASK_ULL(53, 48)
44
45#define LRC_PPHWSP_SIZE SZ_4K
46#define LRC_INDIRECT_CTX_BO_SIZE SZ_4K
47#define LRC_INDIRECT_RING_STATE_SIZE SZ_4K
48
49#define LRC_PRIORITY GENMASK_ULL(10, 9)
50#define LRC_PRIORITY_LOW 0
51#define LRC_PRIORITY_NORMAL 1
52#define LRC_PRIORITY_HIGH 2
53
54/*
55 * Layout of the LRC and associated data allocated as
56 * lrc->bo:
57 *
58 * Region Size
59 * +============================+=================================+ <- __xe_lrc_ring_offset()
60 * | Ring | ring_size, see |
61 * | | xe_lrc_init() |
62 * +============================+=================================+ <- __xe_lrc_pphwsp_offset()
63 * | PPHWSP (includes SW state) | 4K |
64 * +----------------------------+---------------------------------+ <- __xe_lrc_regs_offset()
65 * | Engine Context Image | n * 4K, see |
66 * | | xe_gt_lrc_size() |
67 * +----------------------------+---------------------------------+ <- __xe_lrc_indirect_ring_offset()
68 * | Indirect Ring State Page | 0 or 4k, see |
69 * | | XE_LRC_FLAG_INDIRECT_RING_STATE |
70 * +============================+=================================+ <- __xe_lrc_indirect_ctx_offset()
71 * | Indirect Context Page | 0 or 4k, see |
72 * | | XE_LRC_FLAG_INDIRECT_CTX |
73 * +============================+=================================+ <- __xe_lrc_wa_bb_offset()
74 * | WA BB Per Ctx | 4k |
75 * +============================+=================================+ <- xe_bo_size(lrc->bo)
76 */
77
78static struct xe_device *
79lrc_to_xe(struct xe_lrc *lrc)
80{
81 return gt_to_xe(lrc->fence_ctx.gt);
82}
83
84static bool
85gt_engine_needs_indirect_ctx(struct xe_gt *gt, enum xe_engine_class class)
86{
87 struct xe_device *xe = gt_to_xe(gt);
88
89 if (XE_GT_WA(gt, 16010904313) &&
90 (class == XE_ENGINE_CLASS_RENDER ||
91 class == XE_ENGINE_CLASS_COMPUTE))
92 return true;
93
94 if (xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
95 class, NULL))
96 return true;
97
98 if (gt->ring_ops[class]->emit_aux_table_inv)
99 return true;
100
101 return false;
102}
103
104/**
105 * xe_gt_lrc_hang_replay_size() - Hang replay size
106 * @gt: The GT
107 * @class: Hardware engine class
108 *
109 * Determine size of GPU hang replay state for a GT and hardware engine class.
110 *
111 * Return: Size of GPU hang replay size
112 */
113size_t xe_gt_lrc_hang_replay_size(struct xe_gt *gt, enum xe_engine_class class)
114{
115 struct xe_device *xe = gt_to_xe(gt);
116 size_t size = 0;
117
118 /* Engine context image */
119 switch (class) {
120 case XE_ENGINE_CLASS_RENDER:
121 if (GRAPHICS_VERx100(xe) >= 3510)
122 size += 7 * SZ_4K;
123 else if (GRAPHICS_VER(xe) >= 20)
124 size += 3 * SZ_4K;
125 else
126 size += 13 * SZ_4K;
127 break;
128 case XE_ENGINE_CLASS_COMPUTE:
129 if (GRAPHICS_VERx100(xe) >= 3510)
130 size += 5 * SZ_4K;
131 else if (GRAPHICS_VER(xe) >= 20)
132 size += 2 * SZ_4K;
133 else
134 size += 13 * SZ_4K;
135 break;
136 default:
137 WARN(1, "Unknown engine class: %d", class);
138 fallthrough;
139 case XE_ENGINE_CLASS_COPY:
140 case XE_ENGINE_CLASS_VIDEO_DECODE:
141 case XE_ENGINE_CLASS_VIDEO_ENHANCE:
142 case XE_ENGINE_CLASS_OTHER:
143 size += 1 * SZ_4K;
144 }
145
146 return size;
147}
148
149size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
150{
151 size_t size = xe_gt_lrc_hang_replay_size(gt, class);
152
153 /* Add indirect ring state page */
154 if (xe_gt_has_indirect_ring_state(gt))
155 size += LRC_INDIRECT_RING_STATE_SIZE;
156
157 return size + LRC_PPHWSP_SIZE;
158}
159
160/*
161 * The per-platform tables are u8-encoded in @data. Decode @data and set the
162 * addresses' offset and commands in @regs. The following encoding is used
163 * for each byte. There are 2 steps: decoding commands and decoding addresses.
164 *
165 * Commands:
166 * [7]: create NOPs - number of NOPs are set in lower bits
167 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
168 * MI_LRI_FORCE_POSTED
169 * [5:0]: Number of NOPs or registers to set values to in case of
170 * MI_LOAD_REGISTER_IMM
171 *
172 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
173 * number of registers. They are set by using the REG/REG16 macros: the former
174 * is used for offsets smaller than 0x200 while the latter is for values bigger
175 * than that. Those macros already set all the bits documented below correctly:
176 *
177 * [7]: When a register offset needs more than 6 bits, use additional bytes, to
178 * follow, for the lower bits
179 * [6:0]: Register offset, without considering the engine base.
180 *
181 * This function only tweaks the commands and register offsets. Values are not
182 * filled out.
183 */
184static void set_offsets(u32 *regs,
185 const u8 *data,
186 const struct xe_hw_engine *hwe)
187#define NOP(x) (BIT(7) | (x))
188#define LRI(count, flags) ((flags) << 6 | (count) | \
189 BUILD_BUG_ON_ZERO(count >= BIT(6)))
190#define POSTED BIT(0)
191#define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
192#define REG16(x) \
193 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
194 (((x) >> 2) & 0x7f)
195{
196 const u32 base = hwe->mmio_base;
197
198 while (*data) {
199 u8 count, flags;
200
201 if (*data & BIT(7)) { /* skip */
202 count = *data++ & ~BIT(7);
203 regs += count;
204 continue;
205 }
206
207 count = *data & 0x3f;
208 flags = *data >> 6;
209 data++;
210
211 *regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
212 if (flags & POSTED)
213 *regs |= MI_LRI_FORCE_POSTED;
214 *regs |= MI_LRI_LRM_CS_MMIO;
215 regs++;
216
217 xe_gt_assert(hwe->gt, count);
218 do {
219 u32 offset = 0;
220 u8 v;
221
222 do {
223 v = *data++;
224 offset <<= 7;
225 offset |= v & ~BIT(7);
226 } while (v & BIT(7));
227
228 regs[0] = base + (offset << 2);
229 regs += 2;
230 } while (--count);
231 }
232
233 *regs = MI_BATCH_BUFFER_END | BIT(0);
234}
235
236static const u8 gen12_xcs_offsets[] = {
237 NOP(1),
238 LRI(13, POSTED),
239 REG16(0x244),
240 REG(0x034),
241 REG(0x030),
242 REG(0x038),
243 REG(0x03c),
244 REG(0x168),
245 REG(0x140),
246 REG(0x110),
247 REG(0x1c0),
248 REG(0x1c4),
249 REG(0x1c8),
250 REG(0x180),
251 REG16(0x2b4),
252
253 NOP(5),
254 LRI(9, POSTED),
255 REG16(0x3a8),
256 REG16(0x28c),
257 REG16(0x288),
258 REG16(0x284),
259 REG16(0x280),
260 REG16(0x27c),
261 REG16(0x278),
262 REG16(0x274),
263 REG16(0x270),
264
265 0
266};
267
268static const u8 dg2_xcs_offsets[] = {
269 NOP(1),
270 LRI(15, POSTED),
271 REG16(0x244),
272 REG(0x034),
273 REG(0x030),
274 REG(0x038),
275 REG(0x03c),
276 REG(0x168),
277 REG(0x140),
278 REG(0x110),
279 REG(0x1c0),
280 REG(0x1c4),
281 REG(0x1c8),
282 REG(0x180),
283 REG16(0x2b4),
284 REG(0x120),
285 REG(0x124),
286
287 NOP(1),
288 LRI(9, POSTED),
289 REG16(0x3a8),
290 REG16(0x28c),
291 REG16(0x288),
292 REG16(0x284),
293 REG16(0x280),
294 REG16(0x27c),
295 REG16(0x278),
296 REG16(0x274),
297 REG16(0x270),
298
299 0
300};
301
302static const u8 gen12_rcs_offsets[] = {
303 NOP(1),
304 LRI(13, POSTED),
305 REG16(0x244),
306 REG(0x034),
307 REG(0x030),
308 REG(0x038),
309 REG(0x03c),
310 REG(0x168),
311 REG(0x140),
312 REG(0x110),
313 REG(0x1c0),
314 REG(0x1c4),
315 REG(0x1c8),
316 REG(0x180),
317 REG16(0x2b4),
318
319 NOP(5),
320 LRI(9, POSTED),
321 REG16(0x3a8),
322 REG16(0x28c),
323 REG16(0x288),
324 REG16(0x284),
325 REG16(0x280),
326 REG16(0x27c),
327 REG16(0x278),
328 REG16(0x274),
329 REG16(0x270),
330
331 LRI(3, POSTED),
332 REG(0x1b0),
333 REG16(0x5a8),
334 REG16(0x5ac),
335
336 NOP(6),
337 LRI(1, 0),
338 REG(0x0c8),
339 NOP(3 + 9 + 1),
340
341 LRI(51, POSTED),
342 REG16(0x588),
343 REG16(0x588),
344 REG16(0x588),
345 REG16(0x588),
346 REG16(0x588),
347 REG16(0x588),
348 REG(0x028),
349 REG(0x09c),
350 REG(0x0c0),
351 REG(0x178),
352 REG(0x17c),
353 REG16(0x358),
354 REG(0x170),
355 REG(0x150),
356 REG(0x154),
357 REG(0x158),
358 REG16(0x41c),
359 REG16(0x600),
360 REG16(0x604),
361 REG16(0x608),
362 REG16(0x60c),
363 REG16(0x610),
364 REG16(0x614),
365 REG16(0x618),
366 REG16(0x61c),
367 REG16(0x620),
368 REG16(0x624),
369 REG16(0x628),
370 REG16(0x62c),
371 REG16(0x630),
372 REG16(0x634),
373 REG16(0x638),
374 REG16(0x63c),
375 REG16(0x640),
376 REG16(0x644),
377 REG16(0x648),
378 REG16(0x64c),
379 REG16(0x650),
380 REG16(0x654),
381 REG16(0x658),
382 REG16(0x65c),
383 REG16(0x660),
384 REG16(0x664),
385 REG16(0x668),
386 REG16(0x66c),
387 REG16(0x670),
388 REG16(0x674),
389 REG16(0x678),
390 REG16(0x67c),
391 REG(0x068),
392 REG(0x084),
393 NOP(1),
394
395 0
396};
397
398static const u8 xehp_rcs_offsets[] = {
399 NOP(1),
400 LRI(13, POSTED),
401 REG16(0x244),
402 REG(0x034),
403 REG(0x030),
404 REG(0x038),
405 REG(0x03c),
406 REG(0x168),
407 REG(0x140),
408 REG(0x110),
409 REG(0x1c0),
410 REG(0x1c4),
411 REG(0x1c8),
412 REG(0x180),
413 REG16(0x2b4),
414
415 NOP(5),
416 LRI(9, POSTED),
417 REG16(0x3a8),
418 REG16(0x28c),
419 REG16(0x288),
420 REG16(0x284),
421 REG16(0x280),
422 REG16(0x27c),
423 REG16(0x278),
424 REG16(0x274),
425 REG16(0x270),
426
427 LRI(3, POSTED),
428 REG(0x1b0),
429 REG16(0x5a8),
430 REG16(0x5ac),
431
432 NOP(6),
433 LRI(1, 0),
434 REG(0x0c8),
435
436 0
437};
438
439static const u8 dg2_rcs_offsets[] = {
440 NOP(1),
441 LRI(15, POSTED),
442 REG16(0x244),
443 REG(0x034),
444 REG(0x030),
445 REG(0x038),
446 REG(0x03c),
447 REG(0x168),
448 REG(0x140),
449 REG(0x110),
450 REG(0x1c0),
451 REG(0x1c4),
452 REG(0x1c8),
453 REG(0x180),
454 REG16(0x2b4),
455 REG(0x120),
456 REG(0x124),
457
458 NOP(1),
459 LRI(9, POSTED),
460 REG16(0x3a8),
461 REG16(0x28c),
462 REG16(0x288),
463 REG16(0x284),
464 REG16(0x280),
465 REG16(0x27c),
466 REG16(0x278),
467 REG16(0x274),
468 REG16(0x270),
469
470 LRI(3, POSTED),
471 REG(0x1b0),
472 REG16(0x5a8),
473 REG16(0x5ac),
474
475 NOP(6),
476 LRI(1, 0),
477 REG(0x0c8),
478
479 0
480};
481
482static const u8 mtl_rcs_offsets[] = {
483 NOP(1),
484 LRI(15, POSTED),
485 REG16(0x244),
486 REG(0x034),
487 REG(0x030),
488 REG(0x038),
489 REG(0x03c),
490 REG(0x168),
491 REG(0x140),
492 REG(0x110),
493 REG(0x1c0),
494 REG(0x1c4),
495 REG(0x1c8),
496 REG(0x180),
497 REG16(0x2b4),
498 REG(0x120),
499 REG(0x124),
500
501 NOP(1),
502 LRI(9, POSTED),
503 REG16(0x3a8),
504 REG16(0x28c),
505 REG16(0x288),
506 REG16(0x284),
507 REG16(0x280),
508 REG16(0x27c),
509 REG16(0x278),
510 REG16(0x274),
511 REG16(0x270),
512
513 NOP(2),
514 LRI(2, POSTED),
515 REG16(0x5a8),
516 REG16(0x5ac),
517
518 NOP(6),
519 LRI(1, 0),
520 REG(0x0c8),
521
522 0
523};
524
525#define XE2_CTX_COMMON \
526 NOP(1), /* [0x00] */ \
527 LRI(15, POSTED), /* [0x01] */ \
528 REG16(0x244), /* [0x02] CTXT_SR_CTL */ \
529 REG(0x034), /* [0x04] RING_BUFFER_HEAD */ \
530 REG(0x030), /* [0x06] RING_BUFFER_TAIL */ \
531 REG(0x038), /* [0x08] RING_BUFFER_START */ \
532 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */ \
533 REG(0x168), /* [0x0c] BB_ADDR_UDW */ \
534 REG(0x140), /* [0x0e] BB_ADDR */ \
535 REG(0x110), /* [0x10] BB_STATE */ \
536 REG(0x1c0), /* [0x12] BB_PER_CTX_PTR */ \
537 REG(0x1c4), /* [0x14] RCS_INDIRECT_CTX */ \
538 REG(0x1c8), /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
539 REG(0x180), /* [0x18] CCID */ \
540 REG16(0x2b4), /* [0x1a] SEMAPHORE_TOKEN */ \
541 REG(0x120), /* [0x1c] PRT_BB_STATE */ \
542 REG(0x124), /* [0x1e] PRT_BB_STATE_UDW */ \
543 \
544 NOP(1), /* [0x20] */ \
545 LRI(9, POSTED), /* [0x21] */ \
546 REG16(0x3a8), /* [0x22] CTX_TIMESTAMP */ \
547 REG16(0x3ac), /* [0x24] CTX_TIMESTAMP_UDW */ \
548 REG(0x108), /* [0x26] INDIRECT_RING_STATE */ \
549 REG16(0x284), /* [0x28] dummy reg */ \
550 REG16(0x280), /* [0x2a] CS_ACC_CTR_THOLD */ \
551 REG16(0x27c), /* [0x2c] CS_CTX_SYS_PASID */ \
552 REG16(0x278), /* [0x2e] CS_CTX_ASID */ \
553 REG16(0x274), /* [0x30] PTBP_UDW */ \
554 REG16(0x270) /* [0x32] PTBP_LDW */
555
556static const u8 xe2_rcs_offsets[] = {
557 XE2_CTX_COMMON,
558
559 NOP(2), /* [0x34] */
560 LRI(2, POSTED), /* [0x36] */
561 REG16(0x5a8), /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
562 REG16(0x5ac), /* [0x39] PREEMPTION_STATUS */
563
564 NOP(6), /* [0x41] */
565 LRI(1, 0), /* [0x47] */
566 REG(0x0c8), /* [0x48] R_PWR_CLK_STATE */
567
568 0
569};
570
571static const u8 xe2_bcs_offsets[] = {
572 XE2_CTX_COMMON,
573
574 NOP(4 + 8 + 1), /* [0x34] */
575 LRI(2, POSTED), /* [0x41] */
576 REG16(0x200), /* [0x42] BCS_SWCTRL */
577 REG16(0x204), /* [0x44] BLIT_CCTL */
578
579 0
580};
581
582static const u8 xe2_xcs_offsets[] = {
583 XE2_CTX_COMMON,
584
585 0
586};
587
588static const u8 xe2_indirect_ring_state_offsets[] = {
589 NOP(1), /* [0x00] */
590 LRI(5, POSTED), /* [0x01] */
591 REG(0x034), /* [0x02] RING_BUFFER_HEAD */
592 REG(0x030), /* [0x04] RING_BUFFER_TAIL */
593 REG(0x038), /* [0x06] RING_BUFFER_START */
594 REG(0x048), /* [0x08] RING_BUFFER_START_UDW */
595 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */
596
597 NOP(5), /* [0x0c] */
598 LRI(9, POSTED), /* [0x11] */
599 REG(0x168), /* [0x12] BB_ADDR_UDW */
600 REG(0x140), /* [0x14] BB_ADDR */
601 REG(0x110), /* [0x16] BB_STATE */
602 REG16(0x588), /* [0x18] BB_STACK_WRITE_PORT */
603 REG16(0x588), /* [0x20] BB_STACK_WRITE_PORT */
604 REG16(0x588), /* [0x22] BB_STACK_WRITE_PORT */
605 REG16(0x588), /* [0x24] BB_STACK_WRITE_PORT */
606 REG16(0x588), /* [0x26] BB_STACK_WRITE_PORT */
607 REG16(0x588), /* [0x28] BB_STACK_WRITE_PORT */
608
609 NOP(12), /* [0x00] */
610
611 0
612};
613
614#undef REG16
615#undef REG
616#undef LRI
617#undef NOP
618
619static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
620{
621 if (class == XE_ENGINE_CLASS_RENDER) {
622 if (GRAPHICS_VER(xe) >= 20)
623 return xe2_rcs_offsets;
624 else if (GRAPHICS_VERx100(xe) >= 1270)
625 return mtl_rcs_offsets;
626 else if (GRAPHICS_VERx100(xe) >= 1255)
627 return dg2_rcs_offsets;
628 else if (GRAPHICS_VERx100(xe) >= 1250)
629 return xehp_rcs_offsets;
630 else
631 return gen12_rcs_offsets;
632 } else if (class == XE_ENGINE_CLASS_COPY) {
633 if (GRAPHICS_VER(xe) >= 20)
634 return xe2_bcs_offsets;
635 else
636 return gen12_xcs_offsets;
637 } else {
638 if (GRAPHICS_VER(xe) >= 20)
639 return xe2_xcs_offsets;
640 else if (GRAPHICS_VERx100(xe) >= 1255)
641 return dg2_xcs_offsets;
642 else
643 return gen12_xcs_offsets;
644 }
645}
646
647static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
648{
649 regs[CTX_CONTEXT_CONTROL] = REG_MASKED_FIELD_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
650 CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
651
652 if (xe_gt_has_indirect_ring_state(hwe->gt))
653 regs[CTX_CONTEXT_CONTROL] |=
654 REG_MASKED_FIELD_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE);
655}
656
657static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
658{
659 struct xe_memirq *memirq = >_to_tile(hwe->gt)->memirq;
660 struct xe_device *xe = gt_to_xe(hwe->gt);
661 u8 num_regs;
662
663 if (!xe_device_uses_memirq(xe))
664 return;
665
666 regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM |
667 MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT;
668 regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr;
669 regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq);
670
671 num_regs = xe_device_has_msix(xe) ? 3 : 2;
672 regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) |
673 MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED;
674 regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr;
675 regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe);
676 regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr;
677 regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe);
678
679 if (xe_device_has_msix(xe)) {
680 regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr;
681 /* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */
682 }
683}
684
685static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
686{
687 struct xe_device *xe = gt_to_xe(hwe->gt);
688
689 if (GRAPHICS_VERx100(xe) >= 1250)
690 return 0x70;
691 else
692 return 0x60;
693}
694
695static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
696{
697 int x;
698
699 x = lrc_ring_mi_mode(hwe);
700 regs[x + 1] &= ~STOP_RING;
701 regs[x + 1] |= STOP_RING << 16;
702}
703
704static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc)
705{
706 return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE;
707}
708
709static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
710{
711 return 0;
712}
713
714u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
715{
716 return lrc->ring.size;
717}
718
719/* Make the magic macros work */
720#define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
721#define __xe_lrc_regs_offset xe_lrc_regs_offset
722
723#define LRC_CTX_JOB_TIMESTAMP_OFFSET 512
724#define LRC_ENGINE_ID_PPHWSP_OFFSET 1024
725#define LRC_PARALLEL_PPHWSP_OFFSET 2048
726
727#define LRC_SEQNO_OFFSET 0
728#define LRC_START_SEQNO_OFFSET (LRC_SEQNO_OFFSET + 8)
729
730u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
731{
732 return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
733}
734
735/**
736 * xe_lrc_reg_size() - Get size of the LRC registers area within queues
737 * @xe: the &xe_device struct instance
738 *
739 * Returns: Size of the LRC registers area for current platform
740 */
741size_t xe_lrc_reg_size(struct xe_device *xe)
742{
743 if (GRAPHICS_VERx100(xe) >= 1250)
744 return 96 * sizeof(u32);
745 else
746 return 80 * sizeof(u32);
747}
748
749/**
750 * xe_lrc_engine_state_size() - Get size of the engine state within LRC
751 * @gt: the &xe_gt struct instance
752 * @class: Hardware engine class
753 *
754 * Returns: Size of the engine state
755 */
756size_t xe_lrc_engine_state_size(struct xe_gt *gt, enum xe_engine_class class)
757{
758 return xe_gt_lrc_hang_replay_size(gt, class) - xe_lrc_reg_size(gt_to_xe(gt));
759}
760
761static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
762{
763 return LRC_SEQNO_OFFSET;
764}
765
766static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
767{
768 return LRC_START_SEQNO_OFFSET;
769}
770
771static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
772{
773 /* This is stored in the driver-defined portion of PPHWSP */
774 return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
775}
776
777static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
778{
779 /* The parallel is stored in the driver-defined portion of PPHWSP */
780 return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
781}
782
783static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
784{
785 return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
786}
787
788static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
789{
790 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
791}
792
793static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
794{
795 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
796}
797
798static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
799{
800 u32 offset = xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE -
801 LRC_INDIRECT_RING_STATE_SIZE;
802
803 if (lrc->flags & XE_LRC_FLAG_INDIRECT_CTX)
804 offset -= LRC_INDIRECT_CTX_BO_SIZE;
805
806 return offset;
807}
808
809static inline u32 __xe_lrc_indirect_ctx_offset(struct xe_lrc *lrc)
810{
811 return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - LRC_INDIRECT_CTX_BO_SIZE;
812}
813
814static inline u32 __xe_lrc_wa_bb_offset(struct xe_lrc *lrc)
815{
816 return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE;
817}
818
819#define DECL_MAP_ADDR_HELPERS(elem, bo_expr) \
820static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
821{ \
822 struct xe_bo *bo = (bo_expr); \
823 struct iosys_map map = bo->vmap; \
824\
825 xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map)); \
826 iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
827 return map; \
828} \
829static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
830{ \
831 struct xe_bo *bo = (bo_expr); \
832\
833 return xe_bo_ggtt_addr(bo) + __xe_lrc_##elem##_offset(lrc); \
834} \
835
836DECL_MAP_ADDR_HELPERS(ring, lrc->bo)
837DECL_MAP_ADDR_HELPERS(pphwsp, lrc->bo)
838DECL_MAP_ADDR_HELPERS(seqno, lrc->seqno_bo)
839DECL_MAP_ADDR_HELPERS(regs, lrc->bo)
840DECL_MAP_ADDR_HELPERS(start_seqno, lrc->seqno_bo)
841DECL_MAP_ADDR_HELPERS(ctx_job_timestamp, lrc->bo)
842DECL_MAP_ADDR_HELPERS(ctx_timestamp, lrc->bo)
843DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw, lrc->bo)
844DECL_MAP_ADDR_HELPERS(parallel, lrc->bo)
845DECL_MAP_ADDR_HELPERS(indirect_ring, lrc->bo)
846DECL_MAP_ADDR_HELPERS(engine_id, lrc->bo)
847
848#undef DECL_MAP_ADDR_HELPERS
849
850/**
851 * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address
852 * @lrc: Pointer to the lrc.
853 *
854 * Returns: ctx timestamp GGTT address
855 */
856u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
857{
858 return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
859}
860
861/**
862 * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
863 * @lrc: Pointer to the lrc.
864 *
865 * Returns: ctx timestamp udw GGTT address
866 */
867u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
868{
869 return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
870}
871
872/**
873 * xe_lrc_ctx_timestamp() - Read ctx timestamp value
874 * @lrc: Pointer to the lrc.
875 *
876 * Returns: ctx timestamp value
877 */
878static u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
879{
880 struct xe_device *xe = lrc_to_xe(lrc);
881 struct iosys_map map;
882 u32 ldw, udw = 0;
883
884 map = __xe_lrc_ctx_timestamp_map(lrc);
885 ldw = xe_map_read32(xe, &map);
886
887 if (xe->info.has_64bit_timestamp) {
888 map = __xe_lrc_ctx_timestamp_udw_map(lrc);
889 udw = xe_map_read32(xe, &map);
890 }
891
892 return (u64)udw << 32 | ldw;
893}
894
895/**
896 * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address
897 * @lrc: Pointer to the lrc.
898 *
899 * Returns: ctx timestamp job GGTT address
900 */
901u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc)
902{
903 return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc);
904}
905
906/**
907 * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value
908 * @lrc: Pointer to the lrc.
909 *
910 * Returns: ctx timestamp job value
911 */
912u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc)
913{
914 struct xe_device *xe = lrc_to_xe(lrc);
915 struct iosys_map map;
916
917 map = __xe_lrc_ctx_job_timestamp_map(lrc);
918 return xe_map_read32(xe, &map);
919}
920
921u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
922{
923 return __xe_lrc_pphwsp_ggtt_addr(lrc);
924}
925
926u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc)
927{
928 if (!xe_lrc_has_indirect_ring_state(lrc))
929 return 0;
930
931 return __xe_lrc_indirect_ring_ggtt_addr(lrc);
932}
933
934static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr)
935{
936 struct xe_device *xe = lrc_to_xe(lrc);
937 struct iosys_map map;
938
939 map = __xe_lrc_indirect_ring_map(lrc);
940 iosys_map_incr(&map, reg_nr * sizeof(u32));
941 return xe_map_read32(xe, &map);
942}
943
944static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc,
945 int reg_nr, u32 val)
946{
947 struct xe_device *xe = lrc_to_xe(lrc);
948 struct iosys_map map;
949
950 map = __xe_lrc_indirect_ring_map(lrc);
951 iosys_map_incr(&map, reg_nr * sizeof(u32));
952 xe_map_write32(xe, &map, val);
953}
954
955u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
956{
957 struct xe_device *xe = lrc_to_xe(lrc);
958 struct iosys_map map;
959
960 map = __xe_lrc_regs_map(lrc);
961 iosys_map_incr(&map, reg_nr * sizeof(u32));
962 return xe_map_read32(xe, &map);
963}
964
965void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
966{
967 struct xe_device *xe = lrc_to_xe(lrc);
968 struct iosys_map map;
969
970 map = __xe_lrc_regs_map(lrc);
971 iosys_map_incr(&map, reg_nr * sizeof(u32));
972 xe_map_write32(xe, &map, val);
973}
974
975static void *empty_lrc_data(struct xe_hw_engine *hwe)
976{
977 struct xe_gt *gt = hwe->gt;
978 void *data;
979 u32 *regs;
980
981 data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL);
982 if (!data)
983 return NULL;
984
985 /* 1st page: Per-Process of HW status Page */
986 regs = data + LRC_PPHWSP_SIZE;
987 set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe);
988 set_context_control(regs, hwe);
989 set_memory_based_intr(regs, hwe);
990 reset_stop_ring(regs, hwe);
991 if (xe_gt_has_indirect_ring_state(gt)) {
992 regs = data + xe_gt_lrc_size(gt, hwe->class) -
993 LRC_INDIRECT_RING_STATE_SIZE;
994 set_offsets(regs, xe2_indirect_ring_state_offsets, hwe);
995 }
996
997 return data;
998}
999
1000/**
1001 * xe_default_lrc_update_memirq_regs_with_address - Re-compute GGTT references in default LRC
1002 * of given engine.
1003 * @hwe: the &xe_hw_engine struct instance
1004 */
1005void xe_default_lrc_update_memirq_regs_with_address(struct xe_hw_engine *hwe)
1006{
1007 struct xe_gt *gt = hwe->gt;
1008 u32 *regs;
1009
1010 if (!gt->default_lrc[hwe->class])
1011 return;
1012
1013 regs = gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE;
1014 set_memory_based_intr(regs, hwe);
1015}
1016
1017/**
1018 * xe_lrc_update_memirq_regs_with_address - Re-compute GGTT references in mem interrupt data
1019 * for given LRC.
1020 * @lrc: the &xe_lrc struct instance
1021 * @hwe: the &xe_hw_engine struct instance
1022 * @regs: scratch buffer to be used as temporary storage
1023 */
1024void xe_lrc_update_memirq_regs_with_address(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1025 u32 *regs)
1026{
1027 struct xe_gt *gt = hwe->gt;
1028 struct iosys_map map;
1029 size_t regs_len;
1030
1031 if (!xe_device_uses_memirq(gt_to_xe(gt)))
1032 return;
1033
1034 map = __xe_lrc_regs_map(lrc);
1035 regs_len = xe_lrc_reg_size(gt_to_xe(gt));
1036 xe_map_memcpy_from(gt_to_xe(gt), regs, &map, 0, regs_len);
1037 set_memory_based_intr(regs, hwe);
1038 xe_map_memcpy_to(gt_to_xe(gt), &map, 0, regs, regs_len);
1039}
1040
1041static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
1042{
1043 u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt));
1044
1045 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
1046 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
1047}
1048
1049static void xe_lrc_finish(struct xe_lrc *lrc)
1050{
1051 xe_hw_fence_ctx_finish(&lrc->fence_ctx);
1052 xe_bo_unpin_map_no_vm(lrc->bo);
1053 xe_bo_unpin_map_no_vm(lrc->seqno_bo);
1054}
1055
1056/*
1057 * wa_bb_setup_utilization() - Write commands to wa bb to assist
1058 * in calculating active context run ticks.
1059 *
1060 * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the
1061 * context, but only gets updated when the context switches out. In order to
1062 * check how long a context has been active before it switches out, two things
1063 * are required:
1064 *
1065 * (1) Determine if the context is running:
1066 * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in
1067 * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is
1068 * initialized. During a query, we just check for this value to determine if the
1069 * context is active. If the context switched out, it would overwrite this
1070 * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as
1071 * the last part of context restore, so reusing this LRC location will not
1072 * clobber anything.
1073 *
1074 * (2) Calculate the time that the context has been active for:
1075 * The CTX_TIMESTAMP ticks only when the context is active. If a context is
1076 * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
1077 * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific
1078 * engine instance. Since we do not know which instance the context is running
1079 * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and
1080 * store it in the PPHSWP.
1081 */
1082#define CONTEXT_ACTIVE 1ULL
1083static ssize_t setup_utilization_wa(struct xe_lrc *lrc,
1084 struct xe_hw_engine *hwe,
1085 u32 *batch,
1086 size_t max_len)
1087{
1088 u32 *cmd = batch;
1089
1090 if (IS_SRIOV_VF(gt_to_xe(lrc->gt)))
1091 return 0;
1092
1093 if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1094 return -ENOSPC;
1095
1096 *cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
1097 *cmd++ = ENGINE_ID(0).addr;
1098 *cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
1099 *cmd++ = 0;
1100
1101 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1102 *cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1103 *cmd++ = 0;
1104 *cmd++ = lower_32_bits(CONTEXT_ACTIVE);
1105
1106 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) {
1107 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
1108 *cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
1109 *cmd++ = 0;
1110 *cmd++ = upper_32_bits(CONTEXT_ACTIVE);
1111 }
1112
1113 return cmd - batch;
1114}
1115
1116static ssize_t setup_timestamp_wa(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1117 u32 *batch, size_t max_len)
1118{
1119 const u32 ts_addr = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
1120 u32 *cmd = batch;
1121
1122 if (!XE_GT_WA(lrc->gt, 16010904313) ||
1123 !(hwe->class == XE_ENGINE_CLASS_RENDER ||
1124 hwe->class == XE_ENGINE_CLASS_COMPUTE ||
1125 hwe->class == XE_ENGINE_CLASS_COPY ||
1126 hwe->class == XE_ENGINE_CLASS_VIDEO_DECODE ||
1127 hwe->class == XE_ENGINE_CLASS_VIDEO_ENHANCE))
1128 return 0;
1129
1130 if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
1131 return -ENOSPC;
1132
1133 *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1134 MI_LRM_ASYNC;
1135 *cmd++ = RING_CTX_TIMESTAMP(0).addr;
1136 *cmd++ = ts_addr;
1137 *cmd++ = 0;
1138
1139 *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
1140 MI_LRM_ASYNC;
1141 *cmd++ = RING_CTX_TIMESTAMP(0).addr;
1142 *cmd++ = ts_addr;
1143 *cmd++ = 0;
1144
1145 *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
1146 *cmd++ = RING_CTX_TIMESTAMP(0).addr;
1147 *cmd++ = ts_addr;
1148 *cmd++ = 0;
1149
1150 return cmd - batch;
1151}
1152
1153static ssize_t setup_configfs_post_ctx_restore_bb(struct xe_lrc *lrc,
1154 struct xe_hw_engine *hwe,
1155 u32 *batch, size_t max_len)
1156{
1157 struct xe_device *xe = gt_to_xe(lrc->gt);
1158 const u32 *user_batch;
1159 u32 *cmd = batch;
1160 u32 count;
1161
1162 count = xe_configfs_get_ctx_restore_post_bb(to_pci_dev(xe->drm.dev),
1163 hwe->class, &user_batch);
1164 if (!count)
1165 return 0;
1166
1167 if (count > max_len)
1168 return -ENOSPC;
1169
1170 /*
1171 * This should be used only for tests and validation. Taint the kernel
1172 * as anything could be submitted directly in context switches
1173 */
1174 add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
1175
1176 memcpy(cmd, user_batch, count * sizeof(u32));
1177 cmd += count;
1178
1179 return cmd - batch;
1180}
1181
1182static ssize_t setup_configfs_mid_ctx_restore_bb(struct xe_lrc *lrc,
1183 struct xe_hw_engine *hwe,
1184 u32 *batch, size_t max_len)
1185{
1186 struct xe_device *xe = gt_to_xe(lrc->gt);
1187 const u32 *user_batch;
1188 u32 *cmd = batch;
1189 u32 count;
1190
1191 count = xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
1192 hwe->class, &user_batch);
1193 if (!count)
1194 return 0;
1195
1196 if (count > max_len)
1197 return -ENOSPC;
1198
1199 /*
1200 * This should be used only for tests and validation. Taint the kernel
1201 * as anything could be submitted directly in context switches
1202 */
1203 add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
1204
1205 memcpy(cmd, user_batch, count * sizeof(u32));
1206 cmd += count;
1207
1208 return cmd - batch;
1209}
1210
1211static ssize_t setup_invalidate_state_cache_wa(struct xe_lrc *lrc,
1212 struct xe_hw_engine *hwe,
1213 u32 *batch, size_t max_len)
1214{
1215 u32 *cmd = batch;
1216
1217 if (!XE_GT_WA(lrc->gt, 18022495364) ||
1218 hwe->class != XE_ENGINE_CLASS_RENDER)
1219 return 0;
1220
1221 if (xe_gt_WARN_ON(lrc->gt, max_len < 3))
1222 return -ENOSPC;
1223
1224 *cmd++ = MI_LOAD_REGISTER_IMM | MI_LRI_LRM_CS_MMIO | MI_LRI_NUM_REGS(1);
1225 *cmd++ = CS_DEBUG_MODE2(0).addr;
1226 *cmd++ = REG_MASKED_FIELD_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE);
1227
1228 return cmd - batch;
1229}
1230
1231static ssize_t setup_invalidate_auxccs_wa(struct xe_lrc *lrc,
1232 struct xe_hw_engine *hwe,
1233 u32 *batch, size_t max_len)
1234{
1235 struct xe_gt *gt = lrc->gt;
1236 u32 *(*emit)(struct xe_gt *gt, u32 *cmd) =
1237 gt->ring_ops[hwe->class]->emit_aux_table_inv;
1238
1239 if (!emit)
1240 return 0;
1241
1242 if (xe_gt_WARN_ON(gt, max_len < 8))
1243 return -ENOSPC;
1244
1245 return emit(gt, batch) - batch;
1246}
1247
1248struct bo_setup {
1249 ssize_t (*setup)(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
1250 u32 *batch, size_t max_size);
1251};
1252
1253struct bo_setup_state {
1254 /* Input: */
1255 struct xe_lrc *lrc;
1256 struct xe_hw_engine *hwe;
1257 size_t max_size;
1258 size_t reserve_dw;
1259 unsigned int offset;
1260 const struct bo_setup *funcs;
1261 unsigned int num_funcs;
1262
1263 /* State: */
1264 u32 *buffer;
1265 u32 *ptr;
1266 unsigned int written;
1267};
1268
1269static int setup_bo(struct bo_setup_state *state)
1270{
1271 ssize_t remain;
1272
1273 if (state->lrc->bo->vmap.is_iomem) {
1274 xe_gt_assert(state->hwe->gt, state->buffer);
1275 state->ptr = state->buffer;
1276 } else {
1277 state->ptr = state->lrc->bo->vmap.vaddr + state->offset;
1278 }
1279
1280 remain = state->max_size / sizeof(u32);
1281
1282 for (size_t i = 0; i < state->num_funcs; i++) {
1283 ssize_t len = state->funcs[i].setup(state->lrc, state->hwe,
1284 state->ptr, remain);
1285
1286 remain -= len;
1287
1288 /*
1289 * Caller has asked for at least reserve_dw to remain unused.
1290 */
1291 if (len < 0 ||
1292 xe_gt_WARN_ON(state->lrc->gt, remain < state->reserve_dw))
1293 goto fail;
1294
1295 state->ptr += len;
1296 state->written += len;
1297 }
1298
1299 return 0;
1300
1301fail:
1302 return -ENOSPC;
1303}
1304
1305static void finish_bo(struct bo_setup_state *state)
1306{
1307 if (!state->lrc->bo->vmap.is_iomem)
1308 return;
1309
1310 xe_map_memcpy_to(gt_to_xe(state->lrc->gt), &state->lrc->bo->vmap,
1311 state->offset, state->buffer,
1312 state->written * sizeof(u32));
1313}
1314
1315/**
1316 * xe_lrc_setup_wa_bb_with_scratch - Execute all wa bb setup callbacks.
1317 * @lrc: the &xe_lrc struct instance
1318 * @hwe: the &xe_hw_engine struct instance
1319 * @scratch: preallocated scratch buffer for temporary storage
1320 * Return: 0 on success, negative error code on failure
1321 */
1322int xe_lrc_setup_wa_bb_with_scratch(struct xe_lrc *lrc, struct xe_hw_engine *hwe, u32 *scratch)
1323{
1324 static const struct bo_setup funcs[] = {
1325 { .setup = setup_timestamp_wa },
1326 { .setup = setup_invalidate_state_cache_wa },
1327 { .setup = setup_utilization_wa },
1328 { .setup = setup_configfs_post_ctx_restore_bb },
1329 };
1330 struct bo_setup_state state = {
1331 .lrc = lrc,
1332 .hwe = hwe,
1333 .max_size = LRC_WA_BB_SIZE,
1334 .buffer = scratch,
1335 .reserve_dw = 1,
1336 .offset = __xe_lrc_wa_bb_offset(lrc),
1337 .funcs = funcs,
1338 .num_funcs = ARRAY_SIZE(funcs),
1339 };
1340 int ret;
1341
1342 ret = setup_bo(&state);
1343 if (ret)
1344 return ret;
1345
1346 *state.ptr++ = MI_BATCH_BUFFER_END;
1347 state.written++;
1348
1349 finish_bo(&state);
1350
1351 xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
1352 xe_bo_ggtt_addr(lrc->bo) + state.offset + 1);
1353
1354 return 0;
1355}
1356
1357static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1358{
1359 u32 *buf = NULL;
1360 int ret;
1361
1362 if (lrc->bo->vmap.is_iomem) {
1363 buf = kmalloc(LRC_WA_BB_SIZE, GFP_KERNEL);
1364 if (!buf)
1365 return -ENOMEM;
1366 }
1367
1368 ret = xe_lrc_setup_wa_bb_with_scratch(lrc, hwe, buf);
1369
1370 kfree(buf);
1371
1372 return ret;
1373}
1374
1375static int
1376setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
1377{
1378 static const struct bo_setup rcs_funcs[] = {
1379 { .setup = setup_timestamp_wa },
1380 { .setup = setup_invalidate_auxccs_wa },
1381 { .setup = setup_configfs_mid_ctx_restore_bb },
1382 };
1383 static const struct bo_setup xcs_funcs[] = {
1384 { .setup = setup_invalidate_auxccs_wa },
1385 { .setup = setup_configfs_mid_ctx_restore_bb },
1386 };
1387 struct bo_setup_state state = {
1388 .lrc = lrc,
1389 .hwe = hwe,
1390 .max_size = (63 * 64) /* max 63 cachelines */,
1391 .buffer = NULL,
1392 .offset = __xe_lrc_indirect_ctx_offset(lrc),
1393 };
1394 int ret;
1395
1396 if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX))
1397 return 0;
1398
1399 if (hwe->class == XE_ENGINE_CLASS_RENDER ||
1400 hwe->class == XE_ENGINE_CLASS_COMPUTE) {
1401 state.funcs = rcs_funcs;
1402 state.num_funcs = ARRAY_SIZE(rcs_funcs);
1403 } else {
1404 state.funcs = xcs_funcs;
1405 state.num_funcs = ARRAY_SIZE(xcs_funcs);
1406 }
1407
1408 if (xe_gt_WARN_ON(lrc->gt, !state.funcs))
1409 return 0;
1410
1411 if (lrc->bo->vmap.is_iomem) {
1412 state.buffer = kmalloc(state.max_size, GFP_KERNEL);
1413 if (!state.buffer)
1414 return -ENOMEM;
1415 }
1416
1417 ret = setup_bo(&state);
1418 if (ret) {
1419 kfree(state.buffer);
1420 return ret;
1421 }
1422
1423 /*
1424 * Align to 64B cacheline so there's no garbage at the end for CS to
1425 * execute: size for indirect ctx must be a multiple of 64.
1426 */
1427 while (state.written & 0xf) {
1428 *state.ptr++ = MI_NOOP;
1429 state.written++;
1430 }
1431
1432 finish_bo(&state);
1433 kfree(state.buffer);
1434
1435 /*
1436 * Enable INDIRECT_CTX leaving INDIRECT_CTX_OFFSET at its default: it
1437 * varies per engine class, but the default is good enough
1438 */
1439 xe_lrc_write_ctx_reg(lrc,
1440 CTX_CS_INDIRECT_CTX,
1441 (xe_bo_ggtt_addr(lrc->bo) + state.offset) |
1442 /* Size in CLs. */
1443 (state.written * sizeof(u32) / 64));
1444
1445 return 0;
1446}
1447
1448static u8 xe_multi_queue_prio_to_lrc(struct xe_lrc *lrc, enum xe_multi_queue_priority priority)
1449{
1450 struct xe_device *xe = gt_to_xe(lrc->gt);
1451
1452 xe_assert(xe, (priority >= XE_MULTI_QUEUE_PRIORITY_LOW &&
1453 priority <= XE_MULTI_QUEUE_PRIORITY_HIGH));
1454
1455 /* xe_multi_queue_priority is directly mapped to LRC priority values */
1456 return priority;
1457}
1458
1459/**
1460 * xe_lrc_set_multi_queue_priority() - Set multi queue priority in LRC
1461 * @lrc: Logical Ring Context
1462 * @priority: Multi queue priority of the exec queue
1463 *
1464 * Convert @priority to LRC multi queue priority and update the @lrc descriptor
1465 */
1466void xe_lrc_set_multi_queue_priority(struct xe_lrc *lrc, enum xe_multi_queue_priority priority)
1467{
1468 lrc->desc &= ~LRC_PRIORITY;
1469 lrc->desc |= FIELD_PREP(LRC_PRIORITY, xe_multi_queue_prio_to_lrc(lrc, priority));
1470}
1471
1472static int xe_lrc_ctx_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
1473 void *replay_state, u16 msix_vec, u32 init_flags)
1474{
1475 struct xe_gt *gt = hwe->gt;
1476 struct xe_tile *tile = gt_to_tile(gt);
1477 struct xe_device *xe = gt_to_xe(gt);
1478 struct iosys_map map;
1479 u32 arb_enable;
1480 u32 state_cache_perf_fix[3];
1481 int err;
1482
1483 /*
1484 * Init Per-Process of HW status Page, LRC / context state to known
1485 * values. If there's already a primed default_lrc, just copy it, otherwise
1486 * it's the early submission to record the lrc: build a new empty one from
1487 * scratch.
1488 */
1489 map = __xe_lrc_pphwsp_map(lrc);
1490 if (gt->default_lrc[hwe->class] || replay_state) {
1491 xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE); /* PPHWSP */
1492 xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1493 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
1494 lrc->size - LRC_PPHWSP_SIZE);
1495 if (replay_state)
1496 xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
1497 replay_state, lrc->replay_size);
1498 } else {
1499 void *init_data = empty_lrc_data(hwe);
1500
1501 if (!init_data) {
1502 return -ENOMEM;
1503 }
1504
1505 xe_map_memcpy_to(xe, &map, 0, init_data, lrc->size);
1506 kfree(init_data);
1507 }
1508
1509 if (vm)
1510 xe_lrc_set_ppgtt(lrc, vm);
1511
1512 if (xe_device_has_msix(xe)) {
1513 xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR,
1514 xe_memirq_status_ptr(&tile->memirq, hwe));
1515 xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR,
1516 xe_memirq_source_ptr(&tile->memirq, hwe));
1517 xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec);
1518 }
1519
1520 if (xe_gt_has_indirect_ring_state(gt)) {
1521 xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1522 __xe_lrc_indirect_ring_ggtt_addr(lrc));
1523
1524 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1525 __xe_lrc_ring_ggtt_addr(lrc));
1526 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0);
1527
1528 /* Match head and tail pointers */
1529 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, lrc->ring.tail);
1530 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail);
1531
1532 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL,
1533 RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1534 } else {
1535 xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1536
1537 /* Match head and tail pointers */
1538 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, lrc->ring.tail);
1539 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
1540
1541 xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
1542 RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
1543 }
1544
1545 if (init_flags & XE_LRC_CREATE_RUNALONE)
1546 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1547 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1548 REG_MASKED_FIELD_ENABLE(CTX_CTRL_RUN_ALONE));
1549
1550 if (init_flags & XE_LRC_CREATE_PXP)
1551 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
1552 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
1553 REG_MASKED_FIELD_ENABLE(CTX_CTRL_PXP_ENABLE));
1554
1555 lrc->ctx_timestamp = 0;
1556 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0);
1557 if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
1558 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0);
1559
1560 if (xe->info.has_asid && vm)
1561 xe_lrc_write_ctx_reg(lrc, CTX_ASID, vm->usm.asid);
1562
1563 lrc->desc = LRC_VALID;
1564 lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT);
1565 /* TODO: Priority */
1566
1567 /* While this appears to have something about privileged batches or
1568 * some such, it really just means PPGTT mode.
1569 */
1570 if (vm)
1571 lrc->desc |= LRC_PRIVILEGE;
1572
1573 if (GRAPHICS_VERx100(xe) < 1250) {
1574 lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance);
1575 lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class);
1576 }
1577
1578 arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1579 xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
1580
1581 if (init_flags & XE_LRC_DISABLE_STATE_CACHE_PERF_FIX) {
1582 state_cache_perf_fix[0] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1);
1583 state_cache_perf_fix[1] = COMMON_SLICE_CHICKEN3.addr;
1584 state_cache_perf_fix[2] = REG_MASKED_FIELD_ENABLE(DISABLE_STATE_CACHE_PERF_FIX);
1585 xe_lrc_write_ring(lrc, state_cache_perf_fix, sizeof(state_cache_perf_fix));
1586 }
1587
1588 map = __xe_lrc_seqno_map(lrc);
1589 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1590
1591 map = __xe_lrc_start_seqno_map(lrc);
1592 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
1593
1594 err = setup_wa_bb(lrc, hwe);
1595 if (err)
1596 return err;
1597
1598 err = setup_indirect_ctx(lrc, hwe);
1599
1600 return err;
1601}
1602
1603static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm,
1604 void *replay_state, u32 ring_size, u16 msix_vec, u32 init_flags)
1605{
1606 struct xe_gt *gt = hwe->gt;
1607 const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
1608 u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
1609 struct xe_tile *tile = gt_to_tile(gt);
1610 struct xe_device *xe = gt_to_xe(gt);
1611 struct xe_bo *bo;
1612 u32 bo_flags;
1613 int err;
1614
1615 kref_init(&lrc->refcount);
1616 lrc->gt = gt;
1617 lrc->replay_size = xe_gt_lrc_hang_replay_size(gt, hwe->class);
1618 lrc->size = lrc_size;
1619 lrc->flags = 0;
1620 lrc->ring.size = ring_size;
1621 lrc->ring.tail = 0;
1622
1623 if (gt_engine_needs_indirect_ctx(gt, hwe->class)) {
1624 lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
1625 bo_size += LRC_INDIRECT_CTX_BO_SIZE;
1626 }
1627
1628 if (xe_gt_has_indirect_ring_state(gt))
1629 lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
1630
1631 bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
1632 XE_BO_FLAG_GGTT_INVALIDATE;
1633
1634 if ((vm && vm->xef) || init_flags & XE_LRC_CREATE_USER_CTX) /* userspace */
1635 bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE | XE_BO_FLAG_FORCE_USER_VRAM;
1636
1637 bo = xe_bo_create_pin_map_novm(xe, tile, bo_size,
1638 ttm_bo_type_kernel,
1639 bo_flags, false);
1640 if (IS_ERR(bo))
1641 return PTR_ERR(bo);
1642
1643 lrc->bo = bo;
1644
1645 bo = xe_bo_create_pin_map_novm(xe, tile, PAGE_SIZE,
1646 ttm_bo_type_kernel,
1647 XE_BO_FLAG_GGTT |
1648 XE_BO_FLAG_GGTT_INVALIDATE |
1649 XE_BO_FLAG_SYSTEM, false);
1650 if (IS_ERR(bo)) {
1651 err = PTR_ERR(bo);
1652 goto err_lrc_finish;
1653 }
1654 lrc->seqno_bo = bo;
1655
1656 xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
1657 hwe->fence_irq, hwe->name);
1658
1659 err = xe_lrc_ctx_init(lrc, hwe, vm, replay_state, msix_vec, init_flags);
1660 if (err)
1661 goto err_lrc_finish;
1662
1663 if (vm && vm->xef)
1664 xe_drm_client_add_bo(vm->xef->client, lrc->bo);
1665
1666 return 0;
1667
1668err_lrc_finish:
1669 xe_lrc_finish(lrc);
1670 return err;
1671}
1672
1673/**
1674 * xe_lrc_create - Create a LRC
1675 * @hwe: Hardware Engine
1676 * @vm: The VM (address space)
1677 * @replay_state: GPU hang replay state
1678 * @ring_size: LRC ring size
1679 * @msix_vec: MSI-X interrupt vector (for platforms that support it)
1680 * @flags: LRC initialization flags
1681 *
1682 * Allocate and initialize the Logical Ring Context (LRC).
1683 *
1684 * Return pointer to created LRC upon success and an error pointer
1685 * upon failure.
1686 */
1687struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
1688 void *replay_state, u32 ring_size, u16 msix_vec, u32 flags)
1689{
1690 struct xe_lrc *lrc;
1691 int err;
1692
1693 lrc = kzalloc_obj(*lrc);
1694 if (!lrc)
1695 return ERR_PTR(-ENOMEM);
1696
1697 err = xe_lrc_init(lrc, hwe, vm, replay_state, ring_size, msix_vec, flags);
1698 if (err) {
1699 kfree(lrc);
1700 return ERR_PTR(err);
1701 }
1702
1703 return lrc;
1704}
1705
1706/**
1707 * xe_lrc_destroy - Destroy the LRC
1708 * @ref: reference to LRC
1709 *
1710 * Called when ref == 0, release resources held by the Logical Ring Context
1711 * (LRC) and free the LRC memory.
1712 */
1713void xe_lrc_destroy(struct kref *ref)
1714{
1715 struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount);
1716
1717 xe_lrc_finish(lrc);
1718 kfree(lrc);
1719}
1720
1721/**
1722 * xe_lrc_update_hwctx_regs_with_address - Re-compute GGTT references within given LRC.
1723 * @lrc: the &xe_lrc struct instance
1724 */
1725void xe_lrc_update_hwctx_regs_with_address(struct xe_lrc *lrc)
1726{
1727 if (xe_lrc_has_indirect_ring_state(lrc)) {
1728 xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
1729 __xe_lrc_indirect_ring_ggtt_addr(lrc));
1730
1731 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
1732 __xe_lrc_ring_ggtt_addr(lrc));
1733 } else {
1734 xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
1735 }
1736}
1737
1738void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail)
1739{
1740 if (xe_lrc_has_indirect_ring_state(lrc))
1741 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail);
1742 else
1743 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail);
1744}
1745
1746u32 xe_lrc_ring_tail(struct xe_lrc *lrc)
1747{
1748 if (xe_lrc_has_indirect_ring_state(lrc))
1749 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR;
1750 else
1751 return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR;
1752}
1753
1754static u32 xe_lrc_ring_start(struct xe_lrc *lrc)
1755{
1756 if (xe_lrc_has_indirect_ring_state(lrc))
1757 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START);
1758 else
1759 return xe_lrc_read_ctx_reg(lrc, CTX_RING_START);
1760}
1761
1762void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
1763{
1764 if (xe_lrc_has_indirect_ring_state(lrc))
1765 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head);
1766 else
1767 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
1768}
1769
1770u32 xe_lrc_ring_head(struct xe_lrc *lrc)
1771{
1772 if (xe_lrc_has_indirect_ring_state(lrc))
1773 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR;
1774 else
1775 return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
1776}
1777
1778u32 xe_lrc_ring_space(struct xe_lrc *lrc)
1779{
1780 const u32 head = xe_lrc_ring_head(lrc);
1781 const u32 tail = lrc->ring.tail;
1782 const u32 size = lrc->ring.size;
1783
1784 return ((head - tail - 1) & (size - 1)) + 1;
1785}
1786
1787static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
1788 const void *data, size_t size)
1789{
1790 struct xe_device *xe = lrc_to_xe(lrc);
1791
1792 iosys_map_incr(&ring, lrc->ring.tail);
1793 xe_map_memcpy_to(xe, &ring, 0, data, size);
1794 lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
1795}
1796
1797void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
1798{
1799 struct xe_device *xe = lrc_to_xe(lrc);
1800 struct iosys_map ring;
1801 u32 rhs;
1802 size_t aligned_size;
1803
1804 xe_assert(xe, IS_ALIGNED(size, 4));
1805 aligned_size = ALIGN(size, 8);
1806
1807 ring = __xe_lrc_ring_map(lrc);
1808
1809 xe_assert(xe, lrc->ring.tail < lrc->ring.size);
1810 rhs = lrc->ring.size - lrc->ring.tail;
1811 if (size > rhs) {
1812 __xe_lrc_write_ring(lrc, ring, data, rhs);
1813 __xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
1814 } else {
1815 __xe_lrc_write_ring(lrc, ring, data, size);
1816 }
1817
1818 if (aligned_size > size) {
1819 u32 noop = MI_NOOP;
1820
1821 __xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
1822 }
1823}
1824
1825u64 xe_lrc_descriptor(struct xe_lrc *lrc)
1826{
1827 return lrc->desc | xe_lrc_ggtt_addr(lrc);
1828}
1829
1830u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
1831{
1832 return __xe_lrc_seqno_ggtt_addr(lrc);
1833}
1834
1835/**
1836 * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence.
1837 *
1838 * Allocate but don't initialize an lrc seqno fence.
1839 *
1840 * Return: Pointer to the allocated fence or
1841 * negative error pointer on error.
1842 */
1843struct dma_fence *xe_lrc_alloc_seqno_fence(void)
1844{
1845 return xe_hw_fence_alloc();
1846}
1847
1848/**
1849 * xe_lrc_free_seqno_fence() - Free an lrc seqno fence.
1850 * @fence: Pointer to the fence to free.
1851 *
1852 * Frees an lrc seqno fence that hasn't yet been
1853 * initialized.
1854 */
1855void xe_lrc_free_seqno_fence(struct dma_fence *fence)
1856{
1857 xe_hw_fence_free(fence);
1858}
1859
1860/**
1861 * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence.
1862 * @lrc: Pointer to the lrc.
1863 * @fence: Pointer to the fence to initialize.
1864 *
1865 * Initializes a pre-allocated lrc seqno fence.
1866 * After initialization, the fence is subject to normal
1867 * dma-fence refcounting.
1868 */
1869void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence)
1870{
1871 xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc));
1872}
1873
1874s32 xe_lrc_seqno(struct xe_lrc *lrc)
1875{
1876 struct iosys_map map = __xe_lrc_seqno_map(lrc);
1877
1878 return xe_map_read32(lrc_to_xe(lrc), &map);
1879}
1880
1881s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
1882{
1883 struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
1884
1885 return xe_map_read32(lrc_to_xe(lrc), &map);
1886}
1887
1888u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
1889{
1890 return __xe_lrc_start_seqno_ggtt_addr(lrc);
1891}
1892
1893u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
1894{
1895 return __xe_lrc_parallel_ggtt_addr(lrc);
1896}
1897
1898struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
1899{
1900 return __xe_lrc_parallel_map(lrc);
1901}
1902
1903/**
1904 * xe_lrc_engine_id() - Read engine id value
1905 * @lrc: Pointer to the lrc.
1906 *
1907 * Returns: context id value
1908 */
1909static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
1910{
1911 struct xe_device *xe = lrc_to_xe(lrc);
1912 struct iosys_map map;
1913
1914 map = __xe_lrc_engine_id_map(lrc);
1915 return xe_map_read32(xe, &map);
1916}
1917
1918static int instr_dw(u32 cmd_header)
1919{
1920 /* GFXPIPE "SINGLE_DW" opcodes are a single dword */
1921 if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
1922 GFXPIPE_SINGLE_DW_CMD(0, 0))
1923 return 1;
1924
1925 /* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
1926 if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
1927 return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
1928
1929 /* Most instructions have the # of dwords (minus 2) in 7:0 */
1930 return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
1931}
1932
1933static int dump_mi_command(struct drm_printer *p,
1934 struct xe_gt *gt,
1935 u32 *start,
1936 u32 *dw,
1937 int remaining_dw)
1938{
1939 u32 inst_header = *dw;
1940 u32 numdw = instr_dw(inst_header);
1941 u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
1942 int num_noop;
1943
1944 /* First check for commands that don't have/use a '# DW' field */
1945 switch (inst_header & MI_OPCODE) {
1946 case MI_NOOP:
1947 num_noop = 1;
1948 while (num_noop < remaining_dw &&
1949 (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
1950 num_noop++;
1951 drm_printf(p, "LRC[%#5tx] = [%#010x] MI_NOOP (%d dwords)\n",
1952 dw - num_noop - start, inst_header, num_noop);
1953 return num_noop;
1954
1955 case MI_TOPOLOGY_FILTER:
1956 drm_printf(p, "LRC[%#5tx] = [%#010x] MI_TOPOLOGY_FILTER\n",
1957 dw - start, inst_header);
1958 return 1;
1959
1960 case MI_BATCH_BUFFER_END:
1961 drm_printf(p, "LRC[%#5tx] = [%#010x] MI_BATCH_BUFFER_END\n",
1962 dw - start, inst_header);
1963 /* Return 'remaining_dw' to consume the rest of the LRC */
1964 return remaining_dw;
1965 }
1966
1967 /*
1968 * Any remaining commands include a # of dwords. We should make sure
1969 * it doesn't exceed the remaining size of the LRC.
1970 */
1971 if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1972 numdw = remaining_dw;
1973
1974 switch (inst_header & MI_OPCODE) {
1975 case MI_LOAD_REGISTER_IMM:
1976 drm_printf(p, "LRC[%#5tx] = [%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
1977 dw - start, inst_header, (numdw - 1) / 2);
1978 for (int i = 1; i < numdw; i += 2)
1979 drm_printf(p, "LRC[%#5tx] = - %#6x = %#010x\n",
1980 &dw[i] - start, dw[i], dw[i + 1]);
1981 return numdw;
1982
1983 case MI_LOAD_REGISTER_MEM & MI_OPCODE:
1984 drm_printf(p, "LRC[%#5tx] = [%#010x] MI_LOAD_REGISTER_MEM: %s%s\n",
1985 dw - start, inst_header,
1986 dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "",
1987 dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : "");
1988 if (numdw == 4)
1989 drm_printf(p, "LRC[%#5tx] = - %#6x = %#010llx\n",
1990 dw - start,
1991 dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2])));
1992 else
1993 drm_printf(p, "LRC[%#5tx] = - %*ph (%s)\n",
1994 dw - start, (int)sizeof(u32) * (numdw - 1),
1995 dw + 1, numdw < 4 ? "truncated" : "malformed");
1996 return numdw;
1997
1998 case MI_FORCE_WAKEUP:
1999 drm_printf(p, "LRC[%#5tx] = [%#010x] MI_FORCE_WAKEUP\n",
2000 dw - start, inst_header);
2001 return numdw;
2002
2003 default:
2004 drm_printf(p, "LRC[%#5tx] = [%#010x] unknown MI opcode %#x, likely %d dwords\n",
2005 dw - start, inst_header, opcode, numdw);
2006 return numdw;
2007 }
2008}
2009
2010static int dump_gfxpipe_command(struct drm_printer *p,
2011 struct xe_gt *gt,
2012 u32 *start,
2013 u32 *dw,
2014 int remaining_dw)
2015{
2016 u32 numdw = instr_dw(*dw);
2017 u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
2018 u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
2019 u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
2020
2021 /*
2022 * Make sure we haven't mis-parsed a number of dwords that exceeds the
2023 * remaining size of the LRC.
2024 */
2025 if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
2026 numdw = remaining_dw;
2027
2028 switch (*dw & GFXPIPE_MATCH_MASK) {
2029#define MATCH(cmd) \
2030 case cmd: \
2031 drm_printf(p, "LRC[%#5tx] = [%#010x] " #cmd " (%d dwords)\n", \
2032 dw - start, *dw, numdw); \
2033 return numdw
2034#define MATCH3D(cmd) \
2035 case CMD_##cmd: \
2036 drm_printf(p, "LRC[%#5tx] = [%#010x] " #cmd " (%d dwords)\n", \
2037 dw - start, *dw, numdw); \
2038 return numdw
2039
2040 MATCH(STATE_BASE_ADDRESS);
2041 MATCH(STATE_SIP);
2042 MATCH(GPGPU_CSR_BASE_ADDRESS);
2043 MATCH(STATE_COMPUTE_MODE);
2044 MATCH3D(3DSTATE_BTD);
2045 MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS);
2046 MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS);
2047
2048 MATCH3D(3DSTATE_VF_STATISTICS);
2049
2050 MATCH(PIPELINE_SELECT);
2051
2052 MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
2053 MATCH3D(3DSTATE_CUSTOM_SAMPLE_PATTERN);
2054 MATCH3D(3DSTATE_CLEAR_PARAMS);
2055 MATCH3D(3DSTATE_DEPTH_BUFFER);
2056 MATCH3D(3DSTATE_STENCIL_BUFFER);
2057 MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
2058 MATCH3D(3DSTATE_VERTEX_BUFFERS);
2059 MATCH3D(3DSTATE_VERTEX_ELEMENTS);
2060 MATCH3D(3DSTATE_INDEX_BUFFER);
2061 MATCH3D(3DSTATE_VF);
2062 MATCH3D(3DSTATE_MULTISAMPLE);
2063 MATCH3D(3DSTATE_CC_STATE_POINTERS);
2064 MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
2065 MATCH3D(3DSTATE_VS);
2066 MATCH3D(3DSTATE_GS);
2067 MATCH3D(3DSTATE_CLIP);
2068 MATCH3D(3DSTATE_SF);
2069 MATCH3D(3DSTATE_WM);
2070 MATCH3D(3DSTATE_CONSTANT_VS);
2071 MATCH3D(3DSTATE_CONSTANT_GS);
2072 MATCH3D(3DSTATE_CONSTANT_PS);
2073 MATCH3D(3DSTATE_SAMPLE_MASK);
2074 MATCH3D(3DSTATE_CONSTANT_HS);
2075 MATCH3D(3DSTATE_CONSTANT_DS);
2076 MATCH3D(3DSTATE_HS);
2077 MATCH3D(3DSTATE_TE);
2078 MATCH3D(3DSTATE_DS);
2079 MATCH3D(3DSTATE_STREAMOUT);
2080 MATCH3D(3DSTATE_SBE);
2081 MATCH3D(3DSTATE_PS);
2082 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
2083 MATCH3D(3DSTATE_CPS_POINTERS);
2084 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
2085 MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
2086 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
2087 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
2088 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
2089 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
2090 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
2091 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
2092 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
2093 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
2094 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
2095 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
2096 MATCH3D(3DSTATE_VF_INSTANCING);
2097 MATCH3D(3DSTATE_VF_SGVS);
2098 MATCH3D(3DSTATE_VF_TOPOLOGY);
2099 MATCH3D(3DSTATE_WM_CHROMAKEY);
2100 MATCH3D(3DSTATE_PS_BLEND);
2101 MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
2102 MATCH3D(3DSTATE_PS_EXTRA);
2103 MATCH3D(3DSTATE_RASTER);
2104 MATCH3D(3DSTATE_SBE_SWIZ);
2105 MATCH3D(3DSTATE_WM_HZ_OP);
2106 MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
2107 MATCH3D(3DSTATE_VF_SGVS_2);
2108 MATCH3D(3DSTATE_VFG);
2109 MATCH3D(3DSTATE_URB_ALLOC_VS);
2110 MATCH3D(3DSTATE_URB_ALLOC_HS);
2111 MATCH3D(3DSTATE_URB_ALLOC_DS);
2112 MATCH3D(3DSTATE_URB_ALLOC_GS);
2113 MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
2114 MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
2115 MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
2116 MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
2117 MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
2118 MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
2119 MATCH3D(3DSTATE_AMFS);
2120 MATCH3D(3DSTATE_DEPTH_BOUNDS);
2121 MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
2122 MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
2123 MATCH3D(3DSTATE_MESH_CONTROL);
2124 MATCH3D(3DSTATE_MESH_DISTRIB);
2125 MATCH3D(3DSTATE_TASK_REDISTRIB);
2126 MATCH3D(3DSTATE_MESH_SHADER);
2127 MATCH3D(3DSTATE_MESH_SHADER_DATA);
2128 MATCH3D(3DSTATE_TASK_CONTROL);
2129 MATCH3D(3DSTATE_TASK_SHADER);
2130 MATCH3D(3DSTATE_TASK_SHADER_DATA);
2131 MATCH3D(3DSTATE_URB_ALLOC_MESH);
2132 MATCH3D(3DSTATE_URB_ALLOC_TASK);
2133 MATCH3D(3DSTATE_CLIP_MESH);
2134 MATCH3D(3DSTATE_SBE_MESH);
2135 MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
2136 MATCH3D(3DSTATE_COARSE_PIXEL);
2137 MATCH3D(3DSTATE_MESH_SHADER_DATA_EXT);
2138 MATCH3D(3DSTATE_TASK_SHADER_DATA_EXT);
2139 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC_2);
2140 MATCH3D(3DSTATE_CC_STATE_POINTERS_2);
2141 MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS_2);
2142 MATCH3D(3DSTATE_BLEND_STATE_POINTERS_2);
2143 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP_2);
2144
2145 MATCH3D(3DSTATE_DRAWING_RECTANGLE);
2146 MATCH3D(3DSTATE_URB_MEMORY);
2147 MATCH3D(3DSTATE_CHROMA_KEY);
2148 MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
2149 MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
2150 MATCH3D(3DSTATE_LINE_STIPPLE);
2151 MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
2152 MATCH3D(3DSTATE_MONOFILTER_SIZE);
2153 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
2154 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
2155 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
2156 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
2157 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
2158 MATCH3D(3DSTATE_SO_DECL_LIST);
2159 MATCH3D(3DSTATE_SO_BUFFER);
2160 MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
2161 MATCH3D(3DSTATE_SAMPLE_PATTERN);
2162 MATCH3D(3DSTATE_3D_MODE);
2163 MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
2164 MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
2165 MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
2166 MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTER_2);
2167
2168 default:
2169 drm_printf(p, "LRC[%#5tx] = [%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
2170 dw - start, *dw, pipeline, opcode, subopcode, numdw);
2171 return numdw;
2172 }
2173}
2174
2175static int dump_gfx_state_command(struct drm_printer *p,
2176 struct xe_gt *gt,
2177 u32 *start,
2178 u32 *dw,
2179 int remaining_dw)
2180{
2181 u32 numdw = instr_dw(*dw);
2182 u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw);
2183
2184 /*
2185 * Make sure we haven't mis-parsed a number of dwords that exceeds the
2186 * remaining size of the LRC.
2187 */
2188 if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
2189 numdw = remaining_dw;
2190
2191 switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) {
2192 MATCH(STATE_WRITE_INLINE);
2193
2194 default:
2195 drm_printf(p, "LRC[%#5tx] = [%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n",
2196 dw - start, *dw, opcode, numdw);
2197 return numdw;
2198 }
2199}
2200
2201void xe_lrc_dump_default(struct drm_printer *p,
2202 struct xe_gt *gt,
2203 enum xe_engine_class hwe_class)
2204{
2205 u32 *dw, *start;
2206 int remaining_dw, num_dw;
2207
2208 if (!gt->default_lrc[hwe_class]) {
2209 drm_printf(p, "No default LRC for class %d\n", hwe_class);
2210 return;
2211 }
2212
2213 /*
2214 * Skip the beginning of the LRC since it contains the per-process
2215 * hardware status page.
2216 */
2217 dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
2218 start = dw;
2219 remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
2220
2221 while (remaining_dw > 0) {
2222 if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
2223 num_dw = dump_mi_command(p, gt, start, dw, remaining_dw);
2224 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
2225 num_dw = dump_gfxpipe_command(p, gt, start, dw, remaining_dw);
2226 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) {
2227 num_dw = dump_gfx_state_command(p, gt, start, dw, remaining_dw);
2228 } else {
2229 num_dw = min(instr_dw(*dw), remaining_dw);
2230 drm_printf(p, "LRC[%#5tx] = [%#10x] Unknown instruction of type %#x, likely %d dwords\n",
2231 dw - start,
2232 *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
2233 num_dw);
2234 }
2235
2236 dw += num_dw;
2237 remaining_dw -= num_dw;
2238 }
2239}
2240
2241/*
2242 * Lookup the value of a register within the offset/value pairs of an
2243 * MI_LOAD_REGISTER_IMM instruction.
2244 *
2245 * Return -ENOENT if the register is not present in the MI_LRI instruction.
2246 */
2247static int lookup_reg_in_mi_lri(u32 offset, u32 *value,
2248 const u32 *dword_pair, int num_regs)
2249{
2250 for (int i = 0; i < num_regs; i++) {
2251 if (dword_pair[2 * i] == offset) {
2252 *value = dword_pair[2 * i + 1];
2253 return 0;
2254 }
2255 }
2256
2257 return -ENOENT;
2258}
2259
2260/*
2261 * Lookup the value of a register in a specific engine type's default LRC.
2262 *
2263 * Return -EINVAL if the default LRC doesn't exist, or ENOENT if the register
2264 * cannot be found in the default LRC.
2265 */
2266int xe_lrc_lookup_default_reg_value(struct xe_gt *gt,
2267 enum xe_engine_class hwe_class,
2268 u32 offset,
2269 u32 *value)
2270{
2271 u32 *dw;
2272 int remaining_dw, ret;
2273
2274 if (!gt->default_lrc[hwe_class])
2275 return -EINVAL;
2276
2277 /*
2278 * Skip the beginning of the LRC since it contains the per-process
2279 * hardware status page.
2280 */
2281 dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
2282 remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
2283
2284 while (remaining_dw > 0) {
2285 u32 num_dw = instr_dw(*dw);
2286
2287 if (num_dw > remaining_dw)
2288 num_dw = remaining_dw;
2289
2290 switch (*dw & XE_INSTR_CMD_TYPE) {
2291 case XE_INSTR_MI:
2292 switch (*dw & MI_OPCODE) {
2293 case MI_BATCH_BUFFER_END:
2294 /* End of LRC; register not found */
2295 return -ENOENT;
2296
2297 case MI_NOOP:
2298 case MI_TOPOLOGY_FILTER:
2299 /*
2300 * MI_NOOP and MI_TOPOLOGY_FILTER don't have
2301 * a length field and are always 1-dword
2302 * instructions.
2303 */
2304 remaining_dw--;
2305 dw++;
2306 break;
2307
2308 case MI_LOAD_REGISTER_IMM:
2309 ret = lookup_reg_in_mi_lri(offset, value,
2310 dw + 1, (num_dw - 1) / 2);
2311 if (ret == 0)
2312 return 0;
2313
2314 fallthrough;
2315
2316 default:
2317 /*
2318 * Jump to next instruction based on length
2319 * field.
2320 */
2321 remaining_dw -= num_dw;
2322 dw += num_dw;
2323 break;
2324 }
2325 break;
2326
2327 default:
2328 /* Jump to next instruction based on length field. */
2329 remaining_dw -= num_dw;
2330 dw += num_dw;
2331 }
2332 }
2333
2334 return -ENOENT;
2335}
2336
2337struct instr_state {
2338 u32 instr;
2339 u16 num_dw;
2340};
2341
2342static const struct instr_state xe_hpg_svg_state[] = {
2343 { .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
2344 { .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
2345 { .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
2346 { .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
2347 { .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
2348 { .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
2349 { .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
2350 { .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
2351 { .instr = CMD_3DSTATE_VS, .num_dw = 9 },
2352 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
2353 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
2354 { .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
2355 { .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
2356 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
2357 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
2358 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
2359 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
2360 { .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
2361 { .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
2362 { .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
2363 { .instr = CMD_3DSTATE_SF, .num_dw = 4 },
2364 { .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
2365 { .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
2366 { .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
2367 { .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
2368 { .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
2369 { .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
2370 { .instr = CMD_3DSTATE_HS, .num_dw = 9 },
2371 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
2372 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
2373 { .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
2374 { .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
2375 { .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
2376 { .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
2377 { .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
2378 { .instr = CMD_3DSTATE_TE, .num_dw = 5 },
2379 { .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
2380 { .instr = CMD_3DSTATE_DS, .num_dw = 11 },
2381 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
2382 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
2383 { .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
2384 { .instr = CMD_3DSTATE_GS, .num_dw = 10 },
2385 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
2386 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
2387 { .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
2388 { .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
2389 { .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
2390 { .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
2391 { .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
2392 { .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
2393};
2394
2395u32 *xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, u32 *cs)
2396{
2397 struct xe_gt *gt = q->hwe->gt;
2398 struct xe_device *xe = gt_to_xe(gt);
2399 const struct instr_state *state_table = NULL;
2400 int state_table_size = 0;
2401
2402 /*
2403 * Wa_14019789679
2404 *
2405 * If the driver doesn't explicitly emit the SVG instructions while
2406 * setting up the default LRC, the context switch will write 0's
2407 * (noops) into the LRC memory rather than the expected instruction
2408 * headers. Application contexts start out as a copy of the default
2409 * LRC, and if they also do not emit specific settings for some SVG
2410 * state, then on context restore they'll unintentionally inherit
2411 * whatever state setting the previous context had programmed into the
2412 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will
2413 * prevent the hardware from resetting that state back to any specific
2414 * value).
2415 *
2416 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL
2417 * since that's a specific state setting that can easily cause GPU
2418 * hangs if unintentionally inherited. However to be safe we'll
2419 * continue to emit all of the SVG state since it's best not to leak
2420 * any of the state between contexts, even if that leakage is harmless.
2421 */
2422 if (XE_GT_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) {
2423 state_table = xe_hpg_svg_state;
2424 state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
2425 }
2426
2427 if (!state_table) {
2428 xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
2429 GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
2430 return cs;
2431 }
2432
2433 for (int i = 0; i < state_table_size; i++) {
2434 u32 instr = state_table[i].instr;
2435 u16 num_dw = state_table[i].num_dw;
2436 bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
2437
2438 xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
2439 xe_gt_assert(gt, num_dw != 0);
2440 xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
2441
2442 /*
2443 * Xe2's SVG context is the same as the one on DG2 / MTL
2444 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
2445 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
2446 * Just make the replacement here rather than defining a
2447 * whole separate table for the single trivial change.
2448 */
2449 if (GRAPHICS_VER(xe) >= 20 &&
2450 instr == CMD_3DSTATE_DRAWING_RECTANGLE)
2451 instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
2452
2453 *cs = instr;
2454 if (!is_single_dw)
2455 *cs |= (num_dw - 2);
2456
2457 cs += num_dw;
2458 }
2459
2460 return cs;
2461}
2462
2463struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
2464{
2465 struct xe_lrc_snapshot *snapshot = kmalloc_obj(*snapshot, GFP_NOWAIT);
2466
2467 if (!snapshot)
2468 return NULL;
2469
2470 snapshot->context_desc = xe_lrc_ggtt_addr(lrc);
2471 snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc);
2472 snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc);
2473 snapshot->head = xe_lrc_ring_head(lrc);
2474 snapshot->tail.internal = lrc->ring.tail;
2475 snapshot->tail.memory = xe_lrc_ring_tail(lrc);
2476 snapshot->start = xe_lrc_ring_start(lrc);
2477 snapshot->start_seqno = xe_lrc_start_seqno(lrc);
2478 snapshot->seqno = xe_lrc_seqno(lrc);
2479 snapshot->lrc_bo = xe_bo_get(lrc->bo);
2480 snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
2481 snapshot->lrc_size = lrc->size;
2482 snapshot->replay_offset = 0;
2483 snapshot->replay_size = lrc->replay_size;
2484 snapshot->lrc_snapshot = NULL;
2485 snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc));
2486 snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
2487 return snapshot;
2488}
2489
2490void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot)
2491{
2492 struct xe_bo *bo;
2493 struct iosys_map src;
2494
2495 if (!snapshot)
2496 return;
2497
2498 bo = snapshot->lrc_bo;
2499 snapshot->lrc_bo = NULL;
2500
2501 snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL);
2502 if (!snapshot->lrc_snapshot)
2503 goto put_bo;
2504
2505 xe_bo_lock(bo, false);
2506 if (!ttm_bo_vmap(&bo->ttm, &src)) {
2507 xe_map_memcpy_from(xe_bo_device(bo),
2508 snapshot->lrc_snapshot, &src, snapshot->lrc_offset,
2509 snapshot->lrc_size);
2510 ttm_bo_vunmap(&bo->ttm, &src);
2511 } else {
2512 kvfree(snapshot->lrc_snapshot);
2513 snapshot->lrc_snapshot = NULL;
2514 }
2515 xe_bo_unlock(bo);
2516put_bo:
2517 xe_bo_put(bo);
2518}
2519
2520void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p)
2521{
2522 unsigned long i;
2523
2524 if (!snapshot)
2525 return;
2526
2527 drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc);
2528 drm_printf(p, "\tHW Ring address: 0x%08x\n",
2529 snapshot->ring_addr);
2530 drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n",
2531 snapshot->indirect_context_desc);
2532 drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head);
2533 drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n",
2534 snapshot->tail.internal, snapshot->tail.memory);
2535 drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start);
2536 drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno);
2537 drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno);
2538 drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp);
2539 drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp);
2540
2541 if (!snapshot->lrc_snapshot)
2542 return;
2543
2544 drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE);
2545 drm_puts(p, "\t[HWSP].data: ");
2546 for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) {
2547 u32 *val = snapshot->lrc_snapshot + i;
2548 char dumped[ASCII85_BUFSZ];
2549
2550 drm_puts(p, ascii85_encode(*val, dumped));
2551 }
2552
2553 drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE);
2554 drm_printf(p, "\n\t[HWCTX].replay_offset: 0x%lx\n", snapshot->replay_offset);
2555 drm_printf(p, "\n\t[HWCTX].replay_length: 0x%lx\n", snapshot->replay_size);
2556
2557 drm_puts(p, "\t[HWCTX].data: ");
2558 for (; i < snapshot->lrc_size; i += sizeof(u32)) {
2559 u32 *val = snapshot->lrc_snapshot + i;
2560 char dumped[ASCII85_BUFSZ];
2561
2562 drm_puts(p, ascii85_encode(*val, dumped));
2563 }
2564 drm_puts(p, "\n");
2565}
2566
2567void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
2568{
2569 if (!snapshot)
2570 return;
2571
2572 kvfree(snapshot->lrc_snapshot);
2573 if (snapshot->lrc_bo)
2574 xe_bo_put(snapshot->lrc_bo);
2575
2576 kfree(snapshot);
2577}
2578
2579static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts)
2580{
2581 u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
2582 u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
2583 struct xe_hw_engine *hwe;
2584 u64 val;
2585
2586 hwe = xe_gt_hw_engine(lrc->gt, class, instance, false);
2587 if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe),
2588 "Unexpected engine class:instance %d:%d for context utilization\n",
2589 class, instance))
2590 return -1;
2591
2592 if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
2593 val = xe_mmio_read64_2x32(&hwe->gt->mmio,
2594 RING_CTX_TIMESTAMP(hwe->mmio_base));
2595 else
2596 val = xe_mmio_read32(&hwe->gt->mmio,
2597 RING_CTX_TIMESTAMP(hwe->mmio_base));
2598
2599 *reg_ctx_ts = val;
2600
2601 return 0;
2602}
2603
2604/**
2605 * xe_lrc_timestamp() - Current ctx timestamp
2606 * @lrc: Pointer to the lrc.
2607 *
2608 * Return latest ctx timestamp. With support for active contexts, the
2609 * calculation may be slightly racy, so follow a read-again logic to ensure that
2610 * the context is still active before returning the right timestamp.
2611 *
2612 * Returns: New ctx timestamp value
2613 */
2614u64 xe_lrc_timestamp(struct xe_lrc *lrc)
2615{
2616 u64 lrc_ts, reg_ts, new_ts = lrc->ctx_timestamp;
2617 u32 engine_id;
2618
2619 lrc_ts = xe_lrc_ctx_timestamp(lrc);
2620 /* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */
2621 if (IS_SRIOV_VF(lrc_to_xe(lrc))) {
2622 new_ts = lrc_ts;
2623 goto done;
2624 }
2625
2626 if (lrc_ts == CONTEXT_ACTIVE) {
2627 engine_id = xe_lrc_engine_id(lrc);
2628 if (!get_ctx_timestamp(lrc, engine_id, ®_ts))
2629 new_ts = reg_ts;
2630
2631 /* read lrc again to ensure context is still active */
2632 lrc_ts = xe_lrc_ctx_timestamp(lrc);
2633 }
2634
2635 /*
2636 * If context switched out, just use the lrc_ts. Note that this needs to
2637 * be a separate if condition.
2638 */
2639 if (lrc_ts != CONTEXT_ACTIVE)
2640 new_ts = lrc_ts;
2641
2642done:
2643 return new_ts;
2644}
2645
2646/**
2647 * xe_lrc_update_timestamp() - Update ctx timestamp
2648 * @lrc: Pointer to the lrc.
2649 * @old_ts: Old timestamp value
2650 *
2651 * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
2652 * update saved value.
2653 *
2654 * Returns: New ctx timestamp value
2655 */
2656u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts)
2657{
2658 *old_ts = lrc->ctx_timestamp;
2659 lrc->ctx_timestamp = xe_lrc_timestamp(lrc);
2660
2661 trace_xe_lrc_update_timestamp(lrc, *old_ts);
2662
2663 return lrc->ctx_timestamp;
2664}
2665
2666/**
2667 * xe_lrc_ring_is_idle() - LRC is idle
2668 * @lrc: Pointer to the lrc.
2669 *
2670 * Compare LRC ring head and tail to determine if idle.
2671 *
2672 * Return: True is ring is idle, False otherwise
2673 */
2674bool xe_lrc_ring_is_idle(struct xe_lrc *lrc)
2675{
2676 return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc);
2677}