Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

drm/amdkfd: Handle save/restore of lds allocated in 1280B blocks

The gfx-9 trap handler is reading LDS allocation size in 256 bytes
granularity (from SQ_WAVE_LDS_ALLOC), but it using the assumption that
this value is always even (i.e. the LDS allocation is really done in
multiple of 512 bytes). This was true so far, but gfx-950 allocates LDS
in chunks of 1280 bytes, making this assumption invalid. This can cause
the trap handler to try to save / restore past the end of LDS, and past
the LDS allocated slot in the save are, overriding data from the
following wave.

This patch updates the trap handler to support LDS allocated in 1280
bytes blocks:
- During restore, copy from main memory directly to LDS in batch of 1280
bytes.
- During save, continue to use 512 bytes blocks (we only have 2 VGPRs we
can use to hold data), making sure to mask the upper half of the wave
when handling when the LDS size is not a multiple of 512 bytes.

Signed-off-by: Lancelot SIX <lancelot.six@amd.com>
Co-authored-by: Alex Sierra <alex.sierra@amd.com>
Reviewed-by: Jay Cornwall <jay.cornwall@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Lancelot SIX
Alex Sierra
and committed by
Alex Deucher
5690011a 549120ed

+49 -36
+27 -33
drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
··· 4124 4124 }; 4125 4125 4126 4126 static const uint32_t cwsr_trap_gfx9_5_0_hex[] = { 4127 - 0xbf820001, 0xbf8202ea, 4127 + 0xbf820001, 0xbf8202d8, 4128 4128 0xb8f8f802, 0x8978ff78, 4129 4129 0x00020006, 0xb8fbf803, 4130 4130 0x866eff78, 0x00002000, ··· 4321 4321 0xe0724300, 0x701d0300, 4322 4322 0xbefe00c1, 0xbeff00c1, 4323 4323 0xb8fb5306, 0x867bc17b, 4324 - 0xbf840064, 0xbf8a0000, 4324 + 0xbf840052, 0xbf8a0000, 4325 4325 0x867aff6f, 0x04000000, 4326 - 0xbf840060, 0x8e7b867b, 4326 + 0xbf84004e, 0x8e7b867b, 4327 4327 0x8e7b827b, 0xbef6007b, 4328 4328 0xb8f02985, 0x80708170, 4329 4329 0x8e708a70, 0x8e708170, ··· 4336 4336 0x000204c1, 0x867aff78, 4337 4337 0x00400000, 0xbf850003, 4338 4338 0xb8faf803, 0x897a7aff, 4339 - 0x10000000, 0xbf850030, 4340 - 0x24040682, 0xd86e4000, 4339 + 0x10000000, 0xbf85001d, 4340 + 0x24040682, 0xd86c0000, 4341 4341 0x00000002, 0xbf8cc07f, 4342 4342 0xbe840080, 0xd2890000, 4343 4343 0x00000900, 0x80048104, ··· 4348 4348 0x80048104, 0xc069003a, 4349 4349 0x00000070, 0xbf8cc07f, 4350 4350 0x80709070, 0xbf06c004, 4351 - 0xbf84ffee, 0xbe840080, 4352 - 0xd2890000, 0x00000901, 4353 - 0x80048104, 0xd2890001, 4354 - 0x00000901, 0x80048104, 4355 - 0xd2890002, 0x00000901, 4356 - 0x80048104, 0xd2890003, 4357 - 0x00000901, 0x80048104, 4358 - 0xc069003a, 0x00000070, 4359 - 0xbf8cc07f, 0x80709070, 4360 - 0xbf06c004, 0xbf84ffee, 4361 - 0x680404ff, 0x00000200, 4351 + 0xbf84ffee, 0x680404ff, 4352 + 0x00000100, 0xd0c9006a, 4353 + 0x0000f702, 0xbf87ffe5, 4354 + 0xbf820016, 0xd1060002, 4355 + 0x00011103, 0x7e0602ff, 4356 + 0x00000200, 0xbefc00ff, 4357 + 0x00010000, 0xbe800077, 4358 + 0x8677ff77, 0xff7fffff, 4359 + 0x8777ff77, 0x00058000, 4360 + 0xd8ec0000, 0x00000002, 4361 + 0xbf8cc07f, 0xe0765000, 4362 + 0x701d0002, 0x68040702, 4362 4363 0xd0c9006a, 0x0000f702, 4363 - 0xbf87ffd2, 0xbf820015, 4364 - 0xd1060002, 0x00011103, 4365 - 0x7e0602ff, 0x00000200, 4366 - 0xbefc00ff, 0x00010000, 4367 - 0xbe800077, 0x8677ff77, 4368 - 0xff7fffff, 0x8777ff77, 4369 - 0x00058000, 0xd8ec0000, 4370 - 0x00000002, 0xbf8cc07f, 4371 - 0xe0765000, 0x701d0002, 4372 - 0x68040702, 0xd0c9006a, 4373 - 0x0000f702, 0xbf87fff7, 4364 + 0xbefe016a, 0xbf87fff6, 4374 4365 0xbef70000, 0xbef000ff, 4375 4366 0x00000400, 0xbefe00c1, 4376 4367 0xbeff00c1, 0xb8fb2b05, ··· 4488 4497 0x701d0300, 0x807c847c, 4489 4498 0x8070ff70, 0x00000400, 4490 4499 0xbf0a7b7c, 0xbf85ffeb, 4491 - 0xbf9c0000, 0xbf8200ee, 4500 + 0xbf9c0000, 0xbf8200f4, 4492 4501 0xbef4007e, 0x8675ff7f, 4493 4502 0x0000ffff, 0x8775ff75, 4494 4503 0x00040000, 0xbef60080, 4495 4504 0xbef700ff, 0x00807fac, 4496 4505 0x866eff7f, 0x04000000, 4497 - 0xbf84001f, 0xbefe00c1, 4506 + 0xbf840025, 0xbefe00c1, 4498 4507 0xbeff00c1, 0xb8ef5306, 4499 - 0x866fc16f, 0xbf84001a, 4508 + 0x866fc16f, 0xbf840020, 4500 4509 0x8e6f866f, 0x8e6f826f, 4501 4510 0xbef6006f, 0xb8f82985, 4502 4511 0x80788178, 0x8e788a78, ··· 4507 4516 0x01000000, 0xbefc0080, 4508 4517 0xe0510000, 0x781d0000, 4509 4518 0xe0510100, 0x781d0000, 4510 - 0x807cff7c, 0x00000200, 4511 - 0x8078ff78, 0x00000200, 4512 - 0xbf0a6f7c, 0xbf85fff6, 4519 + 0xe0510200, 0x781d0000, 4520 + 0xe0510300, 0x781d0000, 4521 + 0xe0510400, 0x781d0000, 4522 + 0x807cff7c, 0x00000500, 4523 + 0x8078ff78, 0x00000500, 4524 + 0xbf0a6f7c, 0xbf85fff0, 4513 4525 0xbefe00c1, 0xbeff00c1, 4514 4526 0xbef600ff, 0x01000000, 4515 4527 0xb8ef2b05, 0x806f816f,
+22 -3
drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
··· 75 75 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 76 76 #if ASIC_FAMILY >= CHIP_GC_9_5_0 77 77 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 11 78 + var LDS_RESTORE_GRANULARITY_BYTES = 1280 78 79 #else 79 80 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 81 + var LDS_RESTORE_GRANULARITY_BYTES = 512 80 82 #endif 81 83 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 82 84 var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits ··· 574 572 575 573 v_lshlrev_b32 v2, 2, v3 576 574 L_SAVE_LDS_LOOP_SQC: 575 + #if ASIC_FAMILY < CHIP_GC_9_5_0 577 576 ds_read2_b32 v[0:1], v2 offset0:0 offset1:0x40 578 577 s_waitcnt lgkmcnt(0) 579 - 580 578 write_vgprs_to_mem_with_sqc(v0, 2, s_save_buf_rsrc0, s_save_mem_offset) 581 579 582 580 v_add_u32 v2, 0x200, v2 581 + #else 582 + // gfx950 needs to save in multiple of 256 bytes. 583 + ds_read_b32 v0, v2 584 + s_waitcnt lgkmcnt(0) 585 + write_vgprs_to_mem_with_sqc(v0, 1, s_save_buf_rsrc0, s_save_mem_offset) 586 + 587 + v_add_u32 v2, 0x100, v2 588 + #endif 589 + 583 590 v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size 584 591 s_cbranch_vccnz L_SAVE_LDS_LOOP_SQC 585 592 ··· 612 601 // v_add_u32 v2, vcc[0:1], v2, v3 613 602 v_add_u32 v2, v2, v3 614 603 v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size 604 + #if ASIC_FAMILY >= CHIP_GC_9_5_0 605 + s_mov_b64 exec, vcc 606 + #endif 615 607 s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR 616 608 617 609 // restore rsrc3 ··· 777 763 L_RESTORE_LDS_LOOP: 778 764 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW 779 765 buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW 780 - s_add_u32 m0, m0, 256*2 // 128 DW 781 - s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW 766 + #if ASIC_FAMILY >= CHIP_GC_9_5_0 767 + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:512 // third 64DW 768 + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:768 // forth 64DW 769 + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:1024 // fifth 64DW 770 + #endif 771 + s_add_u32 m0, m0, LDS_RESTORE_GRANULARITY_BYTES // 128/320 DW 772 + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, LDS_RESTORE_GRANULARITY_BYTES //mem offset increased by 128/320 DW 782 773 s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 783 774 s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete? 784 775