A Modern GPGPU API & wip linux RDNA2+ Driver
rdna driver linux gpu
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

amdgpu: implement memcpy for transfer queue

+87
+27
drivers/amdgpu/cmds.cpp
··· 28 28 } 29 29 } 30 30 31 + void memcpy_transfer(CommandListImpl *impl, kes_gpuptr_t dst, kes_gpuptr_t src, std::size_t size) { 32 + assert(impl->queue->type == KesQueueTypeTransfer, "memcpy_transfer: requires queue of Transfer type"); 33 + 34 + SDMAEncoder enc(impl->queue->dev->info, impl->cs); 35 + 36 + // @todo: tmz? 37 + while (size > 0) { 38 + uint64_t bytes_written = enc.copy_linear(src, dst, size, false); 39 + size -= bytes_written; 40 + src += bytes_written; 41 + dst += bytes_written; 42 + } 43 + } 44 + 45 + void amdgpu_cmd_memcpy(KesCommandList pcl, kes_gpuptr_t dst, kes_gpuptr_t src, size_t size) { 46 + auto *cl = reinterpret_cast<CommandListImpl *>(pcl); 47 + assert(cl, "memcpy: command list handle invalid: {}", (void *)pcl); 48 + 49 + switch(cl->queue->type) { 50 + case KesQueueTypeTransfer: 51 + memcpy_transfer(cl, dst, src, size); 52 + break; 53 + default: 54 + not_implemented("memcpy: not implemented for queue type: {}", cl->queue->type); 55 + } 56 + } 57 + 31 58 SDMAAtomicOp sdma_atomic_op_map(KesSignal sig) { 32 59 switch(sig) { 33 60 case KesSignalAtomicSet:
+28
drivers/amdgpu/gpuinfo.h
··· 13 13 #define SDMA_VERSION_VALUE(major, minor) (((major) << 8) | (minor)) 14 14 15 15 enum class SDMAVersion { 16 + SDMA_UNKNOWN = 0, 17 + /* GFX6 */ 18 + SDMA_1_0 = SDMA_VERSION_VALUE(1, 0), 19 + 20 + /* GFX7 */ 21 + SDMA_2_0 = SDMA_VERSION_VALUE(2, 0), 22 + 23 + /* GFX8 */ 24 + SDMA_2_4 = SDMA_VERSION_VALUE(2, 4), 25 + SDMA_3_0 = SDMA_VERSION_VALUE(3, 0), 26 + SDMA_3_1 = SDMA_VERSION_VALUE(3, 1), 27 + 28 + /* GFX9 */ 29 + SDMA_4_0 = SDMA_VERSION_VALUE(4, 0), 30 + SDMA_4_1 = SDMA_VERSION_VALUE(4, 1), 31 + SDMA_4_2 = SDMA_VERSION_VALUE(4, 2), 32 + SDMA_4_4 = SDMA_VERSION_VALUE(4, 4), 33 + 34 + /* GFX10 */ 35 + SDMA_5_0 = SDMA_VERSION_VALUE(5, 0), 36 + 16 37 /* GFX10.3 */ 17 38 SDMA_5_2 = SDMA_VERSION_VALUE(5, 2), 39 + 18 40 /* GFX11 */ 19 41 SDMA_6_0 = SDMA_VERSION_VALUE(6, 0), 42 + 43 + /* GFX11.5 */ 44 + SDMA_6_1 = SDMA_VERSION_VALUE(6, 1), 45 + 46 + /* GFX12 */ 47 + SDMA_7_0 = SDMA_VERSION_VALUE(7, 0), 20 48 }; 21 49 22 50 struct IpInfo {
+1
drivers/amdgpu/impl.h
··· 47 47 void amdgpu_submit(KesQueue, KesCommandList); 48 48 49 49 void amdgpu_cmd_memset(KesCommandList, kes_gpuptr_t addr, size_t size, uint32_t value); 50 + void amdgpu_cmd_memcpy(KesCommandList, kes_gpuptr_t dst, kes_gpuptr_t src, size_t size); 50 51 void amdgpu_cmd_write_timestamp(KesCommandList, kes_gpuptr_t addr); 51 52 52 53 void amdgpu_cmd_signal_after(KesCommandList, KesStage before, kes_gpuptr_t addr, uint64_t value, KesSignal);
+1
drivers/amdgpu/interface.cpp
··· 15 15 fns->fn_start_recording = amdgpu_start_recording; 16 16 fns->fn_submit = amdgpu_submit; 17 17 fns->fn_cmd_memset = amdgpu_cmd_memset; 18 + fns->fn_cmd_memcpy = amdgpu_cmd_memcpy; 18 19 fns->fn_cmd_write_timestamp = amdgpu_cmd_write_timestamp; 19 20 fns->fn_cmd_signal_after = amdgpu_cmd_signal_after; 20 21 fns->fn_cmd_wait_before = amdgpu_cmd_wait_before;
+29
drivers/amdgpu/sdma_encoder.cpp
··· 86 86 87 87 return bytes_written; 88 88 } 89 + 90 + uint64_t SDMAEncoder::copy_linear(uint64_t src_va, uint64_t dst_va, uint64_t size, bool tmz) { 91 + const unsigned max_size_per_packet = 92 + info.sdma_version >= SDMAVersion::SDMA_5_2 ? SDMA_V5_2_COPY_MAX_BYTES : SDMA_V2_0_COPY_MAX_BYTES; 93 + uint32_t align = ~0u; 94 + 95 + /* SDMA FW automatically enables a faster dword copy mode when 96 + * source, destination and size are all dword-aligned. 97 + * 98 + * When source and destination are dword-aligned, round down the size to 99 + * take advantage of faster dword copy, and copy the remaining few bytes 100 + * with the last copy packet. 101 + */ 102 + if ((src_va & 0x3) == 0 && (dst_va & 0x3) == 0 && size > 4 && (size & 0x3) != 0) { 103 + align = ~0x3u; 104 + } 105 + 106 + const uint64_t bytes_written = size >= 4 ? MIN2(size & align, max_size_per_packet) : size; 107 + 108 + cs.emit(SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR, (tmz ? 4 : 0))); 109 + cs.emit(info.sdma_version >= SDMAVersion::SDMA_4_0 ? bytes_written - 1 : bytes_written); 110 + cs.emit(0); 111 + cs.emit(src_va); 112 + cs.emit(src_va >> 32); 113 + cs.emit(dst_va); 114 + cs.emit(dst_va >> 32); 115 + 116 + return bytes_written; 117 + }
+1
drivers/amdgpu/sdma_encoder.h
··· 30 30 31 31 // returns the number of bytes written; may need to be repeated. 32 32 uint64_t constant_fill(uint64_t va, uint64_t size, uint32_t value); 33 + uint64_t copy_linear(uint64_t src_va, uint64_t dst_va, uint64_t size, bool tmz); 33 34 private: 34 35 GpuInfo &info; 35 36 CommandStream &cs;