queue work: initial memset SDMA command · logdahl.net/kestrel@3520975

logdahl.net / kestrel

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

A Modern GPGPU API & wip linux RDNA2+ Driver

rdna driver linux gpu

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

queue work: initial memset SDMA command

Olle Lögdahl 5 months ago 3520975f 55facfaf

+515 -204

16 changed files

expand all collapse all

libvektor

CMakeLists.txt

include

vektor

vektor.h

src

amdgpu

cmdstream.cpp

cmdstream.h

gpuinfo.h

pm4.h

pm4_encoder.cpp

pm4_encoder.h

sdma_encoder.cpp

sdma_encoder.h

beta.h

type_format.h

vektor_cmds.cpp

vektor_impl.h

vektor_queue.cpp

test

02_hello_queue

hello_queue.cpp

+22 -2

libvektor/CMakeLists.txt

reviewed

··· 24 24 fmt::fmt 25 25 ) 26 26 27 27 - add_executable(hello_compute test/01_hello_compute/hello_compute.cpp) 28 28 - target_link_libraries(hello_compute PRIVATE vektor) 27 27 + file(GLOB TEST_DIRS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}/test ${CMAKE_CURRENT_SOURCE_DIR}/test/*) 28 28 + 29 29 + foreach(test_dir ${TEST_DIRS}) 30 30 + # Ensure we are looking at a directory, not a random file 31 31 + if(IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/test/${test_dir}) 32 32 + 33 33 + # 2. Define the executable name based on the folder name 34 34 + set(test_name "test_${test_dir}") 35 35 + 36 36 + # 3. Find all .cpp files within that specific test folder 37 37 + file(GLOB_RECURSE TEST_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/test/${test_dir}/*.cpp") 38 38 + 39 39 + if(TEST_SOURCES) 40 40 + add_executable(${test_name} ${TEST_SOURCES}) 41 41 + 42 42 + # 4. Link your library (libvektor) 43 43 + target_link_libraries(${test_name} PRIVATE vektor) 44 44 + 45 45 + message(STATUS "Added test: ${test_name}") 46 46 + endif() 47 47 + endif() 48 48 + endforeach()

+9 -1

libvektor/include/vektor/vektor.h

reviewed

··· 13 13 const char *commit_id; 14 14 }; 15 15 16 16 + enum class QueueType { 17 17 + Graphics, 18 18 + Compute, 19 19 + Transfer 20 20 + }; 21 21 + 16 22 typedef void *Device; 17 23 typedef void *Queue; 18 24 typedef void *CommandList; ··· 38 44 Allocation malloc(Device, std::size_t size, std::size_t align, Memory memory = Memory::Default); 39 45 void free(Device, Allocation &); 40 46 41 41 - Queue create_queue(Device); 47 47 + Queue create_queue(Device, QueueType); 42 48 CommandList start_recording(Queue); 43 49 44 50 void submit(Queue, CommandList); 51 51 + 52 52 + void memset(CommandList, gpuptr_t addr, std::size_t size, uint32_t value); 45 53 46 54 };

+66 -98

libvektor/src/amdgpu/cmdstream.cpp

reviewed

··· 1 1 #include "cmdstream.h" 2 2 + 2 3 #include "beta.h" 3 4 4 4 - 5 5 - #include "gpuinfo.h" 6 6 - #include "amdgfxregs.h" 7 7 - #include "sid.h" 8 8 - #include <amdgpu_drm.h> 9 9 - 10 10 - CommandStream::CommandStream(GpuInfo &info, uint8_t ip_type) : info(info), ip_type(ip_type) { 11 11 - 5 5 + void CommandStream::emit(uint32_t x) { 6 6 + assert(cursor < end, "commandstream emit out of bounds: {}-{} {}", (void *)start, (void *)end, (void *)cursor); 7 7 + *cursor++ = x; 12 8 } 13 9 14 14 - void CommandStream::emit(uint32_t value) { 15 15 - buf.push_back(value); 16 16 - } 10 10 + CommandRing::CommandRing(amdgpu_device_handle dev, amdgpu_context_handle ctx, uint32_t ip_type, Config cfg) 11 11 + : m_dev(dev), m_ctx(ctx), m_ip_type(ip_type), m_cfg(cfg) { 17 12 18 18 - void CommandStream::emit(std::span<uint32_t> values) { 19 19 - buf.insert(buf.end(), values.begin(), values.end()); 20 20 - } 21 21 - 22 22 - void CommandStream::set_reg_seq(uint32_t reg, uint32_t num, uint32_t idx, uint32_t bank_offset, uint32_t bank_end, uint32_t packet, uint32_t reset_filter_cam) { 23 23 - assert(reg >= bank_offset && reg < bank_end, "register out of range: {}", reg); 24 24 - emit(PKT3(packet, num, 0) | PKT3_RESET_FILTER_CAM_S(reset_filter_cam)); 25 25 - emit(((reg - bank_offset) >> 2) | (idx << 28)); 26 26 - } 27 27 - 28 28 - void CommandStream::set_reg(uint32_t reg, uint32_t idx, uint32_t value, uint32_t bank_offset, uint32_t bank_end, uint32_t packet) { 29 29 - set_reg_seq(reg, 1, idx, bank_offset, bank_end, packet, 0); 30 30 - emit(value); 31 31 - } 32 32 - 33 33 - void CommandStream::set_config_reg_seq(uint32_t reg, uint32_t num) { 34 34 - set_reg_seq(reg, num, 0, SI_CONFIG_REG_OFFSET, SI_CONFIG_REG_END, PKT3_SET_CONFIG_REG, 0); 35 35 - } 13 13 + amdgpu_bo_alloc_request req = { 14 14 + .alloc_size = m_cfg.ring_size_bytes, 15 15 + .phys_alignment = 4096, 16 16 + .preferred_heap = AMDGPU_GEM_DOMAIN_GTT, 17 17 + .flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED | AMDGPU_GEM_CREATE_UNCACHED // Or WC 18 18 + }; 36 19 37 37 - void CommandStream::set_config_reg(uint32_t reg, uint32_t value) { 38 38 - set_reg(reg, 0, value, SI_CONFIG_REG_OFFSET, SI_CONFIG_REG_END, PKT3_SET_CONFIG_REG); 39 39 - } 20 20 + amdgpu_bo_alloc(m_dev, &req, &m_bo_handle); 40 21 41 41 - void CommandStream::set_uconfig_reg_seq(uint32_t reg, uint32_t num) { 42 42 - set_reg_seq(reg, num, 0, CIK_UCONFIG_REG_OFFSET, CIK_UCONFIG_REG_END, PKT3_SET_UCONFIG_REG, 0); 43 43 - } 22 22 + void* ptr; 23 23 + amdgpu_bo_cpu_map(m_bo_handle, &ptr); 24 24 + m_cpu_map = static_cast<uint32_t*>(ptr); 44 25 45 45 - void CommandStream::set_uconfig_reg(uint32_t reg, uint32_t value) { 46 46 - set_reg(reg, 0, value, CIK_UCONFIG_REG_OFFSET, CIK_UCONFIG_REG_END, PKT3_SET_UCONFIG_REG); 26 26 + amdgpu_va_range_alloc(m_dev, amdgpu_gpu_va_range_general, m_cfg.ring_size_bytes, 1, 0, &m_gpu_va, nullptr, 0); 27 27 + amdgpu_bo_va_op(m_bo_handle, 0, m_cfg.ring_size_bytes, m_gpu_va, AMDGPU_VM_PAGE_READABLE, AMDGPU_VA_OP_MAP); 47 28 } 48 29 49 49 - void CommandStream::set_uconfig_reg_idx(uint32_t reg, uint32_t idx, uint32_t value) { 50 50 - set_reg(reg, idx, value, CIK_UCONFIG_REG_OFFSET, CIK_UCONFIG_REG_END, PKT3_SET_UCONFIG_REG_INDEX); 51 51 - } 30 30 + CommandStream CommandRing::begin_recording() { 31 31 + uint32_t stream_dw = m_cfg.stream_size_bytes / 4; 52 32 53 53 - /* 54 54 - * On GFX10, there is a bug with the ME implementation of its content 55 55 - * addressable memory (CAM), that means that it can skip register writes due 56 56 - * to not taking correctly into account the fields from the GRBM_GFX_INDEX. 57 57 - * With this __filter_cam_workaround bit we can force the write. 58 58 - */ 59 59 - void CommandStream::set_uconfig_perfctr_reg_seq(uint32_t reg, uint32_t num) { 60 60 - bool filter_cam_workaround = (info.gfx_level > GfxLevel::GFX10) && ip_type == AMDGPU_HW_IP_GFX; 61 61 - set_reg_seq(reg, num, 0, CIK_UCONFIG_REG_OFFSET, CIK_UCONFIG_REG_END, PKT3_SET_UCONFIG_REG, filter_cam_workaround); 62 62 - } 33 33 + if (m_write_cursor_dw + stream_dw > (m_cfg.ring_size_bytes / 4)) { 34 34 + m_write_cursor_dw = 0; 35 35 + } 63 36 64 64 - void CommandStream::set_uconfig_perfctr_reg(uint32_t reg, uint32_t value) { 65 65 - set_uconfig_perfctr_reg_seq(reg, 1); 66 66 - emit(value); 67 67 - } 37 37 + wait_for_space(m_write_cursor_dw); 68 38 69 69 - void CommandStream::set_context_reg_seq(uint32_t reg, uint32_t num) { 70 70 - set_reg_seq(reg, num, 0, SI_CONTEXT_REG_OFFSET, SI_CONTEXT_REG_END, PKT3_SET_CONTEXT_REG, 0); 71 71 - } 39 39 + CommandStream cs; 40 40 + cs.cursor = m_cpu_map + m_write_cursor_dw; 41 41 + cs.end = cs.cursor + stream_dw; 42 42 + cs.gpu_va_start = m_gpu_va + (m_write_cursor_dw * 4); 72 43 73 73 - void CommandStream::set_context_reg(uint32_t reg, uint32_t value) { 74 74 - set_reg(reg, 0, value, SI_CONTEXT_REG_OFFSET, SI_CONTEXT_REG_END, PKT3_SET_CONTEXT_REG); 44 44 + return cs; 75 45 } 76 46 77 77 - void CommandStream::set_context_reg_idx(uint32_t reg, uint32_t idx, uint32_t value) { 78 78 - set_reg(reg, idx, value, SI_CONTEXT_REG_OFFSET, SI_CONTEXT_REG_END, PKT3_SET_CONTEXT_REG); 79 79 - } 47 47 + void CommandRing::submit(CommandStream& cs) { 48 48 + uint32_t start_dw = cs.end - (m_cfg.stream_size_bytes / 4) - m_cpu_map; 49 49 + start_dw = (reinterpret_cast<uint8_t*>(cs.end) - reinterpret_cast<uint8_t*>(m_cpu_map) - m_cfg.stream_size_bytes) / 4; 80 50 81 81 - void CommandStream::set_sh_reg_seq(uint32_t reg, uint32_t num) { 82 82 - set_reg_seq(reg, num, 0, SI_SH_REG_OFFSET, SI_SH_REG_END, PKT3_SET_SH_REG, 0); 83 83 - } 51 51 + uint32_t count_dw = cs.cursor - (cs.end - (m_cfg.stream_size_bytes / 4)); 84 52 85 85 - void CommandStream::set_sh_reg(uint32_t reg, uint32_t value) { 86 86 - set_reg(reg, 0, value, SI_SH_REG_OFFSET, SI_SH_REG_END, PKT3_SET_SH_REG); 87 87 - } 53 53 + amdgpu_cs_ib_info ib = {}; 54 54 + //ib.handle = m_bo_handle; 55 55 + ib.ib_mc_address = cs.gpu_va_start; 56 56 + ib.size = count_dw; 88 57 89 89 - void CommandStream::set_sh_reg_idx(uint32_t reg, uint32_t idx, uint32_t value) { 90 90 - uint32_t opcode = PKT3_SET_SH_REG_INDEX; 91 91 - set_reg(reg, idx, value, SI_SH_REG_OFFSET, SI_SH_REG_END, opcode); 92 92 - } 58 58 + amdgpu_cs_request req = {}; 59 59 + req.ip_type = m_ip_type; 60 60 + req.number_of_ibs = 1; 61 61 + req.ibs = &ib; 93 62 94 94 - void CommandStream::emit_32bit_pointer(uint32_t sh_offset, uint64_t va) { 95 95 - assert(va == 0 || (va >> 32) == info.address32_hi, "va outside valid range: {}", va); 96 96 - set_sh_reg(sh_offset, va); 97 97 - } 63 63 + if (amdgpu_cs_submit(m_ctx, 0, &req, 1) == 0) { 64 64 + amdgpu_cs_fence fence = {}; 65 65 + fence.context = m_ctx; 66 66 + fence.ip_type = m_ip_type; 67 67 + // @todo: syncronization... 98 68 99 99 - void CommandStream::emit_64bit_pointer(uint32_t sh_offset, uint64_t va) { 100 100 - set_sh_reg(sh_offset, 2); 101 101 - emit(va); 102 102 - emit(va >> 32); 69 69 + m_history.push_back({start_dw, start_dw + (uint32_t)(m_cfg.stream_size_bytes/4), fence}); 70 70 + m_write_cursor_dw += (m_cfg.stream_size_bytes / 4); 71 71 + } 103 72 } 104 73 105 105 - void CommandStream::event_write_predicate(uint32_t event_type, bool predicate) { 106 106 - emit(PKT3(PKT3_EVENT_WRITE, 0, predicate)); 107 107 - auto ev_index = event_type == V_028A90_VS_PARTIAL_FLUSH || 108 108 - event_type == V_028A90_PS_PARTIAL_FLUSH || 109 109 - event_type == V_028A90_CS_PARTIAL_FLUSH ? 4 : 110 110 - event_type == V_028A90_PIXEL_PIPE_STAT_CONTROL ? 1 : 0; 111 111 - emit(EVENT_TYPE(event_type) | EVENT_INDEX(ev_index)); 112 112 - } 74 74 + void CommandRing::wait_for_space(uint32_t target_dw) { 75 75 + /* 76 76 + while (!m_history.empty()) { 77 77 + auto& oldest = m_history.front(); 113 78 114 114 - void CommandStream::event_write(uint32_t event_type) { 115 115 - event_write_predicate(event_type, false); 79 79 + // If target overlaps with the oldest pending submission 80 80 + if (target_dw >= oldest.start_dw && target_dw < oldest.end_dw) { 81 81 + uint32_t expired = 0; 82 82 + amdgpu_cs_wait_fences(&oldest.fence, 1, true, 1000000000, &expired, nullptr); 83 83 + m_history.pop_front(); 84 84 + } else { 85 85 + break; 86 86 + } 87 87 + } 88 88 + */ 116 89 } 117 90 118 118 - void CommandStream::set_privileged_config_reg(uint32_t reg, uint32_t value) { 119 119 - assert(reg < CIK_UCONFIG_REG_OFFSET, "reg outside valid range for privileged config: {}", reg); 120 120 - emit(PKT3(PKT3_COPY_DATA, 4, 0)); 121 121 - emit(COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_PERF)); 122 122 - emit(value); 123 123 - emit(0); 124 124 - emit(reg >> 2); 125 125 - emit(0); 91 91 + CommandRing::~CommandRing() { 92 92 + amdgpu_bo_cpu_unmap(m_bo_handle); 93 93 + amdgpu_bo_free(m_bo_handle); 126 94 }

+41 -38

libvektor/src/amdgpu/cmdstream.h

reviewed

··· 1 1 #pragma once 2 2 3 3 - #include <span> 4 4 - #include <vector> 3 3 + #include <cstdint> 4 4 + #include <deque> 5 5 6 6 - #include "gpuinfo.h" 7 7 - // #include "pm4.h" 6 6 + #include <amdgpu.h> 7 7 + #include <amdgpu_drm.h> 8 8 9 9 class CommandStream { 10 10 public: 11 11 - CommandStream(GpuInfo &info, uint8_t ip_type); 11 11 + void emit(uint32_t); 12 12 + std::size_t size_dw() const { return cursor - start; } 13 13 + private: 14 14 + uint32_t *start; 15 15 + uint32_t *end; 16 16 + uint32_t *cursor; 17 17 + uint64_t gpu_va_start; 12 18 13 13 - void emit(uint32_t value); 14 14 - void emit(std::span<uint32_t> values); 19 19 + friend class CommandRing; 20 20 + }; 15 21 16 16 - void emit_32bit_pointer(uint32_t sh_offset, uint64_t va); 17 17 - void emit_64bit_pointer(uint32_t sh_offset, uint64_t va); 22 22 + // @todo: think about syncronization... 18 23 19 19 - // @todo: figure out better sizes for reg, num, value, ... 24 24 + class CommandRing { 25 25 + public: 26 26 + struct Config { 27 27 + std::size_t ring_size_bytes = 2 * 1024 * 1024; // 2MB 28 28 + std::size_t stream_size_bytes = 128 * 1024; // 128KB 29 29 + }; 20 30 21 21 - // Packet building helpers for CONFIG registers. 22 22 - void set_config_reg_seq(uint32_t reg, uint32_t num); 23 23 - void set_config_reg(uint32_t reg, uint32_t value); 31 31 + CommandRing(amdgpu_device_handle dev, amdgpu_context_handle ctx, uint32_t ip_type, Config cfg); 32 32 + ~CommandRing(); 24 33 25 25 - // Packet building helpers for UCONFIG registers. 26 26 - void set_uconfig_reg_seq(uint32_t reg, uint32_t num); 27 27 - void set_uconfig_reg(uint32_t reg, uint32_t value); 28 28 - void set_uconfig_reg_idx(uint32_t reg, uint32_t idx, uint32_t value); 34 34 + CommandRing(const CommandRing&) = delete; 29 35 30 30 - void set_uconfig_perfctr_reg_seq(uint32_t reg, uint32_t num); 31 31 - void set_uconfig_perfctr_reg(uint32_t reg, uint32_t value); 36 36 + CommandStream begin_recording(); 37 37 + void submit(CommandStream& cs); 32 38 33 33 - // Packet building helpers for CONTEXT registers. 34 34 - void set_context_reg_seq(uint32_t reg, uint32_t num); 35 35 - void set_context_reg(uint32_t reg, uint32_t value); 36 36 - void set_context_reg_idx(uint32_t reg, uint32_t idx, uint32_t value); 39 39 + private: 40 40 + struct Submission { 41 41 + uint32_t start_dw; 42 42 + uint32_t end_dw; 43 43 + amdgpu_cs_fence fence; 44 44 + }; 37 45 38 38 - // Packet building helpers for SH registers. 39 39 - void set_sh_reg_seq(uint32_t reg, uint32_t num); 40 40 - void set_sh_reg(uint32_t reg, uint32_t value); 41 41 - void set_sh_reg_idx(uint32_t reg, uint32_t idx, uint32_t value); 46 46 + void wait_for_space(uint32_t target_dw_offset); 42 47 43 43 - void event_write_predicate(uint32_t event_type, bool predicate); 44 44 - void event_write(uint32_t event_type); 48 48 + amdgpu_device_handle m_dev; 49 49 + amdgpu_context_handle m_ctx; 50 50 + uint32_t m_ip_type; 51 51 + Config m_cfg; 45 52 46 46 - void set_privileged_config_reg(uint32_t reg, uint32_t value); 47 47 - private: 48 48 - void set_reg_seq(uint32_t reg, uint32_t num, uint32_t idx, uint32_t bank_offset, uint32_t bank_end, uint32_t packet, uint32_t reset_filter_cam); 49 49 - void set_reg(uint32_t reg, uint32_t idx, uint32_t value, uint32_t bank_offset, uint32_t bank_end, uint32_t packet); 53 53 + amdgpu_bo_handle m_bo_handle; 54 54 + uint64_t m_gpu_va; 55 55 + uint32_t* m_cpu_map; 50 56 51 51 - GpuInfo &info; 52 52 - uint8_t ip_type; 53 53 - 54 54 - std::vector<uint32_t> buf; 55 55 - bool context_roll; 57 57 + uint32_t m_write_cursor_dw = 0; 58 58 + std::deque<Submission> m_history; 56 59 };

+10

libvektor/src/amdgpu/gpuinfo.h

reviewed

··· 8 8 GFX10_3 9 9 }; 10 10 11 11 + #define SDMA_VERSION_VALUE(major, minor) (((major) << 8) | (minor)) 12 12 + 13 13 + enum class SDMAVersion { 14 14 + /* GFX10.3 */ 15 15 + SDMA_5_2 = SDMA_VERSION_VALUE(5, 2), 16 16 + /* GFX11 */ 17 17 + SDMA_6_0 = SDMA_VERSION_VALUE(6, 0), 18 18 + }; 19 19 + 11 20 struct GpuInfo { 12 21 GfxLevel gfx_level; 13 22 uint32_t address32_hi; 23 23 + SDMAVersion sdma_version; 14 24 };

-42

libvektor/src/amdgpu/pm4.h

reviewed

··· 1 1 - #pragma once 2 2 - 3 3 - #include <vector> 4 4 - #include <cstdint> 5 5 - #include <span> 6 6 - 7 7 - class Pm4State { 8 8 - public: 9 9 - static Pm4State create_sized(bool debug_sqtt, 10 10 - unsigned max_dw, bool is_compute_queue); 11 11 - 12 12 - void set_reg(unsigned reg, uint32_t val); 13 13 - void set_reg_custom(unsigned reg, uint32_t val, 14 14 - unsigned opcode, unsigned idx); 15 15 - void set_reg_idx3(unsigned reg, uint32_t val); 16 16 - void clear_state(/* const struct radeon_info *info @todo, */ 17 17 - bool debug_sqtt, bool is_compute_queue); 18 18 - void cmd_begin(unsigned opcode); 19 19 - void cmd_add(uint32_t dw); 20 20 - void cmd_end(bool predicate); 21 21 - 22 22 - void finalize(); 23 23 - void emit(CommandStream &cs); 24 24 - 25 25 - private: 26 26 - uint16_t last_reg; /* register offset in dwords */ 27 27 - uint16_t last_pm4; 28 28 - uint16_t ndw; /* number of dwords in pm4 */ 29 29 - uint8_t last_opcode; 30 30 - uint8_t last_idx; 31 31 - bool is_compute_queue; 32 32 - bool packed_is_padded; /* whether SET_*_REG_PAIRS_PACKED is padded to an even number of regs */ 33 33 - 34 34 - /* commands for the DE */ 35 35 - uint16_t max_dw; 36 36 - 37 37 - /* Used by SQTT to override the shader address */ 38 38 - bool debug_sqtt; 39 39 - uint32_t spi_shader_pgm_lo_reg; 40 40 - 41 41 - std::vector<uint32_t> pm4; 42 42 - };

+126

libvektor/src/amdgpu/pm4_encoder.cpp

reviewed

··· 1 1 + #include "pm4_encoder.h" 2 2 + #include "beta.h" 3 3 + 4 4 + 5 5 + #include "gpuinfo.h" 6 6 + #include "amdgfxregs.h" 7 7 + #include "sid.h" 8 8 + #include <amdgpu_drm.h> 9 9 + 10 10 + Pm4Encoder::Pm4Encoder(GpuInfo &info, uint8_t ip_type) : info(info), ip_type(ip_type) { 11 11 + 12 12 + } 13 13 + 14 14 + void Pm4Encoder::emit(uint32_t value) { 15 15 + buf.push_back(value); 16 16 + } 17 17 + 18 18 + void Pm4Encoder::emit(std::span<uint32_t> values) { 19 19 + buf.insert(buf.end(), values.begin(), values.end()); 20 20 + } 21 21 + 22 22 + void Pm4Encoder::set_reg_seq(uint32_t reg, uint32_t num, uint32_t idx, uint32_t bank_offset, uint32_t bank_end, uint32_t packet, uint32_t reset_filter_cam) { 23 23 + assert(reg >= bank_offset && reg < bank_end, "register out of range: {}", reg); 24 24 + emit(PKT3(packet, num, 0) | PKT3_RESET_FILTER_CAM_S(reset_filter_cam)); 25 25 + emit(((reg - bank_offset) >> 2) | (idx << 28)); 26 26 + } 27 27 + 28 28 + void Pm4Encoder::set_reg(uint32_t reg, uint32_t idx, uint32_t value, uint32_t bank_offset, uint32_t bank_end, uint32_t packet) { 29 29 + set_reg_seq(reg, 1, idx, bank_offset, bank_end, packet, 0); 30 30 + emit(value); 31 31 + } 32 32 + 33 33 + void Pm4Encoder::set_config_reg_seq(uint32_t reg, uint32_t num) { 34 34 + set_reg_seq(reg, num, 0, SI_CONFIG_REG_OFFSET, SI_CONFIG_REG_END, PKT3_SET_CONFIG_REG, 0); 35 35 + } 36 36 + 37 37 + void Pm4Encoder::set_config_reg(uint32_t reg, uint32_t value) { 38 38 + set_reg(reg, 0, value, SI_CONFIG_REG_OFFSET, SI_CONFIG_REG_END, PKT3_SET_CONFIG_REG); 39 39 + } 40 40 + 41 41 + void Pm4Encoder::set_uconfig_reg_seq(uint32_t reg, uint32_t num) { 42 42 + set_reg_seq(reg, num, 0, CIK_UCONFIG_REG_OFFSET, CIK_UCONFIG_REG_END, PKT3_SET_UCONFIG_REG, 0); 43 43 + } 44 44 + 45 45 + void Pm4Encoder::set_uconfig_reg(uint32_t reg, uint32_t value) { 46 46 + set_reg(reg, 0, value, CIK_UCONFIG_REG_OFFSET, CIK_UCONFIG_REG_END, PKT3_SET_UCONFIG_REG); 47 47 + } 48 48 + 49 49 + void Pm4Encoder::set_uconfig_reg_idx(uint32_t reg, uint32_t idx, uint32_t value) { 50 50 + set_reg(reg, idx, value, CIK_UCONFIG_REG_OFFSET, CIK_UCONFIG_REG_END, PKT3_SET_UCONFIG_REG_INDEX); 51 51 + } 52 52 + 53 53 + /* 54 54 + * On GFX10, there is a bug with the ME implementation of its content 55 55 + * addressable memory (CAM), that means that it can skip register writes due 56 56 + * to not taking correctly into account the fields from the GRBM_GFX_INDEX. 57 57 + * With this __filter_cam_workaround bit we can force the write. 58 58 + */ 59 59 + void Pm4Encoder::set_uconfig_perfctr_reg_seq(uint32_t reg, uint32_t num) { 60 60 + bool filter_cam_workaround = (info.gfx_level > GfxLevel::GFX10) && ip_type == AMDGPU_HW_IP_GFX; 61 61 + set_reg_seq(reg, num, 0, CIK_UCONFIG_REG_OFFSET, CIK_UCONFIG_REG_END, PKT3_SET_UCONFIG_REG, filter_cam_workaround); 62 62 + } 63 63 + 64 64 + void Pm4Encoder::set_uconfig_perfctr_reg(uint32_t reg, uint32_t value) { 65 65 + set_uconfig_perfctr_reg_seq(reg, 1); 66 66 + emit(value); 67 67 + } 68 68 + 69 69 + void Pm4Encoder::set_context_reg_seq(uint32_t reg, uint32_t num) { 70 70 + set_reg_seq(reg, num, 0, SI_CONTEXT_REG_OFFSET, SI_CONTEXT_REG_END, PKT3_SET_CONTEXT_REG, 0); 71 71 + } 72 72 + 73 73 + void Pm4Encoder::set_context_reg(uint32_t reg, uint32_t value) { 74 74 + set_reg(reg, 0, value, SI_CONTEXT_REG_OFFSET, SI_CONTEXT_REG_END, PKT3_SET_CONTEXT_REG); 75 75 + } 76 76 + 77 77 + void Pm4Encoder::set_context_reg_idx(uint32_t reg, uint32_t idx, uint32_t value) { 78 78 + set_reg(reg, idx, value, SI_CONTEXT_REG_OFFSET, SI_CONTEXT_REG_END, PKT3_SET_CONTEXT_REG); 79 79 + } 80 80 + 81 81 + void Pm4Encoder::set_sh_reg_seq(uint32_t reg, uint32_t num) { 82 82 + set_reg_seq(reg, num, 0, SI_SH_REG_OFFSET, SI_SH_REG_END, PKT3_SET_SH_REG, 0); 83 83 + } 84 84 + 85 85 + void Pm4Encoder::set_sh_reg(uint32_t reg, uint32_t value) { 86 86 + set_reg(reg, 0, value, SI_SH_REG_OFFSET, SI_SH_REG_END, PKT3_SET_SH_REG); 87 87 + } 88 88 + 89 89 + void Pm4Encoder::set_sh_reg_idx(uint32_t reg, uint32_t idx, uint32_t value) { 90 90 + uint32_t opcode = PKT3_SET_SH_REG_INDEX; 91 91 + set_reg(reg, idx, value, SI_SH_REG_OFFSET, SI_SH_REG_END, opcode); 92 92 + } 93 93 + 94 94 + void Pm4Encoder::emit_32bit_pointer(uint32_t sh_offset, uint64_t va) { 95 95 + assert(va == 0 || (va >> 32) == info.address32_hi, "va outside valid range: {}", va); 96 96 + set_sh_reg(sh_offset, va); 97 97 + } 98 98 + 99 99 + void Pm4Encoder::emit_64bit_pointer(uint32_t sh_offset, uint64_t va) { 100 100 + set_sh_reg(sh_offset, 2); 101 101 + emit(va); 102 102 + emit(va >> 32); 103 103 + } 104 104 + 105 105 + void Pm4Encoder::event_write_predicate(uint32_t event_type, bool predicate) { 106 106 + emit(PKT3(PKT3_EVENT_WRITE, 0, predicate)); 107 107 + auto ev_index = event_type == V_028A90_VS_PARTIAL_FLUSH || 108 108 + event_type == V_028A90_PS_PARTIAL_FLUSH || 109 109 + event_type == V_028A90_CS_PARTIAL_FLUSH ? 4 : 110 110 + event_type == V_028A90_PIXEL_PIPE_STAT_CONTROL ? 1 : 0; 111 111 + emit(EVENT_TYPE(event_type) | EVENT_INDEX(ev_index)); 112 112 + } 113 113 + 114 114 + void Pm4Encoder::event_write(uint32_t event_type) { 115 115 + event_write_predicate(event_type, false); 116 116 + } 117 117 + 118 118 + void Pm4Encoder::set_privileged_config_reg(uint32_t reg, uint32_t value) { 119 119 + assert(reg < CIK_UCONFIG_REG_OFFSET, "reg outside valid range for privileged config: {}", reg); 120 120 + emit(PKT3(PKT3_COPY_DATA, 4, 0)); 121 121 + emit(COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_PERF)); 122 122 + emit(value); 123 123 + emit(0); 124 124 + emit(reg >> 2); 125 125 + emit(0); 126 126 + }

+62

libvektor/src/amdgpu/pm4_encoder.h

reviewed

··· 1 1 + #pragma once 2 2 + 3 3 + #include <span> 4 4 + #include <vector> 5 5 + 6 6 + #include "gpuinfo.h" 7 7 + // #include "pm4.h" 8 8 + 9 9 + // @todo: consider redesigning this.. 10 10 + // instead of having multiple CmdStream types; let a commandstream 11 11 + // just be some kind of buffer (std::vector). We can rename this 12 12 + // to "PM4 Encoder" or something, directly adding to a provided 13 13 + // buffer. 14 14 + 15 15 + class Pm4Encoder { 16 16 + public: 17 17 + Pm4Encoder(GpuInfo &info, uint8_t ip_type); 18 18 + 19 19 + void emit(uint32_t value); 20 20 + void emit(std::span<uint32_t> values); 21 21 + 22 22 + void emit_32bit_pointer(uint32_t sh_offset, uint64_t va); 23 23 + void emit_64bit_pointer(uint32_t sh_offset, uint64_t va); 24 24 + 25 25 + // @todo: figure out better sizes for reg, num, value, ... 26 26 + 27 27 + // Packet building helpers for CONFIG registers. 28 28 + void set_config_reg_seq(uint32_t reg, uint32_t num); 29 29 + void set_config_reg(uint32_t reg, uint32_t value); 30 30 + 31 31 + // Packet building helpers for UCONFIG registers. 32 32 + void set_uconfig_reg_seq(uint32_t reg, uint32_t num); 33 33 + void set_uconfig_reg(uint32_t reg, uint32_t value); 34 34 + void set_uconfig_reg_idx(uint32_t reg, uint32_t idx, uint32_t value); 35 35 + 36 36 + void set_uconfig_perfctr_reg_seq(uint32_t reg, uint32_t num); 37 37 + void set_uconfig_perfctr_reg(uint32_t reg, uint32_t value); 38 38 + 39 39 + // Packet building helpers for CONTEXT registers. 40 40 + void set_context_reg_seq(uint32_t reg, uint32_t num); 41 41 + void set_context_reg(uint32_t reg, uint32_t value); 42 42 + void set_context_reg_idx(uint32_t reg, uint32_t idx, uint32_t value); 43 43 + 44 44 + // Packet building helpers for SH registers. 45 45 + void set_sh_reg_seq(uint32_t reg, uint32_t num); 46 46 + void set_sh_reg(uint32_t reg, uint32_t value); 47 47 + void set_sh_reg_idx(uint32_t reg, uint32_t idx, uint32_t value); 48 48 + 49 49 + void event_write_predicate(uint32_t event_type, bool predicate); 50 50 + void event_write(uint32_t event_type); 51 51 + 52 52 + void set_privileged_config_reg(uint32_t reg, uint32_t value); 53 53 + private: 54 54 + void set_reg_seq(uint32_t reg, uint32_t num, uint32_t idx, uint32_t bank_offset, uint32_t bank_end, uint32_t packet, uint32_t reset_filter_cam); 55 55 + void set_reg(uint32_t reg, uint32_t idx, uint32_t value, uint32_t bank_offset, uint32_t bank_end, uint32_t packet); 56 56 + 57 57 + GpuInfo &info; 58 58 + uint8_t ip_type; 59 59 + 60 60 + std::vector<uint32_t> buf; 61 61 + bool context_roll; 62 62 + };

+27

libvektor/src/amdgpu/sdma_encoder.cpp

reviewed

··· 1 1 + #include "sdma_encoder.h" 2 2 + 3 3 + #include "cmdstream.h" 4 4 + #include "sid.h" 5 5 + 6 6 + SDMAEncoder::SDMAEncoder(GpuInfo &info, CommandStream &cs) : info(info), cs(cs) {} 7 7 + 8 8 + #define MIN2( A, B ) ( (A)<(B) ? (A) : (B) ) 9 9 + 10 10 + #define BITFIELD64_BIT(b) (uint64_t(1) << (b)) 11 11 + 12 12 + #define BITFIELD64_MASK(b) \ 13 13 + ((b) == 64 ? (~uint64_t(0)) : BITFIELD64_BIT((b) & 63) - 1) 14 14 + 15 15 + uint64_t SDMAEncoder::constant_fill(uint64_t va, uint64_t size, uint32_t value) { 16 16 + const uint32_t fill_size = 2; 17 17 + const uint64_t max_fill_size = BITFIELD64_MASK(info.sdma_version >= SDMAVersion::SDMA_6_0 ? 30 : 22) & ~0x3; 18 18 + const uint64_t bytes_written = MIN2(size, max_fill_size); 19 19 + 20 20 + cs.emit(SDMA_PACKET(SDMA_OPCODE_CONSTANT_FILL, 0, 0) | (fill_size << 30)); 21 21 + cs.emit(va); 22 22 + cs.emit(va >> 32); 23 23 + cs.emit(value); 24 24 + cs.emit(bytes_written - 1); 25 25 + 26 26 + return bytes_written; 27 27 + }

+17

libvektor/src/amdgpu/sdma_encoder.h

reviewed

··· 1 1 + #pragma once 2 2 + 3 3 + #include "cmdstream.h" 4 4 + #include "gpuinfo.h" 5 5 + #include <vector> 6 6 + #include <span> 7 7 + 8 8 + class SDMAEncoder { 9 9 + public: 10 10 + SDMAEncoder(GpuInfo &info, CommandStream &cs); 11 11 + 12 12 + // returns the number of bytes written; may need to be repeated. 13 13 + uint64_t constant_fill(uint64_t va, uint64_t size, uint32_t value); 14 14 + private: 15 15 + GpuInfo &info; 16 16 + CommandStream &cs; 17 17 + };

+3 -2

libvektor/src/beta.h

reviewed

··· 2 2 3 3 #include <fmt/base.h> 4 4 #include <fmt/format.h> 5 5 + #include "type_format.h" 5 6 6 7 template <typename... T> 7 7 - void not_implemented(fmt::format_string<T...> fmt, T&&... args) { 8 8 + [[noreturn]] void not_implemented(fmt::format_string<T...> fmt, T&&... args) { 8 9 auto s = fmt::vformat(fmt, fmt::make_format_args(args...)); 9 10 printf("not implemented: %s\n", s); 10 11 exit(1); 11 12 } 12 13 13 14 template <typename... T> 14 14 - void panic(fmt::format_string<T...> fmt, T&&... args) { 15 15 + [[noreturn]] void panic(fmt::format_string<T...> fmt, T&&... args) { 15 16 auto s = fmt::vformat(fmt, fmt::make_format_args(args...)); 16 17 printf("panic: %s\n", s); 17 18 exit(1);

+18

libvektor/src/type_format.h

reviewed

··· 1 1 + #pragma once 2 2 + 3 3 + #include "vektor/vektor.h" 4 4 + 5 5 + #include <string> 6 6 + namespace vektor { 7 7 + 8 8 + inline std::string format_as(vektor::QueueType qt) { 9 9 + switch(qt) { 10 10 + case QueueType::Graphics: return "graphics"; 11 11 + case QueueType::Compute: return "compute"; 12 12 + case QueueType::Transfer: return "transfer"; 13 13 + default: 14 14 + return "unknown"; 15 15 + } 16 16 + } 17 17 + 18 18 + }

+34

libvektor/src/vektor_cmds.cpp

reviewed

··· 1 1 + #include "amdgpu/sdma_encoder.h" 2 2 + #include "vektor/vektor.h" 3 3 + #include "vektor_impl.h" 4 4 + 5 5 + #include "beta.h" 6 6 + #include <fmt/format.h> 7 7 + 8 8 + namespace vektor { 9 9 + 10 10 + void memset_transfer(CommandListImpl *impl, gpuptr_t addr, std::size_t size, uint32_t value) { 11 11 + assert(impl->queue->type == QueueType::Transfer, "memset_transfer requires queue of Transfer type"); 12 12 + 13 13 + SDMAEncoder enc(impl->queue->dev->info, impl->cs); 14 14 + 15 15 + while (size > 0) { 16 16 + uint64_t bytes_written = enc.constant_fill(addr, size, value); 17 17 + size -= bytes_written; 18 18 + addr += bytes_written; 19 19 + } 20 20 + } 21 21 + 22 22 + void memset(CommandList pcl, gpuptr_t addr, std::size_t size, uint32_t value) { 23 23 + auto *cl = (CommandListImpl *)pcl; 24 24 + 25 25 + switch(cl->queue->type) { 26 26 + case QueueType::Transfer: 27 27 + memset_transfer(cl, addr, size, value); 28 28 + break; 29 29 + default: 30 30 + not_implemented("vektor::memset not implemented for queue type: {}", fmt::underlying(cl->queue->type)); 31 31 + } 32 32 + } 33 33 + 34 34 + }

+22

libvektor/src/vektor_impl.h

reviewed

··· 1 1 #pragma once 2 2 3 3 + #include "vektor/vektor.h" 4 4 + 3 5 #include <amdgpu.h> 4 6 #include <amdgpu_drm.h> 5 7 8 8 + #include "amdgpu/cmdstream.h" 6 9 #include "amdgpu/gpuinfo.h" 7 10 11 11 + namespace vektor { 12 12 + 8 13 struct DeviceImpl { 9 14 int fd; 10 15 amdgpu_device_handle amd_handle; 11 16 12 17 GpuInfo info; 13 18 }; 19 19 + 20 20 + struct QueueImpl { 21 21 + amdgpu_context_handle ctx_handle; 22 22 + DeviceImpl *dev; 23 23 + QueueType type; 24 24 + 25 25 + uint32_t hw_ip_type; 26 26 + 27 27 + CommandRing *cmd_ring; 28 28 + }; 29 29 + 30 30 + struct CommandListImpl { 31 31 + QueueImpl *queue; 32 32 + CommandStream cs; 33 33 + }; 34 34 + 35 35 + }

+27 -21

libvektor/src/vektor_queue.cpp

reviewed

··· 1 1 #include "vektor/vektor.h" 2 2 #include "vektor_impl.h" 3 3 + #include "beta.h" 3 4 #include <cstdint> 4 5 #include <vector> 5 6 6 7 namespace vektor { 7 8 8 8 - struct QueueImpl { 9 9 - amdgpu_context_handle ctx_handle; 10 10 - uint32_t ctx_id; 11 11 - }; 12 12 - 13 13 - struct CommandListImpl { 14 14 - std::vector<uint32_t> commands; 15 15 - 16 16 - }; 9 9 + uint32_t hw_ip_type_from_queue_type(QueueType qt) { 10 10 + switch(qt) { 11 11 + case QueueType::Graphics: return AMDGPU_HW_IP_GFX; 12 12 + case QueueType::Compute: return AMDGPU_HW_IP_COMPUTE; 13 13 + case QueueType::Transfer: return AMDGPU_HW_IP_DMA; 14 14 + default: 15 15 + not_implemented("no HW_IP type picked for queue type: {}", qt); 16 16 + } 17 17 + } 17 18 18 18 - Queue create_queue(Device pd) { 19 19 + Queue create_queue(Device pd, QueueType qt) { 19 20 auto *dev = (DeviceImpl *)pd; 20 21 21 22 auto queue = new QueueImpl; 23 23 + queue->dev = dev; 24 24 + queue->type = qt; 25 25 + queue->hw_ip_type = hw_ip_type_from_queue_type(qt); 22 26 27 27 + // @todo: consider creating ctx at device initialization? 23 28 int r = amdgpu_cs_ctx_create(dev->amd_handle, &queue->ctx_handle); 24 29 if (r != 0) { 25 30 delete queue; 26 31 return nullptr; 27 32 } 28 33 29 29 - drm_amdgpu_ctx_in ctx_in = {}; 30 30 - ctx_in.op = AMDGPU_CTX_OP_ALLOC_CTX; 31 31 - 32 32 - drm_amdgpu_ctx ctx_args = {}; 33 33 - ctx_args.in = ctx_in; 34 34 - 35 35 - // r = drmCommandWriteRead(dev->fd, DRM_AMDGPU_CTX, &ctx_args, sizeof(ctx_args)); 36 36 - // if (r == 0) { 37 37 - // queue->ctx_id = ctx_args.out.alloc.ctx_id; 38 38 - // } 34 34 + // @todo: cleanup: remove this fkn pointer; shit stuff we don't need! 35 35 + auto conf = CommandRing::Config{}; 36 36 + queue->cmd_ring = new CommandRing(dev->amd_handle, queue->ctx_handle, queue->hw_ip_type, conf); 39 37 40 38 return queue; 41 39 } 42 40 43 41 CommandList start_recording(Queue pq) { 42 42 + auto *queue = (QueueImpl *)pq; 44 43 auto cl = new CommandListImpl; 45 44 46 46 - return nullptr; 45 45 + cl->queue = queue; 46 46 + cl->cs = queue->cmd_ring->begin_recording(); 47 47 + 48 48 + return cl; 47 49 } 48 50 49 51 void submit(Queue pq, CommandList pcl) { 52 52 + auto *queue = (QueueImpl *)pq; 53 53 + auto *cl = (CommandListImpl *)pcl; 54 54 + assert(cl->queue == queue, "submit: commandlist from foreign queue"); 50 55 56 56 + queue->cmd_ring->submit(cl->cs); 51 57 } 52 58 53 59 }

+31

libvektor/test/02_hello_queue/hello_queue.cpp

reviewed

··· 1 1 + #include <unistd.h> 2 2 + #include <vektor/vektor.h> 3 3 + 4 4 + #include <stdio.h> 5 5 + 6 6 + int main(void) { 7 7 + 8 8 + auto version = vektor::version(); 9 9 + printf("vektor %s (%s)\n", version.version, version.commit_id); 10 10 + 11 11 + auto dev = vektor::create(); 12 12 + 13 13 + std::size_t size = 10 * 1024 * 1024; 14 14 + auto x = vektor::malloc(dev, size); 15 15 + 16 16 + printf("x: %p (%p) (%llu bytes)\n", x.cpu, x.gpu, x.size); 17 17 + 18 18 + auto dma = vektor::create_queue(dev, vektor::QueueType::Transfer); 19 19 + auto l1 = vektor::start_recording(dma); 20 20 + 21 21 + vektor::memset(l1, x.gpu, size, 1); 22 22 + 23 23 + vektor::submit(dma, l1); 24 24 + 25 25 + // @todo: how to wait on cpu for DMA transfer? TODO? 26 26 + 27 27 + vektor::free(dev, x); 28 28 + vektor::destroy(dev); 29 29 + 30 30 + return 0; 31 31 + }