amdgpu: fixed TLB flush synchronization w/ global residency list

+13 -4

drivers/amdgpu/cmdstream.cpp

··· 1 1 #include "cmdstream.h" 2 2 3 + #include "impl.h" 3 4 #include "beta.h" 4 5 5 6 void CommandStream::emit(uint32_t x) { ··· 7 8 *cursor++ = x; 8 9 } 9 10 10 - CommandRing::CommandRing(amdgpu_device_handle dev, amdgpu_context_handle ctx, uint32_t ip_type, Config cfg) 11 + CommandRing::CommandRing(DeviceImpl *dev, amdgpu_context_handle ctx, uint32_t ip_type, Config cfg) 11 12 : m_dev(dev), m_ctx(ctx), m_ip_type(ip_type), m_cfg(cfg) { 12 13 13 14 amdgpu_bo_alloc_request req = { ··· 17 18 .flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED | AMDGPU_GEM_CREATE_UNCACHED // Or WC 18 19 }; 19 20 20 - amdgpu_bo_alloc(m_dev, &req, &m_bo_handle); 21 + amdgpu_bo_alloc(m_dev->amd_handle, &req, &m_bo_handle); 21 22 22 23 void* ptr; 23 24 amdgpu_bo_cpu_map(m_bo_handle, &ptr); 24 25 m_cpu_map = static_cast<uint32_t*>(ptr); 25 26 26 27 amdgpu_va_handle va_handle; 27 - amdgpu_va_range_alloc(m_dev, amdgpu_gpu_va_range_general, m_cfg.ring_size_bytes, 1, 0, &m_gpu_va, &va_handle, 0); 28 - amdgpu_bo_va_op(m_bo_handle, 0, m_cfg.ring_size_bytes, m_gpu_va, AMDGPU_VM_PAGE_READABLE, AMDGPU_VA_OP_MAP); 28 + amdgpu_va_range_alloc(m_dev->amd_handle, amdgpu_gpu_va_range_general, m_cfg.ring_size_bytes, 1, 0, &m_gpu_va, &va_handle, 0); 29 + amdgpu_bo_va_op(m_bo_handle, 0, m_cfg.ring_size_bytes, m_gpu_va, AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_EXECUTABLE, AMDGPU_VA_OP_MAP); 30 + 31 + log("command ring alloc: {} {}", (void *)m_cpu_map, (void *)m_gpu_va); 32 + 33 + device_register_allocation(m_dev, m_bo_handle); 29 34 } 30 35 31 36 CommandStream CommandRing::begin_recording() { ··· 62 67 req.ip_type = m_ip_type; 63 68 req.number_of_ibs = 1; 64 69 req.ibs = &ib; 70 + if (m_dev->residency_dirty) { 71 + req.resources = m_dev->global_residency_list; 72 + m_dev->residency_dirty = false; 73 + } 65 74 66 75 auto r = amdgpu_cs_submit(m_ctx, 0, &req, 1); 67 76 if (r != 0) {

+4 -2

drivers/amdgpu/cmdstream.h

··· 6 6 #include <amdgpu.h> 7 7 #include <amdgpu_drm.h> 8 8 9 + struct DeviceImpl; 10 + 9 11 class CommandStream { 10 12 public: 11 13 void emit(uint32_t); ··· 28 30 std::size_t stream_size_bytes = 128 * 1024; // 128KB 29 31 }; 30 32 31 - CommandRing(amdgpu_device_handle dev, amdgpu_context_handle ctx, uint32_t ip_type, Config cfg); 33 + CommandRing(DeviceImpl *dev, amdgpu_context_handle ctx, uint32_t ip_type, Config cfg); 32 34 ~CommandRing(); 33 35 34 36 CommandRing(const CommandRing&) = delete; ··· 45 47 46 48 void wait_for_space(uint32_t target_dw_offset); 47 49 48 - amdgpu_device_handle m_dev; 50 + DeviceImpl *m_dev; 49 51 amdgpu_context_handle m_ctx; 50 52 uint32_t m_ip_type; 51 53 Config m_cfg;

+8

drivers/amdgpu/impl.h

··· 10 10 11 11 #include "common.h" 12 12 13 + #include <vector> 14 + 13 15 struct DeviceImpl { 14 16 int fd; 15 17 amdgpu_device_handle amd_handle; 18 + 19 + amdgpu_bo_list_handle global_residency_list; 20 + std::vector<amdgpu_bo_handle> all_bos; 21 + bool residency_dirty = true; 16 22 17 23 uint32_t num_queues[AMDGPU_HW_IP_NUM]; 18 24 GpuInfo info; ··· 55 61 void amdgpu_cmd_dispatch(KesCommandList pcl, uint32_t x, uint32_t y, uint32_t z); 56 62 void amdgpu_cmd_dispatch_indirect(KesCommandList pcl, uint64_t indirect_addr); 57 63 } 64 + 65 + void device_register_allocation(DeviceImpl *impl, amdgpu_bo_handle bo); 58 66 59 67 uint32_t hw_ip_type_from_queue_type(KesQueueType qt);

+8

drivers/amdgpu/init.cpp

··· 25 25 return "???"; 26 26 } 27 27 28 + void device_register_allocation(DeviceImpl *impl, amdgpu_bo_handle bo) { 29 + impl->all_bos.push_back(bo); 30 + impl->residency_dirty = true; 31 + 32 + if (impl->global_residency_list) amdgpu_bo_list_destroy(impl->global_residency_list); 33 + amdgpu_bo_list_create(impl->amd_handle, impl->all_bos.size(), impl->all_bos.data(), NULL, &impl->global_residency_list); 34 + } 35 + 28 36 KesDevice amdgpu_create(int drm_fd) { 29 37 auto dev = new DeviceImpl; 30 38 dev->fd = drm_fd;

+2 -2

drivers/amdgpu/mem.cpp

··· 43 43 // some systems (DCE) require contiguous addresses as they don't use the MMU Infinity Cache. 44 44 // req.flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; 45 45 46 - log("bo alloc: size: {} align: {}", aligned_size, alignment); 47 - 48 46 if (amdgpu_bo_alloc(dev->amd_handle, &req, &impl->bo) != 0) { 49 47 log("amdgpu_bo_alloc failed"); 50 48 return alloc; ··· 87 85 return alloc; 88 86 } 89 87 } 88 + 89 + device_register_allocation(dev, impl->bo); 90 90 91 91 return alloc; 92 92 }

+1 -1

drivers/amdgpu/queue.cpp

··· 39 39 40 40 // @todo: cleanup: remove this fkn pointer; shit stuff we don't need! 41 41 auto conf = CommandRing::Config{}; 42 - queue->cmd_ring = new CommandRing(dev->amd_handle, queue->ctx_handle, queue->hw_ip_type, conf); 42 + queue->cmd_ring = new CommandRing(dev, queue->ctx_handle, queue->hw_ip_type, conf); 43 43 44 44 return queue; 45 45 }

Configure Feed

Configure Feed