A Modern GPGPU API & wip linux RDNA2+ Driver
rdna driver linux gpu
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

amazing progress!

+328 -19
+5 -1
docs/conf.py
··· 28 28 29 29 # -- Options for HTML output ------------------------------------------------- 30 30 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 31 - html_theme = "alabaster" 31 + 32 + html_theme = "sphinx_rtd_theme" 32 33 html_static_path = ["_static"] 33 34 html_sidebars = { 34 35 "**": [ ··· 39 40 "donate.html", 40 41 ] 41 42 } 43 + html_theme_options = { 44 + "display_version": "true", 45 + }
+1 -1
libvektor/CMakeLists.txt
··· 26 26 27 27 add_sanitizers(vektor) 28 28 29 - file(GLOB TEST_DIRS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}/test ${CMAKE_CURRENT_SOURCE_DIR}/test/*) 29 + file(GLOB TEST_DIRS CONFIGURE_DEPENDS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}/test ${CMAKE_CURRENT_SOURCE_DIR}/test/*) 30 30 31 31 foreach(test_dir ${TEST_DIRS}) 32 32 # Ensure we are looking at a directory, not a random file
+4 -1
libvektor/include/vektor/vektor.h
··· 39 39 40 40 enum HazardFlags { 41 41 // @todo 42 + None = 0, 42 43 DrawArguments = 1 << 0, 43 44 Descriptors = 1 << 1 44 45 }; ··· 77 78 78 79 void memset(CommandList, gpuptr_t addr, std::size_t size, uint32_t value); 79 80 81 + void write_timestamp(CommandList, gpuptr_t addr); 82 + 80 83 void signal_after(CommandList, Stage before, gpuptr_t ptr, uint64_t value, Signal); 81 - void wait_before(CommandList, Stage after, gpuptr_t ptr, uint64_t value, Op, HazardFlags hazard = 0, uint64 mask = ~0); 84 + void wait_before(CommandList, Stage after, gpuptr_t ptr, uint64_t value, Op, HazardFlags hazard = HazardFlags::None, uint64_t mask = ~0); 82 85 void wait_semaphore(Semaphore, uint64_t value); 83 86 void signal_semaphore(Semaphore, uint64_t value); 84 87
+8 -2
libvektor/src/amdgpu/cmdstream.cpp
··· 56 56 ib.ib_mc_address = cs.gpu_va_start; 57 57 ib.size = count_dw; 58 58 59 + auto next_point = m_timeline_counter++; 60 + 59 61 amdgpu_cs_request req = {}; 60 62 req.ip_type = m_ip_type; 61 63 req.number_of_ibs = 1; 62 64 req.ibs = &ib; 63 65 64 - if (amdgpu_cs_submit(m_ctx, 0, &req, 1) == 0) { 66 + auto r = amdgpu_cs_submit(m_ctx, 0, &req, 1); 67 + if (r != 0) { 68 + warn("submit failed: (ctx: {}) {}", (void *)m_ctx, r); 69 + } 70 + if (r == 0) { 65 71 amdgpu_cs_fence fence = {}; 66 72 fence.context = m_ctx; 67 73 fence.ip_type = m_ip_type; 68 74 // @todo: syncronization... 69 75 70 - m_history.push_back({start_dw, start_dw + (uint32_t)(m_cfg.stream_size_bytes/4), fence}); 76 + m_history.push_back({start_dw, start_dw + (uint32_t)(m_cfg.stream_size_bytes/4), next_point}); 71 77 m_write_cursor_dw += (m_cfg.stream_size_bytes / 4); 72 78 } 73 79 }
+14
libvektor/src/amdgpu/gpuinfo.h
··· 2 2 3 3 #include <cstdint> 4 4 5 + #include "amdgpu_drm.h" 6 + 5 7 enum class GfxLevel { 6 8 GFX9, 7 9 GFX10, ··· 17 19 SDMA_6_0 = SDMA_VERSION_VALUE(6, 0), 18 20 }; 19 21 22 + struct IpInfo { 23 + uint8_t ver_major; 24 + uint8_t ver_minor; 25 + uint8_t ver_rev; 26 + uint8_t num_queues; 27 + uint8_t num_instances; 28 + uint32_t ib_alignment; 29 + uint32_t ib_pad_dw_mask; 30 + }; 31 + 20 32 struct GpuInfo { 21 33 GfxLevel gfx_level; 22 34 uint32_t address32_hi; 23 35 SDMAVersion sdma_version; 36 + 37 + IpInfo ip[AMDGPU_HW_IP_NUM]; 24 38 };
+34 -3
libvektor/src/amdgpu/sdma_encoder.cpp
··· 13 13 ((b) == 64 ? (~uint64_t(0)) : BITFIELD64_BIT((b) & 63) - 1) 14 14 15 15 void SDMAEncoder::write_timestamp(uint64_t va) { 16 - SDMA_OPCODE_SEMAPHORE 17 16 cs.emit(SDMA_PACKET(SDMA_OPCODE_TIMESTAMP, SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP, 0)); 18 17 cs.emit(va); 19 18 cs.emit(va >> 32); ··· 33 32 cs.emit(fence); 34 33 } 35 34 36 - void SDMAEncoder::wait_mem(uint32_t op, uint64_t va, uint32_t ref, uint32_t mask) { 37 - cs.emit(SDMA_PACKET(SDMA_OPCODE_POLL_REGMEM, 0, 0) | op << 28 | SDMA_POLL_MEM); 35 + #define SDMA_OPCODE_ATOMIC 0xa 36 + 37 + #define SDMA_L2_POLICY_LRU 0 38 + #define SDMA_L2_POLICY_STREAM 1 39 + #define SDMA_L2_POLICY_UC 3 40 + #define SDMA_LLC_POLICY_MALL 0 41 + #define SDMA_LLC_POLICY_BYPASS 1 42 + #define SDMA_CACHE_POLICY(l2, llc) ((uint32_t)(llc) << 2 | (uint32_t)(l2)) 43 + 44 + #define SDMA_POLL_CACHE_POLICY(x) ((uint32_t)(x) << 20) 45 + #define SDMA_POLL_CPV (1u << 24) 46 + #define SDMA_POLL_HDP_FLUSH (1u << 26) 47 + 48 + #define SDMA_ATOMIC_OP(x) ((uint32_t)(x) << 25) 49 + #define SDMA_ATOMIC_CACHE_POLICY(x) ((uint32_t)(x) << 20) 50 + #define SDMA_ATOMIC_CPV (1u << 24) 51 + 52 + void SDMAEncoder::atomic(SDMAAtomicOp op, uint64_t va, uint64_t value) { 53 + uint32_t cache_policy = SDMA_CACHE_POLICY(SDMA_L2_POLICY_UC, SDMA_LLC_POLICY_BYPASS); 54 + cs.emit(SDMA_PACKET(SDMA_OPCODE_ATOMIC, 0, 0) | SDMA_ATOMIC_CPV | SDMA_ATOMIC_CACHE_POLICY(cache_policy) | SDMA_ATOMIC_OP(op)); 55 + cs.emit(va); 56 + cs.emit(va >> 32); 57 + cs.emit(value); 58 + cs.emit(value >> 32); 59 + cs.emit(0); 60 + cs.emit(0); 61 + cs.emit(0); 62 + } 63 + 64 + void SDMAEncoder::wait_mem(SDMAWaitMemOp op, uint64_t va, uint32_t ref, uint32_t mask) { 65 + uint32_t cache_policy = SDMA_CACHE_POLICY(SDMA_L2_POLICY_UC, SDMA_LLC_POLICY_BYPASS); 66 + cs.emit( 67 + SDMA_PACKET(SDMA_OPCODE_POLL_REGMEM, 0, 0) | (uint32_t)op << 28 | SDMA_POLL_MEM 68 + | SDMA_POLL_HDP_FLUSH | SDMA_POLL_CPV | SDMA_POLL_CACHE_POLICY(cache_policy)); 38 69 cs.emit(va); 39 70 cs.emit(va >> 32); 40 71 cs.emit(ref);
+15 -1
libvektor/src/amdgpu/sdma_encoder.h
··· 5 5 #include <vector> 6 6 #include <span> 7 7 8 + enum class SDMAAtomicOp { 9 + Swap = 0x67, 10 + Add = 0x6f, 11 + Sub = 0x70, 12 + UMin = 0x72, 13 + UMax = 0x74, 14 + Or = 0x76, 15 + }; 16 + 17 + enum class SDMAWaitMemOp { 18 + Equal = 0x3 19 + }; 20 + 8 21 class SDMAEncoder { 9 22 public: 10 23 SDMAEncoder(GpuInfo &info, CommandStream &cs); ··· 12 25 void write_timestamp(uint64_t va); 13 26 void semaphore(uint64_t va); 14 27 void fence(uint64_t va, uint32_t fence); 15 - void wait_mem(uint32_t op, uint64_t va, uint32_t ref, uint32_t mask); 28 + void atomic(SDMAAtomicOp op, uint64_t va, uint64_t value); 29 + void wait_mem(SDMAWaitMemOp op, uint64_t va, uint32_t ref, uint32_t mask); 16 30 17 31 // returns the number of bytes written; may need to be repeated. 18 32 uint64_t constant_fill(uint64_t va, uint64_t size, uint32_t value);
+8 -2
libvektor/src/beta.h
··· 7 7 template <typename... T> 8 8 [[noreturn]] void not_implemented(fmt::format_string<T...> fmt, T&&... args) { 9 9 auto s = fmt::vformat(fmt, fmt::make_format_args(args...)); 10 - printf("not implemented: %s\n", s); 10 + printf("not implemented: %s\n", s.c_str()); 11 11 exit(1); 12 12 } 13 13 14 14 template <typename... T> 15 15 [[noreturn]] void panic(fmt::format_string<T...> fmt, T&&... args) { 16 16 auto s = fmt::vformat(fmt, fmt::make_format_args(args...)); 17 - printf("panic: %s\n", s); 17 + printf("panic: %s\n", s.c_str()); 18 18 exit(1); 19 19 } 20 20 ··· 22 22 void log(fmt::format_string<T...> fmt, T&&... args) { 23 23 auto s = fmt::vformat(fmt, fmt::make_format_args(args...)); 24 24 printf("%s\n", s.c_str()); 25 + } 26 + 27 + template <typename... T> 28 + void warn(fmt::format_string<T...> fmt, T&&... args) { 29 + auto s = fmt::vformat(fmt, fmt::make_format_args(args...)); 30 + printf("warn: %s\n", s.c_str()); 25 31 } 26 32 27 33 template <typename... T>
+20
libvektor/src/type_format.h
··· 15 15 } 16 16 } 17 17 18 + inline std::string format_as(vektor::Signal sig) { 19 + switch(sig) { 20 + case Signal::AtomicSet: return "atomic_set"; 21 + case Signal::AtomicMax: return "atomic_max"; 22 + case Signal::AtomicOr: return "atomic_or"; 23 + default: 24 + return "unknown"; 25 + } 26 + } 27 + 28 + inline std::string format_as(vektor::Op op) { 29 + switch(op) { 30 + case Op::Never: return "never"; 31 + case Op::Less: return "less"; 32 + case Op::Equal: return "equal"; 33 + default: 34 + return "unknown"; 35 + } 36 + } 37 + 18 38 }
+92 -2
libvektor/src/vektor_cmds.cpp
··· 8 8 namespace vektor { 9 9 10 10 void memset_transfer(CommandListImpl *impl, gpuptr_t addr, std::size_t size, uint32_t value) { 11 - assert(impl->queue->type == QueueType::Transfer, "memset_transfer requires queue of Transfer type"); 11 + assert(impl->queue->type == QueueType::Transfer, "memset_transfer: requires queue of Transfer type"); 12 12 13 13 SDMAEncoder enc(impl->queue->dev->info, impl->cs); 14 14 ··· 21 21 22 22 void memset(CommandList pcl, gpuptr_t addr, std::size_t size, uint32_t value) { 23 23 auto *cl = (CommandListImpl *)pcl; 24 + assert(cl, "memset: command list handle invalid: {}", (void *)pcl); 24 25 25 26 switch(cl->queue->type) { 26 27 case QueueType::Transfer: 27 28 memset_transfer(cl, addr, size, value); 28 29 break; 29 30 default: 30 - not_implemented("vektor::memset not implemented for queue type: {}", fmt::underlying(cl->queue->type)); 31 + not_implemented("memset: not implemented for queue type: {}", cl->queue->type); 32 + } 33 + } 34 + 35 + SDMAAtomicOp sdma_atomic_op_map(Signal sig) { 36 + switch(sig) { 37 + case Signal::AtomicSet: 38 + return SDMAAtomicOp::Swap; 39 + case Signal::AtomicMax: 40 + return SDMAAtomicOp::UMax; 41 + case Signal::AtomicOr: 42 + return SDMAAtomicOp::Or; 43 + default: 44 + not_implemented("sdma_atomic_op_map: no mapping for {}", sig); 45 + } 46 + } 47 + 48 + SDMAWaitMemOp sdma_waitmem_op_map(Op op) { 49 + switch(op) { 50 + case Op::Equal: 51 + return SDMAWaitMemOp::Equal; 52 + default: 53 + not_implemented("sdma_waitmem_op_map: no mapping for {}", op); 54 + } 55 + } 56 + 57 + void wait_before_transfer(CommandListImpl *impl, gpuptr_t ptr, uint64_t value, Op op, uint64_t mask) { 58 + assert(impl->queue->type == QueueType::Transfer, "wait_before_transfer: requires queue of Transfer type"); 59 + 60 + SDMAEncoder enc(impl->queue->dev->info, impl->cs); 61 + 62 + auto func = sdma_waitmem_op_map(op); 63 + 64 + // @todo: NOTE: this only writes the low 32-bits of value. I do not know how we should do this, it 65 + // seems the hardware doesn't support this. 66 + enc.wait_mem(func, ptr, value & 0xFFFFFFFF, mask & 0xFFFFFFFF); 67 + } 68 + 69 + void wait_before(CommandList pcl, Stage after, gpuptr_t ptr, uint64_t value, Op op, HazardFlags hazard, uint64_t mask) { 70 + auto *cl = (CommandListImpl *)pcl; 71 + assert(cl, "wait_before: command list handle invalid: {}", (void *)pcl); 72 + 73 + switch(cl->queue->type) { 74 + case QueueType::Transfer: 75 + wait_before_transfer(cl, ptr, value, op, mask); 76 + break; 77 + default: 78 + not_implemented("wait_before: not implemented for queue type: {}", cl->queue->type); 79 + } 80 + } 81 + 82 + void signal_after_transfer(CommandListImpl *impl, gpuptr_t ptr, uint64_t value, Signal sig) { 83 + assert(impl->queue->type == QueueType::Transfer, "signal_after_transfer: requires queue of Transfer type"); 84 + SDMAEncoder enc(impl->queue->dev->info, impl->cs); 85 + 86 + auto op = sdma_atomic_op_map(sig); 87 + 88 + enc.atomic(op, ptr, value); 89 + } 90 + 91 + void signal_after(CommandList pcl, Stage before, gpuptr_t ptr, uint64_t value, Signal sig) { 92 + auto *cl = (CommandListImpl *)pcl; 93 + assert(cl, "signal_after: command list handle invalid: {}", (void *)pcl); 94 + 95 + switch(cl->queue->type) { 96 + case QueueType::Transfer: 97 + signal_after_transfer(cl, ptr, value, sig); 98 + break; 99 + default: 100 + not_implemented("wait_before: not implemented for queue type: {}", cl->queue->type); 101 + } 102 + } 103 + 104 + void write_timestamp_transfer(CommandListImpl *impl, gpuptr_t ptr) { 105 + assert(impl->queue->type == QueueType::Transfer, "write_timestamp_transfer: requires queue of Transfer type"); 106 + SDMAEncoder enc(impl->queue->dev->info, impl->cs); 107 + 108 + enc.write_timestamp(ptr); 109 + } 110 + 111 + void write_timestamp(CommandList pcl, gpuptr_t ptr) { 112 + auto *cl = (CommandListImpl *)pcl; 113 + assert(cl, "write_timestamp: command list handle invalid: {}", (void *)pcl); 114 + 115 + switch(cl->queue->type) { 116 + case QueueType::Transfer: 117 + write_timestamp_transfer(cl, ptr); 118 + break; 119 + default: 120 + not_implemented("write_timestamp: not implemented for queue type: {}", cl->queue->type); 31 121 } 32 122 } 33 123
+17 -4
libvektor/src/vektor_init.cpp
··· 9 9 #include <cstdint> 10 10 #include <cstdio> 11 11 #include <string> 12 + #include <bit> 12 13 #include <unistd.h> 13 14 #include <fcntl.h> 14 15 ··· 63 64 } 64 65 65 66 log("amdgpu drm loaded: {}.{}", major, minor); 67 + auto &info = dev->info; 68 + info.gfx_level = GfxLevel::GFX10_3; 66 69 67 70 amdgpu_gpu_info gpu_info; 68 71 if(amdgpu_query_gpu_info(dev->amd_handle, &gpu_info) != 0) { 69 72 panic("amdgpu_query_gpu_info failed."); 70 73 } 74 + log("amdgpu family: {}", amdgpu_family_str(gpu_info.family_id).c_str()); 71 75 72 - log("amdgpu family: {}", amdgpu_family_str(gpu_info.family_id).c_str()); 76 + for (auto ip_type = 0; ip_type < AMDGPU_HW_IP_NUM; ++ip_type) { 77 + auto &ip = info.ip[ip_type]; 78 + drm_amdgpu_info_hw_ip ip_info; 79 + amdgpu_query_hw_ip_info(dev->amd_handle, ip_type, 0, &ip_info); 80 + 81 + ip.num_queues = std::popcount(ip_info.available_rings); 82 + 83 + uint32_t num_instances; 84 + if (amdgpu_query_hw_ip_count(dev->amd_handle, ip_type, &num_instances) != 0) { 85 + ip.num_instances = num_instances; 86 + } 87 + } 73 88 74 89 drm_amdgpu_info_hw_ip gfx_ip_info; 75 90 amdgpu_query_hw_ip_info(dev->amd_handle, AMDGPU_HW_IP_GFX, 0, &gfx_ip_info); ··· 78 93 gfx_ip_info.hw_ip_version_major, 79 94 gfx_ip_info.hw_ip_version_minor); 80 95 81 - 82 - auto &info = dev->info; 83 - info.gfx_level = GfxLevel::GFX10_3; 96 + log("num queues: sdma: {}, {}", info.ip[AMDGPU_HW_IP_DMA].num_queues, info.ip[AMDGPU_HW_IP_DMA].num_instances); 84 97 85 98 if(amdgpu_query_sw_info(dev->amd_handle, amdgpu_sw_info_address32_hi, &info.address32_hi) != 0) { 86 99 panic("andgpu_query_sw_info(amdgpu_sw_info_address32_hi) failed.");
+1 -1
libvektor/src/vektor_queue.cpp
··· 55 55 56 56 auto *semaphore = (SemaphoreImpl *)sem; 57 57 58 - queue->cmd_ring->submit(cl->cs, semaphore, value); 58 + queue->cmd_ring->submit(cl->cs); //, semaphore, value); 59 59 60 60 // @todo: to free commandlist, we want to be sure that it is no longer mapped and stuff. 61 61 // then, we can freely-free it. But i think this needs some deferred-cleanup, as
libvektor/test/01_hello_compute/hello_compute.cpp libvektor/test/01_hello_malloc/hello_malloc.cpp
+8 -1
libvektor/test/02_hello_queue/hello_queue.cpp
··· 12 12 13 13 std::size_t size = 10 * 1024 * 1024; 14 14 auto x = vektor::malloc(dev, size); 15 + auto y = vektor::malloc(dev, 8); 15 16 16 17 printf("x: %p (%p) (%llu bytes)\n", x.cpu, x.gpu, x.size); 18 + printf("y: %p (%p) (%llu bytes)\n", y.cpu, y.gpu, y.size); 17 19 18 20 auto dma = vektor::create_queue(dev, vektor::QueueType::Transfer); 21 + 19 22 auto l1 = vektor::start_recording(dma); 20 - 21 23 vektor::memset(l1, x.gpu, size, 1); 24 + vektor::wait_before(l1, vektor::Stage::Transfer, y.gpu, 1337, vektor::Op::Equal); 25 + vektor::memset(l1, x.gpu, size, 2); 22 26 23 27 vektor::submit(dma, l1); 24 28 25 29 // @todo: how to wait on cpu for DMA transfer? TODO? 26 30 printf("x[0]: %u\n", ((uint32_t *)x.cpu)[0]); 31 + sleep(1); 32 + printf("x[0]: %u\n", ((uint32_t *)x.cpu)[0]); 33 + *((uint32_t *)y.cpu) = 1337; 27 34 sleep(1); 28 35 printf("x[0]: %u\n", ((uint32_t *)x.cpu)[0]); 29 36
+60
libvektor/test/03_hello_2queue/hello_2queue.cpp
··· 1 + #include <unistd.h> 2 + #include <vektor/vektor.h> 3 + 4 + #include <stdio.h> 5 + 6 + int main(void) { 7 + 8 + auto version = vektor::version(); 9 + printf("vektor %s (%s)\n", version.version, version.commit_id); 10 + 11 + auto dev = vektor::create(); 12 + 13 + std::size_t size = 10 * 1024 * 1024; 14 + auto x = vektor::malloc(dev, size); 15 + auto y = vektor::malloc(dev, 8); 16 + auto ts = vektor::malloc(dev, 8 * 5); 17 + 18 + printf("x: %p (%p) (%llu bytes)\n", x.cpu, x.gpu, x.size); 19 + printf("y: %p (%p) (%llu bytes)\n", y.cpu, y.gpu, y.size); 20 + 21 + auto dma1 = vektor::create_queue(dev, vektor::QueueType::Transfer); 22 + auto dma2 = vektor::create_queue(dev, vektor::QueueType::Transfer); 23 + 24 + auto l1 = vektor::start_recording(dma1); 25 + { 26 + vektor::write_timestamp(l1, ts.gpu + 0); 27 + vektor::memset(l1, x.gpu, size, 1); 28 + vektor::write_timestamp(l1, ts.gpu + 8); 29 + vektor::signal_after(l1, vektor::Stage::Transfer, y.gpu, 1337, vektor::Signal::AtomicMax); 30 + } 31 + 32 + auto l2 = vektor::start_recording(dma2); 33 + { 34 + vektor::write_timestamp(l2, ts.gpu + 16); 35 + vektor::wait_before(l2, vektor::Stage::Transfer, y.gpu, 1337, vektor::Op::Equal); 36 + vektor::write_timestamp(l2, ts.gpu + 24); 37 + vektor::memset(l2, x.gpu, size, 2); 38 + vektor::write_timestamp(l2, ts.gpu + 32); 39 + } 40 + 41 + vektor::submit(dma2, l2); 42 + vektor::submit(dma1, l1); 43 + 44 + // @todo: how to wait on cpu for DMA transfer? TODO? 45 + printf("x[0]: %u\n", ((uint32_t *)x.cpu)[0]); 46 + sleep(1); 47 + printf("x[0]: %u\n", ((uint32_t *)x.cpu)[0]); 48 + 49 + printf("\n"); 50 + printf("ts0: %lu\n", ((uint64_t *)ts.cpu)[0]); 51 + printf("ts1: %lu\n", ((uint64_t *)ts.cpu)[1]); 52 + printf("ts2: %lu\n", ((uint64_t *)ts.cpu)[2]); 53 + printf("ts3: %lu\n", ((uint64_t *)ts.cpu)[3]); 54 + printf("ts4: %lu\n", ((uint64_t *)ts.cpu)[4]); 55 + 56 + vektor::free(dev, x); 57 + vektor::destroy(dev); 58 + 59 + return 0; 60 + }
+41
libvektor/test/04_hello_timestamp/hello_timestamp.cpp
··· 1 + #include <unistd.h> 2 + #include <vektor/vektor.h> 3 + 4 + #include <stdio.h> 5 + 6 + int main(void) { 7 + 8 + auto version = vektor::version(); 9 + printf("vektor %s (%s)\n", version.version, version.commit_id); 10 + 11 + auto dev = vektor::create(); 12 + 13 + std::size_t size = 10 * 1024 * 1024; 14 + auto x = vektor::malloc(dev, size); 15 + auto y = vektor::malloc(dev, 16); 16 + 17 + printf("x: %p (%p) (%llu bytes)\n", x.cpu, x.gpu, x.size); 18 + printf("y: %p (%p) (%llu bytes)\n", y.cpu, y.gpu, y.size); 19 + 20 + auto dma = vektor::create_queue(dev, vektor::QueueType::Transfer); 21 + 22 + auto l1 = vektor::start_recording(dma); 23 + vektor::write_timestamp(l1, y.gpu); 24 + vektor::memset(l1, x.gpu, size, 2); 25 + vektor::write_timestamp(l1, y.gpu + 8); 26 + 27 + vektor::submit(dma, l1); 28 + 29 + // @todo: how to wait on cpu for DMA transfer? TODO? 30 + printf("x[0]: %u\n", ((uint32_t *)x.cpu)[0]); 31 + sleep(1); 32 + printf("x[0]: %u\n", ((uint32_t *)x.cpu)[0]); 33 + 34 + printf("ts0: %ul\n", ((uint64_t *)y.cpu)[0]); 35 + printf("ts1: %ul\n", ((uint64_t *)y.cpu)[1]); 36 + 37 + vektor::free(dev, x); 38 + vektor::destroy(dev); 39 + 40 + return 0; 41 + }