full now driver migration · logdahl.net/kestrel@0516833

+1 -1

docs/CMakeLists.txt

··· 6 6 set(DOXYGEN_XML_OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/xml") 7 7 8 8 doxygen_add_docs(doxygen_xml 9 - "${PROJECT_SOURCE_DIR}/libvektor/include" 9 + "${PROJECT_SOURCE_DIR}/kestrel/include" 10 10 COMMENT "Generating XML API data with Doxygen" 11 11 ) 12 12

+2 -2

docs/api/index.rst

··· 1 1 API Reference 2 - ========= 2 + ============= 3 3 4 4 Kestrel's API is organized into functional modules. Each module contains 5 5 related structures, enumerations, and functions. 6 6 7 7 .. note:: 8 - The API docs are generated from :file:`libvektor/include/vektor/vektor.h`. 8 + The API docs are generated from :file:`kestrel/include/kestrel/kestrel.h`. 9 9 To update this documentation, please edit the header file. 10 10 11 11 .. toctree::

-3

docs/api/init.rst

··· 1 1 Initialization 2 2 ============== 3 - 4 - .. doxygengroup:: init 5 - :content-only:

-3

docs/api/memory.rst

··· 1 1 Memory 2 2 ====== 3 - 4 - .. doxygengroup:: memory 5 - :content-only:

-3

docs/api/synchronization.rst

··· 1 1 Synchronization 2 2 =============== 3 - 4 - .. doxygengroup:: sync 5 - :content-only:

+10

docs/drivers/architecture.rst

··· 1 1 Architecture 2 2 ============ 3 + 4 + Each driver library (`libkestrel_<platform>.so`) exports only 5 + a single symbol. 6 + 7 + .. doxygenfile:: kestrel/interface.h 8 + 9 + When a new Device is created, the Kestrel runtime iterates 10 + the available DRM devices on the system, and tries to resolve 11 + a suitable driver library. The symbol is resolved and bind 12 + all functions to that driver.

+1

drivers/CMakeLists.txt

··· 35 35 -static-libgcc 36 36 -Wl,--gc-sections 37 37 -Wl,--exclude-libs,ALL 38 + -Wl,--no-undefined 38 39 > 39 40 ) 40 41

+118

drivers/amdgpu/cmds.cpp

··· 1 + #include "kestrel/kestrel.h" 2 + #include "impl.h" 3 + 4 + #include "sdma_encoder.h" 5 + 6 + void memset_transfer(CommandListImpl *impl, kes_gpuptr_t addr, std::size_t size, uint32_t value) { 7 + assert(impl->queue->type == KesQueueTypeTransfer, "memset_transfer: requires queue of Transfer type"); 8 + 9 + SDMAEncoder enc(impl->queue->dev->info, impl->cs); 10 + 11 + while (size > 0) { 12 + uint64_t bytes_written = enc.constant_fill(addr, size, value); 13 + size -= bytes_written; 14 + addr += bytes_written; 15 + } 16 + } 17 + 18 + void amdgpu_cmd_memset(KesCommandList pcl, kes_gpuptr_t addr, std::size_t size, uint32_t value) { 19 + auto *cl = reinterpret_cast<CommandListImpl *>(pcl); 20 + assert(cl, "memset: command list handle invalid: {}", (void *)pcl); 21 + 22 + switch(cl->queue->type) { 23 + case KesQueueTypeTransfer: 24 + memset_transfer(cl, addr, size, value); 25 + break; 26 + default: 27 + not_implemented("memset: not implemented for queue type: {}", cl->queue->type); 28 + } 29 + } 30 + 31 + SDMAAtomicOp sdma_atomic_op_map(KesSignal sig) { 32 + switch(sig) { 33 + case KesSignalAtomicSet: 34 + return SDMAAtomicOp::Swap; 35 + case KesSignalAtomicMax: 36 + return SDMAAtomicOp::UMax; 37 + case KesSignalAtomicOr: 38 + return SDMAAtomicOp::Or; 39 + default: 40 + not_implemented("sdma_atomic_op_map: no mapping for {}", sig); 41 + } 42 + } 43 + 44 + SDMAWaitMemOp sdma_waitmem_op_map(KesOp op) { 45 + switch(op) { 46 + case KesOpEqual: 47 + return SDMAWaitMemOp::Equal; 48 + default: 49 + not_implemented("sdma_waitmem_op_map: no mapping for {}", op); 50 + } 51 + } 52 + 53 + void wait_before_transfer(CommandListImpl *impl, kes_gpuptr_t ptr, uint64_t value, KesOp op, uint64_t mask) { 54 + assert(impl->queue->type == KesQueueTypeTransfer, "wait_before_transfer: requires queue of Transfer type"); 55 + 56 + SDMAEncoder enc(impl->queue->dev->info, impl->cs); 57 + 58 + auto func = sdma_waitmem_op_map(op); 59 + 60 + // @todo: NOTE: this only writes the low 32-bits of value. I do not know how we should do this, it 61 + // seems the hardware doesn't support this. 62 + enc.wait_mem(func, ptr, value & 0xFFFFFFFF, mask & 0xFFFFFFFF); 63 + } 64 + 65 + void amdgpu_cmd_wait_before(KesCommandList pcl, KesStage after, kes_gpuptr_t ptr, uint64_t value, KesOp op, KesHazardFlags hazard, uint64_t mask) { 66 + auto *cl = reinterpret_cast<CommandListImpl *>(pcl); 67 + assert(cl, "wait_before: command list handle invalid: {}", (void *)pcl); 68 + 69 + switch(cl->queue->type) { 70 + case KesQueueTypeTransfer: 71 + wait_before_transfer(cl, ptr, value, op, mask); 72 + break; 73 + default: 74 + not_implemented("wait_before: not implemented for queue type: {}", cl->queue->type); 75 + } 76 + } 77 + 78 + void signal_after_transfer(CommandListImpl *impl, kes_gpuptr_t ptr, uint64_t value, KesSignal sig) { 79 + assert(impl->queue->type == KesQueueTypeTransfer, "signal_after_transfer: requires queue of Transfer type"); 80 + SDMAEncoder enc(impl->queue->dev->info, impl->cs); 81 + 82 + auto op = sdma_atomic_op_map(sig); 83 + 84 + enc.atomic(op, ptr, value); 85 + } 86 + 87 + void amdgpu_cmd_signal_after(KesCommandList pcl, KesStage before, kes_gpuptr_t ptr, uint64_t value, KesSignal sig) { 88 + auto *cl = reinterpret_cast<CommandListImpl *>(pcl); 89 + assert(cl, "signal_after: command list handle invalid: {}", (void *)pcl); 90 + 91 + switch(cl->queue->type) { 92 + case KesQueueTypeTransfer: 93 + signal_after_transfer(cl, ptr, value, sig); 94 + break; 95 + default: 96 + not_implemented("wait_before: not implemented for queue type: {}", cl->queue->type); 97 + } 98 + } 99 + 100 + void write_timestamp_transfer(CommandListImpl *impl, kes_gpuptr_t ptr) { 101 + assert(impl->queue->type == KesQueueTypeTransfer, "write_timestamp_transfer: requires queue of Transfer type"); 102 + SDMAEncoder enc(impl->queue->dev->info, impl->cs); 103 + 104 + enc.write_timestamp(ptr); 105 + } 106 + 107 + void amdgpu_cmd_write_timestamp(KesCommandList pcl, kes_gpuptr_t ptr) { 108 + auto *cl = reinterpret_cast<CommandListImpl *>(pcl); 109 + assert(cl, "write_timestamp: command list handle invalid: {}", (void *)pcl); 110 + 111 + switch(cl->queue->type) { 112 + case KesQueueTypeTransfer: 113 + write_timestamp_transfer(cl, ptr); 114 + break; 115 + default: 116 + not_implemented("write_timestamp: not implemented for queue type: {}", cl->queue->type); 117 + } 118 + }

+2 -2

drivers/amdgpu/impl.h

··· 33 33 }; 34 34 35 35 extern "C" { 36 - KesDevice amdgpu_init(int drm_fd); 36 + KesDevice amdgpu_create(int drm_fd); 37 37 void amdgpu_destroy(KesDevice); 38 38 struct KesAllocation amdgpu_malloc(KesDevice, size_t size, size_t align, KesMemory); 39 39 void amdgpu_free(KesDevice, struct KesAllocation *); ··· 49 49 void amdgpu_cmd_write_timestamp(KesCommandList, kes_gpuptr_t addr); 50 50 51 51 void amdgpu_cmd_signal_after(KesCommandList, KesStage before, kes_gpuptr_t addr, uint64_t value, KesSignal); 52 - void amdgpu_cmd_wait_before(KesCommandList, KesStage after, kes_gpuptr_t addr, uint64_t value, KesOp, KesHazardFlags); 52 + void amdgpu_cmd_wait_before(KesCommandList, KesStage after, kes_gpuptr_t addr, uint64_t value, KesOp, KesHazardFlags, uint64_t mask); 53 53 }

+10 -1

drivers/amdgpu/init.cpp

··· 25 25 return "???"; 26 26 } 27 27 28 - API_EXPORT KesDevice amdgpu_init(int drm_fd) { 28 + KesDevice amdgpu_create(int drm_fd) { 29 29 auto dev = new DeviceImpl; 30 30 dev->fd = drm_fd; 31 31 ··· 73 73 74 74 return (KesDevice)dev; 75 75 } 76 + 77 + void amdgpu_destroy(KesDevice pd) { 78 + auto *dev = reinterpret_cast<DeviceImpl *>(pd); 79 + if (dev) { 80 + amdgpu_device_deinitialize(dev->amd_handle); 81 + dev->amd_handle = nullptr; 82 + delete dev; 83 + } 84 + }

+1 -1

drivers/amdgpu/interface.cpp

··· 6 6 7 7 API_EXPORT void kes_drv_interface(struct KesDriverFuncs *fns) { 8 8 fns->version = KESDRV_AMDGPU_VERSION_NUM; 9 - fns->fn_create = amdgpu_init; 9 + fns->fn_create = amdgpu_create; 10 10 fns->fn_destroy = amdgpu_destroy; 11 11 fns->fn_malloc = amdgpu_malloc; 12 12 fns->fn_free = amdgpu_free;

+111

drivers/amdgpu/mem.cpp

··· 1 + #include "impl.h" 2 + 3 + struct AllocationImpl { 4 + amdgpu_bo_handle bo; 5 + amdgpu_va_handle va_handle; 6 + }; 7 + 8 + #define VEK_HUGE_PAGE_SIZE (2ULL * 1024 * 1024) 9 + 10 + KesAllocation amdgpu_malloc(KesDevice pd, size_t size, size_t align, KesMemory memory) { 11 + auto *dev = reinterpret_cast<DeviceImpl *>(pd); 12 + 13 + auto aligned_size = (size + VEK_HUGE_PAGE_SIZE - 1) & ~(VEK_HUGE_PAGE_SIZE - 1); 14 + auto alignment = VEK_HUGE_PAGE_SIZE; 15 + 16 + KesAllocation alloc = {}; 17 + auto *impl = reinterpret_cast<AllocationImpl*>(alloc._internal); 18 + 19 + alloc.size = aligned_size; 20 + impl->bo = nullptr; 21 + impl->va_handle = nullptr; 22 + 23 + amdgpu_bo_alloc_request req = {}; 24 + req.alloc_size = aligned_size; 25 + req.phys_alignment = alignment; 26 + req.preferred_heap = AMDGPU_GEM_DOMAIN_VRAM; 27 + 28 + switch(memory) { 29 + case KesMemoryDefault: 30 + req.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED | 31 + AMDGPU_GEM_CREATE_VRAM_CLEARED; 32 + break; 33 + case KesMemoryGpu: 34 + req.flags = AMDGPU_GEM_CREATE_NO_CPU_ACCESS; 35 + break; 36 + case KesMemoryReadback: 37 + req.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED | 38 + AMDGPU_GEM_CREATE_COHERENT; 39 + break; 40 + } 41 + 42 + // some systems (DCE) require contiguous addresses as they don't use the MMU Infinity Cache. 43 + // req.flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; 44 + 45 + log("bo alloc: size: {} align: {}", aligned_size, alignment); 46 + 47 + if (amdgpu_bo_alloc(dev->amd_handle, &req, &impl->bo) != 0) { 48 + log("amdgpu_bo_alloc failed"); 49 + return alloc; 50 + } 51 + 52 + uint64_t va_base; 53 + int r = amdgpu_va_range_alloc(dev->amd_handle, 54 + amdgpu_gpu_va_range_general, 55 + aligned_size, alignment, 0, 56 + &va_base, &impl->va_handle, 0); 57 + if (r != 0) { 58 + log("amdgpu_va_range_alloc failed"); 59 + amdgpu_bo_free(impl->bo); 60 + impl->bo = nullptr; 61 + return alloc; 62 + } 63 + alloc.gpu = va_base; 64 + 65 + auto va_flags = AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE | AMDGPU_VM_PAGE_EXECUTABLE; 66 + r = amdgpu_bo_va_op(impl->bo, 0, aligned_size, va_base, va_flags, AMDGPU_VA_OP_MAP); 67 + if (r != 0) { 68 + log("amdgpu_bo_va_op failed"); 69 + amdgpu_va_range_free(impl->va_handle); 70 + amdgpu_bo_free(impl->bo); 71 + impl->bo = nullptr; 72 + impl->va_handle = nullptr; 73 + return alloc; 74 + } 75 + 76 + if (memory != KesMemoryGpu) { 77 + r = amdgpu_bo_cpu_map(impl->bo, &alloc.cpu); 78 + if (r != 0) { 79 + log("amdgpu_bo_cpu_map failed"); 80 + amdgpu_bo_va_op(impl->bo, 0, aligned_size, va_base, 0, AMDGPU_VA_OP_UNMAP); 81 + amdgpu_va_range_free(impl->va_handle); 82 + amdgpu_bo_free(impl->bo); 83 + impl->bo = nullptr; 84 + impl->va_handle = nullptr; 85 + alloc.gpu = 0; 86 + return alloc; 87 + } 88 + } 89 + 90 + return alloc; 91 + } 92 + 93 + void amdgpu_free(KesDevice pd, struct KesAllocation *alloc) { 94 + AllocationImpl *impl = reinterpret_cast<AllocationImpl*>(alloc->_internal); 95 + 96 + if (alloc->cpu) { 97 + amdgpu_bo_cpu_unmap(impl->bo); 98 + alloc->cpu = nullptr; 99 + } 100 + if (impl->bo) { 101 + amdgpu_bo_va_op(impl->bo, 0, alloc->size, alloc->gpu, 0, AMDGPU_VA_OP_UNMAP); 102 + amdgpu_bo_free(impl->bo); 103 + impl->bo = nullptr; 104 + } 105 + if (impl->va_handle) { 106 + amdgpu_va_range_free(impl->va_handle); 107 + impl->va_handle = nullptr; 108 + } 109 + 110 + alloc->gpu = 0; 111 + }

-7

drivers/amdgpu/pm4_encoder.h

··· 4 4 #include <vector> 5 5 6 6 #include "gpuinfo.h" 7 - // #include "pm4.h" 8 - 9 - // @todo: consider redesigning this.. 10 - // instead of having multiple CmdStream types; let a commandstream 11 - // just be some kind of buffer (std::vector). We can rename this 12 - // to "PM4 Encoder" or something, directly adding to a provided 13 - // buffer. 14 7 15 8 class Pm4Encoder { 16 9 public:

+67

drivers/amdgpu/queue.cpp

··· 1 + #include "kestrel/kestrel.h" 2 + #include "impl.h" 3 + 4 + #include <cstdint> 5 + #include <vector> 6 + 7 + uint32_t hw_ip_type_from_queue_type(KesQueueType qt) { 8 + switch(qt) { 9 + case KesQueueTypeGraphics: return AMDGPU_HW_IP_GFX; 10 + case KesQueueTypeCompute: return AMDGPU_HW_IP_COMPUTE; 11 + case KesQueueTypeTransfer: return AMDGPU_HW_IP_DMA; 12 + default: 13 + not_implemented("no HW_IP type picked for queue type: {}", qt); 14 + } 15 + } 16 + 17 + KesQueue amdgpu_create_queue(KesDevice pd, KesQueueType qt) { 18 + auto *dev = reinterpret_cast<DeviceImpl *>(pd); 19 + 20 + auto queue = new QueueImpl; 21 + queue->dev = dev; 22 + queue->type = qt; 23 + queue->hw_ip_type = hw_ip_type_from_queue_type(qt); 24 + 25 + // @todo: consider creating ctx at device initialization? 26 + int r = amdgpu_cs_ctx_create(dev->amd_handle, &queue->ctx_handle); 27 + if (r != 0) { 28 + delete queue; 29 + return nullptr; 30 + } 31 + 32 + // @todo: cleanup: remove this fkn pointer; shit stuff we don't need! 33 + auto conf = CommandRing::Config{}; 34 + queue->cmd_ring = new CommandRing(dev->amd_handle, queue->ctx_handle, queue->hw_ip_type, conf); 35 + 36 + return queue; 37 + } 38 + 39 + void amdgpu_destroy_queue(KesQueue pq) { 40 + auto *queue = reinterpret_cast<QueueImpl *>(pq); 41 + // @todo: actually delete queue. 42 + } 43 + 44 + KesCommandList amdgpu_start_recording(KesQueue pq) { 45 + auto *queue = reinterpret_cast<QueueImpl *>(pq); 46 + auto cl = new CommandListImpl; 47 + 48 + cl->queue = queue; 49 + cl->cs = queue->cmd_ring->begin_recording(); 50 + 51 + return cl; 52 + } 53 + 54 + // @todo: add support for semaphore or other synchronization. 55 + void amdgpu_submit(KesQueue pq, KesCommandList pcl) { 56 + auto *queue = reinterpret_cast<QueueImpl *>(pq); 57 + auto *cl = reinterpret_cast<CommandListImpl *>(pcl); 58 + assert(cl->queue == queue, "submit: commandlist from foreign queue"); 59 + 60 + queue->cmd_ring->submit(cl->cs); //, semaphore, value); 61 + 62 + // @todo: to free commandlist, we want to be sure that it is no longer mapped and stuff. 63 + // then, we can freely-free it. But i think this needs some deferred-cleanup, as 64 + // the data is on GTT so we cannot just let the CPU start using the range again. 65 + // 66 + // think about this. 67 + }

+1 -1

drivers/i915/impl.h

··· 19 19 void i915_cmd_write_timestamp(KesCommandList, kes_gpuptr_t addr); 20 20 21 21 void i915_cmd_signal_after(KesCommandList, KesStage before, kes_gpuptr_t addr, uint64_t value, KesSignal); 22 - void i915_cmd_wait_before(KesCommandList, KesStage after, kes_gpuptr_t addr, uint64_t value, KesOp, KesHazardFlags); 22 + void i915_cmd_wait_before(KesCommandList, KesStage after, kes_gpuptr_t addr, uint64_t value, KesOp, KesHazardFlags, uint64_t); 23 23 }

+2 -2

drivers/i915/mock.cpp

··· 1 1 #include "impl.h" 2 2 3 3 API_EXPORT KesDevice i915_init(int drm_fd) { 4 - return nullptr; 4 + return (KesDevice)0x1; 5 5 } 6 6 7 7 API_EXPORT void i915_destroy(KesDevice) { ··· 43 43 44 44 } 45 45 46 - API_EXPORT void i915_cmd_wait_before(KesCommandList, KesStage after, kes_gpuptr_t addr, uint64_t value, KesOp, KesHazardFlags) { 46 + API_EXPORT void i915_cmd_wait_before(KesCommandList, KesStage after, kes_gpuptr_t addr, uint64_t value, KesOp, KesHazardFlags, uint64_t) { 47 47 48 48 }

+1 -1

kestrel/include/kestrel/kestrel.h

··· 75 75 void kes_cmd_write_timestamp(KesCommandList, kes_gpuptr_t addr); 76 76 77 77 void kes_cmd_signal_after(KesCommandList, KesStage before, kes_gpuptr_t addr, uint64_t value, KesSignal); 78 - void kes_cmd_wait_before(KesCommandList, KesStage after, kes_gpuptr_t addr, uint64_t value, KesOp, KesHazardFlags); 78 + void kes_cmd_wait_before(KesCommandList, KesStage after, kes_gpuptr_t addr, uint64_t value, KesOp, KesHazardFlags, uint64_t mask); 79 79 80 80 #ifdef __cplusplus 81 81 }

+4 -4

kestrel/rt/api.cpp

··· 179 179 dev->fns.fn_cmd_write_timestamp(clhandle->cmdlist, addr); 180 180 } 181 181 182 - API_EXPORT void kes_cmd_signal_after(KesCommandList pcl, KesStage before, kes_gpuptr_t addr, uint64_t value, KesSignal) { 182 + API_EXPORT void kes_cmd_signal_after(KesCommandList pcl, KesStage before, kes_gpuptr_t addr, uint64_t value, KesSignal signal) { 183 183 auto *clhandle = reinterpret_cast<CommandListHandle *>(pcl); 184 184 auto *dev = reinterpret_cast<DeviceHandle *>(clhandle->drv_handle); 185 185 186 - dev->fns.fn_cmd_signal_after(clhandle->cmdlist, before, addr, value, KesSignalAtomicSet); 186 + dev->fns.fn_cmd_signal_after(clhandle->cmdlist, before, addr, value, signal); 187 187 } 188 188 189 - API_EXPORT void kes_cmd_wait_before(KesCommandList pcl, KesStage after, kes_gpuptr_t addr, uint64_t value, KesOp, KesHazardFlags) { 189 + API_EXPORT void kes_cmd_wait_before(KesCommandList pcl, KesStage after, kes_gpuptr_t addr, uint64_t value, KesOp op, KesHazardFlags hazard, uint64_t mask) { 190 190 auto *clhandle = reinterpret_cast<CommandListHandle *>(pcl); 191 191 auto *dev = reinterpret_cast<DeviceHandle *>(clhandle->drv_handle); 192 192 193 - dev->fns.fn_cmd_wait_before(clhandle->cmdlist, after, addr, value, KesOpNever, KesHazardFlagsNone); 193 + dev->fns.fn_cmd_wait_before(clhandle->cmdlist, after, addr, value, op, hazard, mask); 194 194 }

+1 -1

test/test/01_hello_malloc/hello_malloc.cpp

··· 7 7 8 8 auto dev = kes_create(); 9 9 10 - auto x = kes_malloc(dev, 100 * 1024 * 1024, 0, KesMemoryDefault); 10 + auto x = kes_malloc(dev, 100 * 1024 * 1024, 4, KesMemoryDefault); 11 11 12 12 printf("x: %p (%p) (%llu bytes)\n", x.cpu, x.gpu, x.size); 13 13

+42

test/test/02_hello_queue/hello_queue.cpp

··· 1 + #include <unistd.h> 2 + #include <kestrel/kestrel.h> 3 + 4 + #include <stdio.h> 5 + 6 + int main(void) { 7 + 8 + auto dev = kes_create(); 9 + 10 + std::size_t size = 10 * 1024 * 1024; 11 + auto x = kes_malloc(dev, size, 4, KesMemoryDefault); 12 + auto y = kes_malloc(dev, 8, 4, KesMemoryDefault); 13 + 14 + printf("x: %p (%p) (%llu bytes)\n", x.cpu, x.gpu, x.size); 15 + printf("y: %p (%p) (%llu bytes)\n", y.cpu, y.gpu, y.size); 16 + 17 + // @todo: wait for address ok? 18 + 19 + auto dma = kes_create_queue(dev, KesQueueTypeTransfer); 20 + 21 + auto l1 = kes_start_recording(dma); 22 + kes_cmd_memset(l1, x.gpu, size, 1); 23 + kes_cmd_wait_before(l1, KesStageTransfer, y.gpu, 1337, KesOpEqual, KesHazardFlagsNone, ~0); 24 + kes_cmd_memset(l1, x.gpu, size, 2); 25 + 26 + kes_submit(dma, l1); 27 + 28 + // @todo: hacky bussy-wait 29 + printf("x[0]: %u\n", ((uint32_t *)x.cpu)[0]); 30 + 31 + while(*((uint32_t *)x.cpu) == 0); 32 + printf("x[0]: %u\n", ((uint32_t *)x.cpu)[0]); 33 + *((uint32_t *)y.cpu) = 1337; 34 + 35 + while(*((uint32_t *)x.cpu) == 1); 36 + printf("x[0]: %u\n", ((uint32_t *)x.cpu)[0]); 37 + 38 + kes_free(dev, &x); 39 + kes_destroy(dev); 40 + 41 + return 0; 42 + }

+61

test/test/03_hello_2queue/hello_2queue.cpp

··· 1 + #include <unistd.h> 2 + #include <kestrel/kestrel.h> 3 + 4 + #include <stdio.h> 5 + 6 + int main(void) { 7 + 8 + auto dev = kes_create(); 9 + 10 + std::size_t size = 10 * 1024 * 1024; 11 + auto x = kes_malloc(dev, size, 4, KesMemoryDefault); 12 + auto y = kes_malloc(dev, 8, 4, KesMemoryDefault); 13 + auto ts = kes_malloc(dev, 8 * 5, 4, KesMemoryDefault); 14 + 15 + printf("x: %p (%p) (%llu bytes)\n", x.cpu, x.gpu, x.size); 16 + printf("y: %p (%p) (%llu bytes)\n", y.cpu, y.gpu, y.size); 17 + 18 + auto dma1 = kes_create_queue(dev, KesQueueTypeTransfer); 19 + auto dma2 = kes_create_queue(dev, KesQueueTypeTransfer); 20 + 21 + auto l1 = kes_start_recording(dma1); 22 + { 23 + kes_cmd_write_timestamp(l1, ts.gpu + 0); 24 + kes_cmd_memset(l1, x.gpu, size, 1); 25 + kes_cmd_write_timestamp(l1, ts.gpu + 8); 26 + kes_cmd_signal_after(l1, KesStageTransfer, y.gpu, 1337, KesSignalAtomicMax); 27 + } 28 + 29 + auto l2 = kes_start_recording(dma2); 30 + { 31 + kes_cmd_write_timestamp(l2, ts.gpu + 16); 32 + kes_cmd_wait_before(l2, KesStageTransfer, y.gpu, 1337, KesOpEqual, KesHazardFlagsNone, ~0); 33 + kes_cmd_write_timestamp(l2, ts.gpu + 24); 34 + kes_cmd_memset(l2, x.gpu, size, 2); 35 + kes_cmd_write_timestamp(l2, ts.gpu + 32); 36 + } 37 + 38 + kes_submit(dma2, l2); 39 + kes_submit(dma1, l1); 40 + 41 + // @todo: how to wait on cpu for DMA transfer? TODO? 42 + printf("x[0]: %u\n", ((uint32_t *)x.cpu)[0]); 43 + sleep(1); 44 + printf("x[0]: %u\n", ((uint32_t *)x.cpu)[0]); 45 + sleep(1); 46 + printf("x[0]: %u\n", ((uint32_t *)x.cpu)[0]); 47 + sleep(1); 48 + printf("x[0]: %u\n", ((uint32_t *)x.cpu)[0]); 49 + 50 + printf("\n"); 51 + printf("ts0: %lu\n", ((uint64_t *)ts.cpu)[0]); 52 + printf("ts1: %lu\n", ((uint64_t *)ts.cpu)[1]); 53 + printf("ts2: %lu\n", ((uint64_t *)ts.cpu)[2]); 54 + printf("ts3: %lu\n", ((uint64_t *)ts.cpu)[3]); 55 + printf("ts4: %lu\n", ((uint64_t *)ts.cpu)[4]); 56 + 57 + kes_free(dev, &x); 58 + kes_destroy(dev); 59 + 60 + return 0; 61 + }

+38

test/test/04_hello_timestamp/hello_timestamp.cpp

··· 1 + #include <unistd.h> 2 + #include <kestrel/kestrel.h> 3 + 4 + #include <stdio.h> 5 + 6 + int main(void) { 7 + 8 + auto dev = kes_create(); 9 + 10 + std::size_t size = 10 * 1024 * 1024; 11 + auto x = kes_malloc(dev, size, 4, KesMemoryDefault); 12 + auto y = kes_malloc(dev, 16, 4, KesMemoryDefault); 13 + 14 + printf("x: %p (%p) (%llu bytes)\n", x.cpu, x.gpu, x.size); 15 + printf("y: %p (%p) (%llu bytes)\n", y.cpu, y.gpu, y.size); 16 + 17 + auto dma = kes_create_queue(dev, KesQueueTypeTransfer); 18 + 19 + auto l1 = kes_start_recording(dma); 20 + kes_cmd_write_timestamp(l1, y.gpu); 21 + kes_cmd_memset(l1, x.gpu, size, 2); 22 + kes_cmd_write_timestamp(l1, y.gpu + 8); 23 + 24 + kes_submit(dma, l1); 25 + 26 + // @todo: how to wait on cpu for DMA transfer? TODO? 27 + printf("x[0]: %u\n", ((uint32_t *)x.cpu)[0]); 28 + sleep(1); 29 + printf("x[0]: %u\n", ((uint32_t *)x.cpu)[0]); 30 + 31 + printf("ts0: %ul\n", ((uint64_t *)y.cpu)[0]); 32 + printf("ts1: %ul\n", ((uint64_t *)y.cpu)[1]); 33 + 34 + kes_free(dev, &x); 35 + kes_destroy(dev); 36 + 37 + return 0; 38 + }

Configure Feed

Configure Feed