A Modern GPGPU API & wip linux RDNA2+ Driver
rdna driver linux gpu
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

wip: moved gir to public header

This WILL be removed in the future. However, we need a way
to write shaders. Right now, only using the IR.

+487 -411
+1
drivers/CMakeLists.txt
··· 3 3 ) 4 4 add_library(kes_driver_common STATIC ${COMMON_DRIVER_SOURCES}) 5 5 target_include_directories(kes_driver_common PUBLIC 6 + ${PROJECT_SOURCE_DIR}/kestrel/include 6 7 common 7 8 ) 8 9
+60 -87
drivers/amdgpu/cmds.cpp
··· 1 - #include "gir/gir.h" 1 + #include "kestrel/gir.h" 2 2 #include "compiler/compiler.h" 3 3 #include "cp_encoder.h" 4 4 #include "gpuinfo.h" ··· 258 258 uint64_t data_va; 259 259 }; 260 260 261 - struct ShaderRegs { 262 - uint32_t pgm_lo; 263 - uint32_t pgm_hi; 264 - uint32_t pgm_rsrc1; 265 - uint32_t pgm_rsrc2; 266 - uint32_t pgm_rsrc3; 261 + // @todo: should this really be part of a shader or the device? 262 + // i think device. 263 + void precompute_regs(ShaderInfo &info) { 264 + auto &regs = info.regs; 267 265 268 - uint32_t userdata_0; 269 - }; 266 + // @todo: setup that compute_resource_limits thingy. 270 267 271 - enum class HwStage { 272 - Compute 273 - }; 268 + switch(info.hw_stage) { 269 + case HwStage::Compute: 270 + regs.pgm_lo = R_00B830_COMPUTE_PGM_LO; 271 + regs.pgm_hi = R_00B834_COMPUTE_PGM_HI; 272 + regs.pgm_rsrc1 = R_00B848_COMPUTE_PGM_RSRC1; 273 + regs.pgm_rsrc2 = R_00B84C_COMPUTE_PGM_RSRC2; 274 + regs.pgm_rsrc3 = R_00B8A0_COMPUTE_PGM_RSRC3; 275 + regs.userdata_0 = R_00B900_COMPUTE_USER_DATA_0; 276 + break; 277 + } 278 + } 274 279 275 - struct ShaderInfo { 276 - uint32_t block_size[3]; 277 - HwStage hw_stage; 278 - ShaderRegs regs; 280 + KesShader amdgpu_create_shader(KesDevice pd, void *modptr) { 281 + auto *dev = reinterpret_cast<DeviceImpl *>(pd); 282 + gir::Module *module = reinterpret_cast<gir::Module *>(modptr); 279 283 280 - bool ordered; 281 - uint32_t wave_size; 282 - }; 284 + assert(dev, "amdgpu_create_shader: device handle invalid: {}", (void *)dev); 285 + assert(module, "amdgpu_create_shader: module handle invalid: {}", (void *)module); 283 286 284 - struct ShaderConfig { 285 - uint32_t pgm_rsrc1; 286 - uint32_t pgm_rsrc2; 287 - uint32_t pgm_rsrc3; 288 - uint32_t compute_resource_limits; 289 - 290 - uint32_t user_sgpr_count; 291 - }; 292 - 293 - struct Shader { 294 - ShaderInfo info; 295 - ShaderConfig config; 296 - uint64_t va; 297 - }; 298 - 299 - void init_compute_shader_config(DeviceImpl *dev, Shader &shader) { 287 + auto shader = new Shader; 300 288 301 289 // @todo: ultra temporary. 302 290 auto alloc = amdgpu_malloc(dev, 1024, 256, KesMemoryDefault); 303 - 304 - { 305 - gir::Module mod; 306 - gir::Builder gb(mod); 307 - auto rp = gb.get_root_ptr(); 308 - auto p = gb.add(rp, gb.mul(gb.get_local_invocation_id(), gb.i32(4))); 309 - auto x = gb.load(p); 310 - auto sum = gb.add(x, gb.i32(15)); 311 - gb.store(p, sum); 312 - 313 - rdna2_compile(mod, alloc.cpu, alloc.gpu); 314 - } 291 + rdna2_compile(*module, alloc.cpu, alloc.gpu); 292 + shader->allocation = alloc; 315 293 316 294 log("shader code: {} {}", (void *)alloc.cpu, (void *)alloc.gpu); 317 295 318 - // @todo: temporary 319 296 auto ordered = false; 320 297 auto wave_size = 32; 321 298 auto waves_per_threadgroup = 1; ··· 335 312 336 313 auto num_shared_vgpr_blocks = num_shared_vgprs / 8; 337 314 338 - shader.config.user_sgpr_count = num_user_sgprs; 339 - shader.info.ordered = ordered; 340 - shader.info.wave_size = wave_size; 341 - shader.info.block_size[0] = 32; 342 - shader.info.block_size[1] = 1; 343 - shader.info.block_size[2] = 1; 344 - shader.va = alloc.gpu; 345 - shader.info.hw_stage = HwStage::Compute; 315 + shader->config.user_sgpr_count = num_user_sgprs; 316 + shader->info.ordered = ordered; 317 + shader->info.wave_size = wave_size; 318 + shader->info.block_size[0] = 32; 319 + shader->info.block_size[1] = 1; 320 + shader->info.block_size[2] = 1; 321 + shader->va = alloc.gpu; 322 + shader->info.hw_stage = HwStage::Compute; 346 323 347 324 // use large limits. 348 - shader.config.compute_resource_limits = 325 + shader->config.compute_resource_limits = 349 326 S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0) 350 327 | S_00B854_WAVES_PER_SH(max_waves_per_sh) 351 328 | S_00B854_CU_GROUP_COUNT(threadgroups_per_cu - 1); 352 329 353 - shader.config.pgm_rsrc1 = 330 + shader->config.pgm_rsrc1 = 354 331 S_00B848_VGPRS((num_vgprs - 1) / (wave_size == 32 ? 8 : 4)) 355 332 | S_00B848_DX10_CLAMP(dx10_clamp) 356 333 | S_00B128_MEM_ORDERED(true); //always true for gfx10.3 357 334 358 - shader.config.pgm_rsrc2 = 359 - S_00B84C_USER_SGPR(shader.config.user_sgpr_count) 360 - | S_00B22C_USER_SGPR_MSB_GFX10(shader.config.user_sgpr_count >> 5) 335 + shader->config.pgm_rsrc2 = 336 + S_00B84C_USER_SGPR(shader->config.user_sgpr_count) 337 + | S_00B22C_USER_SGPR_MSB_GFX10(shader->config.user_sgpr_count >> 5) 361 338 | S_00B12C_SCRATCH_EN(scratch_enabled) 362 339 | S_00B12C_TRAP_PRESENT(trap_present) 363 340 | S_00B84C_TGID_X_EN(1) 364 341 | S_00B84C_TGID_Y_EN(1) 365 342 | S_00B84C_TGID_Z_EN(1); 366 343 367 - shader.config.pgm_rsrc3 = 344 + shader->config.pgm_rsrc3 = 368 345 S_00B8A0_SHARED_VGPR_CNT(num_shared_vgpr_blocks); 369 - } 370 346 371 - void precompute_regs(ShaderInfo &info) { 372 - auto &regs = info.regs; 373 - 374 - // @todo: setup that compute_resource_limits thingy. 347 + precompute_regs(shader->info); 375 348 376 - switch(info.hw_stage) { 377 - case HwStage::Compute: 378 - regs.pgm_lo = R_00B830_COMPUTE_PGM_LO; 379 - regs.pgm_hi = R_00B834_COMPUTE_PGM_HI; 380 - regs.pgm_rsrc1 = R_00B848_COMPUTE_PGM_RSRC1; 381 - regs.pgm_rsrc2 = R_00B84C_COMPUTE_PGM_RSRC2; 382 - regs.pgm_rsrc3 = R_00B8A0_COMPUTE_PGM_RSRC3; 383 - regs.userdata_0 = R_00B900_COMPUTE_USER_DATA_0; 384 - break; 385 - } 349 + return reinterpret_cast<KesShader>(shader); 386 350 } 387 351 388 352 void emit_compute_shader(Shader &shader, Pm4Encoder &enc) { ··· 401 365 enc.emit(shader.info.block_size[2] & 0xFFFF); 402 366 } 403 367 368 + void amdgpu_bind_shader(KesCommandList pcl, KesShader pshader) { 369 + auto *cl = reinterpret_cast<CommandListImpl *>(pcl); 370 + auto *shader = reinterpret_cast<Shader *>(pshader); 371 + assert(cl, "amdgpu_bind_shader: command list handle invalid: {}", (void *)pcl); 372 + assert(shader, "amdgpu_bind_shader: shader handle invalid: {}", (void *)shader); 373 + 374 + cl->state.shader = shader; 375 + 376 + // @todo: setup registers here. 377 + auto hw_ip_type = hw_ip_type_from_queue_type(cl->queue->type); 378 + Pm4Encoder enc(cl->queue->dev->info, hw_ip_type, cl->cs); 379 + 380 + emit_compute_shader(*shader, enc); 381 + } 382 + 404 383 void amdgpu_emit_dispatch_packets(GpuInfo &ginfo, Pm4Encoder &enc, Shader &shader, DispatchInfo &dinfo) { 405 384 406 385 // @todo: get this from device settings ··· 417 396 if (shader.info.wave_size == 32) { 418 397 dispatch_initiator |= S_00B800_CS_W32_EN(1); 419 398 } 420 - 421 - emit_compute_shader(shader, enc); 422 399 423 400 uint32_t regs[2]; 424 401 regs[0] = dinfo.data_va; ··· 451 428 auto *cl = reinterpret_cast<CommandListImpl *>(pcl); 452 429 assert(cl, "dispatch: command list handle invalid: {}", (void *)pcl); 453 430 454 - auto hw_ip_type = hw_ip_type_from_queue_type(cl->queue->type); 455 - Pm4Encoder enc(cl->queue->dev->info, hw_ip_type, cl->cs); 431 + auto dev = cl->queue->dev; 456 432 457 - Shader tmp{}; 458 433 DispatchInfo dinfo{ 459 434 .x = x, 460 435 .y = y, ··· 463 438 .data_va = data, 464 439 }; 465 440 466 - // @todo: do this earlier. 467 - init_compute_shader_config(cl->queue->dev, tmp); 468 - precompute_regs(tmp.info); 469 - 470 - amdgpu_emit_dispatch_packets(cl->queue->dev->info, enc, tmp, dinfo); 441 + auto hw_ip_type = hw_ip_type_from_queue_type(cl->queue->type); 442 + Pm4Encoder enc(dev->info, hw_ip_type, cl->cs); 443 + amdgpu_emit_dispatch_packets(cl->queue->dev->info, enc, *cl->state.shader, dinfo); 471 444 } 472 445 473 446 void amdgpu_cmd_dispatch_indirect(KesCommandList pcl, kes_gpuptr_t data, kes_gpuptr_t indirect_addr) {
-1
drivers/amdgpu/compiler/compiler.cpp
··· 1 1 #include "compiler.h" 2 2 #include "rdna2_asm.h" 3 - #include "gir/gir.h" 4 3 5 4 #include <sstream> 6 5 #include <iomanip>
+1 -1
drivers/amdgpu/compiler/compiler.h
··· 1 1 #pragma once 2 2 3 - #include "gir/gir.h" 3 + #include "kestrel/gir.h" 4 4 5 5 void rdna2_compile(gir::Module &mod, void *write_ptr, uint64_t base_addr);
+8
drivers/amdgpu/impl.h
··· 7 7 8 8 #include "cmdstream.h" 9 9 #include "gpuinfo.h" 10 + #include "shader.h" 10 11 11 12 #include "common.h" 12 13 ··· 37 38 struct CommandListImpl { 38 39 QueueImpl *queue; 39 40 CommandStream cs; 41 + 42 + struct { 43 + Shader *shader; 44 + } state; 40 45 }; 41 46 42 47 struct SemaphoreImpl { ··· 68 73 69 74 KesSemaphore amdgpu_create_semaphore(KesDevice, uint64_t); 70 75 int amdgpu_wait_semaphore(KesSemaphore, uint64_t); 76 + 77 + KesShader amdgpu_create_shader(KesDevice device, void *module); 78 + void amdgpu_bind_shader(KesCommandList command_list, KesShader shader); 71 79 } 72 80 73 81 void device_register_allocation(DeviceImpl *impl, amdgpu_bo_handle bo);
+2
drivers/amdgpu/interface.cpp
··· 23 23 fns->fn_cmd_dispatch_indirect = amdgpu_cmd_dispatch_indirect; 24 24 fns->fn_create_semaphore = amdgpu_create_semaphore; 25 25 fns->fn_wait_semaphore = amdgpu_wait_semaphore; 26 + fns->fn_create_shader = amdgpu_create_shader; 27 + fns->fn_bind_shader = amdgpu_bind_shader; 26 28 }
+43
drivers/amdgpu/shader.h
··· 1 + #pragma once 2 + 3 + #include <cstdint> 4 + 5 + struct ShaderRegs { 6 + uint32_t pgm_lo; 7 + uint32_t pgm_hi; 8 + uint32_t pgm_rsrc1; 9 + uint32_t pgm_rsrc2; 10 + uint32_t pgm_rsrc3; 11 + 12 + uint32_t userdata_0; 13 + }; 14 + 15 + enum class HwStage { 16 + Compute 17 + }; 18 + 19 + struct ShaderInfo { 20 + uint32_t block_size[3]; 21 + HwStage hw_stage; 22 + ShaderRegs regs; 23 + 24 + bool ordered; 25 + uint32_t wave_size; 26 + }; 27 + 28 + struct ShaderConfig { 29 + uint32_t pgm_rsrc1; 30 + uint32_t pgm_rsrc2; 31 + uint32_t pgm_rsrc3; 32 + uint32_t compute_resource_limits; 33 + 34 + uint32_t user_sgpr_count; 35 + }; 36 + 37 + struct Shader { 38 + ShaderInfo info; 39 + ShaderConfig config; 40 + uint64_t va; 41 + 42 + KesAllocation allocation; 43 + };
-138
drivers/common/gir/gir.h
··· 1 - #pragma once 2 - 3 - #include <cstdint> 4 - #include <vector> 5 - #include <functional> 6 - #include <string_view> 7 - 8 - #define GIR_VERSION "v1.1" 9 - 10 - namespace gir { 11 - 12 - enum class Type { 13 - Void, 14 - I32, 15 - F32, 16 - Ptr, 17 - }; 18 - 19 - struct Value { 20 - uint32_t id; 21 - 22 - bool is_inst() const { return id != ~0u; } 23 - }; 24 - 25 - enum class Op { 26 - Add, 27 - Sub, 28 - Mul, 29 - Div, 30 - Mod, 31 - FAdd, 32 - FSub, 33 - FMul, 34 - FDiv, 35 - And, 36 - Or, 37 - Xor, 38 - Shl, 39 - Shr, 40 - Eq, 41 - Ne, 42 - Lt, 43 - Le, 44 - Gt, 45 - Ge, 46 - Load, 47 - LoadShared, 48 - Store, 49 - StoreShared, 50 - Const, 51 - GetRootPtr, 52 - GetLocalInvocationId, 53 - GetThreadIdX, 54 - GetThreadIdY, 55 - GetThreadIdZ, 56 - GetWorkgroupIdX, 57 - GetWorkgroupIdY, 58 - GetWorkgroupIdZ, 59 - BackendIntrinsic, 60 - }; 61 - 62 - struct Inst { 63 - Op op; 64 - Type type; 65 - std::vector<Value> operands; 66 - 67 - // only for BackendIntrinsic 68 - uint32_t intrinsic_id; 69 - 70 - union { 71 - int64_t imm_i64; 72 - } data; 73 - 74 - struct { 75 - bool is_uniform = false; 76 - uint32_t phys_reg = ~0u; 77 - uint32_t last_use = 0xFFFFFFFF; 78 - } meta; 79 - }; 80 - 81 - class Module { 82 - public: 83 - std::vector<Inst> insts; 84 - 85 - Value emit(Inst inst) { 86 - uint32_t id = insts.size(); 87 - insts.push_back(inst); 88 - return Value{id}; 89 - } 90 - 91 - Inst &deref(Value v) { 92 - return insts[v.id]; 93 - } 94 - }; 95 - 96 - class Builder { 97 - public: 98 - Builder(Module& m) : mod(m) {} 99 - 100 - Value i32(int32_t imm); 101 - Value f32(float f); 102 - 103 - Value add(Value a, Value b); 104 - Value sub(Value a, Value b); 105 - Value mul(Value a, Value b); 106 - 107 - Value fadd(Value a, Value b); 108 - Value fmul(Value a, Value b); 109 - 110 - Value eq(Value a, Value b); 111 - Value lt(Value a, Value b); 112 - 113 - Value load(Value addr); 114 - Value load_shared(Value addr); 115 - void store(Value addr, Value data); 116 - void store_shared(Value addr, Value data); 117 - 118 - Value get_root_ptr(); 119 - 120 - Value get_local_invocation_id(); 121 - Value get_thread_id_x(); 122 - Value get_thread_id_y(); 123 - Value get_thread_id_z(); 124 - 125 - Value get_workgroup_id_x(); 126 - Value get_workgroup_id_y(); 127 - Value get_workgroup_id_z(); 128 - 129 - protected: 130 - Module& mod; 131 - }; 132 - 133 - std::string dump_module(Module &mod, std::function<std::string_view(uint32_t)> backend_intrinsic_to_string); 134 - 135 - void pass_normalize(Module& mod); 136 - void pass_eliminate_dead_code(Module &mod); 137 - 138 - };
-180
drivers/common/gir/gir_builder.cpp
··· 1 - #include "gir.h" 2 - 3 - #include <cstring> 4 - 5 - namespace gir { 6 - 7 - Value Builder::i32(int32_t imm) { 8 - return mod.emit(Inst{ 9 - .op = Op::Const, 10 - .type = Type::I32, 11 - .operands = {}, 12 - .data = {.imm_i64 = imm} 13 - }); 14 - } 15 - 16 - Value Builder::f32(float f) { 17 - uint32_t bits; 18 - memcpy(&bits, &f, sizeof(float)); 19 - return mod.emit(Inst{ 20 - .op = Op::Const, 21 - .type = Type::F32, 22 - .operands = {}, 23 - .data = {.imm_i64 = (int64_t)bits} 24 - }); 25 - } 26 - 27 - Value Builder::add(Value a, Value b) { 28 - return mod.emit(Inst{ 29 - .op = Op::Add, 30 - .type = Type::I32, 31 - .operands = {a, b} 32 - }); 33 - } 34 - 35 - Value Builder::sub(Value a, Value b) { 36 - return mod.emit(Inst{ 37 - .op = Op::Sub, 38 - .type = Type::I32, 39 - .operands = {a, b} 40 - }); 41 - } 42 - 43 - Value Builder::mul(Value a, Value b) { 44 - return mod.emit(Inst{ 45 - .op = Op::Mul, 46 - .type = Type::I32, 47 - .operands = {a, b} 48 - }); 49 - } 50 - 51 - Value Builder::fadd(Value a, Value b) { 52 - return mod.emit(Inst{ 53 - .op = Op::FAdd, 54 - .type = Type::F32, 55 - .operands = {a, b} 56 - }); 57 - } 58 - 59 - Value Builder::fmul(Value a, Value b) { 60 - return mod.emit(Inst{ 61 - .op = Op::FMul, 62 - .type = Type::F32, 63 - .operands = {a, b} 64 - }); 65 - } 66 - 67 - Value Builder::eq(Value a, Value b) { 68 - return mod.emit(Inst{ 69 - .op = Op::Eq, 70 - .type = Type::I32, 71 - .operands = {a, b} 72 - }); 73 - } 74 - 75 - Value Builder::lt(Value a, Value b) { 76 - return mod.emit(Inst{ 77 - .op = Op::Lt, 78 - .type = Type::I32, 79 - .operands = {a, b} 80 - }); 81 - } 82 - 83 - Value Builder::load(Value addr) { 84 - return mod.emit(Inst{ 85 - .op = Op::Load, 86 - .type = Type::I32, 87 - .operands = {addr}, 88 - }); 89 - } 90 - 91 - Value Builder::load_shared(Value addr) { 92 - return mod.emit(Inst{ 93 - .op = Op::LoadShared, 94 - .type = Type::I32, 95 - .operands = {addr}, 96 - }); 97 - } 98 - 99 - void Builder::store(Value addr, Value data) { 100 - mod.emit(Inst{ 101 - .op = Op::Store, 102 - .type = Type::Void, 103 - .operands = {addr, data}, 104 - }); 105 - } 106 - 107 - void Builder::store_shared(Value addr, Value data) { 108 - mod.emit(Inst{ 109 - .op = Op::StoreShared, 110 - .type = Type::Void, 111 - .operands = {addr, data}, 112 - }); 113 - } 114 - 115 - Value Builder::get_root_ptr() { 116 - return mod.emit(Inst{ 117 - .op = Op::GetRootPtr, 118 - .type = Type::Ptr, 119 - .operands = {} 120 - }); 121 - } 122 - 123 - Value Builder::get_local_invocation_id() { 124 - return mod.emit(Inst{ 125 - .op = Op::GetLocalInvocationId, 126 - .type = Type::I32, 127 - .operands = {} 128 - }); 129 - } 130 - 131 - Value Builder::get_thread_id_x() { 132 - return mod.emit(Inst{ 133 - .op = Op::GetThreadIdX, 134 - .type = Type::I32, 135 - .operands = {} 136 - }); 137 - } 138 - 139 - Value Builder::get_thread_id_y() { 140 - return mod.emit(Inst{ 141 - .op = Op::GetThreadIdY, 142 - .type = Type::I32, 143 - .operands = {} 144 - }); 145 - } 146 - 147 - Value Builder::get_thread_id_z() { 148 - return mod.emit(Inst{ 149 - .op = Op::GetThreadIdZ, 150 - .type = Type::I32, 151 - .operands = {} 152 - }); 153 - } 154 - 155 - 156 - Value Builder::get_workgroup_id_x() { 157 - return mod.emit(Inst{ 158 - .op = Op::GetWorkgroupIdX, 159 - .type = Type::I32, 160 - .operands = {} 161 - }); 162 - } 163 - 164 - Value Builder::get_workgroup_id_y() { 165 - return mod.emit(Inst{ 166 - .op = Op::GetWorkgroupIdY, 167 - .type = Type::I32, 168 - .operands = {} 169 - }); 170 - } 171 - 172 - Value Builder::get_workgroup_id_z() { 173 - return mod.emit(Inst{ 174 - .op = Op::GetWorkgroupIdZ, 175 - .type = Type::I32, 176 - .operands = {} 177 - }); 178 - } 179 - 180 - }
+1 -1
drivers/common/gir/gir_dump.cpp
··· 1 - #include "gir.h" 1 + #include "kestrel/gir.h" 2 2 #include <string> 3 3 #include <sstream> 4 4
+1 -1
drivers/common/gir/gir_normalize.cpp
··· 1 - #include "gir.h" 1 + #include "kestrel/gir.h" 2 2 3 3 namespace gir { 4 4
+321
kestrel/include/kestrel/gir.h
··· 1 + #pragma once 2 + 3 + /* 4 + * GIR is the IR language used. 5 + * 6 + * @NOTE: 7 + * This will actually NOT be exposed in the final API. I think! I want some kind of 8 + * spir-v compilation or otherwise. Not completely sure yet. 9 + */ 10 + 11 + #include <cstdint> 12 + #include <vector> 13 + #include <functional> 14 + #include <string_view> 15 + #include <cstring> 16 + 17 + #define GIR_VERSION "v1.1" 18 + 19 + namespace gir { 20 + 21 + enum class Type { 22 + Void, 23 + I32, 24 + F32, 25 + Ptr, 26 + }; 27 + 28 + struct Value { 29 + uint32_t id; 30 + 31 + bool is_inst() const { return id != ~0u; } 32 + }; 33 + 34 + enum class Op { 35 + Add, 36 + Sub, 37 + Mul, 38 + Div, 39 + Mod, 40 + FAdd, 41 + FSub, 42 + FMul, 43 + FDiv, 44 + And, 45 + Or, 46 + Xor, 47 + Shl, 48 + Shr, 49 + Eq, 50 + Ne, 51 + Lt, 52 + Le, 53 + Gt, 54 + Ge, 55 + Load, 56 + LoadShared, 57 + Store, 58 + StoreShared, 59 + Const, 60 + GetRootPtr, 61 + GetLocalInvocationId, 62 + GetThreadIdX, 63 + GetThreadIdY, 64 + GetThreadIdZ, 65 + GetWorkgroupIdX, 66 + GetWorkgroupIdY, 67 + GetWorkgroupIdZ, 68 + BackendIntrinsic, 69 + }; 70 + 71 + struct Inst { 72 + Op op; 73 + Type type; 74 + std::vector<Value> operands; 75 + 76 + // only for BackendIntrinsic 77 + uint32_t intrinsic_id; 78 + 79 + union { 80 + int64_t imm_i64; 81 + } data; 82 + 83 + struct { 84 + bool is_uniform = false; 85 + uint32_t phys_reg = ~0u; 86 + uint32_t last_use = 0xFFFFFFFF; 87 + } meta; 88 + }; 89 + 90 + class Module { 91 + public: 92 + std::vector<Inst> insts; 93 + 94 + Value emit(Inst inst) { 95 + uint32_t id = insts.size(); 96 + insts.push_back(inst); 97 + return Value{id}; 98 + } 99 + 100 + Inst &deref(Value v) { 101 + return insts[v.id]; 102 + } 103 + }; 104 + 105 + class Builder { 106 + public: 107 + Builder(Module& m) : mod(m) {} 108 + 109 + Value i32(int32_t imm); 110 + Value f32(float f); 111 + 112 + Value add(Value a, Value b); 113 + Value sub(Value a, Value b); 114 + Value mul(Value a, Value b); 115 + 116 + Value fadd(Value a, Value b); 117 + Value fmul(Value a, Value b); 118 + 119 + Value eq(Value a, Value b); 120 + Value lt(Value a, Value b); 121 + 122 + Value load(Value addr); 123 + Value load_shared(Value addr); 124 + void store(Value addr, Value data); 125 + void store_shared(Value addr, Value data); 126 + 127 + Value get_root_ptr(); 128 + 129 + Value get_local_invocation_id(); 130 + Value get_thread_id_x(); 131 + Value get_thread_id_y(); 132 + Value get_thread_id_z(); 133 + 134 + Value get_workgroup_id_x(); 135 + Value get_workgroup_id_y(); 136 + Value get_workgroup_id_z(); 137 + 138 + protected: 139 + Module& mod; 140 + }; 141 + 142 + std::string dump_module(Module &mod, std::function<std::string_view(uint32_t)> backend_intrinsic_to_string); 143 + 144 + void pass_normalize(Module& mod); 145 + void pass_eliminate_dead_code(Module &mod); 146 + 147 + // builder impl 148 + inline Value Builder::i32(int32_t imm) { 149 + return mod.emit(Inst{ 150 + .op = Op::Const, 151 + .type = Type::I32, 152 + .operands = {}, 153 + .data = {.imm_i64 = imm} 154 + }); 155 + } 156 + 157 + inline Value Builder::f32(float f) { 158 + uint32_t bits; 159 + memcpy(&bits, &f, sizeof(float)); 160 + return mod.emit(Inst{ 161 + .op = Op::Const, 162 + .type = Type::F32, 163 + .operands = {}, 164 + .data = {.imm_i64 = (int64_t)bits} 165 + }); 166 + } 167 + 168 + inline Value Builder::add(Value a, Value b) { 169 + return mod.emit(Inst{ 170 + .op = Op::Add, 171 + .type = Type::I32, 172 + .operands = {a, b} 173 + }); 174 + } 175 + 176 + inline Value Builder::sub(Value a, Value b) { 177 + return mod.emit(Inst{ 178 + .op = Op::Sub, 179 + .type = Type::I32, 180 + .operands = {a, b} 181 + }); 182 + } 183 + 184 + inline Value Builder::mul(Value a, Value b) { 185 + return mod.emit(Inst{ 186 + .op = Op::Mul, 187 + .type = Type::I32, 188 + .operands = {a, b} 189 + }); 190 + } 191 + 192 + inline Value Builder::fadd(Value a, Value b) { 193 + return mod.emit(Inst{ 194 + .op = Op::FAdd, 195 + .type = Type::F32, 196 + .operands = {a, b} 197 + }); 198 + } 199 + 200 + inline Value Builder::fmul(Value a, Value b) { 201 + return mod.emit(Inst{ 202 + .op = Op::FMul, 203 + .type = Type::F32, 204 + .operands = {a, b} 205 + }); 206 + } 207 + 208 + inline Value Builder::eq(Value a, Value b) { 209 + return mod.emit(Inst{ 210 + .op = Op::Eq, 211 + .type = Type::I32, 212 + .operands = {a, b} 213 + }); 214 + } 215 + 216 + inline Value Builder::lt(Value a, Value b) { 217 + return mod.emit(Inst{ 218 + .op = Op::Lt, 219 + .type = Type::I32, 220 + .operands = {a, b} 221 + }); 222 + } 223 + 224 + inline Value Builder::load(Value addr) { 225 + return mod.emit(Inst{ 226 + .op = Op::Load, 227 + .type = Type::I32, 228 + .operands = {addr}, 229 + }); 230 + } 231 + 232 + inline Value Builder::load_shared(Value addr) { 233 + return mod.emit(Inst{ 234 + .op = Op::LoadShared, 235 + .type = Type::I32, 236 + .operands = {addr}, 237 + }); 238 + } 239 + 240 + inline void Builder::store(Value addr, Value data) { 241 + mod.emit(Inst{ 242 + .op = Op::Store, 243 + .type = Type::Void, 244 + .operands = {addr, data}, 245 + }); 246 + } 247 + 248 + inline void Builder::store_shared(Value addr, Value data) { 249 + mod.emit(Inst{ 250 + .op = Op::StoreShared, 251 + .type = Type::Void, 252 + .operands = {addr, data}, 253 + }); 254 + } 255 + 256 + inline Value Builder::get_root_ptr() { 257 + return mod.emit(Inst{ 258 + .op = Op::GetRootPtr, 259 + .type = Type::Ptr, 260 + .operands = {} 261 + }); 262 + } 263 + 264 + inline Value Builder::get_local_invocation_id() { 265 + return mod.emit(Inst{ 266 + .op = Op::GetLocalInvocationId, 267 + .type = Type::I32, 268 + .operands = {} 269 + }); 270 + } 271 + 272 + inline Value Builder::get_thread_id_x() { 273 + return mod.emit(Inst{ 274 + .op = Op::GetThreadIdX, 275 + .type = Type::I32, 276 + .operands = {} 277 + }); 278 + } 279 + 280 + inline Value Builder::get_thread_id_y() { 281 + return mod.emit(Inst{ 282 + .op = Op::GetThreadIdY, 283 + .type = Type::I32, 284 + .operands = {} 285 + }); 286 + } 287 + 288 + inline Value Builder::get_thread_id_z() { 289 + return mod.emit(Inst{ 290 + .op = Op::GetThreadIdZ, 291 + .type = Type::I32, 292 + .operands = {} 293 + }); 294 + } 295 + 296 + 297 + inline Value Builder::get_workgroup_id_x() { 298 + return mod.emit(Inst{ 299 + .op = Op::GetWorkgroupIdX, 300 + .type = Type::I32, 301 + .operands = {} 302 + }); 303 + } 304 + 305 + inline Value Builder::get_workgroup_id_y() { 306 + return mod.emit(Inst{ 307 + .op = Op::GetWorkgroupIdY, 308 + .type = Type::I32, 309 + .operands = {} 310 + }); 311 + } 312 + 313 + inline Value Builder::get_workgroup_id_z() { 314 + return mod.emit(Inst{ 315 + .op = Op::GetWorkgroupIdZ, 316 + .type = Type::I32, 317 + .operands = {} 318 + }); 319 + } 320 + 321 + }
+3 -1
kestrel/include/kestrel/interface.h
··· 32 32 void (*fn_cmd_dispatch)(KesCommandList command_list, kes_gpuptr_t data, uint32_t x, uint32_t y, uint32_t z); 33 33 void (*fn_cmd_dispatch_indirect)(KesCommandList command_list, kes_gpuptr_t data, kes_gpuptr_t command_addr); 34 34 KesSemaphore (*fn_create_semaphore)(KesDevice device, uint64_t value); 35 - int (*fn_wait_semaphore)(KesSemaphore semaphore, uint64_t value); 35 + int (*fn_wait_semaphore)(KesSemaphore semaphore, uint64_t value); 36 + KesShader (*fn_create_shader)(KesDevice device, void *module); 37 + void (*fn_bind_shader)(KesCommandList command_list, KesShader shader); 36 38 }; 37 39 38 40 /**
+17
kestrel/include/kestrel/kestrel.h
··· 36 36 typedef void *KesSemaphore; 37 37 38 38 /** 39 + * Opaque handle to a Shader. 40 + */ 41 + typedef void *KesShader; 42 + 43 + /** 39 44 * Structure describing a memory allocation. 40 45 * @sa kes_malloc 41 46 */ ··· 325 330 KesSemaphore kes_create_semaphore(KesDevice device, uint64_t value); 326 331 327 332 int kes_wait_semaphore(KesSemaphore semaphore, uint64_t value); 333 + 334 + /** 335 + * Create a Shader 336 + * @param device The device to create the shader on. 337 + * @param module An opaque handle to a gir::Module. 338 + * 339 + * @todo: TO BE REMOVED for proper spir-v or otherwise! 340 + * @note: Bad leaky C++ :^/ 341 + */ 342 + KesShader kes_create_shader(KesDevice device, void *module); 343 + 344 + void kes_bind_shader(KesCommandList command_list, KesShader shader); 328 345 329 346 #ifdef __cplusplus 330 347 }
+15 -1
kestrel/rt/api.cpp
··· 74 74 75 75 std::string lib_name = "libkes_" + gpu.driver_name + ".so"; 76 76 77 - std::string temp_path = std::string("/home/olle/hack/kestrel/build-dev/drivers/") + lib_name; 77 + std::string temp_path = std::string("/home/olle/hack/kestrel/build/drivers/") + lib_name; 78 78 79 79 printf("trying path: %s\n", temp_path.c_str()); 80 80 ··· 238 238 239 239 return dev->fns.fn_wait_semaphore(handle->sem, value); 240 240 } 241 + 242 + KesShader kes_create_shader(KesDevice pd, void *module) { 243 + auto *dev = reinterpret_cast<DeviceHandle *>(pd); 244 + auto shader = dev->fns.fn_create_shader(dev->drv_handle, module); 245 + 246 + return shader; 247 + } 248 + 249 + void kes_bind_shader(KesCommandList pcl, KesShader shader) { 250 + auto *clhandle = reinterpret_cast<CommandListHandle *>(pcl); 251 + auto *dev = clhandle->dev; 252 + 253 + dev->fns.fn_bind_shader(clhandle->cmdlist, shader); 254 + }
+14
test/examples/07_hello_dispatch/hello_dispatch.cpp
··· 1 1 #include <unistd.h> 2 2 #include <kestrel/kestrel.h> 3 + #include <kestrel/gir.h> 3 4 4 5 #include <stdio.h> 5 6 ··· 17 18 18 19 auto compute = kes_create_queue(dev, KesQueueTypeCompute); 19 20 21 + gir::Module mod; 22 + { 23 + gir::Builder gb(mod); 24 + auto rp = gb.get_root_ptr(); 25 + auto p = gb.add(rp, gb.mul(gb.get_local_invocation_id(), gb.i32(4))); 26 + auto x = gb.load(p); 27 + auto sum = gb.add(x, gb.i32(15)); 28 + gb.store(p, sum); 29 + } 30 + 31 + auto shader = kes_create_shader(dev, (void *)&mod); 32 + 20 33 auto cl = kes_start_recording(compute); 21 34 { 35 + kes_bind_shader(cl, shader); 22 36 kes_cmd_dispatch(cl, x.gpu, 32, 1, 1); 23 37 } 24 38