gir+amdgpu: add support for workgroup sizes · logdahl.net/kestrel@b70b100

+14 -6

drivers/amdgpu/cmds.cpp

··· 285 285 assert(module, "amdgpu_create_shader: module handle invalid: {}", (void *)module); 286 286 287 287 // Fixed for the Root Pointer ABI 288 - auto num_user_sgprs = 2; 288 + uint32_t num_user_sgprs = 2; 289 289 290 290 auto shader = new Shader; 291 291 ··· 302 302 303 303 auto ordered = false; 304 304 auto wave_size = 32; 305 - auto waves_per_threadgroup = 1; 305 + 306 + auto workgroup_size_x = module->workgroup_size_x; 307 + auto workgroup_size_y = module->workgroup_size_y; 308 + auto workgroup_size_z = module->workgroup_size_z; 309 + 310 + uint32_t total_threads = workgroup_size_x * workgroup_size_y * workgroup_size_z; 311 + auto waves_per_threadgroup = (total_threads + wave_size - 1) / wave_size; 312 + 306 313 auto max_waves_per_sh = 0x3FF; 307 314 auto threadgroups_per_cu = 1; 308 315 ··· 319 326 shader->config.user_sgpr_count = num_user_sgprs; 320 327 shader->info.ordered = ordered; 321 328 shader->info.wave_size = wave_size; 322 - shader->info.block_size[0] = 32; 323 - shader->info.block_size[1] = 1; 324 - shader->info.block_size[2] = 1; 329 + shader->info.block_size[0] = workgroup_size_x; 330 + shader->info.block_size[1] = workgroup_size_y; 331 + shader->info.block_size[2] = workgroup_size_z; 325 332 shader->va = alloc.gpu; 326 333 shader->info.hw_stage = HwStage::Compute; 327 334 ··· 343 350 | S_00B12C_TRAP_PRESENT(trap_present) 344 351 | S_00B84C_TGID_X_EN(1) 345 352 | S_00B84C_TGID_Y_EN(1) 346 - | S_00B84C_TGID_Z_EN(1); 353 + | S_00B84C_TGID_Z_EN(1) 354 + | S_00B84C_TG_SIZE_EN(1); 347 355 348 356 shader->config.pgm_rsrc3 = 349 357 S_00B8A0_SHARED_VGPR_CNT(num_shared_vgpr_blocks);

+1 -7

drivers/amdgpu/compiler/compiler.cpp

··· 35 35 36 36 void lower_simple(Compiler &); 37 37 void lower_memory_loads(Compiler &); 38 + 38 39 void analyze_uniformity(Compiler &); 39 40 void codegen(Compiler &); 40 41 ··· 121 122 inst.meta.phys_reg = cc.shdr.num_user_sgprs + 2; 122 123 inst.meta.is_uniform = true; 123 124 } 124 - 125 - // @todo: handle local_invocation_id. 126 - // There are many ways to do this, but I believe we need to lower it 127 - // into a pack operation of vgpr0,1,2. But I'm not entirely sure. 128 - 129 - // @todo: handle global invocation ids. 130 125 } 131 126 } 132 127 ··· 456 451 case gir::Op::LocalInvocationIdX: 457 452 case gir::Op::LocalInvocationIdY: 458 453 case gir::Op::LocalInvocationIdZ: 459 - case gir::Op::LocalInvocationIndex: 460 454 case gir::Op::WorkgroupIdX: 461 455 case gir::Op::WorkgroupIdY: 462 456 case gir::Op::WorkgroupIdZ:

+4

kestrel/include/kestrel/gir.h

··· 106 106 public: 107 107 std::vector<Inst> insts; 108 108 109 + uint32_t workgroup_size_x = 1; 110 + uint32_t workgroup_size_y = 1; 111 + uint32_t workgroup_size_z = 1; 112 + 109 113 Value emit(Inst inst) { 110 114 uint32_t id = insts.size(); 111 115 insts.push_back(inst);

+2 -1

test/examples/07_hello_dispatch/hello_dispatch.cpp

··· 19 19 auto compute = kes_create_queue(dev, KesQueueTypeCompute); 20 20 21 21 gir::Module mod; 22 + mod.workgroup_size_x = 32; 22 23 { 23 24 gir::Builder gb(mod); 24 25 auto rp = gb.root_ptr(); ··· 33 34 auto cl = kes_start_recording(compute); 34 35 { 35 36 kes_bind_shader(cl, shader); 36 - kes_cmd_dispatch(cl, x.gpu, 32, 1, 1); 37 + kes_cmd_dispatch(cl, x.gpu, 1, 1, 1); 37 38 } 38 39 39 40 kes_submit(compute, cl, sem, 1);

Configure Feed

Configure Feed