A Modern GPGPU API & wip linux RDNA2+ Driver
rdna driver linux gpu
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

gir,amdgpu: progress

+319 -103
+3 -2
drivers/amdgpu/cmds.cpp
··· 305 305 gir::Module mod; 306 306 gir::Builder gb(mod); 307 307 auto rp = gb.get_root_ptr(); 308 - auto x = gb.load(gb.add(rp, gb.mul(gb.get_local_invocation_id(), gb.i32(4)))); 308 + auto p = gb.add(rp, gb.mul(gb.get_local_invocation_id(), gb.i32(4))); 309 + auto x = gb.load(p); 309 310 auto sum = gb.add(x, gb.i32(15)); 310 - gb.store(x, sum); 311 + gb.store(p, sum); 311 312 312 313 rdna2_compile(mod, alloc.cpu, alloc.gpu); 313 314 }
+230 -82
drivers/amdgpu/compiler/compiler.cpp
··· 6 6 #include <iomanip> 7 7 #include <string> 8 8 #include <fstream> 9 + #include <optional> 9 10 10 11 using namespace gir; 11 12 ··· 29 30 }; 30 31 31 32 void lower_simple(Compiler &); 33 + void lower_memory_loads(Compiler &); 32 34 void analyze_uniformity(Compiler &); 33 35 void allocate_registers(Compiler &); 34 36 void codegen(Compiler &); 35 37 36 38 enum class AmdIntrinsics : uint32_t { 37 - GlobalLoadDword, 38 - GlobalLoadDwordAddTI, // 12-bit imm offset, saddr, addr 39 + GlobalLoadDwordAddTID_Scale4, 40 + GlobalStoreDwordAddTID_Scale4, 39 41 }; 40 42 43 + void tmp_dump_shader(uint32_t *data, size_t code_size_bytes) { 44 + std::stringstream ss; 45 + ss << "shader_tmp.bin"; 46 + std::string filename = ss.str(); 47 + 48 + std::ofstream outfile(filename, std::ios::out | std::ios::binary); 49 + if (outfile.is_open()) { 50 + outfile.write(reinterpret_cast<const char*>(data), code_size_bytes); 51 + outfile.close(); 52 + } 53 + log("shader written to {}", filename); 54 + } 55 + 41 56 void rdna2_compile(gir::Module &mod, void *write_ptr, uint64_t base_addr) { 42 57 Compiler compiler(mod); 43 58 59 + gir::pass_normalize(mod); 60 + 44 61 lower_simple(compiler); 62 + lower_memory_loads(compiler); 45 63 analyze_uniformity(compiler); 64 + 65 + gir::pass_eliminate_dead_code(mod); 66 + 46 67 allocate_registers(compiler); 68 + 47 69 codegen(compiler); 48 70 49 71 auto code = compiler.as.values(); 50 72 auto code_size_bytes = code.size() * sizeof(uint32_t); 51 73 memcpy(write_ptr, code.data(), code_size_bytes); 52 74 53 - // dump the shader code to a file. 54 - // @todo: wip 55 - { 56 - std::stringstream ss; 57 - ss << "shader_tmp.bin"; 58 - std::string filename = ss.str(); 59 - 60 - std::ofstream outfile(filename, std::ios::out | std::ios::binary); 61 - if (outfile.is_open()) { 62 - outfile.write(reinterpret_cast<const char*>(code.data()), code_size_bytes); 63 - outfile.close(); 64 - } 65 - log("shader written to {}", filename); 66 - exit(0); 67 - } 75 + tmp_dump_shader(code.data(), code_size_bytes); 68 76 } 69 77 70 78 void lower_simple(Compiler &cc) { ··· 80 88 // @todo: handle local_invocation_id. 81 89 // There are many ways to do this, but I believe we need to lower it 82 90 // into a pack operation of vgpr0,1,2. But I'm not entirely sure. 91 + if (inst.op == gir::Op::GetLocalInvocationId) { 92 + // @todo: stop assuming 1d dispatch. 93 + inst.meta.phys_reg = 0; // vgpr0 94 + inst.meta.is_uniform = false; 95 + } 83 96 } 84 97 } 85 98 99 + struct AddressPattern { 100 + Value base_ptr; 101 + bool is_tid_scaled_by_4 = false; 102 + }; 103 + 104 + std::optional<AddressPattern> match_address_pattern(Compiler& cc, const Inst& addr) { 105 + AddressPattern pat; 106 + 107 + // Match: ptr 108 + if (addr.type == Type::Ptr && addr.op != Op::Add) { 109 + pat.base_ptr = addr.operands[0]; 110 + return pat; 111 + } 112 + 113 + // Match: ptr + offset 114 + if (addr.op == Op::Add) { 115 + auto& lhs = cc.mod.deref(addr.operands[0]); 116 + auto& rhs = cc.mod.deref(addr.operands[1]); 117 + 118 + // After normalization, ptr should be on left 119 + if (lhs.type != Type::Ptr) { 120 + log("not normalized?"); 121 + return std::nullopt; // Shouldn't happen after normalization 122 + } 123 + 124 + pat.base_ptr = addr.operands[0]; 125 + 126 + // Check if offset is tid * 4 127 + if (rhs.op == Op::Mul) { 128 + auto& mul_lhs = cc.mod.deref(rhs.operands[0]); 129 + auto& mul_rhs = cc.mod.deref(rhs.operands[1]); 130 + 131 + if (mul_lhs.op == Op::GetLocalInvocationId && 132 + mul_rhs.op == Op::Const && 133 + mul_rhs.data.imm_i64 == 4) { 134 + pat.is_tid_scaled_by_4 = true; 135 + return pat; 136 + } 137 + } 138 + 139 + // Other offset patterns not yet supported 140 + log("other offset pattern?"); 141 + return std::nullopt; 142 + } 143 + 144 + log("what? op: {}", (int)addr.op); 145 + return std::nullopt; 146 + } 147 + 86 148 void lower_memory_loads(Compiler &cc) { 87 149 // device memory loads should become global_load_dword or similar. 88 150 // these kinds of instructions support a base ptr + imm offset or 89 151 // base sgpr ptr + vgpr offset. 90 152 91 - // if we detect such a pattern we can replace with these opcodes. 92 153 // global_load_dword: saddr + voff (+ imm offset) 93 154 // global_load_dword: vaddr (+ imm offset) 94 155 // global_load_dword_addtid: saddr (+ imm offset) + 4 * local_invocation_id 95 156 for (uint32_t i = 0; i < cc.mod.insts.size(); ++i) { 96 157 auto &inst = cc.mod.insts[i]; 97 - if (inst.op == gir::Op::Load) { 98 - auto addr = cc.mod.deref(inst.operands[0]); 99 - 100 - if (addr.meta.is_uniform) { 101 - not_implemented("lower_memory_loads: cannot handle Op::Load with uniform address"); 158 + if (inst.op == Op::Load) { 159 + auto& addr = cc.mod.deref(inst.operands[0]); 160 + auto pat = match_address_pattern(cc, addr); 161 + if (!pat) { 162 + not_implemented("lower_memory_loads: unsupported load address pattern"); 102 163 } 103 164 104 - if (addr.op == gir::Op::Add) { 105 - // we have detected an offset! 106 - // @todo: I think we need some form of canonicalization 107 - // here so the check can be more trivial. 165 + auto& base = cc.mod.deref(pat->base_ptr); 108 166 109 - auto lhs = cc.mod.deref(addr.operands[0]); 110 - auto rhs = cc.mod.deref(addr.operands[1]); 167 + if (!base.meta.is_uniform) { 168 + not_implemented("lower_memory_loads: non-uniform base pointer in Load not yet supported"); 169 + } 111 170 112 - assert(lhs.type == gir::Type::Ptr, "lower_memory_loads: invalid operand in load(x + y)"); 171 + if (pat->is_tid_scaled_by_4) { 172 + inst.op = Op::BackendIntrinsic; 173 + inst.intrinsic_id = (uint32_t)AmdIntrinsics::GlobalLoadDwordAddTID_Scale4; 174 + inst.operands = {pat->base_ptr}; 175 + } else { 176 + not_implemented("lower_memory_loads: simple loads not yet implemented"); 177 + } 178 + } else if (inst.op == Op::Store) { 179 + auto& addr = cc.mod.deref(inst.operands[0]); 180 + auto pat = match_address_pattern(cc, addr); 181 + if (!pat) { 182 + not_implemented("lower_memory_loads: unsupported Store address pattern"); 183 + } 113 184 114 - if (rhs.op == gir::Op::Mul) { 115 - auto lhs2 = cc.mod.deref(rhs.operands[0]); 116 - auto rhs2 = cc.mod.deref(rhs.operands[1]); 185 + auto& base = cc.mod.deref(pat->base_ptr); 117 186 118 - if (lhs2.op == gir::Op::GetLocalInvocationId && rhs2.op == gir::Op::Const && rhs2.data.imm_i64 == 4) { 119 - // replace instruction 120 - auto args = std::vector<Value>{lhs, rhs2}; 121 - inst = gir::Inst{ 122 - .op = gir::Op::BackendIntrinsic, 123 - .type = gir::Type::I32, 124 - .operands = args, 125 - .intrinsic_id = AmdIntrinsics::GlobalLoadDwordAddTI 126 - } 127 - } 128 - } 187 + if (!base.meta.is_uniform) { 188 + not_implemented("lower_memory_loads: non-uniform base pointer in Store not yet supported"); 189 + } 129 190 191 + if (pat->is_tid_scaled_by_4) { 192 + inst.op = Op::BackendIntrinsic; 193 + inst.intrinsic_id = (uint32_t)AmdIntrinsics::GlobalStoreDwordAddTID_Scale4; 194 + inst.operands = {pat->base_ptr, inst.operands[1]}; 130 195 } else { 131 - 196 + not_implemented("lower_memory_loads: simple stores not yet implemented"); 132 197 } 133 198 } 134 199 } ··· 211 276 } 212 277 213 278 void codegen(Compiler &cc) { 214 - 215 279 for (auto &inst : cc.mod.insts) { 216 280 switch (inst.op) { 217 281 case gir::Op::BackendIntrinsic: { 218 282 switch(inst.intrinsic_id) { 219 - case (uint32_t)AmdIntrinsics::GlobalLoadDwordAddTI: { 220 - // @todo: support offset constants 221 - //assert(cc.mod.deref(inst.operands[0]).op == gir::Op::Const, "offset must be const"); 222 - //auto offset = mod.deref(inst.operands[0]).data.imm_i64; 223 - auto offset = 0; 283 + case (uint32_t)AmdIntrinsics::GlobalLoadDwordAddTID_Scale4: { 284 + auto saddr = get_ssrc(cc, inst.operands[0]); 285 + 286 + // @todo: how do we know what to do about the cache flags? 287 + cc.as.global( 288 + RDNA2Assembler::global_opcode::global_load_dword_addtid, 289 + false, false, false, false, 290 + 0, // 12-bit immediate offset (0 for now) 291 + 0, // vaddr (0 = use addtid mode) 292 + (uint8_t)saddr, // saddr base pointer 293 + inst.meta.phys_reg, // vdst destination register 294 + 0 // unused in addtid mode 295 + ); 296 + 297 + // @todo: wait for load to complete. this is very conservative 298 + cc.as.sopp(RDNA2Assembler::sopp_opcode::s_waitcnt, 0x3F70); 299 + } break; 300 + case (uint32_t)AmdIntrinsics::GlobalStoreDwordAddTID_Scale4: { 301 + auto saddr = get_ssrc(cc, inst.operands[0]); 302 + auto& data = cc.mod.deref(inst.operands[1]); 224 303 225 - auto saddr = get_ssrc(cc, inst.operands[1]); 226 - auto addr = get_vsrc(cc, inst.operands[2]); 227 - cc.as.global(RDNA2Assembler::global_opcode::global_load_dword_addtid, false, false, false, false, 228 - offset, 0, (uint8_t)saddr, inst.meta.phys_reg, (uint8_t)addr 304 + if (data.meta.is_uniform) { 305 + not_implemented("codegen: GlobalStoreDwordAddTI with uniform data (need v_mov)"); 306 + } 307 + 308 + cc.as.global( 309 + RDNA2Assembler::global_opcode::global_store_dword_addtid, 310 + false, false, false, false, 311 + 0, 0, (uint8_t)saddr, data.meta.phys_reg, 0 229 312 ); 230 313 } break; 314 + default: 315 + not_implemented("codegen: unknown backend intrinsic: {}", inst.intrinsic_id); 231 316 } 232 317 } break; 233 - } 234 - } 235 318 236 - /* 237 - for (auto& inst : mod.insts) { 238 - switch (inst.op) { 239 - case ADD: 240 - if (mod.values[inst.dest.id].is_uniform) 241 - as.sop2(sop2_opcode::s_add_u32, mod.values[inst.dest.id].phys_reg, 242 - mod.values[inst.args[0].id].phys_reg, mod.values[inst.args[1].id].phys_reg); 243 - else 244 - as.vop2(vop2_opcode::v_add_nc_u32, mod.values[inst.dest.id].phys_reg, 245 - mod.values[inst.args[0].id].phys_reg, mod.values[inst.args[1].id].phys_reg); 246 - break; 247 - case LOAD_GLOBAL: 248 - as.global(global_opcode::global_load_dword, inst.imm, 249 - mod.values[inst.dest.id].phys_reg, mod.values[inst.args[0].id].phys_reg, 0); 250 - break; 251 - case STORE_GLOBAL: 252 - as.global(global_opcode::global_store_dword, inst.imm, 253 - 0, mod.values[inst.args[0].id].phys_reg, mod.values[inst.args[2].id].phys_reg); 254 - break; 255 - case V_MOV_S2V: 256 - as.vop2(vop2_opcode::v_mov_b32, mod.values[inst.dest.id].phys_reg, 257 - mod.values[inst.args[0].id].phys_reg, 0); 258 - break; 319 + case gir::Op::Store: { 320 + // @todo: we currently assume all stores are global, but this may not be the case. 321 + // I am not sure how NIR handles this, nor how other backends have local caches (LDS & GDS). 322 + auto& addr = cc.mod.deref(inst.operands[0]); 323 + auto& data = cc.mod.deref(inst.operands[1]); 324 + 325 + if (!addr.meta.is_uniform) { 326 + not_implemented("codegen: Store with non-uniform address not yet supported"); 327 + } 328 + 329 + if (data.meta.is_uniform) { 330 + not_implemented("codegen: Store with uniform data not yet supported (need v_mov to copy sgpr to vgpr)"); 331 + } 332 + 333 + if (data.type != gir::Type::I32 && data.type != gir::Type::F32) { 334 + not_implemented("codegen: Store only supports I32/F32 for now"); 335 + } 336 + 337 + // global_store_dword: saddr + vdata 338 + auto saddr = get_ssrc(cc, inst.operands[0]); 339 + 340 + cc.as.global( 341 + RDNA2Assembler::global_opcode::global_store_dword, 342 + true, true, false, true, 343 + 0, // 12-bit immediate offset 344 + 0, // vdst (unused for stores) 345 + (uint8_t)saddr, // saddr base pointer 346 + data.meta.phys_reg, // vdata - data to store 347 + 0 // vaddr (0 = use saddr only) 348 + ); 349 + } break; 350 + 351 + case gir::Op::Add: { 352 + if (inst.type == gir::Type::I32) { 353 + if (inst.meta.is_uniform) { 354 + // Scalar add: s_add_u32 355 + auto src0 = get_ssrc(cc, inst.operands[0]); 356 + auto src1 = get_ssrc(cc, inst.operands[1]); 357 + cc.as.sop2( 358 + RDNA2Assembler::sop2_opcode::s_add_u32, 359 + (RDNA2Assembler::ssrc)inst.meta.phys_reg, 360 + src0, 361 + src1 362 + ); 363 + } else { 364 + // Vector add: v_add_nc_u32 (non-carry version) 365 + // vsrc1 MUST be a VGPR, src0 can be anything (SGPR, VGPR, const) 366 + auto& op0 = cc.mod.deref(inst.operands[0]); 367 + auto& op1 = cc.mod.deref(inst.operands[1]); 368 + 369 + // Ensure VGPR is in vsrc1 position by swapping if needed 370 + bool op0_is_vgpr = !op0.meta.is_uniform && op0.op != gir::Op::Const; 371 + bool op1_is_vgpr = !op1.meta.is_uniform && op1.op != gir::Op::Const; 372 + 373 + if (!op0_is_vgpr && !op1_is_vgpr) { 374 + not_implemented("codegen: v_add_nc_u32 requires at least one VGPR operand"); 375 + } 376 + 377 + // Swap so VGPR is always in vsrc1 position 378 + if (op0_is_vgpr && !op1_is_vgpr) { 379 + cc.as.vop2( 380 + RDNA2Assembler::vop2_opcode::v_add_nc_u32, 381 + inst.meta.phys_reg, 382 + get_vsrc(cc, inst.operands[1]), // src0: can be const/sgpr 383 + op0.meta.phys_reg // vsrc1: VGPR 384 + ); 385 + } else { 386 + cc.as.vop2( 387 + RDNA2Assembler::vop2_opcode::v_add_nc_u32, 388 + inst.meta.phys_reg, 389 + get_vsrc(cc, inst.operands[0]), // src0: can be const/sgpr/vgpr 390 + op1.meta.phys_reg // vsrc1: VGPR 391 + ); 392 + } 393 + } 394 + } else if (inst.type == gir::Type::Ptr) { 395 + not_implemented("codegen: pointer addition (64-bit) not yet implemented"); 396 + } else { 397 + not_implemented("codegen: Add not implemented for type: {}", (int)inst.type); 398 + } 399 + } break; 400 + 401 + case gir::Op::Const: 402 + case gir::Op::GetRootPtr: 403 + case gir::Op::GetLocalInvocationId: 404 + // Skip metadata operations and constants 405 + break; 406 + default: 407 + not_implemented("codegen: operation not yet implemented: {}", (int)inst.op); 408 + break; 259 409 } 260 410 } 261 - */ 262 - 263 411 264 412 cc.as.sopp(RDNA2Assembler::sopp_opcode::s_endpgm, 0); 265 413
+4
drivers/amdgpu/compiler/rdna2_asm.h
··· 548 548 flat_impl((uint8_t)op, slc, glc, 2, lds, dlc, offset, vdst, saddr, data, addr); 549 549 } 550 550 551 + inline void waitcnt(uint8_t vmcnt, uint8_t expcnt, uint8_t lgkmcnt) { 552 + sopp(sopp_opcode::s_waitcnt, (vmcnt & 0x18) << 14 | (lgkmcnt & 0x1F) << 8 | (expcnt & 0x3) << 4 | (vmcnt & 0xF)); 553 + } 554 + 551 555 std::vector<uint32_t> &values() { 552 556 return m_values; 553 557 }
+3 -8
drivers/common/gir/gir.h
··· 24 24 Mul, 25 25 Div, 26 26 Mod, 27 - 28 27 FAdd, 29 28 FSub, 30 29 FMul, 31 30 FDiv, 32 - 33 31 And, 34 32 Or, 35 33 Xor, 36 34 Shl, 37 35 Shr, 38 - 39 36 Eq, 40 37 Ne, 41 38 Lt, 42 39 Le, 43 40 Gt, 44 41 Ge, 45 - 46 42 Load, 47 43 Store, 48 - 49 44 Const, 50 - 51 45 GetRootPtr, 52 - 53 46 GetLocalInvocationId, 54 47 GetThreadIdX, 55 48 GetThreadIdY, ··· 57 50 GetWorkgroupIdX, 58 51 GetWorkgroupIdY, 59 52 GetWorkgroupIdZ, 60 - 61 53 BackendIntrinsic, 62 54 }; 63 55 ··· 129 121 protected: 130 122 Module& mod; 131 123 }; 124 + 125 + void pass_normalize(Module& mod); 126 + void pass_eliminate_dead_code(Module &mod); 132 127 133 128 };
+77
drivers/common/gir/gir_normalize.cpp
··· 1 + #include "gir.h" 2 + 3 + namespace gir { 4 + 5 + // Canonicalize address computations so pointer is always on the left side of Add 6 + void normalize_address_computation(Module& mod) { 7 + for (auto& inst : mod.insts) { 8 + if (inst.op != Op::Add || inst.type != Type::Ptr) continue; 9 + 10 + auto& lhs = mod.deref(inst.operands[0]); 11 + auto& rhs = mod.deref(inst.operands[1]); 12 + 13 + // Swap if pointer is on the right 14 + if (lhs.type != Type::Ptr && rhs.type == Type::Ptr) { 15 + std::swap(inst.operands[0], inst.operands[1]); 16 + } 17 + } 18 + } 19 + 20 + void pass_normalize(Module &mod) { 21 + normalize_address_computation(mod); 22 + } 23 + 24 + void pass_eliminate_dead_code(Module& mod) { 25 + std::vector<bool> is_live(mod.insts.size(), false); 26 + 27 + // Mark all instructions with side effects as live (roots) 28 + for (size_t i = 0; i < mod.insts.size(); ++i) { 29 + auto& inst = mod.insts[i]; 30 + // Instructions with side effects are roots 31 + if (inst.op == Op::Store || inst.op == Op::BackendIntrinsic) { 32 + is_live[i] = true; 33 + } 34 + } 35 + 36 + // Propagate liveness backwards through dependencies 37 + // Keep iterating until no new instructions are marked live 38 + bool changed = true; 39 + while (changed) { 40 + changed = false; 41 + for (size_t i = 0; i < mod.insts.size(); ++i) { 42 + if (!is_live[i]) continue; 43 + 44 + // Mark all operands of live instructions as live 45 + for (auto& op : mod.insts[i].operands) { 46 + if (op.is_inst() && !is_live[op.id]) { 47 + is_live[op.id] = true; 48 + changed = true; 49 + } 50 + } 51 + } 52 + } 53 + 54 + // Build new instruction list with value remapping 55 + std::vector<Inst> new_insts; 56 + std::vector<uint32_t> value_map(mod.insts.size()); 57 + 58 + for (size_t i = 0; i < mod.insts.size(); ++i) { 59 + if (is_live[i]) { 60 + auto inst = mod.insts[i]; 61 + 62 + // Remap operands to new instruction indices 63 + for (auto& op : inst.operands) { 64 + if (op.is_inst()) { 65 + op.id = value_map[op.id]; 66 + } 67 + } 68 + 69 + value_map[i] = new_insts.size(); 70 + new_insts.push_back(inst); 71 + } 72 + } 73 + 74 + mod.insts = std::move(new_insts); 75 + } 76 + 77 + }
+2 -11
test/examples/07_hello_dispatch/hello_dispatch.cpp
··· 3 3 4 4 #include <stdio.h> 5 5 6 - struct DispatchArguments { 7 - uint64_t buffer; 8 - }; 9 - 10 6 int main(void) { 11 7 auto dev = kes_create(); 12 8 13 - auto x = kes_malloc(dev, 1024, 4, KesMemoryDefault); 14 - auto y = kes_malloc(dev, sizeof(DispatchArguments), 8, KesMemoryDefault); 9 + auto x = kes_malloc(dev, sizeof(uint32_t) * 128, 8, KesMemoryReadback); 15 10 16 11 auto sem = kes_create_semaphore(dev, 0); 17 12 18 13 printf("x: %p %p\n", (void *)x.cpu, (void *)x.gpu); 19 - printf("y: %p %p\n", (void *)y.cpu, (void *)y.gpu); 20 - 21 - DispatchArguments *args = (DispatchArguments *)y.cpu; 22 - args->buffer = x.gpu; 23 14 24 15 auto compute = kes_create_queue(dev, KesQueueTypeCompute); 25 16 26 17 auto cl = kes_start_recording(compute); 27 18 { 28 - kes_cmd_dispatch(cl, y.gpu, 32, 1, 1); 19 + kes_cmd_dispatch(cl, x.gpu, 32, 1, 1); 29 20 } 30 21 31 22 kes_submit(compute, cl, sem, 1);