A Modern GPGPU API & wip linux RDNA2+ Driver
rdna driver linux gpu
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

gir: wip

+191 -31
+2 -2
drivers/amdgpu/cmds.cpp
··· 305 305 gir::Module mod; 306 306 gir::Builder gb(mod); 307 307 auto rp = gb.get_root_ptr(); 308 - auto x = gb.load(rp, gb.mul(gb.i32(4), gb.get_thread_id_x())); 308 + auto x = gb.load(gb.add(rp, gb.mul(gb.get_local_invocation_id(), gb.i32(4)))); 309 309 auto sum = gb.add(x, gb.i32(15)); 310 - gb.store(rp, sum, x); 310 + gb.store(x, sum); 311 311 312 312 rdna2_compile(mod, alloc.cpu, alloc.gpu); 313 313 }
+158 -20
drivers/amdgpu/compiler/compiler.cpp
··· 28 28 RDNA2Assembler as; 29 29 }; 30 30 31 + void lower_simple(Compiler &); 31 32 void analyze_uniformity(Compiler &); 32 - void analyze_liveness(Compiler &); 33 33 void allocate_registers(Compiler &); 34 34 void codegen(Compiler &); 35 35 36 + enum class AmdIntrinsics : uint32_t { 37 + GlobalLoadDword, 38 + GlobalLoadDwordAddTI, // 12-bit imm offset, saddr, addr 39 + }; 40 + 36 41 void rdna2_compile(gir::Module &mod, void *write_ptr, uint64_t base_addr) { 37 42 Compiler compiler(mod); 38 43 39 - analyze_liveness(compiler); 44 + lower_simple(compiler); 45 + analyze_uniformity(compiler); 40 46 allocate_registers(compiler); 41 - analyze_uniformity(compiler); 42 47 codegen(compiler); 43 48 44 49 auto code = compiler.as.values(); ··· 49 54 // @todo: wip 50 55 { 51 56 std::stringstream ss; 52 - ss << "shader_" << std::hex << reinterpret_cast<uintptr_t>(write_ptr) << ".bin"; 57 + ss << "shader_tmp.bin"; 53 58 std::string filename = ss.str(); 54 59 55 60 std::ofstream outfile(filename, std::ios::out | std::ios::binary); ··· 62 67 } 63 68 } 64 69 65 - // @todo: stuff like this is pretty general. 66 - void analyze_liveness(Compiler &cc) { 70 + void lower_simple(Compiler &cc) { 67 71 for (uint32_t i = 0; i < cc.mod.insts.size(); ++i) { 68 - for (auto arg : cc.mod.insts[i].operands) { 69 - if (arg.id != 0xFFFFFFFF) cc.mod.insts[arg.id].meta.last_use = i; 72 + auto &inst = cc.mod.insts[i]; 73 + if (inst.op == gir::Op::GetRootPtr) { 74 + // root pointer is passed as the user sgprs. 75 + // we don't actually have to do anything. 76 + inst.meta.phys_reg = 0; 77 + inst.meta.is_uniform = true; 70 78 } 79 + 80 + // @todo: handle local_invocation_id. 81 + // There are many ways to do this, but I believe we need to lower it 82 + // into a pack operation of vgpr0,1,2. But I'm not entirely sure. 71 83 } 72 84 } 73 85 74 - void analyze_uniformity(Compiler &cc) { 75 - // Simple propagation: Root ptr is uniform. 76 - /* 77 - for (auto& inst : cc.mod.insts) { 78 - bool divergent = false; 79 - for (auto arg : inst.args) { 80 - if (arg.id != 0xFFFFFFFF && !cc.mod.values[arg.id].is_uniform) divergent = true; 86 + void lower_memory_loads(Compiler &cc) { 87 + // device memory loads should become global_load_dword or similar. 88 + // these kinds of instructions support a base ptr + imm offset or 89 + // base sgpr ptr + vgpr offset. 90 + 91 + // if we detect such a pattern we can replace with these opcodes. 92 + for (uint32_t i = 0; i < cc.mod.insts.size(); ++i) { 93 + auto &inst = cc.mod.insts[i]; 94 + if (inst.op == gir::Op::Load) { 95 + auto addr = cc.mod.deref(inst.operands[0]); 96 + 97 + if (addr.meta.is_uniform) { 98 + not_implemented("lower_memory_loads: cannot handle Op::Load with uniform address"); 99 + } 100 + 101 + if (addr.op == gir::Op::Add) { 102 + // we have detected an offset! 103 + // @todo: I think we need some form of canonicalization 104 + // here so the check can be more trivial. 105 + 106 + auto lhs = cc.mod.deref(addr.operands[0]); 107 + auto rhs = cc.mod.deref(addr.operands[1]); 108 + 109 + assert(lhs.type == gir::Type::Ptr, "lower_memory_loads: invalid operand in load(x + y)"); 110 + 111 + if (rhs.op == gir::Op::Mul) { 112 + auto lhs2 = cc.mod.deref(rhs.operands[0]); 113 + auto rhs2 = cc.mod.deref(rhs.operands[1]); 114 + 115 + if (lhs2.op == gir::Op::GetLocalInvocationId && rhs2.op == gir::Op::Const && rhs2.data.imm_i64 == 4) { 116 + // replace instruction 117 + auto args = std::vector<Value>{lhs, rhs2}; 118 + inst = gir::Inst{ 119 + .op = gir::Op::BackendIntrinsic, 120 + .type = gir::Type::I32, 121 + .operands = args, 122 + .intrinsic_id = AmdIntrinsics::GlobalLoadDwordAddTI 123 + } 124 + } 125 + } 126 + 127 + } else { 128 + 129 + } 81 130 } 82 - if (inst.op == LOAD_GLOBAL) divergent = true; // Memory reads are divergent 83 - if (inst.dest.id != 0xFFFFFFFF) cc.mod.values[inst.dest.id].is_uniform = !divergent; 84 131 } 85 - */ 132 + } 133 + 134 + void analyze_uniformity(Compiler &cc) { 135 + 136 + } 137 + 138 + uint32_t required_regs_for_type(gir::Type t) { 139 + switch(t) { 140 + case gir::Type::I32: 141 + case gir::Type::F32: 142 + return 1; 143 + case gir::Type::Ptr: 144 + return 2; 145 + } 146 + 147 + return 0; 86 148 } 87 149 88 150 void allocate_registers(Compiler &cc) { 89 - // linear register allocation. We need to note the DS and determine how many 90 - // contiguous sgpr/vgprs are needed for that. 151 + uint32_t sgpr_start = 6; 152 + uint32_t vgpr_start = 3; 153 + // @todo: improve register allocation and also reuse non-live 154 + // registers. 155 + for (auto &inst : cc.mod.insts) { 156 + if (inst.meta.phys_reg != ~0u) continue; 157 + 158 + // @todo: additionally, some types require a kind of align/size difference 159 + // (flat_load_dword16 needs 4 align, 16 size). 160 + // find next one / seq of regs of this kind 161 + auto count = required_regs_for_type(inst.type); 162 + 163 + if (inst.meta.is_uniform) { 164 + inst.meta.phys_reg = sgpr_start; 165 + sgpr_start += count; 166 + } else { 167 + inst.meta.phys_reg = vgpr_start; 168 + vgpr_start += count; 169 + } 170 + } 171 + } 172 + 173 + inline RDNA2Assembler::vsrc get_vsrc(Compiler& c, gir::Value v) { 174 + auto& inst = c.mod.deref(v); 175 + 176 + // Handle inline constants 177 + if (inst.op == gir::Op::Const) { 178 + int32_t imm = inst.data.imm_i64; 179 + if (imm == 0) return RDNA2Assembler::vsrc::zero; 180 + if (imm == -1) return RDNA2Assembler::vsrc::int_neg_1; 181 + if (imm >= 1 && imm <= 64) return (RDNA2Assembler::vsrc)((uint)RDNA2Assembler::vsrc::int_pos_1 + imm - 1); 182 + if (imm < 0 && imm >= -16) return (RDNA2Assembler::vsrc)((uint)RDNA2Assembler::vsrc::int_neg_1 - imm - 1); 183 + return RDNA2Assembler::vsrc::literal_constant; 184 + } 185 + 186 + auto reg = inst.meta.phys_reg; 187 + if (inst.meta.is_uniform) { 188 + return (RDNA2Assembler::vsrc)((uint)RDNA2Assembler::vsrc::sgpr0 + reg); 189 + } else { 190 + return (RDNA2Assembler::vsrc)((uint)RDNA2Assembler::vsrc::vgpr0 + reg); 191 + } 192 + } 193 + 194 + inline RDNA2Assembler::ssrc get_ssrc(Compiler& c, gir::Value v) { 195 + auto& inst = c.mod.deref(v); 196 + 197 + if (inst.op == gir::Op::Const) { 198 + int32_t imm = inst.data.imm_i64; 199 + if (imm == 0) return RDNA2Assembler::ssrc::src_zero; 200 + if (imm == -1) return RDNA2Assembler::ssrc::int_neg_1; 201 + if (imm >= 1 && imm <= 64) return (RDNA2Assembler::ssrc)((uint)RDNA2Assembler::ssrc::int_pos_1 + imm - 1); 202 + if (imm < 0 && imm >= -16) return (RDNA2Assembler::ssrc)((uint)RDNA2Assembler::ssrc::int_neg_1 - imm - 1); 203 + return RDNA2Assembler::ssrc::literal_constant; 204 + } 205 + 206 + assert(inst.meta.is_uniform, "Cannot use non-uniform value as ssrc"); 207 + return (RDNA2Assembler::ssrc)((uint)RDNA2Assembler::ssrc::sgpr0 + inst.meta.phys_reg); 91 208 } 92 209 93 210 void codegen(Compiler &cc) { 211 + 212 + for (auto &inst : cc.mod.insts) { 213 + switch (inst.op) { 214 + case gir::Op::BackendIntrinsic: { 215 + switch(inst.intrinsic_id) { 216 + case (uint32_t)AmdIntrinsics::GlobalLoadDwordAddTI: { 217 + // @todo: support offset constants 218 + //assert(cc.mod.deref(inst.operands[0]).op == gir::Op::Const, "offset must be const"); 219 + //auto offset = mod.deref(inst.operands[0]).data.imm_i64; 220 + auto offset = 0; 221 + 222 + auto saddr = get_ssrc(cc, inst.operands[1]); 223 + auto addr = get_vsrc(cc, inst.operands[2]); 224 + cc.as.global(RDNA2Assembler::global_opcode::global_load_dword_addtid, false, false, false, false, 225 + offset, 0, (uint8_t)saddr, inst.meta.phys_reg, (uint8_t)addr 226 + ); 227 + } break; 228 + } 229 + } break; 230 + } 231 + } 94 232 95 233 /* 96 234 for (auto& inst : mod.insts) {
+10 -5
drivers/common/gir/gir.h
··· 50 50 51 51 GetRootPtr, 52 52 53 + GetLocalInvocationId, 53 54 GetThreadIdX, 54 55 GetThreadIdY, 55 56 GetThreadIdZ, ··· 60 61 BackendIntrinsic, 61 62 }; 62 63 63 - using BackendIntrinsicId = uint32_t; 64 - 65 64 struct Inst { 66 65 Op op; 67 66 Type type; 68 67 std::vector<Value> operands; 69 68 70 69 // only for BackendIntrinsic 71 - BackendIntrinsicId intrinsic_id; 70 + uint32_t intrinsic_id; 72 71 73 72 union { 74 73 int64_t imm_i64; ··· 90 89 insts.push_back(inst); 91 90 return Value{id}; 92 91 } 92 + 93 + Inst &deref(Value v) { 94 + return insts[v.id]; 95 + } 93 96 }; 94 97 95 98 class Builder { ··· 109 112 Value eq(Value a, Value b); 110 113 Value lt(Value a, Value b); 111 114 112 - Value load(Value addr, Value offset); 113 - void store(Value addr, Value data, Value offset); 115 + Value load(Value addr); 116 + void store(Value addr, Value data); 114 117 115 118 Value get_root_ptr(); 116 119 120 + Value get_local_invocation_id(); 117 121 Value get_thread_id_x(); 118 122 Value get_thread_id_y(); 123 + Value get_thread_id_z(); 119 124 120 125 Value get_workgroup_id_x(); 121 126 Value get_workgroup_id_y();
+21 -4
drivers/common/gir/gir_builder.cpp
··· 80 80 }); 81 81 } 82 82 83 - Value Builder::load(Value addr, Value offset) { 83 + Value Builder::load(Value addr) { 84 84 return mod.emit(Inst{ 85 85 .op = Op::Load, 86 86 .type = Type::I32, 87 - .operands = {addr, offset}, 87 + .operands = {addr}, 88 88 }); 89 89 } 90 90 91 - void Builder::store(Value addr, Value data, Value offset) { 91 + void Builder::store(Value addr, Value data) { 92 92 mod.emit(Inst{ 93 93 .op = Op::Store, 94 94 .type = Type::Void, 95 - .operands = {addr, data, offset}, 95 + .operands = {addr, data}, 96 96 }); 97 97 } 98 98 ··· 104 104 }); 105 105 } 106 106 107 + Value Builder::get_local_invocation_id() { 108 + return mod.emit(Inst{ 109 + .op = Op::GetLocalInvocationId, 110 + .type = Type::I32, 111 + .operands = {} 112 + }); 113 + } 114 + 107 115 Value Builder::get_thread_id_x() { 108 116 return mod.emit(Inst{ 109 117 .op = Op::GetThreadIdX, ··· 119 127 .operands = {} 120 128 }); 121 129 } 130 + 131 + Value Builder::get_thread_id_z() { 132 + return mod.emit(Inst{ 133 + .op = Op::GetThreadIdZ, 134 + .type = Type::I32, 135 + .operands = {} 136 + }); 137 + } 138 + 122 139 123 140 Value Builder::get_workgroup_id_x() { 124 141 return mod.emit(Inst{