drm/xe/wa: Steer RMW of MCR registers while building default LRC

When generating the default LRC, if a register is not masked, we apply
any save-restore programming necessary via a read-modify-write sequence
that will ensure we only update the relevant bits/fields without
clobbering the rest of the register. However some of the registers that
need to be updated might be MCR registers which require steering to a
non-terminated instance to ensure we can read back a valid, non-zero
value. The steering of reads originating from a command streamer is
controlled by register CS_MMIO_GROUP_INSTANCE_SELECT. Emit additional
MI_LRI commands to update the steering before any RMW of an MCR register
to ensure the reads are performed properly.

Note that needing to perform a RMW of an MCR register while building the
default LRC is pretty rare. Most of the MCR registers that are part of
an engine's LRCs are also masked registers, so no MCR is necessary.

Fixes: f2f90989ccff ("drm/xe: Avoid reading RMW registers in emit_wa_job")
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Reviewed-by: Balasubramani Vivekanandan <balasubramani.vivekanandan@intel.com>
Link: https://patch.msgid.link/20260206223058.387014-2-matthew.d.roper@intel.com
Signed-off-by: Matt Roper <matthew.d.roper@intel.com>

Matt Roper 2 months ago 6c2e331c 9ff885ef

+60 -12

2 changed files

expand all

drivers

gpu

drm

regs

xe_engine_regs.h

xe_gt.c

drivers/gpu/drm/xe/regs/xe_engine_regs.h

··· 96 96 #define ENABLE_SEMAPHORE_POLL_BIT REG_BIT(13) 97 97 98 98 #define RING_CMD_CCTL(base) XE_REG((base) + 0xc4, XE_REG_OPTION_MASKED) 99 + 100 + #define CS_MMIO_GROUP_INSTANCE_SELECT(base) XE_REG((base) + 0xcc) 101 + #define SELECTIVE_READ_ADDRESSING REG_BIT(30) 102 + #define SELECTIVE_READ_GROUP REG_GENMASK(29, 23) 103 + #define SELECTIVE_READ_INSTANCE REG_GENMASK(22, 16) 104 + 99 105 /* 100 106 * CMD_CCTL read/write fields take a MOCS value and _not_ a table index. 101 107 * The lsb of each can be considered a separate enabling bit for encryption.

+54 -12

drivers/gpu/drm/xe/xe_gt.c

··· 208 208 return ret; 209 209 } 210 210 211 + /* Dwords required to emit a RMW of a register */ 212 + #define EMIT_RMW_DW 20 213 + 211 214 static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q) 212 215 { 213 - struct xe_reg_sr *sr = &q->hwe->reg_lrc; 216 + struct xe_hw_engine *hwe = q->hwe; 217 + struct xe_reg_sr *sr = &hwe->reg_lrc; 214 218 struct xe_reg_sr_entry *entry; 215 - int count_rmw = 0, count = 0, ret; 219 + int count_rmw = 0, count_rmw_mcr = 0, count = 0, ret; 216 220 unsigned long idx; 217 221 struct xe_bb *bb; 218 222 size_t bb_len = 0; ··· 226 222 xa_for_each(&sr->xa, idx, entry) { 227 223 if (entry->reg.masked || entry->clr_bits == ~0) 228 224 ++count; 225 + else if (entry->reg.mcr) 226 + ++count_rmw_mcr; 229 227 else 230 228 ++count_rmw; 231 229 } ··· 235 229 if (count) 236 230 bb_len += count * 2 + 1; 237 231 238 - if (count_rmw) 239 - bb_len += count_rmw * 20 + 7; 232 + /* 233 + * RMW of MCR registers is the same as a normal RMW, except an 234 + * additional LRI (3 dwords) is required per register to steer the read 235 + * to a nom-terminated instance. 236 + * 237 + * We could probably shorten the batch slightly by eliding the 238 + * steering for consecutive MCR registers that have the same 239 + * group/instance target, but it's not worth the extra complexity to do 240 + * so. 241 + */ 242 + bb_len += count_rmw * EMIT_RMW_DW; 243 + bb_len += count_rmw_mcr * (EMIT_RMW_DW + 3); 240 244 241 - if (q->hwe->class == XE_ENGINE_CLASS_RENDER) 245 + /* 246 + * After doing all RMW, we need 7 trailing dwords to clean up, 247 + * plus an additional 3 dwords to reset steering if any of the 248 + * registers were MCR. 249 + */ 250 + if (count_rmw || count_rmw_mcr) 251 + bb_len += 7 + (count_rmw_mcr ? 3 : 0); 252 + 253 + if (hwe->class == XE_ENGINE_CLASS_RENDER) 242 254 /* 243 255 * Big enough to emit all of the context's 3DSTATE via 244 256 * xe_lrc_emit_hwe_state_instructions() 245 257 */ 246 - bb_len += xe_gt_lrc_size(gt, q->hwe->class) / sizeof(u32); 258 + bb_len += xe_gt_lrc_size(gt, hwe->class) / sizeof(u32); 247 259 248 - xe_gt_dbg(gt, "LRC %s WA job: %zu dwords\n", q->hwe->name, bb_len); 260 + xe_gt_dbg(gt, "LRC %s WA job: %zu dwords\n", hwe->name, bb_len); 249 261 250 262 bb = xe_bb_new(gt, bb_len, false); 251 263 if (IS_ERR(bb)) ··· 298 274 } 299 275 } 300 276 301 - if (count_rmw) { 302 - /* Emit MI_MATH for each RMW reg: 20dw per reg + 7 trailing dw */ 303 - 277 + if (count_rmw || count_rmw_mcr) { 304 278 xa_for_each(&sr->xa, idx, entry) { 305 279 if (entry->reg.masked || entry->clr_bits == ~0) 306 280 continue; 281 + 282 + if (entry->reg.mcr) { 283 + struct xe_reg_mcr reg = { .__reg.raw = entry->reg.raw }; 284 + u8 group, instance; 285 + 286 + xe_gt_mcr_get_nonterminated_steering(gt, reg, &group, &instance); 287 + *cs++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1); 288 + *cs++ = CS_MMIO_GROUP_INSTANCE_SELECT(hwe->mmio_base).addr; 289 + *cs++ = SELECTIVE_READ_ADDRESSING | 290 + REG_FIELD_PREP(SELECTIVE_READ_GROUP, group) | 291 + REG_FIELD_PREP(SELECTIVE_READ_INSTANCE, instance); 292 + } 307 293 308 294 *cs++ = MI_LOAD_REGISTER_REG | MI_LRR_DST_CS_MMIO; 309 295 *cs++ = entry->reg.addr; ··· 340 306 *cs++ = CS_GPR_REG(0, 0).addr; 341 307 *cs++ = entry->reg.addr; 342 308 343 - xe_gt_dbg(gt, "REG[%#x] = ~%#x|%#x\n", 344 - entry->reg.addr, entry->clr_bits, entry->set_bits); 309 + xe_gt_dbg(gt, "REG[%#x] = ~%#x|%#x%s\n", 310 + entry->reg.addr, entry->clr_bits, entry->set_bits, 311 + entry->reg.mcr ? " (MCR)" : ""); 345 312 } 346 313 347 314 /* reset used GPR */ ··· 354 319 *cs++ = 0; 355 320 *cs++ = CS_GPR_REG(0, 2).addr; 356 321 *cs++ = 0; 322 + 323 + /* reset steering */ 324 + if (count_rmw_mcr) { 325 + *cs++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1); 326 + *cs++ = CS_MMIO_GROUP_INSTANCE_SELECT(q->hwe->mmio_base).addr; 327 + *cs++ = 0; 328 + } 357 329 } 358 330 359 331 cs = xe_lrc_emit_hwe_state_instructions(q, cs);

Configure Feed

Configure Feed