Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

drm/xe/vf: Use drm mm instead of drm sa for CCS read/write

The suballocator algorithm tracks a hole cursor at the last allocation
and tries to allocate after it. This is optimized for fence-ordered
progress, where older allocations are expected to become reusable first.

In fence-enabled mode, that ordering assumption holds. In fence-disabled
mode, allocations may be freed in arbitrary order, so limiting allocation
to the current hole window can miss valid free space and fail allocations
despite sufficient total space.

Use DRM memory manager instead of sub-allocator to get rid of this issue
as CCS read/write operations do not use fences.

Fixes: 864690cf4dd6 ("drm/xe/vf: Attach and detach CCS copy commands with BO")
Signed-off-by: Satyanarayana K V P <satyanarayana.k.v.p@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Maarten Lankhorst <dev@lankhorst.se>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Link: https://patch.msgid.link/20260408110145.1639937-6-satyanarayana.k.v.p@intel.com
(cherry picked from commit 6c84b493012aeb05dec29c709377bf0e17ac6815)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>

authored by

Satyanarayana K V P and committed by
Rodrigo Vivi
1460eae7 36c6bac1

+63 -55
+2 -1
drivers/gpu/drm/xe/xe_bo_types.h
··· 18 18 #include "xe_ggtt_types.h" 19 19 20 20 struct xe_device; 21 + struct xe_mem_pool_node; 21 22 struct xe_vm; 22 23 23 24 #define XE_BO_MAX_PLACEMENTS 3 ··· 89 88 bool ccs_cleared; 90 89 91 90 /** @bb_ccs: BB instructions of CCS read/write. Valid only for VF */ 92 - struct xe_bb *bb_ccs[XE_SRIOV_VF_CCS_CTX_COUNT]; 91 + struct xe_mem_pool_node *bb_ccs[XE_SRIOV_VF_CCS_CTX_COUNT]; 93 92 94 93 /** 95 94 * @cpu_caching: CPU caching mode. Currently only used for userspace
+31 -25
drivers/gpu/drm/xe/xe_migrate.c
··· 29 29 #include "xe_hw_engine.h" 30 30 #include "xe_lrc.h" 31 31 #include "xe_map.h" 32 + #include "xe_mem_pool.h" 32 33 #include "xe_mocs.h" 33 34 #include "xe_printk.h" 34 35 #include "xe_pt.h" ··· 1167 1166 u32 batch_size, batch_size_allocated; 1168 1167 struct xe_device *xe = gt_to_xe(gt); 1169 1168 struct xe_res_cursor src_it, ccs_it; 1169 + struct xe_mem_pool *bb_pool; 1170 1170 struct xe_sriov_vf_ccs_ctx *ctx; 1171 - struct xe_sa_manager *bb_pool; 1172 1171 u64 size = xe_bo_size(src_bo); 1173 - struct xe_bb *bb = NULL; 1172 + struct xe_mem_pool_node *bb; 1174 1173 u64 src_L0, src_L0_ofs; 1174 + struct xe_bb xe_bb_tmp; 1175 1175 u32 src_L0_pt; 1176 1176 int err; 1177 1177 ··· 1210 1208 size -= src_L0; 1211 1209 } 1212 1210 1213 - bb = xe_bb_alloc(gt); 1211 + bb = xe_mem_pool_alloc_node(); 1214 1212 if (IS_ERR(bb)) 1215 1213 return PTR_ERR(bb); 1216 1214 1217 1215 bb_pool = ctx->mem.ccs_bb_pool; 1218 - scoped_guard(mutex, xe_sa_bo_swap_guard(bb_pool)) { 1219 - xe_sa_bo_swap_shadow(bb_pool); 1216 + scoped_guard(mutex, xe_mem_pool_bo_swap_guard(bb_pool)) { 1217 + xe_mem_pool_swap_shadow_locked(bb_pool); 1220 1218 1221 - err = xe_bb_init(bb, bb_pool, batch_size); 1219 + err = xe_mem_pool_insert_node(bb_pool, bb, batch_size * sizeof(u32)); 1222 1220 if (err) { 1223 1221 xe_gt_err(gt, "BB allocation failed.\n"); 1224 - xe_bb_free(bb, NULL); 1222 + kfree(bb); 1225 1223 return err; 1226 1224 } 1227 1225 ··· 1229 1227 size = xe_bo_size(src_bo); 1230 1228 batch_size = 0; 1231 1229 1230 + xe_bb_tmp = (struct xe_bb){ .cs = xe_mem_pool_node_cpu_addr(bb), .len = 0 }; 1232 1231 /* 1233 1232 * Emit PTE and copy commands here. 1234 1233 * The CCS copy command can only support limited size. If the size to be ··· 1258 1255 xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE)); 1259 1256 batch_size += EMIT_COPY_CCS_DW; 1260 1257 1261 - emit_pte(m, bb, src_L0_pt, false, true, &src_it, src_L0, src); 1258 + emit_pte(m, &xe_bb_tmp, src_L0_pt, false, true, &src_it, src_L0, src); 1262 1259 1263 - emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src); 1260 + emit_pte(m, &xe_bb_tmp, ccs_pt, false, false, &ccs_it, ccs_size, src); 1264 1261 1265 - bb->len = emit_flush_invalidate(bb->cs, bb->len, flush_flags); 1266 - flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs, src_is_pltt, 1262 + xe_bb_tmp.len = emit_flush_invalidate(xe_bb_tmp.cs, xe_bb_tmp.len, 1263 + flush_flags); 1264 + flush_flags = xe_migrate_ccs_copy(m, &xe_bb_tmp, src_L0_ofs, src_is_pltt, 1267 1265 src_L0_ofs, dst_is_pltt, 1268 1266 src_L0, ccs_ofs, true); 1269 - bb->len = emit_flush_invalidate(bb->cs, bb->len, flush_flags); 1267 + xe_bb_tmp.len = emit_flush_invalidate(xe_bb_tmp.cs, xe_bb_tmp.len, 1268 + flush_flags); 1270 1269 1271 1270 size -= src_L0; 1272 1271 } 1273 1272 1274 - xe_assert(xe, (batch_size_allocated == bb->len)); 1273 + xe_assert(xe, (batch_size_allocated == xe_bb_tmp.len)); 1274 + xe_assert(xe, bb->sa_node.size == xe_bb_tmp.len * sizeof(u32)); 1275 1275 src_bo->bb_ccs[read_write] = bb; 1276 1276 1277 1277 xe_sriov_vf_ccs_rw_update_bb_addr(ctx); 1278 - xe_sa_bo_sync_shadow(bb->bo); 1278 + xe_mem_pool_sync_shadow_locked(bb); 1279 1279 } 1280 1280 1281 1281 return 0; ··· 1303 1297 void xe_migrate_ccs_rw_copy_clear(struct xe_bo *src_bo, 1304 1298 enum xe_sriov_vf_ccs_rw_ctxs read_write) 1305 1299 { 1306 - struct xe_bb *bb = src_bo->bb_ccs[read_write]; 1300 + struct xe_mem_pool_node *bb = src_bo->bb_ccs[read_write]; 1307 1301 struct xe_device *xe = xe_bo_device(src_bo); 1302 + struct xe_mem_pool *bb_pool; 1308 1303 struct xe_sriov_vf_ccs_ctx *ctx; 1309 - struct xe_sa_manager *bb_pool; 1310 1304 u32 *cs; 1311 1305 1312 1306 xe_assert(xe, IS_SRIOV_VF(xe)); ··· 1314 1308 ctx = &xe->sriov.vf.ccs.contexts[read_write]; 1315 1309 bb_pool = ctx->mem.ccs_bb_pool; 1316 1310 1317 - guard(mutex) (xe_sa_bo_swap_guard(bb_pool)); 1318 - xe_sa_bo_swap_shadow(bb_pool); 1311 + scoped_guard(mutex, xe_mem_pool_bo_swap_guard(bb_pool)) { 1312 + xe_mem_pool_swap_shadow_locked(bb_pool); 1319 1313 1320 - cs = xe_sa_bo_cpu_addr(bb->bo); 1321 - memset(cs, MI_NOOP, bb->len * sizeof(u32)); 1322 - xe_sriov_vf_ccs_rw_update_bb_addr(ctx); 1314 + cs = xe_mem_pool_node_cpu_addr(bb); 1315 + memset(cs, MI_NOOP, bb->sa_node.size); 1316 + xe_sriov_vf_ccs_rw_update_bb_addr(ctx); 1323 1317 1324 - xe_sa_bo_sync_shadow(bb->bo); 1325 - 1326 - xe_bb_free(bb, NULL); 1327 - src_bo->bb_ccs[read_write] = NULL; 1318 + xe_mem_pool_sync_shadow_locked(bb); 1319 + xe_mem_pool_free_node(bb); 1320 + src_bo->bb_ccs[read_write] = NULL; 1321 + } 1328 1322 } 1329 1323 1330 1324 /**
+29 -25
drivers/gpu/drm/xe/xe_sriov_vf_ccs.c
··· 14 14 #include "xe_guc.h" 15 15 #include "xe_guc_submit.h" 16 16 #include "xe_lrc.h" 17 + #include "xe_mem_pool.h" 17 18 #include "xe_migrate.h" 18 19 #include "xe_pm.h" 19 - #include "xe_sa.h" 20 20 #include "xe_sriov_printk.h" 21 21 #include "xe_sriov_vf.h" 22 22 #include "xe_sriov_vf_ccs.h" ··· 141 141 142 142 static int alloc_bb_pool(struct xe_tile *tile, struct xe_sriov_vf_ccs_ctx *ctx) 143 143 { 144 + struct xe_mem_pool *pool; 144 145 struct xe_device *xe = tile_to_xe(tile); 145 - struct xe_sa_manager *sa_manager; 146 + u32 *pool_cpu_addr, *last_dw_addr; 146 147 u64 bb_pool_size; 147 - int offset, err; 148 + int err; 148 149 149 150 bb_pool_size = get_ccs_bb_pool_size(xe); 150 151 xe_sriov_info(xe, "Allocating %s CCS BB pool size = %lldMB\n", 151 152 ctx->ctx_id ? "Restore" : "Save", bb_pool_size / SZ_1M); 152 153 153 - sa_manager = __xe_sa_bo_manager_init(tile, bb_pool_size, SZ_4K, SZ_16, 154 - XE_SA_BO_MANAGER_FLAG_SHADOW); 155 - 156 - if (IS_ERR(sa_manager)) { 157 - xe_sriov_err(xe, "Suballocator init failed with error: %pe\n", 158 - sa_manager); 159 - err = PTR_ERR(sa_manager); 154 + pool = xe_mem_pool_init(tile, bb_pool_size, sizeof(u32), 155 + XE_MEM_POOL_BO_FLAG_INIT_SHADOW_COPY); 156 + if (IS_ERR(pool)) { 157 + xe_sriov_err(xe, "xe_mem_pool_init failed with error: %pe\n", 158 + pool); 159 + err = PTR_ERR(pool); 160 160 return err; 161 161 } 162 162 163 - offset = 0; 164 - xe_map_memset(xe, &sa_manager->bo->vmap, offset, MI_NOOP, 165 - bb_pool_size); 166 - xe_map_memset(xe, &sa_manager->shadow->vmap, offset, MI_NOOP, 167 - bb_pool_size); 163 + pool_cpu_addr = xe_mem_pool_cpu_addr(pool); 164 + memset(pool_cpu_addr, 0, bb_pool_size); 168 165 169 - offset = bb_pool_size - sizeof(u32); 170 - xe_map_wr(xe, &sa_manager->bo->vmap, offset, u32, MI_BATCH_BUFFER_END); 171 - xe_map_wr(xe, &sa_manager->shadow->vmap, offset, u32, MI_BATCH_BUFFER_END); 166 + last_dw_addr = pool_cpu_addr + (bb_pool_size / sizeof(u32)) - 1; 167 + *last_dw_addr = MI_BATCH_BUFFER_END; 172 168 173 - ctx->mem.ccs_bb_pool = sa_manager; 169 + /** 170 + * Sync the main copy and shadow copy so that the shadow copy is 171 + * replica of main copy. We sync only BBs after init part. So, we 172 + * need to make sure the main pool and shadow copy are in sync after 173 + * this point. This is needed as GuC may read the BB commands from 174 + * shadow copy. 175 + */ 176 + xe_mem_pool_sync(pool); 174 177 178 + ctx->mem.ccs_bb_pool = pool; 175 179 return 0; 176 180 } 177 181 178 182 static void ccs_rw_update_ring(struct xe_sriov_vf_ccs_ctx *ctx) 179 183 { 180 - u64 addr = xe_sa_manager_gpu_addr(ctx->mem.ccs_bb_pool); 184 + u64 addr = xe_mem_pool_gpu_addr(ctx->mem.ccs_bb_pool); 181 185 struct xe_lrc *lrc = xe_exec_queue_lrc(ctx->mig_q); 182 186 u32 dw[10], i = 0; 183 187 ··· 392 388 #define XE_SRIOV_VF_CCS_RW_BB_ADDR_OFFSET (2 * sizeof(u32)) 393 389 void xe_sriov_vf_ccs_rw_update_bb_addr(struct xe_sriov_vf_ccs_ctx *ctx) 394 390 { 395 - u64 addr = xe_sa_manager_gpu_addr(ctx->mem.ccs_bb_pool); 391 + u64 addr = xe_mem_pool_gpu_addr(ctx->mem.ccs_bb_pool); 396 392 struct xe_lrc *lrc = xe_exec_queue_lrc(ctx->mig_q); 397 393 struct xe_device *xe = gt_to_xe(ctx->mig_q->gt); 398 394 ··· 416 412 struct xe_device *xe = xe_bo_device(bo); 417 413 enum xe_sriov_vf_ccs_rw_ctxs ctx_id; 418 414 struct xe_sriov_vf_ccs_ctx *ctx; 415 + struct xe_mem_pool_node *bb; 419 416 struct xe_tile *tile; 420 - struct xe_bb *bb; 421 417 int err = 0; 422 418 423 419 xe_assert(xe, IS_VF_CCS_READY(xe)); ··· 449 445 { 450 446 struct xe_device *xe = xe_bo_device(bo); 451 447 enum xe_sriov_vf_ccs_rw_ctxs ctx_id; 452 - struct xe_bb *bb; 448 + struct xe_mem_pool_node *bb; 453 449 454 450 xe_assert(xe, IS_VF_CCS_READY(xe)); 455 451 ··· 475 471 */ 476 472 void xe_sriov_vf_ccs_print(struct xe_device *xe, struct drm_printer *p) 477 473 { 478 - struct xe_sa_manager *bb_pool; 479 474 enum xe_sriov_vf_ccs_rw_ctxs ctx_id; 475 + struct xe_mem_pool *bb_pool; 480 476 481 477 if (!IS_VF_CCS_READY(xe)) 482 478 return; ··· 489 485 490 486 drm_printf(p, "ccs %s bb suballoc info\n", ctx_id ? "write" : "read"); 491 487 drm_printf(p, "-------------------------\n"); 492 - drm_suballoc_dump_debug_info(&bb_pool->base, p, xe_sa_manager_gpu_addr(bb_pool)); 488 + xe_mem_pool_dump(bb_pool, p); 493 489 drm_puts(p, "\n"); 494 490 } 495 491 }
+1 -4
drivers/gpu/drm/xe/xe_sriov_vf_ccs_types.h
··· 17 17 XE_SRIOV_VF_CCS_CTX_COUNT 18 18 }; 19 19 20 - struct xe_migrate; 21 - struct xe_sa_manager; 22 - 23 20 /** 24 21 * struct xe_sriov_vf_ccs_ctx - VF CCS migration context data. 25 22 */ ··· 30 33 /** @mem: memory data */ 31 34 struct { 32 35 /** @mem.ccs_bb_pool: Pool from which batch buffers are allocated. */ 33 - struct xe_sa_manager *ccs_bb_pool; 36 + struct xe_mem_pool *ccs_bb_pool; 34 37 } mem; 35 38 }; 36 39