Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

drm/amdgpu: statically assign gart windows to ttm entities

If multiple entities share the same window we must make sure
that jobs using them are executed sequentially.

This commit gives separate windows to each entity, so jobs
from multiple entities could execute in parallel if needed.
(for now they all use the first sdma engine, so it makes no
difference yet).
The entity stores the gart window offsets to centralize the
"window id" to "window offset" in a single place.

default_entity doesn't get any windows reserved since there is
no use for them.

---
v3:
- renamed gart_window_lock -> lock (Christian)
- added amdgpu_ttm_buffer_entity_init (Christian)
- fixed gart_addr in svm_migrate_gart_map (Felix)
- renamed gart_window_idX -> gart_window_offs[]
- added amdgpu_compute_gart_address
v4:
- u32 -> u64
- added kerneldoc
v5:
- removed gtt_window_lock
- simplified gart window creation and use: entities using a
single window now uses window #0 instead of #1
- fix dst_addr calculation in kfd_migrate.c
---

Signed-off-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Acked-by: Felix Kuehling <felix.kuehling@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Pierre-Eric Pelloux-Prayer and committed by
Alex Deucher
2c372557 49fe425d

+72 -31
+3 -3
drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
··· 742 742 * translation. Avoid this by doing the invalidation from the SDMA 743 743 * itself at least for GART. 744 744 */ 745 - mutex_lock(&adev->mman.gtt_window_lock); 745 + mutex_lock(&adev->mman.default_entity.lock); 746 746 r = amdgpu_job_alloc_with_ib(ring->adev, &adev->mman.default_entity.base, 747 747 AMDGPU_FENCE_OWNER_UNDEFINED, 748 748 16 * 4, AMDGPU_IB_POOL_IMMEDIATE, ··· 755 755 job->ibs->ptr[job->ibs->length_dw++] = ring->funcs->nop; 756 756 amdgpu_ring_pad_ib(ring, &job->ibs[0]); 757 757 fence = amdgpu_job_submit(job); 758 - mutex_unlock(&adev->mman.gtt_window_lock); 758 + mutex_unlock(&adev->mman.default_entity.lock); 759 759 760 760 dma_fence_wait(fence, false); 761 761 dma_fence_put(fence); ··· 763 763 return; 764 764 765 765 error_alloc: 766 - mutex_unlock(&adev->mman.gtt_window_lock); 766 + mutex_unlock(&adev->mman.default_entity.lock); 767 767 dev_err(adev->dev, "Error flushing GPU TLB using the SDMA (%d)!\n", r); 768 768 } 769 769
+45 -19
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
··· 228 228 229 229 *size = min(*size, (uint64_t)num_pages * PAGE_SIZE - offset); 230 230 231 - *addr = adev->gmc.gart_start; 232 - *addr += (u64)window * AMDGPU_GTT_MAX_TRANSFER_SIZE * 233 - AMDGPU_GPU_PAGE_SIZE; 231 + *addr = amdgpu_compute_gart_address(&adev->gmc, entity, window); 234 232 *addr += offset; 235 233 236 234 num_dw = ALIGN(adev->mman.buffer_funcs->copy_num_dw, 8); ··· 246 248 src_addr += job->ibs[0].gpu_addr; 247 249 248 250 dst_addr = amdgpu_bo_gpu_offset(adev->gart.bo); 249 - dst_addr += window * AMDGPU_GTT_MAX_TRANSFER_SIZE * 8; 251 + dst_addr += (entity->gart_window_offs[window] >> AMDGPU_GPU_PAGE_SHIFT) * 8; 250 252 amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_addr, 251 253 dst_addr, num_bytes, 0); 252 254 ··· 311 313 amdgpu_res_first(src->mem, src->offset, size, &src_mm); 312 314 amdgpu_res_first(dst->mem, dst->offset, size, &dst_mm); 313 315 314 - mutex_lock(&adev->mman.gtt_window_lock); 316 + mutex_lock(&entity->lock); 315 317 while (src_mm.remaining) { 316 318 uint64_t from, to, cur_size, tiling_flags; 317 319 uint32_t num_type, data_format, max_com, write_compress_disable; ··· 366 368 amdgpu_res_next(&dst_mm, cur_size); 367 369 } 368 370 error: 369 - mutex_unlock(&adev->mman.gtt_window_lock); 371 + mutex_unlock(&entity->lock); 370 372 *f = fence; 371 373 return r; 372 374 } ··· 1578 1580 if (r) 1579 1581 goto out; 1580 1582 1581 - mutex_lock(&adev->mman.gtt_window_lock); 1583 + mutex_lock(&adev->mman.default_entity.lock); 1582 1584 amdgpu_res_first(abo->tbo.resource, offset, len, &src_mm); 1583 1585 src_addr = amdgpu_ttm_domain_start(adev, bo->resource->mem_type) + 1584 1586 src_mm.start; ··· 1590 1592 PAGE_SIZE, 0); 1591 1593 1592 1594 fence = amdgpu_ttm_job_submit(adev, job, num_dw); 1593 - mutex_unlock(&adev->mman.gtt_window_lock); 1595 + mutex_unlock(&adev->mman.default_entity.lock); 1594 1596 1595 1597 if (!dma_fence_wait_timeout(fence, false, adev->sdma_timeout)) 1596 1598 r = -ETIMEDOUT; ··· 2011 2013 adev->rmmio_remap.bo = NULL; 2012 2014 } 2013 2015 2016 + static int amdgpu_ttm_buffer_entity_init(struct amdgpu_ttm_buffer_entity *entity, 2017 + int starting_gart_window, 2018 + u32 num_gart_windows) 2019 + { 2020 + int i; 2021 + 2022 + mutex_init(&entity->lock); 2023 + 2024 + if (ARRAY_SIZE(entity->gart_window_offs) < num_gart_windows) 2025 + return starting_gart_window; 2026 + 2027 + for (i = 0; i < num_gart_windows; i++) { 2028 + entity->gart_window_offs[i] = 2029 + (u64)starting_gart_window * AMDGPU_GTT_MAX_TRANSFER_SIZE * 2030 + AMDGPU_GPU_PAGE_SIZE; 2031 + starting_gart_window++; 2032 + } 2033 + 2034 + return starting_gart_window; 2035 + } 2036 + 2014 2037 /* 2015 2038 * amdgpu_ttm_init - Init the memory management (ttm) as well as various 2016 2039 * gtt/vram related fields. ··· 2045 2026 { 2046 2027 uint64_t gtt_size; 2047 2028 int r; 2048 - 2049 - mutex_init(&adev->mman.gtt_window_lock); 2050 2029 2051 2030 dma_set_max_seg_size(adev->dev, UINT_MAX); 2052 2031 /* No others user of address space so set it to 0 */ ··· 2319 2302 void amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev, bool enable) 2320 2303 { 2321 2304 struct ttm_resource_manager *man = ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM); 2305 + u32 used_windows; 2322 2306 uint64_t size; 2323 2307 int r; 2324 2308 ··· 2363 2345 drm_sched_entity_destroy(&adev->mman.clear_entity.base); 2364 2346 goto error_free_entity; 2365 2347 } 2348 + 2349 + /* Statically assign GART windows to each entity. */ 2350 + used_windows = amdgpu_ttm_buffer_entity_init(&adev->mman.default_entity, 0, 0); 2351 + used_windows = amdgpu_ttm_buffer_entity_init(&adev->mman.move_entity, 2352 + used_windows, 2); 2353 + used_windows = amdgpu_ttm_buffer_entity_init(&adev->mman.clear_entity, 2354 + used_windows, 1); 2366 2355 } else { 2367 2356 drm_sched_entity_destroy(&adev->mman.default_entity.base); 2368 2357 drm_sched_entity_destroy(&adev->mman.clear_entity.base); ··· 2528 2503 struct dma_fence **fence) 2529 2504 { 2530 2505 struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); 2506 + struct amdgpu_ttm_buffer_entity *entity; 2531 2507 struct amdgpu_res_cursor cursor; 2532 2508 u64 addr; 2533 2509 int r = 0; ··· 2539 2513 if (!fence) 2540 2514 return -EINVAL; 2541 2515 2516 + entity = &adev->mman.clear_entity; 2542 2517 *fence = dma_fence_get_stub(); 2543 2518 2544 2519 amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &cursor); 2545 2520 2546 - mutex_lock(&adev->mman.gtt_window_lock); 2521 + mutex_lock(&entity->lock); 2547 2522 while (cursor.remaining) { 2548 2523 struct dma_fence *next = NULL; 2549 2524 u64 size; ··· 2557 2530 /* Never clear more than 256MiB at once to avoid timeouts */ 2558 2531 size = min(cursor.size, 256ULL << 20); 2559 2532 2560 - r = amdgpu_ttm_map_buffer(&adev->mman.clear_entity, 2561 - &bo->tbo, bo->tbo.resource, &cursor, 2562 - 1, false, &size, &addr); 2533 + r = amdgpu_ttm_map_buffer(entity, &bo->tbo, bo->tbo.resource, &cursor, 2534 + 0, false, &size, &addr); 2563 2535 if (r) 2564 2536 goto err; 2565 2537 2566 - r = amdgpu_ttm_fill_mem(adev, &adev->mman.clear_entity, 0, addr, size, resv, 2538 + r = amdgpu_ttm_fill_mem(adev, entity, 0, addr, size, resv, 2567 2539 &next, true, 2568 2540 AMDGPU_KERNEL_JOB_ID_TTM_CLEAR_BUFFER); 2569 2541 if (r) ··· 2574 2548 amdgpu_res_next(&cursor, size); 2575 2549 } 2576 2550 err: 2577 - mutex_unlock(&adev->mman.gtt_window_lock); 2551 + mutex_unlock(&entity->lock); 2578 2552 2579 2553 return r; 2580 2554 } ··· 2599 2573 2600 2574 amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &dst); 2601 2575 2602 - mutex_lock(&adev->mman.gtt_window_lock); 2576 + mutex_lock(&entity->lock); 2603 2577 while (dst.remaining) { 2604 2578 struct dma_fence *next; 2605 2579 uint64_t cur_size, to; ··· 2608 2582 cur_size = min(dst.size, 256ULL << 20); 2609 2583 2610 2584 r = amdgpu_ttm_map_buffer(entity, &bo->tbo, bo->tbo.resource, &dst, 2611 - 1, false, &cur_size, &to); 2585 + 0, false, &cur_size, &to); 2612 2586 if (r) 2613 2587 goto error; 2614 2588 ··· 2624 2598 amdgpu_res_next(&dst, cur_size); 2625 2599 } 2626 2600 error: 2627 - mutex_unlock(&adev->mman.gtt_window_lock); 2601 + mutex_unlock(&entity->lock); 2628 2602 if (f) 2629 2603 *f = dma_fence_get(fence); 2630 2604 dma_fence_put(fence);
+18 -3
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
··· 29 29 #include <drm/ttm/ttm_placement.h> 30 30 #include "amdgpu_vram_mgr.h" 31 31 #include "amdgpu_hmm.h" 32 + #include "amdgpu_gmc.h" 32 33 33 34 #define AMDGPU_PL_GDS (TTM_PL_PRIV + 0) 34 35 #define AMDGPU_PL_GWS (TTM_PL_PRIV + 1) ··· 40 39 #define __AMDGPU_PL_NUM (TTM_PL_PRIV + 6) 41 40 42 41 #define AMDGPU_GTT_MAX_TRANSFER_SIZE 512 43 - #define AMDGPU_GTT_NUM_TRANSFER_WINDOWS 2 42 + #define AMDGPU_GTT_NUM_TRANSFER_WINDOWS 3 44 43 45 44 extern const struct attribute_group amdgpu_vram_mgr_attr_group; 46 45 extern const struct attribute_group amdgpu_gtt_mgr_attr_group; ··· 55 54 56 55 struct amdgpu_ttm_buffer_entity { 57 56 struct drm_sched_entity base; 57 + struct mutex lock; 58 + u64 gart_window_offs[2]; 58 59 }; 59 60 60 61 struct amdgpu_mman { ··· 70 67 struct amdgpu_ring *buffer_funcs_ring; 71 68 bool buffer_funcs_enabled; 72 69 73 - struct mutex gtt_window_lock; 74 - 70 + /* @default_entity: for workarounds, has no gart windows */ 75 71 struct amdgpu_ttm_buffer_entity default_entity; 76 72 struct amdgpu_ttm_buffer_entity clear_entity; 77 73 struct amdgpu_ttm_buffer_entity move_entity; ··· 206 204 return -EPERM; 207 205 } 208 206 #endif 207 + 208 + /** 209 + * amdgpu_compute_gart_address() - Returns GART address of an entity's window 210 + * @gmc: The &struct amdgpu_gmc instance to use 211 + * @entity: The &struct amdgpu_ttm_buffer_entity owning the GART window 212 + * @index: The window to use (must be 0 or 1) 213 + */ 214 + static inline u64 amdgpu_compute_gart_address(struct amdgpu_gmc *gmc, 215 + struct amdgpu_ttm_buffer_entity *entity, 216 + int index) 217 + { 218 + return gmc->gart_start + entity->gart_window_offs[index]; 219 + } 209 220 210 221 void amdgpu_ttm_tt_set_user_pages(struct ttm_tt *ttm, struct amdgpu_hmm_range *range); 211 222 int amdgpu_ttm_tt_get_userptr(const struct ttm_buffer_object *tbo,
+6 -6
drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
··· 59 59 void *cpu_addr; 60 60 int r; 61 61 62 - /* use gart window 0 */ 63 - *gart_addr = adev->gmc.gart_start; 62 + *gart_addr = amdgpu_compute_gart_address(&adev->gmc, entity, 0); 64 63 65 64 num_dw = ALIGN(adev->mman.buffer_funcs->copy_num_dw, 8); 66 65 num_bytes = npages * 8 * AMDGPU_GPU_PAGES_IN_CPU_PAGE; ··· 77 78 src_addr += job->ibs[0].gpu_addr; 78 79 79 80 dst_addr = amdgpu_bo_gpu_offset(adev->gart.bo); 81 + dst_addr += (entity->gart_window_offs[0] >> AMDGPU_GPU_PAGE_SHIFT) * 8; 80 82 amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_addr, 81 83 dst_addr, num_bytes, 0); 82 84 ··· 116 116 * multiple GTT_MAX_PAGES transfer, all sdma operations are serialized, wait for 117 117 * the last sdma finish fence which is returned to check copy memory is done. 118 118 * 119 - * Context: Process context, takes and releases gtt_window_lock 119 + * Context: Process context 120 120 * 121 121 * Return: 122 122 * 0 - OK, otherwise error code ··· 136 136 u64 size; 137 137 int r; 138 138 139 - entity = &adev->mman.default_entity; 139 + entity = &adev->mman.move_entity; 140 140 141 - mutex_lock(&adev->mman.gtt_window_lock); 141 + mutex_lock(&entity->lock); 142 142 143 143 while (npages) { 144 144 size = min(GTT_MAX_PAGES, npages); ··· 175 175 } 176 176 177 177 out_unlock: 178 - mutex_unlock(&adev->mman.gtt_window_lock); 178 + mutex_unlock(&entity->lock); 179 179 180 180 return r; 181 181 }