Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

drm/amd: Fix MQD and control stack alignment for non-4K

For gfxV9, due to a hardware bug ("based on the comments in the code
here [1]"), the control stack of a user-mode compute queue must be
allocated immediately after the page boundary of its regular MQD buffer.
To handle this, we allocate an enlarged MQD buffer where the first page
is used as the MQD and the remaining pages store the control stack.
Although these regions share the same BO, they require different memory
types: the MQD must be UC (uncached), while the control stack must be
NC (non-coherent), matching the behavior when the control stack is
allocated in user space.

This logic works correctly on systems where the CPU page size matches
the GPU page size (4K). However, the current implementation aligns both
the MQD and the control stack to the CPU PAGE_SIZE. On systems with a
larger CPU page size, the entire first CPU page is marked UC—even though
that page may contain multiple GPU pages. The GPU treats the second 4K
GPU page inside that CPU page as part of the control stack, but it is
incorrectly mapped as UC.

This patch fixes the issue by aligning both the MQD and control stack
sizes to the GPU page size (4K). The first 4K page is correctly marked
as UC for the MQD, and the remaining GPU pages are marked NC for the
control stack. This ensures proper memory type assignment on systems
with larger CPU page sizes.

[1]: https://elixir.bootlin.com/linux/v6.18/source/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c#L118

Acked-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Donet Tom <donettom@linux.ibm.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Donet Tom and committed by
Alex Deucher
998d6781 b01cd158

+64 -21
+44
drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
··· 404 404 } 405 405 406 406 /** 407 + * amdgpu_gart_map_gfx9_mqd - map mqd and ctrl_stack dma_addresses into GART entries 408 + * 409 + * @adev: amdgpu_device pointer 410 + * @offset: offset into the GPU's gart aperture 411 + * @pages: number of pages to bind 412 + * @dma_addr: DMA addresses of pages 413 + * @flags: page table entry flags 414 + * 415 + * Map the MQD and control stack addresses into GART entries with the correct 416 + * memory types on gfxv9. The MQD occupies the first 4KB and is followed by 417 + * the control stack. The MQD uses UC (uncached) memory, while the control stack 418 + * uses NC (non-coherent) memory. 419 + */ 420 + void amdgpu_gart_map_gfx9_mqd(struct amdgpu_device *adev, uint64_t offset, 421 + int pages, dma_addr_t *dma_addr, uint64_t flags) 422 + { 423 + uint64_t page_base; 424 + unsigned int i, j, t; 425 + int idx; 426 + uint64_t ctrl_flags = AMDGPU_PTE_MTYPE_VG10(flags, AMDGPU_MTYPE_NC); 427 + void *dst; 428 + 429 + if (!adev->gart.ptr) 430 + return; 431 + 432 + if (!drm_dev_enter(adev_to_drm(adev), &idx)) 433 + return; 434 + 435 + t = offset / AMDGPU_GPU_PAGE_SIZE; 436 + dst = adev->gart.ptr; 437 + for (i = 0; i < pages; i++) { 438 + page_base = dma_addr[i]; 439 + for (j = 0; j < AMDGPU_GPU_PAGES_IN_CPU_PAGE; j++, t++) { 440 + if ((i == 0) && (j == 0)) 441 + amdgpu_gmc_set_pte_pde(adev, dst, t, page_base, flags); 442 + else 443 + amdgpu_gmc_set_pte_pde(adev, dst, t, page_base, ctrl_flags); 444 + page_base += AMDGPU_GPU_PAGE_SIZE; 445 + } 446 + } 447 + drm_dev_exit(idx); 448 + } 449 + 450 + /** 407 451 * amdgpu_gart_bind - bind pages into the gart page table 408 452 * 409 453 * @adev: amdgpu_device pointer
+2
drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
··· 62 62 void amdgpu_gart_map(struct amdgpu_device *adev, uint64_t offset, 63 63 int pages, dma_addr_t *dma_addr, uint64_t flags, 64 64 void *dst); 65 + void amdgpu_gart_map_gfx9_mqd(struct amdgpu_device *adev, uint64_t offset, 66 + int pages, dma_addr_t *dma_addr, uint64_t flags); 65 67 void amdgpu_gart_bind(struct amdgpu_device *adev, uint64_t offset, 66 68 int pages, dma_addr_t *dma_addr, uint64_t flags); 67 69 void amdgpu_gart_map_vram_range(struct amdgpu_device *adev, uint64_t pa,
+3 -13
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
··· 854 854 int num_xcc = max(1U, adev->gfx.num_xcc_per_xcp); 855 855 uint64_t page_idx, pages_per_xcc; 856 856 int i; 857 - uint64_t ctrl_flags = AMDGPU_PTE_MTYPE_VG10(flags, AMDGPU_MTYPE_NC); 858 857 859 858 pages_per_xcc = total_pages; 860 859 do_div(pages_per_xcc, num_xcc); 861 860 862 861 for (i = 0, page_idx = 0; i < num_xcc; i++, page_idx += pages_per_xcc) { 863 - /* MQD page: use default flags */ 864 - amdgpu_gart_bind(adev, 862 + amdgpu_gart_map_gfx9_mqd(adev, 865 863 gtt->offset + (page_idx << PAGE_SHIFT), 866 - 1, &gtt->ttm.dma_address[page_idx], flags); 867 - /* 868 - * Ctrl pages - modify the memory type to NC (ctrl_flags) from 869 - * the second page of the BO onward. 870 - */ 871 - amdgpu_gart_bind(adev, 872 - gtt->offset + ((page_idx + 1) << PAGE_SHIFT), 873 - pages_per_xcc - 1, 874 - &gtt->ttm.dma_address[page_idx + 1], 875 - ctrl_flags); 864 + pages_per_xcc, &gtt->ttm.dma_address[page_idx], 865 + flags); 876 866 } 877 867 } 878 868
+15 -8
drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
··· 42 42 struct queue_properties *q) 43 43 { 44 44 if (mm->dev->kfd->cwsr_enabled && 45 - q->type == KFD_QUEUE_TYPE_COMPUTE) 46 - return ALIGN(q->ctl_stack_size, PAGE_SIZE) + 47 - ALIGN(sizeof(struct v9_mqd), PAGE_SIZE); 45 + q->type == KFD_QUEUE_TYPE_COMPUTE) { 46 + 47 + /* On gfxv9, the MQD resides in the first 4K page, 48 + * followed by the control stack. Align both to 49 + * AMDGPU_GPU_PAGE_SIZE to maintain the required 4K boundary. 50 + */ 51 + 52 + return ALIGN(ALIGN(q->ctl_stack_size, AMDGPU_GPU_PAGE_SIZE) + 53 + ALIGN(sizeof(struct v9_mqd), AMDGPU_GPU_PAGE_SIZE), PAGE_SIZE); 54 + } 48 55 49 56 return mm->mqd_size; 50 57 } ··· 157 150 if (!mqd_mem_obj) 158 151 return NULL; 159 152 retval = amdgpu_amdkfd_alloc_kernel_mem(node->adev, 160 - (ALIGN(q->ctl_stack_size, PAGE_SIZE) + 161 - ALIGN(sizeof(struct v9_mqd), PAGE_SIZE)) * 153 + (ALIGN(ALIGN(q->ctl_stack_size, AMDGPU_GPU_PAGE_SIZE) + 154 + ALIGN(sizeof(struct v9_mqd), AMDGPU_GPU_PAGE_SIZE), PAGE_SIZE)) * 162 155 NUM_XCC(node->xcc_mask), 163 156 mqd_on_vram(node->adev) ? AMDGPU_GEM_DOMAIN_VRAM : 164 157 AMDGPU_GEM_DOMAIN_GTT, ··· 366 359 struct kfd_context_save_area_header header; 367 360 368 361 /* Control stack is located one page after MQD. */ 369 - void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE); 362 + void *mqd_ctl_stack = (void *)((uintptr_t)mqd + AMDGPU_GPU_PAGE_SIZE); 370 363 371 364 m = get_mqd(mqd); 372 365 ··· 406 399 { 407 400 struct v9_mqd *m; 408 401 /* Control stack is located one page after MQD. */ 409 - void *ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE); 402 + void *ctl_stack = (void *)((uintptr_t)mqd + AMDGPU_GPU_PAGE_SIZE); 410 403 411 404 m = get_mqd(mqd); 412 405 ··· 452 445 *gart_addr = addr; 453 446 454 447 /* Control stack is located one page after MQD. */ 455 - ctl_stack = (void *)((uintptr_t)*mqd + PAGE_SIZE); 448 + ctl_stack = (void *)((uintptr_t)*mqd + AMDGPU_GPU_PAGE_SIZE); 456 449 memcpy(ctl_stack, ctl_stack_src, ctl_stack_size); 457 450 458 451 m->cp_hqd_pq_doorbell_control =