Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

drm/amdkfd: Don't call mmput from MMU notifier callback

If the process is exiting, the mmput inside mmu notifier callback from
compactd or fork or numa balancing could release the last reference
of mm struct to call exit_mmap and free_pgtable, this triggers deadlock
with below backtrace.

The deadlock will leak kfd process as mmu notifier release is not called
and cause VRAM leaking.

The fix is to take mm reference mmget_non_zero when adding prange to the
deferred list to pair with mmput in deferred list work.

If prange split and add into pchild list, the pchild work_item.mm is not
used, so remove the mm parameter from svm_range_unmap_split and
svm_range_add_child.

The backtrace of hung task:

INFO: task python:348105 blocked for more than 64512 seconds.
Call Trace:
__schedule+0x1c3/0x550
schedule+0x46/0xb0
rwsem_down_write_slowpath+0x24b/0x4c0
unlink_anon_vmas+0xb1/0x1c0
free_pgtables+0xa9/0x130
exit_mmap+0xbc/0x1a0
mmput+0x5a/0x140
svm_range_cpu_invalidate_pagetables+0x2b/0x40 [amdgpu]
mn_itree_invalidate+0x72/0xc0
__mmu_notifier_invalidate_range_start+0x48/0x60
try_to_unmap_one+0x10fa/0x1400
rmap_walk_anon+0x196/0x460
try_to_unmap+0xbb/0x210
migrate_page_unmap+0x54d/0x7e0
migrate_pages_batch+0x1c3/0xae0
migrate_pages_sync+0x98/0x240
migrate_pages+0x25c/0x520
compact_zone+0x29d/0x590
compact_zone_order+0xb6/0xf0
try_to_compact_pages+0xbe/0x220
__alloc_pages_direct_compact+0x96/0x1a0
__alloc_pages_slowpath+0x410/0x930
__alloc_pages_nodemask+0x3a9/0x3e0
do_huge_pmd_anonymous_page+0xd7/0x3e0
__handle_mm_fault+0x5e3/0x5f0
handle_mm_fault+0xf7/0x2e0
hmm_vma_fault.isra.0+0x4d/0xa0
walk_pmd_range.isra.0+0xa8/0x310
walk_pud_range+0x167/0x240
walk_pgd_range+0x55/0x100
__walk_page_range+0x87/0x90
walk_page_range+0xf6/0x160
hmm_range_fault+0x4f/0x90
amdgpu_hmm_range_get_pages+0x123/0x230 [amdgpu]
amdgpu_ttm_tt_get_user_pages+0xb1/0x150 [amdgpu]
init_user_pages+0xb1/0x2a0 [amdgpu]
amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu+0x543/0x7d0 [amdgpu]
kfd_ioctl_alloc_memory_of_gpu+0x24c/0x4e0 [amdgpu]
kfd_ioctl+0x29d/0x500 [amdgpu]

Fixes: fa582c6f3684 ("drm/amdkfd: Use mmget_not_zero in MMU notifier")
Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Philip Yang and committed by
Alex Deucher
a29e067b 51526efe

+20 -23
+20 -23
drivers/gpu/drm/amd/amdkfd/kfd_svm.c
··· 1171 1171 } 1172 1172 1173 1173 static void 1174 - svm_range_add_child(struct svm_range *prange, struct mm_struct *mm, 1175 - struct svm_range *pchild, enum svm_work_list_ops op) 1174 + svm_range_add_child(struct svm_range *prange, struct svm_range *pchild, enum svm_work_list_ops op) 1176 1175 { 1177 1176 pr_debug("add child 0x%p [0x%lx 0x%lx] to prange 0x%p child list %d\n", 1178 1177 pchild, pchild->start, pchild->last, prange, op); 1179 1178 1180 - pchild->work_item.mm = mm; 1179 + pchild->work_item.mm = NULL; 1181 1180 pchild->work_item.op = op; 1182 1181 list_add_tail(&pchild->child_list, &prange->child_list); 1183 1182 } ··· 2393 2394 prange->work_item.op != SVM_OP_UNMAP_RANGE) 2394 2395 prange->work_item.op = op; 2395 2396 } else { 2396 - prange->work_item.op = op; 2397 - 2398 - /* Pairs with mmput in deferred_list_work */ 2399 - mmget(mm); 2400 - prange->work_item.mm = mm; 2401 - list_add_tail(&prange->deferred_list, 2402 - &prange->svms->deferred_range_list); 2403 - pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n", 2404 - prange, prange->start, prange->last, op); 2397 + /* Pairs with mmput in deferred_list_work. 2398 + * If process is exiting and mm is gone, don't update mmu notifier. 2399 + */ 2400 + if (mmget_not_zero(mm)) { 2401 + prange->work_item.mm = mm; 2402 + prange->work_item.op = op; 2403 + list_add_tail(&prange->deferred_list, 2404 + &prange->svms->deferred_range_list); 2405 + pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n", 2406 + prange, prange->start, prange->last, op); 2407 + } 2405 2408 } 2406 2409 spin_unlock(&svms->deferred_list_lock); 2407 2410 } ··· 2417 2416 } 2418 2417 2419 2418 static void 2420 - svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent, 2421 - struct svm_range *prange, unsigned long start, 2419 + svm_range_unmap_split(struct svm_range *parent, struct svm_range *prange, unsigned long start, 2422 2420 unsigned long last) 2423 2421 { 2424 2422 struct svm_range *head; ··· 2438 2438 svm_range_split(tail, last + 1, tail->last, &head); 2439 2439 2440 2440 if (head != prange && tail != prange) { 2441 - svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE); 2442 - svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE); 2441 + svm_range_add_child(parent, head, SVM_OP_UNMAP_RANGE); 2442 + svm_range_add_child(parent, tail, SVM_OP_ADD_RANGE); 2443 2443 } else if (tail != prange) { 2444 - svm_range_add_child(parent, mm, tail, SVM_OP_UNMAP_RANGE); 2444 + svm_range_add_child(parent, tail, SVM_OP_UNMAP_RANGE); 2445 2445 } else if (head != prange) { 2446 - svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE); 2446 + svm_range_add_child(parent, head, SVM_OP_UNMAP_RANGE); 2447 2447 } else if (parent != prange) { 2448 2448 prange->work_item.op = SVM_OP_UNMAP_RANGE; 2449 2449 } ··· 2520 2520 l = min(last, pchild->last); 2521 2521 if (l >= s) 2522 2522 svm_range_unmap_from_gpus(pchild, s, l, trigger); 2523 - svm_range_unmap_split(mm, prange, pchild, start, last); 2523 + svm_range_unmap_split(prange, pchild, start, last); 2524 2524 mutex_unlock(&pchild->lock); 2525 2525 } 2526 2526 s = max(start, prange->start); 2527 2527 l = min(last, prange->last); 2528 2528 if (l >= s) 2529 2529 svm_range_unmap_from_gpus(prange, s, l, trigger); 2530 - svm_range_unmap_split(mm, prange, prange, start, last); 2530 + svm_range_unmap_split(prange, prange, start, last); 2531 2531 2532 2532 if (unmap_parent) 2533 2533 svm_range_add_list_work(svms, prange, mm, SVM_OP_UNMAP_RANGE); ··· 2570 2570 2571 2571 if (range->event == MMU_NOTIFY_RELEASE) 2572 2572 return true; 2573 - if (!mmget_not_zero(mni->mm)) 2574 - return true; 2575 2573 2576 2574 start = mni->interval_tree.start; 2577 2575 last = mni->interval_tree.last; ··· 2596 2598 } 2597 2599 2598 2600 svm_range_unlock(prange); 2599 - mmput(mni->mm); 2600 2601 2601 2602 return true; 2602 2603 }