mm/userfaultfd: don't lock anon_vma when performing UFFDIO_MOVE

Now that rmap_walk() is guaranteed to be called with the folio lock held,
we can stop serializing on the src VMA anon_vma lock when moving an
exclusive folio from a src VMA to a dst VMA in UFFDIO_MOVE ioctl.

When moving a folio, we modify folio->mapping through
folio_move_anon_rmap() and adjust folio->index accordingly. Doing that
while we could have concurrent RMAP walks would be dangerous. Therefore,
to avoid that, we had to acquire anon_vma of src VMA in write-mode. That
meant that when multiple threads called UFFDIO_MOVE concurrently on
distinct pages of the same src VMA, they would serialize on it, hurting
scalability.

In addition to avoiding the scalability bottleneck, this patch also
simplifies the complicated lock dance that UFFDIO_MOVE has to go through
between RCU, folio-lock, ptl, and anon_vma.

folio_move_anon_rmap() already enforces that the folio is locked. So when
we have the folio locked we can no longer race with concurrent rmap_walk()
as used by folio_referenced() and others who call it on unlocked non-KSM
anon folios, and therefore the anon_vma lock is no longer required.

Note that this handling is now the same as for other
folio_move_anon_rmap() users that also do not hold the anon_vma lock --
namely COW reuse handling (do_wp_page()->wp_can_reuse_anon_folio(),
do_huge_pmd_wp_page(), and hugetlb_wp()). These users never required the
anon_vma lock as they are only moving the anon VMA closer to the anon_vma
leaf of the VMA, for example, from an anon_vma root to a leaf of that
root. rmap walks were always able to tolerate that scenario.

Link: https://lkml.kernel.org/r/20250923071019.775806-3-lokeshgidra@google.com
Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Jann Horn <jannh@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Nicolas Geoffray <ngeoffray@google.com>
Cc: Harry Yoo <harry.yoo@oracle.com>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Lokesh Gidra and committed by

Andrew Morton 7 months ago cc22b997 95b34d66

+12 -72

2 changed files

expand all

huge_memory.c

userfaultfd.c

+1 -21

mm/huge_memory.c

··· 2534 2534 pmd_t _dst_pmd, src_pmdval; 2535 2535 struct page *src_page; 2536 2536 struct folio *src_folio; 2537 - struct anon_vma *src_anon_vma; 2538 2537 spinlock_t *src_ptl, *dst_ptl; 2539 2538 pgtable_t src_pgtable; 2540 2539 struct mmu_notifier_range range; ··· 2582 2583 src_addr + HPAGE_PMD_SIZE); 2583 2584 mmu_notifier_invalidate_range_start(&range); 2584 2585 2585 - if (src_folio) { 2586 + if (src_folio) 2586 2587 folio_lock(src_folio); 2587 - 2588 - /* 2589 - * split_huge_page walks the anon_vma chain without the page 2590 - * lock. Serialize against it with the anon_vma lock, the page 2591 - * lock is not enough. 2592 - */ 2593 - src_anon_vma = folio_get_anon_vma(src_folio); 2594 - if (!src_anon_vma) { 2595 - err = -EAGAIN; 2596 - goto unlock_folio; 2597 - } 2598 - anon_vma_lock_write(src_anon_vma); 2599 - } else 2600 - src_anon_vma = NULL; 2601 2588 2602 2589 dst_ptl = pmd_lockptr(mm, dst_pmd); 2603 2590 double_pt_lock(src_ptl, dst_ptl); ··· 2629 2644 pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable); 2630 2645 unlock_ptls: 2631 2646 double_pt_unlock(src_ptl, dst_ptl); 2632 - if (src_anon_vma) { 2633 - anon_vma_unlock_write(src_anon_vma); 2634 - put_anon_vma(src_anon_vma); 2635 - } 2636 - unlock_folio: 2637 2647 /* unblock rmap walks */ 2638 2648 if (src_folio) 2639 2649 folio_unlock(src_folio);

+11 -51

mm/userfaultfd.c

··· 1035 1035 */ 1036 1036 static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma, 1037 1037 unsigned long src_addr, 1038 - pte_t *src_pte, pte_t *dst_pte, 1039 - struct anon_vma *src_anon_vma) 1038 + pte_t *src_pte, pte_t *dst_pte) 1040 1039 { 1041 1040 pte_t orig_dst_pte, orig_src_pte; 1042 1041 struct folio *folio; ··· 1051 1052 folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); 1052 1053 if (!folio || !folio_trylock(folio)) 1053 1054 return NULL; 1054 - if (!PageAnonExclusive(&folio->page) || folio_test_large(folio) || 1055 - folio_anon_vma(folio) != src_anon_vma) { 1055 + if (!PageAnonExclusive(&folio->page) || folio_test_large(folio)) { 1056 1056 folio_unlock(folio); 1057 1057 return NULL; 1058 1058 } ··· 1059 1061 } 1060 1062 1061 1063 /* 1062 - * Moves src folios to dst in a batch as long as they share the same 1063 - * anon_vma as the first folio, are not large, and can successfully 1064 - * take the lock via folio_trylock(). 1064 + * Moves src folios to dst in a batch as long as they are not large, and can 1065 + * successfully take the lock via folio_trylock(). 1065 1066 */ 1066 1067 static long move_present_ptes(struct mm_struct *mm, 1067 1068 struct vm_area_struct *dst_vma, ··· 1070 1073 pte_t orig_dst_pte, pte_t orig_src_pte, 1071 1074 pmd_t *dst_pmd, pmd_t dst_pmdval, 1072 1075 spinlock_t *dst_ptl, spinlock_t *src_ptl, 1073 - struct folio **first_src_folio, unsigned long len, 1074 - struct anon_vma *src_anon_vma) 1076 + struct folio **first_src_folio, unsigned long len) 1075 1077 { 1076 1078 int err = 0; 1077 1079 struct folio *src_folio = *first_src_folio; ··· 1128 1132 src_pte++; 1129 1133 1130 1134 folio_unlock(src_folio); 1131 - src_folio = check_ptes_for_batched_move(src_vma, src_addr, src_pte, 1132 - dst_pte, src_anon_vma); 1135 + src_folio = check_ptes_for_batched_move(src_vma, src_addr, 1136 + src_pte, dst_pte); 1133 1137 if (!src_folio) 1134 1138 break; 1135 1139 } ··· 1259 1263 pmd_t dummy_pmdval; 1260 1264 pmd_t dst_pmdval; 1261 1265 struct folio *src_folio = NULL; 1262 - struct anon_vma *src_anon_vma = NULL; 1263 1266 struct mmu_notifier_range range; 1264 1267 long ret = 0; 1265 1268 ··· 1342 1347 } 1343 1348 1344 1349 /* 1345 - * Pin and lock both source folio and anon_vma. Since we are in 1346 - * RCU read section, we can't block, so on contention have to 1347 - * unmap the ptes, obtain the lock and retry. 1350 + * Pin and lock source folio. Since we are in RCU read section, 1351 + * we can't block, so on contention have to unmap the ptes, 1352 + * obtain the lock and retry. 1348 1353 */ 1349 1354 if (!src_folio) { 1350 1355 struct folio *folio; ··· 1418 1423 goto retry; 1419 1424 } 1420 1425 1421 - if (!src_anon_vma) { 1422 - /* 1423 - * folio_referenced walks the anon_vma chain 1424 - * without the folio lock. Serialize against it with 1425 - * the anon_vma lock, the folio lock is not enough. 1426 - */ 1427 - src_anon_vma = folio_get_anon_vma(src_folio); 1428 - if (!src_anon_vma) { 1429 - /* page was unmapped from under us */ 1430 - ret = -EAGAIN; 1431 - goto out; 1432 - } 1433 - if (!anon_vma_trylock_write(src_anon_vma)) { 1434 - pte_unmap(src_pte); 1435 - pte_unmap(dst_pte); 1436 - src_pte = dst_pte = NULL; 1437 - /* now we can block and wait */ 1438 - anon_vma_lock_write(src_anon_vma); 1439 - goto retry; 1440 - } 1441 - } 1442 - 1443 1426 ret = move_present_ptes(mm, dst_vma, src_vma, 1444 1427 dst_addr, src_addr, dst_pte, src_pte, 1445 1428 orig_dst_pte, orig_src_pte, dst_pmd, 1446 1429 dst_pmdval, dst_ptl, src_ptl, &src_folio, 1447 - len, src_anon_vma); 1430 + len); 1448 1431 } else { 1449 1432 struct folio *folio = NULL; 1450 1433 ··· 1488 1515 } 1489 1516 1490 1517 out: 1491 - if (src_anon_vma) { 1492 - anon_vma_unlock_write(src_anon_vma); 1493 - put_anon_vma(src_anon_vma); 1494 - } 1495 1518 if (src_folio) { 1496 1519 folio_unlock(src_folio); 1497 1520 folio_put(src_folio); ··· 1761 1792 * virtual regions without knowing if there are transparent hugepage 1762 1793 * in the regions or not, but preventing the risk of having to split 1763 1794 * the hugepmd during the remap. 1764 - * 1765 - * If there's any rmap walk that is taking the anon_vma locks without 1766 - * first obtaining the folio lock (the only current instance is 1767 - * folio_referenced), they will have to verify if the folio->mapping 1768 - * has changed after taking the anon_vma lock. If it changed they 1769 - * should release the lock and retry obtaining a new anon_vma, because 1770 - * it means the anon_vma was changed by move_pages() before the lock 1771 - * could be obtained. This is the only additional complexity added to 1772 - * the rmap code to provide this anonymous page remapping functionality. 1773 1795 */ 1774 1796 ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, 1775 1797 unsigned long src_start, unsigned long len, __u64 mode)

Configure Feed

Configure Feed