Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

shmem, userfaultfd: implement shmem uffd operations using vm_uffd_ops

Add filemap_add() and filemap_remove() methods to vm_uffd_ops and use them
in __mfill_atomic_pte() to add shmem folios to page cache and remove them
in case of error.

Implement these methods in shmem along with vm_uffd_ops->alloc_folio() and
drop shmem_mfill_atomic_pte().

Since userfaultfd now does not reference any functions from shmem, drop
include if linux/shmem_fs.h from mm/userfaultfd.c

mfill_atomic_install_pte() is not used anywhere outside of mm/userfaultfd,
make it static.

Link: https://lore.kernel.org/20260402041156.1377214-11-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: James Houghton <jthoughton@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrei Vagin <avagin@google.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: David Hildenbrand (Arm) <david@kernel.org>
Cc: Harry Yoo <harry.yoo@oracle.com>
Cc: Harry Yoo (Oracle) <harry@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nikita Kalyazin <kalyazin@amazon.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Carlier <devnexen@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Mike Rapoport (Microsoft) and committed by
Andrew Morton
f74991b4 ad9ac308

+105 -154
-14
include/linux/shmem_fs.h
··· 221 221 222 222 extern bool shmem_charge(struct inode *inode, long pages); 223 223 224 - #ifdef CONFIG_USERFAULTFD 225 - #ifdef CONFIG_SHMEM 226 - extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd, 227 - struct vm_area_struct *dst_vma, 228 - unsigned long dst_addr, 229 - unsigned long src_addr, 230 - uffd_flags_t flags, 231 - struct folio **foliop); 232 - #else /* !CONFIG_SHMEM */ 233 - #define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \ 234 - src_addr, flags, foliop) ({ BUG(); 0; }) 235 - #endif /* CONFIG_SHMEM */ 236 - #endif /* CONFIG_USERFAULTFD */ 237 - 238 224 /* 239 225 * Used space is stored as unsigned 64-bit value in bytes but 240 226 * quota core supports only signed 64-bit values so use that
+14 -5
include/linux/userfaultfd_k.h
··· 100 100 */ 101 101 struct folio *(*alloc_folio)(struct vm_area_struct *vma, 102 102 unsigned long addr); 103 + /* 104 + * Called during resolution of UFFDIO_COPY request. 105 + * Should only be called with a folio returned by alloc_folio() above. 106 + * The folio will be set to locked. 107 + * Returns 0 on success, error code on failure. 108 + */ 109 + int (*filemap_add)(struct folio *folio, struct vm_area_struct *vma, 110 + unsigned long addr); 111 + /* 112 + * Called during resolution of UFFDIO_COPY request on the error 113 + * handling path. 114 + * Should revert the operation of ->filemap_add(). 115 + */ 116 + void (*filemap_remove)(struct folio *folio, struct vm_area_struct *vma); 103 117 }; 104 118 105 119 /* A combined operation mode + behavior flags. */ ··· 146 132 147 133 /* Flags controlling behavior. These behavior changes are mode-independent. */ 148 134 #define MFILL_ATOMIC_WP MFILL_ATOMIC_FLAG(0) 149 - 150 - extern int mfill_atomic_install_pte(pmd_t *dst_pmd, 151 - struct vm_area_struct *dst_vma, 152 - unsigned long dst_addr, struct page *page, 153 - bool newly_allocated, uffd_flags_t flags); 154 135 155 136 extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, 156 137 unsigned long src_start, unsigned long len,
+52 -94
mm/shmem.c
··· 3175 3175 #endif /* CONFIG_TMPFS_QUOTA */ 3176 3176 3177 3177 #ifdef CONFIG_USERFAULTFD 3178 - int shmem_mfill_atomic_pte(pmd_t *dst_pmd, 3179 - struct vm_area_struct *dst_vma, 3180 - unsigned long dst_addr, 3181 - unsigned long src_addr, 3182 - uffd_flags_t flags, 3183 - struct folio **foliop) 3178 + static struct folio *shmem_mfill_folio_alloc(struct vm_area_struct *vma, 3179 + unsigned long addr) 3184 3180 { 3185 - struct inode *inode = file_inode(dst_vma->vm_file); 3186 - struct shmem_inode_info *info = SHMEM_I(inode); 3181 + struct inode *inode = file_inode(vma->vm_file); 3187 3182 struct address_space *mapping = inode->i_mapping; 3183 + struct shmem_inode_info *info = SHMEM_I(inode); 3184 + pgoff_t pgoff = linear_page_index(vma, addr); 3188 3185 gfp_t gfp = mapping_gfp_mask(mapping); 3189 - pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); 3190 - void *page_kaddr; 3191 3186 struct folio *folio; 3192 - int ret; 3193 - pgoff_t max_off; 3194 3187 3195 - if (shmem_inode_acct_blocks(inode, 1)) { 3196 - /* 3197 - * We may have got a page, returned -ENOENT triggering a retry, 3198 - * and now we find ourselves with -ENOMEM. Release the page, to 3199 - * avoid a BUG_ON in our caller. 3200 - */ 3201 - if (unlikely(*foliop)) { 3202 - folio_put(*foliop); 3203 - *foliop = NULL; 3204 - } 3205 - return -ENOMEM; 3188 + if (unlikely(pgoff >= DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) 3189 + return NULL; 3190 + 3191 + folio = shmem_alloc_folio(gfp, 0, info, pgoff); 3192 + if (!folio) 3193 + return NULL; 3194 + 3195 + if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) { 3196 + folio_put(folio); 3197 + return NULL; 3206 3198 } 3207 3199 3208 - if (!*foliop) { 3209 - ret = -ENOMEM; 3210 - folio = shmem_alloc_folio(gfp, 0, info, pgoff); 3211 - if (!folio) 3212 - goto out_unacct_blocks; 3200 + return folio; 3201 + } 3213 3202 3214 - if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) { 3215 - page_kaddr = kmap_local_folio(folio, 0); 3216 - /* 3217 - * The read mmap_lock is held here. Despite the 3218 - * mmap_lock being read recursive a deadlock is still 3219 - * possible if a writer has taken a lock. For example: 3220 - * 3221 - * process A thread 1 takes read lock on own mmap_lock 3222 - * process A thread 2 calls mmap, blocks taking write lock 3223 - * process B thread 1 takes page fault, read lock on own mmap lock 3224 - * process B thread 2 calls mmap, blocks taking write lock 3225 - * process A thread 1 blocks taking read lock on process B 3226 - * process B thread 1 blocks taking read lock on process A 3227 - * 3228 - * Disable page faults to prevent potential deadlock 3229 - * and retry the copy outside the mmap_lock. 3230 - */ 3231 - pagefault_disable(); 3232 - ret = copy_from_user(page_kaddr, 3233 - (const void __user *)src_addr, 3234 - PAGE_SIZE); 3235 - pagefault_enable(); 3236 - kunmap_local(page_kaddr); 3203 + static int shmem_mfill_filemap_add(struct folio *folio, 3204 + struct vm_area_struct *vma, 3205 + unsigned long addr) 3206 + { 3207 + struct inode *inode = file_inode(vma->vm_file); 3208 + struct address_space *mapping = inode->i_mapping; 3209 + pgoff_t pgoff = linear_page_index(vma, addr); 3210 + gfp_t gfp = mapping_gfp_mask(mapping); 3211 + int err; 3237 3212 3238 - /* fallback to copy_from_user outside mmap_lock */ 3239 - if (unlikely(ret)) { 3240 - *foliop = folio; 3241 - ret = -ENOENT; 3242 - /* don't free the page */ 3243 - goto out_unacct_blocks; 3244 - } 3245 - 3246 - flush_dcache_folio(folio); 3247 - } else { /* ZEROPAGE */ 3248 - clear_user_highpage(&folio->page, dst_addr); 3249 - } 3250 - } else { 3251 - folio = *foliop; 3252 - VM_BUG_ON_FOLIO(folio_test_large(folio), folio); 3253 - *foliop = NULL; 3254 - } 3255 - 3256 - VM_BUG_ON(folio_test_locked(folio)); 3257 - VM_BUG_ON(folio_test_swapbacked(folio)); 3258 3213 __folio_set_locked(folio); 3259 3214 __folio_set_swapbacked(folio); 3260 - __folio_mark_uptodate(folio); 3261 3215 3262 - ret = -EFAULT; 3263 - max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 3264 - if (unlikely(pgoff >= max_off)) 3265 - goto out_release; 3216 + err = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp); 3217 + if (err) 3218 + goto err_unlock; 3266 3219 3267 - ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp); 3268 - if (ret) 3269 - goto out_release; 3270 - ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp); 3271 - if (ret) 3272 - goto out_release; 3220 + if (shmem_inode_acct_blocks(inode, 1)) { 3221 + err = -ENOMEM; 3222 + goto err_delete_from_cache; 3223 + } 3273 3224 3274 - ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, 3275 - &folio->page, true, flags); 3276 - if (ret) 3277 - goto out_delete_from_cache; 3278 - 3225 + folio_add_lru(folio); 3279 3226 shmem_recalc_inode(inode, 1, 0); 3280 - folio_unlock(folio); 3227 + 3281 3228 return 0; 3282 - out_delete_from_cache: 3229 + 3230 + err_delete_from_cache: 3283 3231 filemap_remove_folio(folio); 3284 - out_release: 3232 + err_unlock: 3285 3233 folio_unlock(folio); 3286 - folio_put(folio); 3287 - out_unacct_blocks: 3288 - shmem_inode_unacct_blocks(inode, 1); 3289 - return ret; 3234 + return err; 3235 + } 3236 + 3237 + static void shmem_mfill_filemap_remove(struct folio *folio, 3238 + struct vm_area_struct *vma) 3239 + { 3240 + struct inode *inode = file_inode(vma->vm_file); 3241 + 3242 + filemap_remove_folio(folio); 3243 + shmem_recalc_inode(inode, 0, 0); 3244 + folio_unlock(folio); 3290 3245 } 3291 3246 3292 3247 static struct folio *shmem_get_folio_noalloc(struct inode *inode, pgoff_t pgoff) ··· 3264 3309 static const struct vm_uffd_ops shmem_uffd_ops = { 3265 3310 .can_userfault = shmem_can_userfault, 3266 3311 .get_folio_noalloc = shmem_get_folio_noalloc, 3312 + .alloc_folio = shmem_mfill_folio_alloc, 3313 + .filemap_add = shmem_mfill_filemap_add, 3314 + .filemap_remove = shmem_mfill_filemap_remove, 3267 3315 }; 3268 3316 #endif /* CONFIG_USERFAULTFD */ 3269 3317
+39 -41
mm/userfaultfd.c
··· 14 14 #include <linux/userfaultfd_k.h> 15 15 #include <linux/mmu_notifier.h> 16 16 #include <linux/hugetlb.h> 17 - #include <linux/shmem_fs.h> 18 17 #include <asm/tlbflush.h> 19 18 #include <asm/tlb.h> 20 19 #include "internal.h" ··· 337 338 * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem 338 339 * and anon, and for both shared and private VMAs. 339 340 */ 340 - int mfill_atomic_install_pte(pmd_t *dst_pmd, 341 - struct vm_area_struct *dst_vma, 342 - unsigned long dst_addr, struct page *page, 343 - bool newly_allocated, uffd_flags_t flags) 341 + static int mfill_atomic_install_pte(pmd_t *dst_pmd, 342 + struct vm_area_struct *dst_vma, 343 + unsigned long dst_addr, struct page *page, 344 + uffd_flags_t flags) 344 345 { 345 346 int ret; 346 347 struct mm_struct *dst_mm = dst_vma->vm_mm; ··· 384 385 goto out_unlock; 385 386 386 387 if (page_in_cache) { 387 - /* Usually, cache pages are already added to LRU */ 388 - if (newly_allocated) 389 - folio_add_lru(folio); 390 388 folio_add_file_rmap_pte(folio, page, dst_vma); 391 389 } else { 392 390 folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE); ··· 397 401 inc_mm_counter(dst_mm, mm_counter(folio)); 398 402 399 403 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 404 + 405 + if (page_in_cache) 406 + folio_unlock(folio); 400 407 401 408 /* No need to invalidate - it was non-present before */ 402 409 update_mmu_cache(dst_vma, dst_addr, dst_pte); ··· 513 514 */ 514 515 __folio_mark_uptodate(folio); 515 516 517 + if (ops->filemap_add) { 518 + ret = ops->filemap_add(folio, state->vma, state->dst_addr); 519 + if (ret) 520 + goto err_folio_put; 521 + } 522 + 516 523 ret = mfill_atomic_install_pte(state->pmd, state->vma, dst_addr, 517 - &folio->page, true, flags); 524 + &folio->page, flags); 518 525 if (ret) 519 - goto err_folio_put; 526 + goto err_filemap_remove; 520 527 521 528 return 0; 522 529 530 + err_filemap_remove: 531 + if (ops->filemap_remove) 532 + ops->filemap_remove(folio, state->vma); 523 533 err_folio_put: 524 534 folio_put(folio); 525 535 /* Don't return -ENOENT so that our caller won't retry */ ··· 540 532 static int mfill_atomic_pte_copy(struct mfill_state *state) 541 533 { 542 534 const struct vm_uffd_ops *ops = vma_uffd_ops(state->vma); 535 + 536 + /* 537 + * The normal page fault path for a MAP_PRIVATE mapping in a 538 + * file-backed VMA will invoke the fault, fill the hole in the file and 539 + * COW it right away. The result generates plain anonymous memory. 540 + * So when we are asked to fill a hole in a MAP_PRIVATE mapping, we'll 541 + * generate anonymous memory directly without actually filling the 542 + * hole. For the MAP_PRIVATE case the robustness check only happens in 543 + * the pagetable (to verify it's still none) and not in the page cache. 544 + */ 545 + if (!(state->vma->vm_flags & VM_SHARED)) 546 + ops = &anon_uffd_ops; 543 547 544 548 return __mfill_atomic_pte(state, ops); 545 549 } ··· 572 552 spinlock_t *ptl; 573 553 int ret; 574 554 575 - if (mm_forbids_zeropage(dst_vma->vm_mm)) 555 + if (mm_forbids_zeropage(dst_vma->vm_mm) || 556 + (dst_vma->vm_flags & VM_SHARED)) 576 557 return mfill_atomic_pte_zeroed_folio(state); 577 558 578 559 _dst_pte = pte_mkspecial(pfn_pte(zero_pfn(dst_addr), ··· 630 609 } 631 610 632 611 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, 633 - page, false, flags); 612 + page, flags); 634 613 if (ret) 635 614 goto out_release; 636 615 637 - folio_unlock(folio); 638 616 return 0; 639 617 640 618 out_release: ··· 856 836 857 837 static __always_inline ssize_t mfill_atomic_pte(struct mfill_state *state) 858 838 { 859 - struct vm_area_struct *dst_vma = state->vma; 860 - unsigned long src_addr = state->src_addr; 861 - unsigned long dst_addr = state->dst_addr; 862 - struct folio **foliop = &state->folio; 863 839 uffd_flags_t flags = state->flags; 864 - pmd_t *dst_pmd = state->pmd; 865 - ssize_t err; 866 840 867 841 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) 868 842 return mfill_atomic_pte_continue(state); 869 843 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) 870 844 return mfill_atomic_pte_poison(state); 845 + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) 846 + return mfill_atomic_pte_copy(state); 847 + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) 848 + return mfill_atomic_pte_zeropage(state); 871 849 872 - /* 873 - * The normal page fault path for a shmem will invoke the 874 - * fault, fill the hole in the file and COW it right away. The 875 - * result generates plain anonymous memory. So when we are 876 - * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll 877 - * generate anonymous memory directly without actually filling 878 - * the hole. For the MAP_PRIVATE case the robustness check 879 - * only happens in the pagetable (to verify it's still none) 880 - * and not in the radix tree. 881 - */ 882 - if (!(dst_vma->vm_flags & VM_SHARED)) { 883 - if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) 884 - err = mfill_atomic_pte_copy(state); 885 - else 886 - err = mfill_atomic_pte_zeropage(state); 887 - } else { 888 - err = shmem_mfill_atomic_pte(dst_pmd, dst_vma, 889 - dst_addr, src_addr, 890 - flags, foliop); 891 - } 892 - 893 - return err; 850 + VM_WARN_ONCE(1, "Unknown UFFDIO operation, flags: %x", flags); 851 + return -EOPNOTSUPP; 894 852 } 895 853 896 854 static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,