Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at master 2241 lines 60 kB view raw
1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * mm/userfaultfd.c 4 * 5 * Copyright (C) 2015 Red Hat, Inc. 6 */ 7 8#include <linux/mm.h> 9#include <linux/sched/signal.h> 10#include <linux/pagemap.h> 11#include <linux/rmap.h> 12#include <linux/swap.h> 13#include <linux/leafops.h> 14#include <linux/userfaultfd_k.h> 15#include <linux/mmu_notifier.h> 16#include <linux/hugetlb.h> 17#include <asm/tlbflush.h> 18#include <asm/tlb.h> 19#include "internal.h" 20#include "swap.h" 21 22struct mfill_state { 23 struct userfaultfd_ctx *ctx; 24 unsigned long src_start; 25 unsigned long dst_start; 26 unsigned long len; 27 uffd_flags_t flags; 28 29 struct vm_area_struct *vma; 30 unsigned long src_addr; 31 unsigned long dst_addr; 32 pmd_t *pmd; 33}; 34 35static bool anon_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags) 36{ 37 /* anonymous memory does not support MINOR mode */ 38 if (vm_flags & VM_UFFD_MINOR) 39 return false; 40 return true; 41} 42 43static struct folio *anon_alloc_folio(struct vm_area_struct *vma, 44 unsigned long addr) 45{ 46 struct folio *folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, 47 addr); 48 49 if (!folio) 50 return NULL; 51 52 if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) { 53 folio_put(folio); 54 return NULL; 55 } 56 57 return folio; 58} 59 60static const struct vm_uffd_ops anon_uffd_ops = { 61 .can_userfault = anon_can_userfault, 62 .alloc_folio = anon_alloc_folio, 63}; 64 65static const struct vm_uffd_ops *vma_uffd_ops(struct vm_area_struct *vma) 66{ 67 if (vma_is_anonymous(vma)) 68 return &anon_uffd_ops; 69 return vma->vm_ops ? vma->vm_ops->uffd_ops : NULL; 70} 71 72static __always_inline 73bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) 74{ 75 /* Make sure that the dst range is fully within dst_vma. */ 76 if (dst_end > dst_vma->vm_end) 77 return false; 78 79 /* 80 * Check the vma is registered in uffd, this is required to 81 * enforce the VM_MAYWRITE check done at uffd registration 82 * time. 83 */ 84 if (!dst_vma->vm_userfaultfd_ctx.ctx) 85 return false; 86 87 return true; 88} 89 90static __always_inline 91struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm, 92 unsigned long addr) 93{ 94 struct vm_area_struct *vma; 95 96 mmap_assert_locked(mm); 97 vma = vma_lookup(mm, addr); 98 if (!vma) 99 vma = ERR_PTR(-ENOENT); 100 else if (!(vma->vm_flags & VM_SHARED) && 101 unlikely(anon_vma_prepare(vma))) 102 vma = ERR_PTR(-ENOMEM); 103 104 return vma; 105} 106 107#ifdef CONFIG_PER_VMA_LOCK 108/* 109 * uffd_lock_vma() - Lookup and lock vma corresponding to @address. 110 * @mm: mm to search vma in. 111 * @address: address that the vma should contain. 112 * 113 * Should be called without holding mmap_lock. 114 * 115 * Return: A locked vma containing @address, -ENOENT if no vma is found, or 116 * -ENOMEM if anon_vma couldn't be allocated. 117 */ 118static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm, 119 unsigned long address) 120{ 121 struct vm_area_struct *vma; 122 123 vma = lock_vma_under_rcu(mm, address); 124 if (vma) { 125 /* 126 * We know we're going to need to use anon_vma, so check 127 * that early. 128 */ 129 if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma)) 130 vma_end_read(vma); 131 else 132 return vma; 133 } 134 135 mmap_read_lock(mm); 136 vma = find_vma_and_prepare_anon(mm, address); 137 if (!IS_ERR(vma)) { 138 bool locked = vma_start_read_locked(vma); 139 140 if (!locked) 141 vma = ERR_PTR(-EAGAIN); 142 } 143 144 mmap_read_unlock(mm); 145 return vma; 146} 147 148static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, 149 unsigned long dst_start, 150 unsigned long len) 151{ 152 struct vm_area_struct *dst_vma; 153 154 dst_vma = uffd_lock_vma(dst_mm, dst_start); 155 if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len)) 156 return dst_vma; 157 158 vma_end_read(dst_vma); 159 return ERR_PTR(-ENOENT); 160} 161 162static void uffd_mfill_unlock(struct vm_area_struct *vma) 163{ 164 vma_end_read(vma); 165} 166 167#else 168 169static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, 170 unsigned long dst_start, 171 unsigned long len) 172{ 173 struct vm_area_struct *dst_vma; 174 175 mmap_read_lock(dst_mm); 176 dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start); 177 if (IS_ERR(dst_vma)) 178 goto out_unlock; 179 180 if (validate_dst_vma(dst_vma, dst_start + len)) 181 return dst_vma; 182 183 dst_vma = ERR_PTR(-ENOENT); 184out_unlock: 185 mmap_read_unlock(dst_mm); 186 return dst_vma; 187} 188 189static void uffd_mfill_unlock(struct vm_area_struct *vma) 190{ 191 mmap_read_unlock(vma->vm_mm); 192} 193#endif 194 195static void mfill_put_vma(struct mfill_state *state) 196{ 197 if (!state->vma) 198 return; 199 200 up_read(&state->ctx->map_changing_lock); 201 uffd_mfill_unlock(state->vma); 202 state->vma = NULL; 203} 204 205static int mfill_get_vma(struct mfill_state *state) 206{ 207 struct userfaultfd_ctx *ctx = state->ctx; 208 uffd_flags_t flags = state->flags; 209 struct vm_area_struct *dst_vma; 210 const struct vm_uffd_ops *ops; 211 int err; 212 213 /* 214 * Make sure the vma is not shared, that the dst range is 215 * both valid and fully within a single existing vma. 216 */ 217 dst_vma = uffd_mfill_lock(ctx->mm, state->dst_start, state->len); 218 if (IS_ERR(dst_vma)) 219 return PTR_ERR(dst_vma); 220 221 /* 222 * If memory mappings are changing because of non-cooperative 223 * operation (e.g. mremap) running in parallel, bail out and 224 * request the user to retry later 225 */ 226 down_read(&ctx->map_changing_lock); 227 state->vma = dst_vma; 228 err = -EAGAIN; 229 if (atomic_read(&ctx->mmap_changing)) 230 goto out_unlock; 231 232 err = -EINVAL; 233 234 /* 235 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but 236 * it will overwrite vm_ops, so vma_is_anonymous must return false. 237 */ 238 if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && 239 dst_vma->vm_flags & VM_SHARED)) 240 goto out_unlock; 241 242 /* 243 * validate 'mode' now that we know the dst_vma: don't allow 244 * a wrprotect copy if the userfaultfd didn't register as WP. 245 */ 246 if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP)) 247 goto out_unlock; 248 249 if (is_vm_hugetlb_page(dst_vma)) 250 return 0; 251 252 ops = vma_uffd_ops(dst_vma); 253 if (!ops) 254 goto out_unlock; 255 256 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) && 257 !ops->get_folio_noalloc) 258 goto out_unlock; 259 260 return 0; 261 262out_unlock: 263 mfill_put_vma(state); 264 return err; 265} 266 267static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) 268{ 269 pgd_t *pgd; 270 p4d_t *p4d; 271 pud_t *pud; 272 273 pgd = pgd_offset(mm, address); 274 p4d = p4d_alloc(mm, pgd, address); 275 if (!p4d) 276 return NULL; 277 pud = pud_alloc(mm, p4d, address); 278 if (!pud) 279 return NULL; 280 /* 281 * Note that we didn't run this because the pmd was 282 * missing, the *pmd may be already established and in 283 * turn it may also be a trans_huge_pmd. 284 */ 285 return pmd_alloc(mm, pud, address); 286} 287 288static int mfill_establish_pmd(struct mfill_state *state) 289{ 290 struct mm_struct *dst_mm = state->ctx->mm; 291 pmd_t *dst_pmd, dst_pmdval; 292 293 dst_pmd = mm_alloc_pmd(dst_mm, state->dst_addr); 294 if (unlikely(!dst_pmd)) 295 return -ENOMEM; 296 297 dst_pmdval = pmdp_get_lockless(dst_pmd); 298 if (unlikely(pmd_none(dst_pmdval)) && 299 unlikely(__pte_alloc(dst_mm, dst_pmd))) 300 return -ENOMEM; 301 302 dst_pmdval = pmdp_get_lockless(dst_pmd); 303 /* 304 * If the dst_pmd is THP don't override it and just be strict. 305 * (This includes the case where the PMD used to be THP and 306 * changed back to none after __pte_alloc().) 307 */ 308 if (unlikely(!pmd_present(dst_pmdval) || pmd_leaf(dst_pmdval))) 309 return -EEXIST; 310 if (unlikely(pmd_bad(dst_pmdval))) 311 return -EFAULT; 312 313 state->pmd = dst_pmd; 314 return 0; 315} 316 317/* Check if dst_addr is outside of file's size. Must be called with ptl held. */ 318static bool mfill_file_over_size(struct vm_area_struct *dst_vma, 319 unsigned long dst_addr) 320{ 321 struct inode *inode; 322 pgoff_t offset, max_off; 323 324 if (!dst_vma->vm_file) 325 return false; 326 327 inode = dst_vma->vm_file->f_inode; 328 offset = linear_page_index(dst_vma, dst_addr); 329 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 330 return offset >= max_off; 331} 332 333/* 334 * Install PTEs, to map dst_addr (within dst_vma) to page. 335 * 336 * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem 337 * and anon, and for both shared and private VMAs. 338 */ 339static int mfill_atomic_install_pte(pmd_t *dst_pmd, 340 struct vm_area_struct *dst_vma, 341 unsigned long dst_addr, struct page *page, 342 uffd_flags_t flags) 343{ 344 int ret; 345 struct mm_struct *dst_mm = dst_vma->vm_mm; 346 pte_t _dst_pte, *dst_pte; 347 bool writable = dst_vma->vm_flags & VM_WRITE; 348 bool vm_shared = dst_vma->vm_flags & VM_SHARED; 349 spinlock_t *ptl; 350 struct folio *folio = page_folio(page); 351 bool page_in_cache = folio_mapping(folio); 352 pte_t dst_ptep; 353 354 _dst_pte = mk_pte(page, dst_vma->vm_page_prot); 355 _dst_pte = pte_mkdirty(_dst_pte); 356 if (page_in_cache && !vm_shared) 357 writable = false; 358 if (writable) 359 _dst_pte = pte_mkwrite(_dst_pte, dst_vma); 360 if (flags & MFILL_ATOMIC_WP) 361 _dst_pte = pte_mkuffd_wp(_dst_pte); 362 363 ret = -EAGAIN; 364 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 365 if (!dst_pte) 366 goto out; 367 368 if (mfill_file_over_size(dst_vma, dst_addr)) { 369 ret = -EFAULT; 370 goto out_unlock; 371 } 372 373 ret = -EEXIST; 374 375 dst_ptep = ptep_get(dst_pte); 376 377 /* 378 * We are allowed to overwrite a UFFD pte marker: consider when both 379 * MISSING|WP registered, we firstly wr-protect a none pte which has no 380 * page cache page backing it, then access the page. 381 */ 382 if (!pte_none(dst_ptep) && !pte_is_uffd_marker(dst_ptep)) 383 goto out_unlock; 384 385 if (page_in_cache) { 386 folio_add_file_rmap_pte(folio, page, dst_vma); 387 } else { 388 folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE); 389 folio_add_lru_vma(folio, dst_vma); 390 } 391 392 /* 393 * Must happen after rmap, as mm_counter() checks mapping (via 394 * PageAnon()), which is set by __page_set_anon_rmap(). 395 */ 396 inc_mm_counter(dst_mm, mm_counter(folio)); 397 398 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 399 400 if (page_in_cache) 401 folio_unlock(folio); 402 403 /* No need to invalidate - it was non-present before */ 404 update_mmu_cache(dst_vma, dst_addr, dst_pte); 405 ret = 0; 406out_unlock: 407 pte_unmap_unlock(dst_pte, ptl); 408out: 409 return ret; 410} 411 412static int mfill_copy_folio_locked(struct folio *folio, unsigned long src_addr) 413{ 414 void *kaddr; 415 int ret; 416 417 kaddr = kmap_local_folio(folio, 0); 418 /* 419 * The read mmap_lock is held here. Despite the 420 * mmap_lock being read recursive a deadlock is still 421 * possible if a writer has taken a lock. For example: 422 * 423 * process A thread 1 takes read lock on own mmap_lock 424 * process A thread 2 calls mmap, blocks taking write lock 425 * process B thread 1 takes page fault, read lock on own mmap lock 426 * process B thread 2 calls mmap, blocks taking write lock 427 * process A thread 1 blocks taking read lock on process B 428 * process B thread 1 blocks taking read lock on process A 429 * 430 * Disable page faults to prevent potential deadlock 431 * and retry the copy outside the mmap_lock. 432 */ 433 pagefault_disable(); 434 ret = copy_from_user(kaddr, (const void __user *) src_addr, 435 PAGE_SIZE); 436 pagefault_enable(); 437 kunmap_local(kaddr); 438 439 if (ret) 440 return -EFAULT; 441 442 flush_dcache_folio(folio); 443 return ret; 444} 445 446static int mfill_copy_folio_retry(struct mfill_state *state, 447 struct folio *folio) 448{ 449 const struct vm_uffd_ops *orig_ops = vma_uffd_ops(state->vma); 450 unsigned long src_addr = state->src_addr; 451 void *kaddr; 452 int err; 453 454 /* retry copying with mm_lock dropped */ 455 mfill_put_vma(state); 456 457 kaddr = kmap_local_folio(folio, 0); 458 err = copy_from_user(kaddr, (const void __user *) src_addr, PAGE_SIZE); 459 kunmap_local(kaddr); 460 if (unlikely(err)) 461 return -EFAULT; 462 463 flush_dcache_folio(folio); 464 465 /* reget VMA and PMD, they could change underneath us */ 466 err = mfill_get_vma(state); 467 if (err) 468 return err; 469 470 /* 471 * The VMA type may have changed while the lock was dropped 472 * (e.g. replaced with a hugetlb mapping), making the caller's 473 * ops pointer stale. 474 */ 475 if (vma_uffd_ops(state->vma) != orig_ops) 476 return -EAGAIN; 477 478 err = mfill_establish_pmd(state); 479 if (err) 480 return err; 481 482 return 0; 483} 484 485static int __mfill_atomic_pte(struct mfill_state *state, 486 const struct vm_uffd_ops *ops) 487{ 488 unsigned long dst_addr = state->dst_addr; 489 unsigned long src_addr = state->src_addr; 490 uffd_flags_t flags = state->flags; 491 struct folio *folio; 492 int ret; 493 494 folio = ops->alloc_folio(state->vma, state->dst_addr); 495 if (!folio) 496 return -ENOMEM; 497 498 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) { 499 ret = mfill_copy_folio_locked(folio, src_addr); 500 /* 501 * Fallback to copy_from_user outside mmap_lock. 502 * If retry is successful, mfill_copy_folio_locked() returns 503 * with locks retaken by mfill_get_vma(). 504 * If there was an error, we must mfill_put_vma() anyway and it 505 * will take care of unlocking if needed. 506 */ 507 if (unlikely(ret)) { 508 ret = mfill_copy_folio_retry(state, folio); 509 if (ret) 510 goto err_folio_put; 511 } 512 } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { 513 clear_user_highpage(&folio->page, state->dst_addr); 514 } else { 515 VM_WARN_ONCE(1, "Unknown UFFDIO operation, flags: %x", flags); 516 } 517 518 /* 519 * The memory barrier inside __folio_mark_uptodate makes sure that 520 * preceding stores to the page contents become visible before 521 * the set_pte_at() write. 522 */ 523 __folio_mark_uptodate(folio); 524 525 if (ops->filemap_add) { 526 ret = ops->filemap_add(folio, state->vma, state->dst_addr); 527 if (ret) 528 goto err_folio_put; 529 } 530 531 ret = mfill_atomic_install_pte(state->pmd, state->vma, dst_addr, 532 &folio->page, flags); 533 if (ret) 534 goto err_filemap_remove; 535 536 return 0; 537 538err_filemap_remove: 539 if (ops->filemap_remove) 540 ops->filemap_remove(folio, state->vma); 541err_folio_put: 542 folio_put(folio); 543 return ret; 544} 545 546static int mfill_atomic_pte_copy(struct mfill_state *state) 547{ 548 const struct vm_uffd_ops *ops = vma_uffd_ops(state->vma); 549 550 /* 551 * The normal page fault path for a MAP_PRIVATE mapping in a 552 * file-backed VMA will invoke the fault, fill the hole in the file and 553 * COW it right away. The result generates plain anonymous memory. 554 * So when we are asked to fill a hole in a MAP_PRIVATE mapping, we'll 555 * generate anonymous memory directly without actually filling the 556 * hole. For the MAP_PRIVATE case the robustness check only happens in 557 * the pagetable (to verify it's still none) and not in the page cache. 558 */ 559 if (!(state->vma->vm_flags & VM_SHARED)) 560 ops = &anon_uffd_ops; 561 562 return __mfill_atomic_pte(state, ops); 563} 564 565static int mfill_atomic_pte_zeroed_folio(struct mfill_state *state) 566{ 567 const struct vm_uffd_ops *ops = vma_uffd_ops(state->vma); 568 569 return __mfill_atomic_pte(state, ops); 570} 571 572static int mfill_atomic_pte_zeropage(struct mfill_state *state) 573{ 574 struct vm_area_struct *dst_vma = state->vma; 575 unsigned long dst_addr = state->dst_addr; 576 pmd_t *dst_pmd = state->pmd; 577 pte_t _dst_pte, *dst_pte; 578 spinlock_t *ptl; 579 int ret; 580 581 if (mm_forbids_zeropage(dst_vma->vm_mm) || 582 (dst_vma->vm_flags & VM_SHARED)) 583 return mfill_atomic_pte_zeroed_folio(state); 584 585 _dst_pte = pte_mkspecial(pfn_pte(zero_pfn(dst_addr), 586 dst_vma->vm_page_prot)); 587 ret = -EAGAIN; 588 dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl); 589 if (!dst_pte) 590 goto out; 591 if (mfill_file_over_size(dst_vma, dst_addr)) { 592 ret = -EFAULT; 593 goto out_unlock; 594 } 595 ret = -EEXIST; 596 if (!pte_none(ptep_get(dst_pte))) 597 goto out_unlock; 598 set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte); 599 /* No need to invalidate - it was non-present before */ 600 update_mmu_cache(dst_vma, dst_addr, dst_pte); 601 ret = 0; 602out_unlock: 603 pte_unmap_unlock(dst_pte, ptl); 604out: 605 return ret; 606} 607 608/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ 609static int mfill_atomic_pte_continue(struct mfill_state *state) 610{ 611 struct vm_area_struct *dst_vma = state->vma; 612 const struct vm_uffd_ops *ops = vma_uffd_ops(dst_vma); 613 unsigned long dst_addr = state->dst_addr; 614 pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); 615 struct inode *inode = file_inode(dst_vma->vm_file); 616 uffd_flags_t flags = state->flags; 617 pmd_t *dst_pmd = state->pmd; 618 struct folio *folio; 619 struct page *page; 620 int ret; 621 622 if (!ops) { 623 VM_WARN_ONCE(1, "UFFDIO_CONTINUE for unsupported VMA"); 624 return -EOPNOTSUPP; 625 } 626 627 folio = ops->get_folio_noalloc(inode, pgoff); 628 /* Our caller expects us to return -EFAULT if we failed to find folio */ 629 if (IS_ERR_OR_NULL(folio)) 630 return -EFAULT; 631 632 page = folio_file_page(folio, pgoff); 633 if (PageHWPoison(page)) { 634 ret = -EIO; 635 goto out_release; 636 } 637 638 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, 639 page, flags); 640 if (ret) 641 goto out_release; 642 643 return 0; 644 645out_release: 646 folio_unlock(folio); 647 folio_put(folio); 648 return ret; 649} 650 651/* Handles UFFDIO_POISON for all non-hugetlb VMAs. */ 652static int mfill_atomic_pte_poison(struct mfill_state *state) 653{ 654 struct vm_area_struct *dst_vma = state->vma; 655 struct mm_struct *dst_mm = dst_vma->vm_mm; 656 unsigned long dst_addr = state->dst_addr; 657 pmd_t *dst_pmd = state->pmd; 658 pte_t _dst_pte, *dst_pte; 659 spinlock_t *ptl; 660 int ret; 661 662 _dst_pte = make_pte_marker(PTE_MARKER_POISONED); 663 ret = -EAGAIN; 664 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 665 if (!dst_pte) 666 goto out; 667 668 if (mfill_file_over_size(dst_vma, dst_addr)) { 669 ret = -EFAULT; 670 goto out_unlock; 671 } 672 673 ret = -EEXIST; 674 /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */ 675 if (!pte_none(ptep_get(dst_pte))) 676 goto out_unlock; 677 678 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 679 680 /* No need to invalidate - it was non-present before */ 681 update_mmu_cache(dst_vma, dst_addr, dst_pte); 682 ret = 0; 683out_unlock: 684 pte_unmap_unlock(dst_pte, ptl); 685out: 686 return ret; 687} 688 689#ifdef CONFIG_HUGETLB_PAGE 690/* 691 * mfill_atomic processing for HUGETLB vmas. Note that this routine is 692 * called with either vma-lock or mmap_lock held, it will release the lock 693 * before returning. 694 */ 695static __always_inline ssize_t mfill_atomic_hugetlb( 696 struct userfaultfd_ctx *ctx, 697 struct vm_area_struct *dst_vma, 698 unsigned long dst_start, 699 unsigned long src_start, 700 unsigned long len, 701 uffd_flags_t flags) 702{ 703 struct mm_struct *dst_mm = dst_vma->vm_mm; 704 ssize_t err; 705 pte_t *dst_pte; 706 unsigned long src_addr, dst_addr; 707 long copied; 708 struct folio *folio; 709 unsigned long vma_hpagesize; 710 pgoff_t idx; 711 u32 hash; 712 struct address_space *mapping; 713 714 /* 715 * There is no default zero huge page for all huge page sizes as 716 * supported by hugetlb. A PMD_SIZE huge pages may exist as used 717 * by THP. Since we can not reliably insert a zero page, this 718 * feature is not supported. 719 */ 720 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { 721 up_read(&ctx->map_changing_lock); 722 uffd_mfill_unlock(dst_vma); 723 return -EINVAL; 724 } 725 726 src_addr = src_start; 727 dst_addr = dst_start; 728 copied = 0; 729 folio = NULL; 730 vma_hpagesize = vma_kernel_pagesize(dst_vma); 731 732 /* 733 * Validate alignment based on huge page size 734 */ 735 err = -EINVAL; 736 if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) 737 goto out_unlock; 738 739retry: 740 /* 741 * On routine entry dst_vma is set. If we had to drop mmap_lock and 742 * retry, dst_vma will be set to NULL and we must lookup again. 743 */ 744 if (!dst_vma) { 745 dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); 746 if (IS_ERR(dst_vma)) { 747 err = PTR_ERR(dst_vma); 748 goto out; 749 } 750 751 err = -ENOENT; 752 if (!is_vm_hugetlb_page(dst_vma)) 753 goto out_unlock_vma; 754 755 err = -EINVAL; 756 if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) 757 goto out_unlock_vma; 758 759 /* 760 * If memory mappings are changing because of non-cooperative 761 * operation (e.g. mremap) running in parallel, bail out and 762 * request the user to retry later 763 */ 764 down_read(&ctx->map_changing_lock); 765 err = -EAGAIN; 766 if (atomic_read(&ctx->mmap_changing)) 767 goto out_unlock; 768 } 769 770 while (src_addr < src_start + len) { 771 VM_WARN_ON_ONCE(dst_addr >= dst_start + len); 772 773 /* 774 * Serialize via vma_lock and hugetlb_fault_mutex. 775 * vma_lock ensures the dst_pte remains valid even 776 * in the case of shared pmds. fault mutex prevents 777 * races with other faulting threads. 778 */ 779 idx = hugetlb_linear_page_index(dst_vma, dst_addr); 780 mapping = dst_vma->vm_file->f_mapping; 781 hash = hugetlb_fault_mutex_hash(mapping, idx); 782 mutex_lock(&hugetlb_fault_mutex_table[hash]); 783 hugetlb_vma_lock_read(dst_vma); 784 785 err = -ENOMEM; 786 dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); 787 if (!dst_pte) { 788 hugetlb_vma_unlock_read(dst_vma); 789 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 790 goto out_unlock; 791 } 792 793 if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { 794 const pte_t ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte); 795 796 if (!huge_pte_none(ptep) && !pte_is_uffd_marker(ptep)) { 797 err = -EEXIST; 798 hugetlb_vma_unlock_read(dst_vma); 799 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 800 goto out_unlock; 801 } 802 } 803 804 err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr, 805 src_addr, flags, &folio); 806 807 hugetlb_vma_unlock_read(dst_vma); 808 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 809 810 cond_resched(); 811 812 if (unlikely(err == -ENOENT)) { 813 up_read(&ctx->map_changing_lock); 814 uffd_mfill_unlock(dst_vma); 815 VM_WARN_ON_ONCE(!folio); 816 817 err = copy_folio_from_user(folio, 818 (const void __user *)src_addr, true); 819 if (unlikely(err)) { 820 err = -EFAULT; 821 goto out; 822 } 823 824 dst_vma = NULL; 825 goto retry; 826 } else 827 VM_WARN_ON_ONCE(folio); 828 829 if (!err) { 830 dst_addr += vma_hpagesize; 831 src_addr += vma_hpagesize; 832 copied += vma_hpagesize; 833 834 if (fatal_signal_pending(current)) 835 err = -EINTR; 836 } 837 if (err) 838 break; 839 } 840 841out_unlock: 842 up_read(&ctx->map_changing_lock); 843out_unlock_vma: 844 uffd_mfill_unlock(dst_vma); 845out: 846 if (folio) 847 folio_put(folio); 848 VM_WARN_ON_ONCE(copied < 0); 849 VM_WARN_ON_ONCE(err > 0); 850 VM_WARN_ON_ONCE(!copied && !err); 851 return copied ? copied : err; 852} 853#else /* !CONFIG_HUGETLB_PAGE */ 854/* fail at build time if gcc attempts to use this */ 855extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx, 856 struct vm_area_struct *dst_vma, 857 unsigned long dst_start, 858 unsigned long src_start, 859 unsigned long len, 860 uffd_flags_t flags); 861#endif /* CONFIG_HUGETLB_PAGE */ 862 863static __always_inline ssize_t mfill_atomic_pte(struct mfill_state *state) 864{ 865 uffd_flags_t flags = state->flags; 866 867 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) 868 return mfill_atomic_pte_continue(state); 869 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) 870 return mfill_atomic_pte_poison(state); 871 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) 872 return mfill_atomic_pte_copy(state); 873 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) 874 return mfill_atomic_pte_zeropage(state); 875 876 VM_WARN_ONCE(1, "Unknown UFFDIO operation, flags: %x", flags); 877 return -EOPNOTSUPP; 878} 879 880static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, 881 unsigned long dst_start, 882 unsigned long src_start, 883 unsigned long len, 884 uffd_flags_t flags) 885{ 886 struct mfill_state state = (struct mfill_state){ 887 .ctx = ctx, 888 .dst_start = dst_start, 889 .src_start = src_start, 890 .flags = flags, 891 .len = len, 892 .src_addr = src_start, 893 .dst_addr = dst_start, 894 }; 895 long copied = 0; 896 ssize_t err; 897 898 /* 899 * Sanitize the command parameters: 900 */ 901 VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK); 902 VM_WARN_ON_ONCE(len & ~PAGE_MASK); 903 904 /* Does the address range wrap, or is the span zero-sized? */ 905 VM_WARN_ON_ONCE(src_start + len <= src_start); 906 VM_WARN_ON_ONCE(dst_start + len <= dst_start); 907 908 err = mfill_get_vma(&state); 909 if (err) 910 goto out; 911 912 /* 913 * If this is a HUGETLB vma, pass off to appropriate routine 914 */ 915 if (is_vm_hugetlb_page(state.vma)) 916 return mfill_atomic_hugetlb(ctx, state.vma, dst_start, 917 src_start, len, flags); 918 919 while (state.src_addr < src_start + len) { 920 VM_WARN_ON_ONCE(state.dst_addr >= dst_start + len); 921 922 err = mfill_establish_pmd(&state); 923 if (err) 924 break; 925 926 /* 927 * For shmem mappings, khugepaged is allowed to remove page 928 * tables under us; pte_offset_map_lock() will deal with that. 929 */ 930 931 err = mfill_atomic_pte(&state); 932 cond_resched(); 933 934 if (!err) { 935 state.dst_addr += PAGE_SIZE; 936 state.src_addr += PAGE_SIZE; 937 copied += PAGE_SIZE; 938 939 if (fatal_signal_pending(current)) 940 err = -EINTR; 941 } 942 if (err) 943 break; 944 } 945 946 mfill_put_vma(&state); 947out: 948 VM_WARN_ON_ONCE(copied < 0); 949 VM_WARN_ON_ONCE(err > 0); 950 VM_WARN_ON_ONCE(!copied && !err); 951 return copied ? copied : err; 952} 953 954ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, 955 unsigned long src_start, unsigned long len, 956 uffd_flags_t flags) 957{ 958 return mfill_atomic(ctx, dst_start, src_start, len, 959 uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY)); 960} 961 962ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, 963 unsigned long start, 964 unsigned long len) 965{ 966 return mfill_atomic(ctx, start, 0, len, 967 uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE)); 968} 969 970ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start, 971 unsigned long len, uffd_flags_t flags) 972{ 973 974 /* 975 * A caller might reasonably assume that UFFDIO_CONTINUE contains an 976 * smp_wmb() to ensure that any writes to the about-to-be-mapped page by 977 * the thread doing the UFFDIO_CONTINUE are guaranteed to be visible to 978 * subsequent loads from the page through the newly mapped address range. 979 */ 980 smp_wmb(); 981 982 return mfill_atomic(ctx, start, 0, len, 983 uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE)); 984} 985 986ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, 987 unsigned long len, uffd_flags_t flags) 988{ 989 return mfill_atomic(ctx, start, 0, len, 990 uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON)); 991} 992 993long uffd_wp_range(struct vm_area_struct *dst_vma, 994 unsigned long start, unsigned long len, bool enable_wp) 995{ 996 unsigned int mm_cp_flags; 997 struct mmu_gather tlb; 998 long ret; 999 1000 VM_WARN_ONCE(start < dst_vma->vm_start || start + len > dst_vma->vm_end, 1001 "The address range exceeds VMA boundary.\n"); 1002 if (enable_wp) 1003 mm_cp_flags = MM_CP_UFFD_WP; 1004 else 1005 mm_cp_flags = MM_CP_UFFD_WP_RESOLVE; 1006 1007 /* 1008 * vma->vm_page_prot already reflects that uffd-wp is enabled for this 1009 * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed 1010 * to be write-protected as default whenever protection changes. 1011 * Try upgrading write permissions manually. 1012 */ 1013 if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma)) 1014 mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; 1015 tlb_gather_mmu(&tlb, dst_vma->vm_mm); 1016 ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags); 1017 tlb_finish_mmu(&tlb); 1018 1019 return ret; 1020} 1021 1022int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, 1023 unsigned long len, bool enable_wp) 1024{ 1025 struct mm_struct *dst_mm = ctx->mm; 1026 unsigned long end = start + len; 1027 unsigned long _start, _end; 1028 struct vm_area_struct *dst_vma; 1029 unsigned long page_mask; 1030 long err; 1031 VMA_ITERATOR(vmi, dst_mm, start); 1032 1033 /* 1034 * Sanitize the command parameters: 1035 */ 1036 VM_WARN_ON_ONCE(start & ~PAGE_MASK); 1037 VM_WARN_ON_ONCE(len & ~PAGE_MASK); 1038 1039 /* Does the address range wrap, or is the span zero-sized? */ 1040 VM_WARN_ON_ONCE(start + len <= start); 1041 1042 mmap_read_lock(dst_mm); 1043 1044 /* 1045 * If memory mappings are changing because of non-cooperative 1046 * operation (e.g. mremap) running in parallel, bail out and 1047 * request the user to retry later 1048 */ 1049 down_read(&ctx->map_changing_lock); 1050 err = -EAGAIN; 1051 if (atomic_read(&ctx->mmap_changing)) 1052 goto out_unlock; 1053 1054 err = -ENOENT; 1055 for_each_vma_range(vmi, dst_vma, end) { 1056 1057 if (!userfaultfd_wp(dst_vma)) { 1058 err = -ENOENT; 1059 break; 1060 } 1061 1062 if (is_vm_hugetlb_page(dst_vma)) { 1063 err = -EINVAL; 1064 page_mask = vma_kernel_pagesize(dst_vma) - 1; 1065 if ((start & page_mask) || (len & page_mask)) 1066 break; 1067 } 1068 1069 _start = max(dst_vma->vm_start, start); 1070 _end = min(dst_vma->vm_end, end); 1071 1072 err = uffd_wp_range(dst_vma, _start, _end - _start, enable_wp); 1073 1074 /* Return 0 on success, <0 on failures */ 1075 if (err < 0) 1076 break; 1077 err = 0; 1078 } 1079out_unlock: 1080 up_read(&ctx->map_changing_lock); 1081 mmap_read_unlock(dst_mm); 1082 return err; 1083} 1084 1085 1086void double_pt_lock(spinlock_t *ptl1, 1087 spinlock_t *ptl2) 1088 __acquires(ptl1) 1089 __acquires(ptl2) 1090{ 1091 if (ptl1 > ptl2) 1092 swap(ptl1, ptl2); 1093 /* lock in virtual address order to avoid lock inversion */ 1094 spin_lock(ptl1); 1095 if (ptl1 != ptl2) 1096 spin_lock_nested(ptl2, SINGLE_DEPTH_NESTING); 1097 else 1098 __acquire(ptl2); 1099} 1100 1101void double_pt_unlock(spinlock_t *ptl1, 1102 spinlock_t *ptl2) 1103 __releases(ptl1) 1104 __releases(ptl2) 1105{ 1106 spin_unlock(ptl1); 1107 if (ptl1 != ptl2) 1108 spin_unlock(ptl2); 1109 else 1110 __release(ptl2); 1111} 1112 1113static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte, 1114 pte_t orig_dst_pte, pte_t orig_src_pte, 1115 pmd_t *dst_pmd, pmd_t dst_pmdval) 1116{ 1117 return pte_same(ptep_get(src_pte), orig_src_pte) && 1118 pte_same(ptep_get(dst_pte), orig_dst_pte) && 1119 pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd)); 1120} 1121 1122/* 1123 * Checks if the two ptes and the corresponding folio are eligible for batched 1124 * move. If so, then returns pointer to the locked folio. Otherwise, returns NULL. 1125 * 1126 * NOTE: folio's reference is not required as the whole operation is within 1127 * PTL's critical section. 1128 */ 1129static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma, 1130 unsigned long src_addr, 1131 pte_t *src_pte, pte_t *dst_pte) 1132{ 1133 pte_t orig_dst_pte, orig_src_pte; 1134 struct folio *folio; 1135 1136 orig_dst_pte = ptep_get(dst_pte); 1137 if (!pte_none(orig_dst_pte)) 1138 return NULL; 1139 1140 orig_src_pte = ptep_get(src_pte); 1141 if (!pte_present(orig_src_pte) || is_zero_pfn(pte_pfn(orig_src_pte))) 1142 return NULL; 1143 1144 folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); 1145 if (!folio || !folio_trylock(folio)) 1146 return NULL; 1147 if (!PageAnonExclusive(&folio->page) || folio_test_large(folio)) { 1148 folio_unlock(folio); 1149 return NULL; 1150 } 1151 return folio; 1152} 1153 1154/* 1155 * Moves src folios to dst in a batch as long as they are not large, and can 1156 * successfully take the lock via folio_trylock(). 1157 */ 1158static long move_present_ptes(struct mm_struct *mm, 1159 struct vm_area_struct *dst_vma, 1160 struct vm_area_struct *src_vma, 1161 unsigned long dst_addr, unsigned long src_addr, 1162 pte_t *dst_pte, pte_t *src_pte, 1163 pte_t orig_dst_pte, pte_t orig_src_pte, 1164 pmd_t *dst_pmd, pmd_t dst_pmdval, 1165 spinlock_t *dst_ptl, spinlock_t *src_ptl, 1166 struct folio **first_src_folio, unsigned long len) 1167{ 1168 int err = 0; 1169 struct folio *src_folio = *first_src_folio; 1170 unsigned long src_start = src_addr; 1171 unsigned long src_end; 1172 1173 len = pmd_addr_end(dst_addr, dst_addr + len) - dst_addr; 1174 src_end = pmd_addr_end(src_addr, src_addr + len); 1175 flush_cache_range(src_vma, src_addr, src_end); 1176 double_pt_lock(dst_ptl, src_ptl); 1177 1178 if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, 1179 dst_pmd, dst_pmdval)) { 1180 err = -EAGAIN; 1181 goto out; 1182 } 1183 if (folio_test_large(src_folio) || 1184 folio_maybe_dma_pinned(src_folio) || 1185 !PageAnonExclusive(&src_folio->page)) { 1186 err = -EBUSY; 1187 goto out; 1188 } 1189 /* It's safe to drop the reference now as the page-table is holding one. */ 1190 folio_put(*first_src_folio); 1191 *first_src_folio = NULL; 1192 lazy_mmu_mode_enable(); 1193 1194 while (true) { 1195 orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); 1196 /* Folio got pinned from under us. Put it back and fail the move. */ 1197 if (folio_maybe_dma_pinned(src_folio)) { 1198 set_pte_at(mm, src_addr, src_pte, orig_src_pte); 1199 err = -EBUSY; 1200 break; 1201 } 1202 1203 folio_move_anon_rmap(src_folio, dst_vma); 1204 src_folio->index = linear_page_index(dst_vma, dst_addr); 1205 1206 orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot); 1207 /* Set soft dirty bit so userspace can notice the pte was moved */ 1208 if (pgtable_supports_soft_dirty()) 1209 orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); 1210 if (pte_dirty(orig_src_pte)) 1211 orig_dst_pte = pte_mkdirty(orig_dst_pte); 1212 orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma); 1213 set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); 1214 1215 src_addr += PAGE_SIZE; 1216 if (src_addr == src_end) 1217 break; 1218 dst_addr += PAGE_SIZE; 1219 dst_pte++; 1220 src_pte++; 1221 1222 folio_unlock(src_folio); 1223 src_folio = check_ptes_for_batched_move(src_vma, src_addr, 1224 src_pte, dst_pte); 1225 if (!src_folio) 1226 break; 1227 } 1228 1229 lazy_mmu_mode_disable(); 1230 if (src_addr > src_start) 1231 flush_tlb_range(src_vma, src_start, src_addr); 1232 1233 if (src_folio) 1234 folio_unlock(src_folio); 1235out: 1236 double_pt_unlock(dst_ptl, src_ptl); 1237 return src_addr > src_start ? src_addr - src_start : err; 1238} 1239 1240static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, 1241 unsigned long dst_addr, unsigned long src_addr, 1242 pte_t *dst_pte, pte_t *src_pte, 1243 pte_t orig_dst_pte, pte_t orig_src_pte, 1244 pmd_t *dst_pmd, pmd_t dst_pmdval, 1245 spinlock_t *dst_ptl, spinlock_t *src_ptl, 1246 struct folio *src_folio, 1247 struct swap_info_struct *si, swp_entry_t entry) 1248{ 1249 /* 1250 * Check if the folio still belongs to the target swap entry after 1251 * acquiring the lock. Folio can be freed in the swap cache while 1252 * not locked. 1253 */ 1254 if (src_folio && unlikely(!folio_test_swapcache(src_folio) || 1255 entry.val != src_folio->swap.val)) 1256 return -EAGAIN; 1257 1258 double_pt_lock(dst_ptl, src_ptl); 1259 1260 if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, 1261 dst_pmd, dst_pmdval)) { 1262 double_pt_unlock(dst_ptl, src_ptl); 1263 return -EAGAIN; 1264 } 1265 1266 /* 1267 * The src_folio resides in the swapcache, requiring an update to its 1268 * index and mapping to align with the dst_vma, where a swap-in may 1269 * occur and hit the swapcache after moving the PTE. 1270 */ 1271 if (src_folio) { 1272 folio_move_anon_rmap(src_folio, dst_vma); 1273 src_folio->index = linear_page_index(dst_vma, dst_addr); 1274 } else { 1275 /* 1276 * Check if the swap entry is cached after acquiring the src_pte 1277 * lock. Otherwise, we might miss a newly loaded swap cache folio. 1278 * 1279 * We are trying to catch newly added swap cache, the only possible case is 1280 * when a folio is swapped in and out again staying in swap cache, using the 1281 * same entry before the PTE check above. The PTL is acquired and released 1282 * twice, each time after updating the swap table. So holding 1283 * the PTL here ensures we see the updated value. 1284 */ 1285 if (swap_cache_has_folio(entry)) { 1286 double_pt_unlock(dst_ptl, src_ptl); 1287 return -EAGAIN; 1288 } 1289 } 1290 1291 orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); 1292 if (pgtable_supports_soft_dirty()) 1293 orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte); 1294 set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); 1295 double_pt_unlock(dst_ptl, src_ptl); 1296 1297 return PAGE_SIZE; 1298} 1299 1300static int move_zeropage_pte(struct mm_struct *mm, 1301 struct vm_area_struct *dst_vma, 1302 struct vm_area_struct *src_vma, 1303 unsigned long dst_addr, unsigned long src_addr, 1304 pte_t *dst_pte, pte_t *src_pte, 1305 pte_t orig_dst_pte, pte_t orig_src_pte, 1306 pmd_t *dst_pmd, pmd_t dst_pmdval, 1307 spinlock_t *dst_ptl, spinlock_t *src_ptl) 1308{ 1309 pte_t zero_pte; 1310 1311 double_pt_lock(dst_ptl, src_ptl); 1312 if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, 1313 dst_pmd, dst_pmdval)) { 1314 double_pt_unlock(dst_ptl, src_ptl); 1315 return -EAGAIN; 1316 } 1317 1318 zero_pte = pte_mkspecial(pfn_pte(zero_pfn(dst_addr), 1319 dst_vma->vm_page_prot)); 1320 ptep_clear_flush(src_vma, src_addr, src_pte); 1321 set_pte_at(mm, dst_addr, dst_pte, zero_pte); 1322 double_pt_unlock(dst_ptl, src_ptl); 1323 1324 return PAGE_SIZE; 1325} 1326 1327 1328/* 1329 * The mmap_lock for reading is held by the caller. Just move the page(s) 1330 * from src_pmd to dst_pmd if possible, and return number of bytes moved. 1331 * On failure, an error code is returned. 1332 */ 1333static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, 1334 struct vm_area_struct *dst_vma, 1335 struct vm_area_struct *src_vma, 1336 unsigned long dst_addr, unsigned long src_addr, 1337 unsigned long len, __u64 mode) 1338{ 1339 struct swap_info_struct *si = NULL; 1340 pte_t orig_src_pte, orig_dst_pte; 1341 pte_t src_folio_pte; 1342 spinlock_t *src_ptl, *dst_ptl; 1343 pte_t *src_pte = NULL; 1344 pte_t *dst_pte = NULL; 1345 pmd_t dummy_pmdval; 1346 pmd_t dst_pmdval; 1347 struct folio *src_folio = NULL; 1348 struct mmu_notifier_range range; 1349 long ret = 0; 1350 1351 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 1352 src_addr, src_addr + len); 1353 mmu_notifier_invalidate_range_start(&range); 1354retry: 1355 /* 1356 * Use the maywrite version to indicate that dst_pte will be modified, 1357 * since dst_pte needs to be none, the subsequent pte_same() check 1358 * cannot prevent the dst_pte page from being freed concurrently, so we 1359 * also need to obtain dst_pmdval and recheck pmd_same() later. 1360 */ 1361 dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dst_pmdval, 1362 &dst_ptl); 1363 1364 /* Retry if a huge pmd materialized from under us */ 1365 if (unlikely(!dst_pte)) { 1366 ret = -EAGAIN; 1367 goto out; 1368 } 1369 1370 /* 1371 * Unlike dst_pte, the subsequent pte_same() check can ensure the 1372 * stability of the src_pte page, so there is no need to get pmdval, 1373 * just pass a dummy variable to it. 1374 */ 1375 src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval, 1376 &src_ptl); 1377 1378 /* 1379 * We held the mmap_lock for reading so MADV_DONTNEED 1380 * can zap transparent huge pages under us, or the 1381 * transparent huge page fault can establish new 1382 * transparent huge pages under us. 1383 */ 1384 if (unlikely(!src_pte)) { 1385 ret = -EAGAIN; 1386 goto out; 1387 } 1388 1389 /* Sanity checks before the operation */ 1390 if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) || 1391 pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) { 1392 ret = -EINVAL; 1393 goto out; 1394 } 1395 1396 spin_lock(dst_ptl); 1397 orig_dst_pte = ptep_get(dst_pte); 1398 spin_unlock(dst_ptl); 1399 if (!pte_none(orig_dst_pte)) { 1400 ret = -EEXIST; 1401 goto out; 1402 } 1403 1404 spin_lock(src_ptl); 1405 orig_src_pte = ptep_get(src_pte); 1406 spin_unlock(src_ptl); 1407 if (pte_none(orig_src_pte)) { 1408 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) 1409 ret = -ENOENT; 1410 else /* nothing to do to move a hole */ 1411 ret = PAGE_SIZE; 1412 goto out; 1413 } 1414 1415 /* If PTE changed after we locked the folio then start over */ 1416 if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) { 1417 ret = -EAGAIN; 1418 goto out; 1419 } 1420 1421 if (pte_present(orig_src_pte)) { 1422 if (is_zero_pfn(pte_pfn(orig_src_pte))) { 1423 ret = move_zeropage_pte(mm, dst_vma, src_vma, 1424 dst_addr, src_addr, dst_pte, src_pte, 1425 orig_dst_pte, orig_src_pte, 1426 dst_pmd, dst_pmdval, dst_ptl, src_ptl); 1427 goto out; 1428 } 1429 1430 /* 1431 * Pin and lock source folio. Since we are in RCU read section, 1432 * we can't block, so on contention have to unmap the ptes, 1433 * obtain the lock and retry. 1434 */ 1435 if (!src_folio) { 1436 struct folio *folio; 1437 bool locked; 1438 1439 /* 1440 * Pin the page while holding the lock to be sure the 1441 * page isn't freed under us 1442 */ 1443 spin_lock(src_ptl); 1444 if (!pte_same(orig_src_pte, ptep_get(src_pte))) { 1445 spin_unlock(src_ptl); 1446 ret = -EAGAIN; 1447 goto out; 1448 } 1449 1450 folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); 1451 if (!folio || !PageAnonExclusive(&folio->page)) { 1452 spin_unlock(src_ptl); 1453 ret = -EBUSY; 1454 goto out; 1455 } 1456 1457 locked = folio_trylock(folio); 1458 /* 1459 * We avoid waiting for folio lock with a raised 1460 * refcount for large folios because extra refcounts 1461 * will result in split_folio() failing later and 1462 * retrying. If multiple tasks are trying to move a 1463 * large folio we can end up livelocking. 1464 */ 1465 if (!locked && folio_test_large(folio)) { 1466 spin_unlock(src_ptl); 1467 ret = -EAGAIN; 1468 goto out; 1469 } 1470 1471 folio_get(folio); 1472 src_folio = folio; 1473 src_folio_pte = orig_src_pte; 1474 spin_unlock(src_ptl); 1475 1476 if (!locked) { 1477 pte_unmap(src_pte); 1478 pte_unmap(dst_pte); 1479 src_pte = dst_pte = NULL; 1480 /* now we can block and wait */ 1481 folio_lock(src_folio); 1482 goto retry; 1483 } 1484 1485 if (WARN_ON_ONCE(!folio_test_anon(src_folio))) { 1486 ret = -EBUSY; 1487 goto out; 1488 } 1489 } 1490 1491 /* at this point we have src_folio locked */ 1492 if (folio_test_large(src_folio)) { 1493 /* split_folio() can block */ 1494 pte_unmap(src_pte); 1495 pte_unmap(dst_pte); 1496 src_pte = dst_pte = NULL; 1497 ret = split_folio(src_folio); 1498 if (ret) 1499 goto out; 1500 /* have to reacquire the folio after it got split */ 1501 folio_unlock(src_folio); 1502 folio_put(src_folio); 1503 src_folio = NULL; 1504 goto retry; 1505 } 1506 1507 ret = move_present_ptes(mm, dst_vma, src_vma, 1508 dst_addr, src_addr, dst_pte, src_pte, 1509 orig_dst_pte, orig_src_pte, dst_pmd, 1510 dst_pmdval, dst_ptl, src_ptl, &src_folio, 1511 len); 1512 } else { /* !pte_present() */ 1513 struct folio *folio = NULL; 1514 const softleaf_t entry = softleaf_from_pte(orig_src_pte); 1515 1516 if (softleaf_is_migration(entry)) { 1517 pte_unmap(src_pte); 1518 pte_unmap(dst_pte); 1519 src_pte = dst_pte = NULL; 1520 migration_entry_wait(mm, src_pmd, src_addr); 1521 1522 ret = -EAGAIN; 1523 goto out; 1524 } else if (!softleaf_is_swap(entry)) { 1525 ret = -EFAULT; 1526 goto out; 1527 } 1528 1529 if (!pte_swp_exclusive(orig_src_pte)) { 1530 ret = -EBUSY; 1531 goto out; 1532 } 1533 1534 si = get_swap_device(entry); 1535 if (unlikely(!si)) { 1536 ret = -EAGAIN; 1537 goto out; 1538 } 1539 /* 1540 * Verify the existence of the swapcache. If present, the folio's 1541 * index and mapping must be updated even when the PTE is a swap 1542 * entry. The anon_vma lock is not taken during this process since 1543 * the folio has already been unmapped, and the swap entry is 1544 * exclusive, preventing rmap walks. 1545 * 1546 * For large folios, return -EBUSY immediately, as split_folio() 1547 * also returns -EBUSY when attempting to split unmapped large 1548 * folios in the swapcache. This issue needs to be resolved 1549 * separately to allow proper handling. 1550 */ 1551 if (!src_folio) 1552 folio = swap_cache_get_folio(entry); 1553 if (folio) { 1554 if (folio_test_large(folio)) { 1555 ret = -EBUSY; 1556 folio_put(folio); 1557 goto out; 1558 } 1559 src_folio = folio; 1560 src_folio_pte = orig_src_pte; 1561 if (!folio_trylock(src_folio)) { 1562 pte_unmap(src_pte); 1563 pte_unmap(dst_pte); 1564 src_pte = dst_pte = NULL; 1565 put_swap_device(si); 1566 si = NULL; 1567 /* now we can block and wait */ 1568 folio_lock(src_folio); 1569 goto retry; 1570 } 1571 } 1572 ret = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, 1573 orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, 1574 dst_ptl, src_ptl, src_folio, si, entry); 1575 } 1576 1577out: 1578 if (src_folio) { 1579 folio_unlock(src_folio); 1580 folio_put(src_folio); 1581 } 1582 /* 1583 * Unmap in reverse order (LIFO) to maintain proper kmap_local 1584 * index ordering when CONFIG_HIGHPTE is enabled. We mapped dst_pte 1585 * first, then src_pte, so we must unmap src_pte first, then dst_pte. 1586 */ 1587 if (src_pte) 1588 pte_unmap(src_pte); 1589 if (dst_pte) 1590 pte_unmap(dst_pte); 1591 mmu_notifier_invalidate_range_end(&range); 1592 if (si) 1593 put_swap_device(si); 1594 1595 return ret; 1596} 1597 1598#ifdef CONFIG_TRANSPARENT_HUGEPAGE 1599static inline bool move_splits_huge_pmd(unsigned long dst_addr, 1600 unsigned long src_addr, 1601 unsigned long src_end) 1602{ 1603 return (src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) || 1604 src_end - src_addr < HPAGE_PMD_SIZE; 1605} 1606#else 1607static inline bool move_splits_huge_pmd(unsigned long dst_addr, 1608 unsigned long src_addr, 1609 unsigned long src_end) 1610{ 1611 /* This is unreachable anyway, just to avoid warnings when HPAGE_PMD_SIZE==0 */ 1612 return false; 1613} 1614#endif 1615 1616static inline bool vma_move_compatible(struct vm_area_struct *vma) 1617{ 1618 return !(vma->vm_flags & (VM_PFNMAP | VM_IO | VM_HUGETLB | 1619 VM_MIXEDMAP | VM_SHADOW_STACK)); 1620} 1621 1622static int validate_move_areas(struct userfaultfd_ctx *ctx, 1623 struct vm_area_struct *src_vma, 1624 struct vm_area_struct *dst_vma) 1625{ 1626 /* Only allow moving if both have the same access and protection */ 1627 if ((src_vma->vm_flags & VM_ACCESS_FLAGS) != (dst_vma->vm_flags & VM_ACCESS_FLAGS) || 1628 pgprot_val(src_vma->vm_page_prot) != pgprot_val(dst_vma->vm_page_prot)) 1629 return -EINVAL; 1630 1631 /* Only allow moving if both are mlocked or both aren't */ 1632 if ((src_vma->vm_flags & VM_LOCKED) != (dst_vma->vm_flags & VM_LOCKED)) 1633 return -EINVAL; 1634 1635 /* 1636 * For now, we keep it simple and only move between writable VMAs. 1637 * Access flags are equal, therefore checking only the source is enough. 1638 */ 1639 if (!(src_vma->vm_flags & VM_WRITE)) 1640 return -EINVAL; 1641 1642 /* Check if vma flags indicate content which can be moved */ 1643 if (!vma_move_compatible(src_vma) || !vma_move_compatible(dst_vma)) 1644 return -EINVAL; 1645 1646 /* Ensure dst_vma is registered in uffd we are operating on */ 1647 if (!dst_vma->vm_userfaultfd_ctx.ctx || 1648 dst_vma->vm_userfaultfd_ctx.ctx != ctx) 1649 return -EINVAL; 1650 1651 /* Only allow moving across anonymous vmas */ 1652 if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma)) 1653 return -EINVAL; 1654 1655 return 0; 1656} 1657 1658static __always_inline 1659int find_vmas_mm_locked(struct mm_struct *mm, 1660 unsigned long dst_start, 1661 unsigned long src_start, 1662 struct vm_area_struct **dst_vmap, 1663 struct vm_area_struct **src_vmap) 1664{ 1665 struct vm_area_struct *vma; 1666 1667 mmap_assert_locked(mm); 1668 vma = find_vma_and_prepare_anon(mm, dst_start); 1669 if (IS_ERR(vma)) 1670 return PTR_ERR(vma); 1671 1672 *dst_vmap = vma; 1673 /* Skip finding src_vma if src_start is in dst_vma */ 1674 if (src_start >= vma->vm_start && src_start < vma->vm_end) 1675 goto out_success; 1676 1677 vma = vma_lookup(mm, src_start); 1678 if (!vma) 1679 return -ENOENT; 1680out_success: 1681 *src_vmap = vma; 1682 return 0; 1683} 1684 1685#ifdef CONFIG_PER_VMA_LOCK 1686static int uffd_move_lock(struct mm_struct *mm, 1687 unsigned long dst_start, 1688 unsigned long src_start, 1689 struct vm_area_struct **dst_vmap, 1690 struct vm_area_struct **src_vmap) 1691{ 1692 struct vm_area_struct *vma; 1693 int err; 1694 1695 vma = uffd_lock_vma(mm, dst_start); 1696 if (IS_ERR(vma)) 1697 return PTR_ERR(vma); 1698 1699 *dst_vmap = vma; 1700 /* 1701 * Skip finding src_vma if src_start is in dst_vma. This also ensures 1702 * that we don't lock the same vma twice. 1703 */ 1704 if (src_start >= vma->vm_start && src_start < vma->vm_end) { 1705 *src_vmap = vma; 1706 return 0; 1707 } 1708 1709 /* 1710 * Using uffd_lock_vma() to get src_vma can lead to following deadlock: 1711 * 1712 * Thread1 Thread2 1713 * ------- ------- 1714 * vma_start_read(dst_vma) 1715 * mmap_write_lock(mm) 1716 * vma_start_write(src_vma) 1717 * vma_start_read(src_vma) 1718 * mmap_read_lock(mm) 1719 * vma_start_write(dst_vma) 1720 */ 1721 *src_vmap = lock_vma_under_rcu(mm, src_start); 1722 if (likely(*src_vmap)) 1723 return 0; 1724 1725 /* Undo any locking and retry in mmap_lock critical section */ 1726 vma_end_read(*dst_vmap); 1727 1728 mmap_read_lock(mm); 1729 err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); 1730 if (err) 1731 goto out; 1732 1733 if (!vma_start_read_locked(*dst_vmap)) { 1734 err = -EAGAIN; 1735 goto out; 1736 } 1737 1738 /* Nothing further to do if both vmas are locked. */ 1739 if (*dst_vmap == *src_vmap) 1740 goto out; 1741 1742 if (!vma_start_read_locked_nested(*src_vmap, SINGLE_DEPTH_NESTING)) { 1743 /* Undo dst_vmap locking if src_vmap failed to lock */ 1744 vma_end_read(*dst_vmap); 1745 err = -EAGAIN; 1746 } 1747out: 1748 mmap_read_unlock(mm); 1749 return err; 1750} 1751 1752static void uffd_move_unlock(struct vm_area_struct *dst_vma, 1753 struct vm_area_struct *src_vma) 1754{ 1755 vma_end_read(src_vma); 1756 if (src_vma != dst_vma) 1757 vma_end_read(dst_vma); 1758} 1759 1760#else 1761 1762static int uffd_move_lock(struct mm_struct *mm, 1763 unsigned long dst_start, 1764 unsigned long src_start, 1765 struct vm_area_struct **dst_vmap, 1766 struct vm_area_struct **src_vmap) 1767{ 1768 int err; 1769 1770 mmap_read_lock(mm); 1771 err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); 1772 if (err) 1773 mmap_read_unlock(mm); 1774 return err; 1775} 1776 1777static void uffd_move_unlock(struct vm_area_struct *dst_vma, 1778 struct vm_area_struct *src_vma) 1779{ 1780 mmap_assert_locked(src_vma->vm_mm); 1781 mmap_read_unlock(dst_vma->vm_mm); 1782} 1783#endif 1784 1785/** 1786 * move_pages - move arbitrary anonymous pages of an existing vma 1787 * @ctx: pointer to the userfaultfd context 1788 * @dst_start: start of the destination virtual memory range 1789 * @src_start: start of the source virtual memory range 1790 * @len: length of the virtual memory range 1791 * @mode: flags from uffdio_move.mode 1792 * 1793 * It will either use the mmap_lock in read mode or per-vma locks 1794 * 1795 * move_pages() remaps arbitrary anonymous pages atomically in zero 1796 * copy. It only works on non shared anonymous pages because those can 1797 * be relocated without generating non linear anon_vmas in the rmap 1798 * code. 1799 * 1800 * It provides a zero copy mechanism to handle userspace page faults. 1801 * The source vma pages should have mapcount == 1, which can be 1802 * enforced by using madvise(MADV_DONTFORK) on src vma. 1803 * 1804 * The thread receiving the page during the userland page fault 1805 * will receive the faulting page in the source vma through the network, 1806 * storage or any other I/O device (MADV_DONTFORK in the source vma 1807 * avoids move_pages() to fail with -EBUSY if the process forks before 1808 * move_pages() is called), then it will call move_pages() to map the 1809 * page in the faulting address in the destination vma. 1810 * 1811 * This userfaultfd command works purely via pagetables, so it's the 1812 * most efficient way to move physical non shared anonymous pages 1813 * across different virtual addresses. Unlike mremap()/mmap()/munmap() 1814 * it does not create any new vmas. The mapping in the destination 1815 * address is atomic. 1816 * 1817 * It only works if the vma protection bits are identical from the 1818 * source and destination vma. 1819 * 1820 * It can remap non shared anonymous pages within the same vma too. 1821 * 1822 * If the source virtual memory range has any unmapped holes, or if 1823 * the destination virtual memory range is not a whole unmapped hole, 1824 * move_pages() will fail respectively with -ENOENT or -EEXIST. This 1825 * provides a very strict behavior to avoid any chance of memory 1826 * corruption going unnoticed if there are userland race conditions. 1827 * Only one thread should resolve the userland page fault at any given 1828 * time for any given faulting address. This means that if two threads 1829 * try to both call move_pages() on the same destination address at the 1830 * same time, the second thread will get an explicit error from this 1831 * command. 1832 * 1833 * The command retval will return "len" is successful. The command 1834 * however can be interrupted by fatal signals or errors. If 1835 * interrupted it will return the number of bytes successfully 1836 * remapped before the interruption if any, or the negative error if 1837 * none. It will never return zero. Either it will return an error or 1838 * an amount of bytes successfully moved. If the retval reports a 1839 * "short" remap, the move_pages() command should be repeated by 1840 * userland with src+retval, dst+reval, len-retval if it wants to know 1841 * about the error that interrupted it. 1842 * 1843 * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag can be specified to 1844 * prevent -ENOENT errors to materialize if there are holes in the 1845 * source virtual range that is being remapped. The holes will be 1846 * accounted as successfully remapped in the retval of the 1847 * command. This is mostly useful to remap hugepage naturally aligned 1848 * virtual regions without knowing if there are transparent hugepage 1849 * in the regions or not, but preventing the risk of having to split 1850 * the hugepmd during the remap. 1851 */ 1852ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, 1853 unsigned long src_start, unsigned long len, __u64 mode) 1854{ 1855 struct mm_struct *mm = ctx->mm; 1856 struct vm_area_struct *src_vma, *dst_vma; 1857 unsigned long src_addr, dst_addr, src_end; 1858 pmd_t *src_pmd, *dst_pmd; 1859 long err = -EINVAL; 1860 ssize_t moved = 0; 1861 1862 /* Sanitize the command parameters. */ 1863 VM_WARN_ON_ONCE(src_start & ~PAGE_MASK); 1864 VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK); 1865 VM_WARN_ON_ONCE(len & ~PAGE_MASK); 1866 1867 /* Does the address range wrap, or is the span zero-sized? */ 1868 VM_WARN_ON_ONCE(src_start + len < src_start); 1869 VM_WARN_ON_ONCE(dst_start + len < dst_start); 1870 1871 err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma); 1872 if (err) 1873 goto out; 1874 1875 /* Re-check after taking map_changing_lock */ 1876 err = -EAGAIN; 1877 down_read(&ctx->map_changing_lock); 1878 if (likely(atomic_read(&ctx->mmap_changing))) 1879 goto out_unlock; 1880 /* 1881 * Make sure the vma is not shared, that the src and dst remap 1882 * ranges are both valid and fully within a single existing 1883 * vma. 1884 */ 1885 err = -EINVAL; 1886 if (src_vma->vm_flags & VM_SHARED) 1887 goto out_unlock; 1888 if (src_start + len > src_vma->vm_end) 1889 goto out_unlock; 1890 1891 if (dst_vma->vm_flags & VM_SHARED) 1892 goto out_unlock; 1893 if (dst_start + len > dst_vma->vm_end) 1894 goto out_unlock; 1895 1896 err = validate_move_areas(ctx, src_vma, dst_vma); 1897 if (err) 1898 goto out_unlock; 1899 1900 for (src_addr = src_start, dst_addr = dst_start, src_end = src_start + len; 1901 src_addr < src_end;) { 1902 spinlock_t *ptl; 1903 pmd_t dst_pmdval; 1904 unsigned long step_size; 1905 1906 /* 1907 * Below works because anonymous area would not have a 1908 * transparent huge PUD. If file-backed support is added, 1909 * that case would need to be handled here. 1910 */ 1911 src_pmd = mm_find_pmd(mm, src_addr); 1912 if (unlikely(!src_pmd)) { 1913 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { 1914 err = -ENOENT; 1915 break; 1916 } 1917 src_pmd = mm_alloc_pmd(mm, src_addr); 1918 if (unlikely(!src_pmd)) { 1919 err = -ENOMEM; 1920 break; 1921 } 1922 } 1923 dst_pmd = mm_alloc_pmd(mm, dst_addr); 1924 if (unlikely(!dst_pmd)) { 1925 err = -ENOMEM; 1926 break; 1927 } 1928 1929 dst_pmdval = pmdp_get_lockless(dst_pmd); 1930 /* 1931 * If the dst_pmd is mapped as THP don't override it and just 1932 * be strict. If dst_pmd changes into TPH after this check, the 1933 * move_pages_huge_pmd() will detect the change and retry 1934 * while move_pages_pte() will detect the change and fail. 1935 */ 1936 if (unlikely(pmd_trans_huge(dst_pmdval))) { 1937 err = -EEXIST; 1938 break; 1939 } 1940 1941 ptl = pmd_trans_huge_lock(src_pmd, src_vma); 1942 if (ptl) { 1943 /* Check if we can move the pmd without splitting it. */ 1944 if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) || 1945 !pmd_none(dst_pmdval)) { 1946 /* Can be a migration entry */ 1947 if (pmd_present(*src_pmd)) { 1948 struct folio *folio = pmd_folio(*src_pmd); 1949 1950 if (!is_huge_zero_folio(folio) && 1951 !PageAnonExclusive(&folio->page)) { 1952 spin_unlock(ptl); 1953 err = -EBUSY; 1954 break; 1955 } 1956 } 1957 1958 spin_unlock(ptl); 1959 split_huge_pmd(src_vma, src_pmd, src_addr); 1960 /* The folio will be split by move_pages_pte() */ 1961 continue; 1962 } 1963 1964 err = move_pages_huge_pmd(mm, dst_pmd, src_pmd, 1965 dst_pmdval, dst_vma, src_vma, 1966 dst_addr, src_addr); 1967 step_size = HPAGE_PMD_SIZE; 1968 } else { 1969 long ret; 1970 1971 if (pmd_none(*src_pmd)) { 1972 if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { 1973 err = -ENOENT; 1974 break; 1975 } 1976 if (unlikely(__pte_alloc(mm, src_pmd))) { 1977 err = -ENOMEM; 1978 break; 1979 } 1980 } 1981 1982 if (unlikely(pte_alloc(mm, dst_pmd))) { 1983 err = -ENOMEM; 1984 break; 1985 } 1986 1987 ret = move_pages_ptes(mm, dst_pmd, src_pmd, 1988 dst_vma, src_vma, dst_addr, 1989 src_addr, src_end - src_addr, mode); 1990 if (ret < 0) 1991 err = ret; 1992 else 1993 step_size = ret; 1994 } 1995 1996 cond_resched(); 1997 1998 if (fatal_signal_pending(current)) { 1999 /* Do not override an error */ 2000 if (!err || err == -EAGAIN) 2001 err = -EINTR; 2002 break; 2003 } 2004 2005 if (err) { 2006 if (err == -EAGAIN) 2007 continue; 2008 break; 2009 } 2010 2011 /* Proceed to the next page */ 2012 dst_addr += step_size; 2013 src_addr += step_size; 2014 moved += step_size; 2015 } 2016 2017out_unlock: 2018 up_read(&ctx->map_changing_lock); 2019 uffd_move_unlock(dst_vma, src_vma); 2020out: 2021 VM_WARN_ON_ONCE(moved < 0); 2022 VM_WARN_ON_ONCE(err > 0); 2023 VM_WARN_ON_ONCE(!moved && !err); 2024 return moved ? moved : err; 2025} 2026 2027bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, 2028 bool wp_async) 2029{ 2030 const struct vm_uffd_ops *ops = vma_uffd_ops(vma); 2031 2032 if (vma->vm_flags & VM_DROPPABLE) 2033 return false; 2034 2035 vm_flags &= __VM_UFFD_FLAGS; 2036 2037 /* 2038 * If WP is the only mode enabled and context is wp async, allow any 2039 * memory type. 2040 */ 2041 if (wp_async && (vm_flags == VM_UFFD_WP)) 2042 return true; 2043 2044 /* For any other mode reject VMAs that don't implement vm_uffd_ops */ 2045 if (!ops) 2046 return false; 2047 2048 /* 2049 * If user requested uffd-wp but not enabled pte markers for 2050 * uffd-wp, then only anonymous memory is supported 2051 */ 2052 if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) && 2053 !vma_is_anonymous(vma)) 2054 return false; 2055 2056 return ops->can_userfault(vma, vm_flags); 2057} 2058 2059static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, 2060 vm_flags_t vm_flags) 2061{ 2062 const bool uffd_wp_changed = (vma->vm_flags ^ vm_flags) & VM_UFFD_WP; 2063 2064 vm_flags_reset(vma, vm_flags); 2065 /* 2066 * For shared mappings, we want to enable writenotify while 2067 * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply 2068 * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. 2069 */ 2070 if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) 2071 vma_set_page_prot(vma); 2072} 2073 2074static void userfaultfd_set_ctx(struct vm_area_struct *vma, 2075 struct userfaultfd_ctx *ctx, 2076 vm_flags_t vm_flags) 2077{ 2078 vma_start_write(vma); 2079 vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx}; 2080 userfaultfd_set_vm_flags(vma, 2081 (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags); 2082} 2083 2084void userfaultfd_reset_ctx(struct vm_area_struct *vma) 2085{ 2086 userfaultfd_set_ctx(vma, NULL, 0); 2087} 2088 2089struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, 2090 struct vm_area_struct *prev, 2091 struct vm_area_struct *vma, 2092 unsigned long start, 2093 unsigned long end) 2094{ 2095 struct vm_area_struct *ret; 2096 bool give_up_on_oom = false; 2097 vma_flags_t new_vma_flags = vma->flags; 2098 2099 vma_flags_clear_mask(&new_vma_flags, __VMA_UFFD_FLAGS); 2100 2101 /* 2102 * If we are modifying only and not splitting, just give up on the merge 2103 * if OOM prevents us from merging successfully. 2104 */ 2105 if (start == vma->vm_start && end == vma->vm_end) 2106 give_up_on_oom = true; 2107 2108 /* Reset ptes for the whole vma range if wr-protected */ 2109 if (userfaultfd_wp(vma)) 2110 uffd_wp_range(vma, start, end - start, false); 2111 2112 ret = vma_modify_flags_uffd(vmi, prev, vma, start, end, 2113 &new_vma_flags, NULL_VM_UFFD_CTX, 2114 give_up_on_oom); 2115 2116 /* 2117 * In the vma_merge() successful mprotect-like case 8: 2118 * the next vma was merged into the current one and 2119 * the current one has not been updated yet. 2120 */ 2121 if (!IS_ERR(ret)) 2122 userfaultfd_reset_ctx(ret); 2123 2124 return ret; 2125} 2126 2127/* Assumes mmap write lock taken, and mm_struct pinned. */ 2128int userfaultfd_register_range(struct userfaultfd_ctx *ctx, 2129 struct vm_area_struct *vma, 2130 vm_flags_t vm_flags, 2131 unsigned long start, unsigned long end, 2132 bool wp_async) 2133{ 2134 vma_flags_t vma_flags = legacy_to_vma_flags(vm_flags); 2135 VMA_ITERATOR(vmi, ctx->mm, start); 2136 struct vm_area_struct *prev = vma_prev(&vmi); 2137 unsigned long vma_end; 2138 vma_flags_t new_vma_flags; 2139 2140 if (vma->vm_start < start) 2141 prev = vma; 2142 2143 for_each_vma_range(vmi, vma, end) { 2144 cond_resched(); 2145 2146 VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, wp_async)); 2147 VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx && 2148 vma->vm_userfaultfd_ctx.ctx != ctx); 2149 VM_WARN_ON_ONCE(!vma_test(vma, VMA_MAYWRITE_BIT)); 2150 2151 /* 2152 * Nothing to do: this vma is already registered into this 2153 * userfaultfd and with the right tracking mode too. 2154 */ 2155 if (vma->vm_userfaultfd_ctx.ctx == ctx && 2156 vma_test_all_mask(vma, vma_flags)) 2157 goto skip; 2158 2159 if (vma->vm_start > start) 2160 start = vma->vm_start; 2161 vma_end = min(end, vma->vm_end); 2162 2163 new_vma_flags = vma->flags; 2164 vma_flags_clear_mask(&new_vma_flags, __VMA_UFFD_FLAGS); 2165 vma_flags_set_mask(&new_vma_flags, vma_flags); 2166 2167 vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, 2168 &new_vma_flags, 2169 (struct vm_userfaultfd_ctx){ctx}, 2170 /* give_up_on_oom = */false); 2171 if (IS_ERR(vma)) 2172 return PTR_ERR(vma); 2173 2174 /* 2175 * In the vma_merge() successful mprotect-like case 8: 2176 * the next vma was merged into the current one and 2177 * the current one has not been updated yet. 2178 */ 2179 userfaultfd_set_ctx(vma, ctx, vm_flags); 2180 2181 if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) 2182 hugetlb_unshare_all_pmds(vma); 2183 2184skip: 2185 prev = vma; 2186 start = vma->vm_end; 2187 } 2188 2189 return 0; 2190} 2191 2192void userfaultfd_release_new(struct userfaultfd_ctx *ctx) 2193{ 2194 struct mm_struct *mm = ctx->mm; 2195 struct vm_area_struct *vma; 2196 VMA_ITERATOR(vmi, mm, 0); 2197 2198 /* the various vma->vm_userfaultfd_ctx still points to it */ 2199 mmap_write_lock(mm); 2200 for_each_vma(vmi, vma) { 2201 if (vma->vm_userfaultfd_ctx.ctx == ctx) 2202 userfaultfd_reset_ctx(vma); 2203 } 2204 mmap_write_unlock(mm); 2205} 2206 2207void userfaultfd_release_all(struct mm_struct *mm, 2208 struct userfaultfd_ctx *ctx) 2209{ 2210 struct vm_area_struct *vma, *prev; 2211 VMA_ITERATOR(vmi, mm, 0); 2212 2213 if (!mmget_not_zero(mm)) 2214 return; 2215 2216 /* 2217 * Flush page faults out of all CPUs. NOTE: all page faults 2218 * must be retried without returning VM_FAULT_SIGBUS if 2219 * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx 2220 * changes while handle_userfault released the mmap_lock. So 2221 * it's critical that released is set to true (above), before 2222 * taking the mmap_lock for writing. 2223 */ 2224 mmap_write_lock(mm); 2225 prev = NULL; 2226 for_each_vma(vmi, vma) { 2227 cond_resched(); 2228 VM_WARN_ON_ONCE(!!vma->vm_userfaultfd_ctx.ctx ^ 2229 !!(vma->vm_flags & __VM_UFFD_FLAGS)); 2230 if (vma->vm_userfaultfd_ctx.ctx != ctx) { 2231 prev = vma; 2232 continue; 2233 } 2234 2235 vma = userfaultfd_clear_vma(&vmi, prev, vma, 2236 vma->vm_start, vma->vm_end); 2237 prev = vma; 2238 } 2239 mmap_write_unlock(mm); 2240 mmput(mm); 2241}