userfaultfd: UFFDIO_MOVE uABI · tjh.dev/kernel@adef440

+3

Documentation/admin-guide/mm/userfaultfd.rst

··· 113 113 areas. ``UFFD_FEATURE_MINOR_SHMEM`` is the analogous feature indicating 114 114 support for shmem virtual memory areas. 115 115 116 + - ``UFFD_FEATURE_MOVE`` indicates that the kernel supports moving an 117 + existing page contents from userspace. 118 + 116 119 The userland application should set the feature flags it intends to use 117 120 when invoking the ``UFFDIO_API`` ioctl, to request that those features be 118 121 enabled if supported.

+72

fs/userfaultfd.c

··· 2005 2005 return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED; 2006 2006 } 2007 2007 2008 + static int userfaultfd_move(struct userfaultfd_ctx *ctx, 2009 + unsigned long arg) 2010 + { 2011 + __s64 ret; 2012 + struct uffdio_move uffdio_move; 2013 + struct uffdio_move __user *user_uffdio_move; 2014 + struct userfaultfd_wake_range range; 2015 + struct mm_struct *mm = ctx->mm; 2016 + 2017 + user_uffdio_move = (struct uffdio_move __user *) arg; 2018 + 2019 + if (atomic_read(&ctx->mmap_changing)) 2020 + return -EAGAIN; 2021 + 2022 + if (copy_from_user(&uffdio_move, user_uffdio_move, 2023 + /* don't copy "move" last field */ 2024 + sizeof(uffdio_move)-sizeof(__s64))) 2025 + return -EFAULT; 2026 + 2027 + /* Do not allow cross-mm moves. */ 2028 + if (mm != current->mm) 2029 + return -EINVAL; 2030 + 2031 + ret = validate_range(mm, uffdio_move.dst, uffdio_move.len); 2032 + if (ret) 2033 + return ret; 2034 + 2035 + ret = validate_range(mm, uffdio_move.src, uffdio_move.len); 2036 + if (ret) 2037 + return ret; 2038 + 2039 + if (uffdio_move.mode & ~(UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES| 2040 + UFFDIO_MOVE_MODE_DONTWAKE)) 2041 + return -EINVAL; 2042 + 2043 + if (mmget_not_zero(mm)) { 2044 + mmap_read_lock(mm); 2045 + 2046 + /* Re-check after taking mmap_lock */ 2047 + if (likely(!atomic_read(&ctx->mmap_changing))) 2048 + ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src, 2049 + uffdio_move.len, uffdio_move.mode); 2050 + else 2051 + ret = -EINVAL; 2052 + 2053 + mmap_read_unlock(mm); 2054 + mmput(mm); 2055 + } else { 2056 + return -ESRCH; 2057 + } 2058 + 2059 + if (unlikely(put_user(ret, &user_uffdio_move->move))) 2060 + return -EFAULT; 2061 + if (ret < 0) 2062 + goto out; 2063 + 2064 + /* len == 0 would wake all */ 2065 + VM_WARN_ON(!ret); 2066 + range.len = ret; 2067 + if (!(uffdio_move.mode & UFFDIO_MOVE_MODE_DONTWAKE)) { 2068 + range.start = uffdio_move.dst; 2069 + wake_userfault(ctx, &range); 2070 + } 2071 + ret = range.len == uffdio_move.len ? 0 : -EAGAIN; 2072 + 2073 + out: 2074 + return ret; 2075 + } 2076 + 2008 2077 /* 2009 2078 * userland asks for a certain API version and we return which bits 2010 2079 * and ioctl commands are implemented in this kernel for such API ··· 2165 2096 break; 2166 2097 case UFFDIO_ZEROPAGE: 2167 2098 ret = userfaultfd_zeropage(ctx, arg); 2099 + break; 2100 + case UFFDIO_MOVE: 2101 + ret = userfaultfd_move(ctx, arg); 2168 2102 break; 2169 2103 case UFFDIO_WRITEPROTECT: 2170 2104 ret = userfaultfd_writeprotect(ctx, arg);

+5

include/linux/rmap.h

··· 121 121 down_write(&anon_vma->root->rwsem); 122 122 } 123 123 124 + static inline int anon_vma_trylock_write(struct anon_vma *anon_vma) 125 + { 126 + return down_write_trylock(&anon_vma->root->rwsem); 127 + } 128 + 124 129 static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) 125 130 { 126 131 up_write(&anon_vma->root->rwsem);

+11

include/linux/userfaultfd_k.h

··· 93 93 extern long uffd_wp_range(struct vm_area_struct *vma, 94 94 unsigned long start, unsigned long len, bool enable_wp); 95 95 96 + /* move_pages */ 97 + void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2); 98 + void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2); 99 + ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm, 100 + unsigned long dst_start, unsigned long src_start, 101 + unsigned long len, __u64 flags); 102 + int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, 103 + struct vm_area_struct *dst_vma, 104 + struct vm_area_struct *src_vma, 105 + unsigned long dst_addr, unsigned long src_addr); 106 + 96 107 /* mm helpers */ 97 108 static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, 98 109 struct vm_userfaultfd_ctx vm_ctx)

+28 -1

include/uapi/linux/userfaultfd.h

··· 41 41 UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \ 42 42 UFFD_FEATURE_WP_UNPOPULATED | \ 43 43 UFFD_FEATURE_POISON | \ 44 - UFFD_FEATURE_WP_ASYNC) 44 + UFFD_FEATURE_WP_ASYNC | \ 45 + UFFD_FEATURE_MOVE) 45 46 #define UFFD_API_IOCTLS \ 46 47 ((__u64)1 << _UFFDIO_REGISTER | \ 47 48 (__u64)1 << _UFFDIO_UNREGISTER | \ ··· 51 50 ((__u64)1 << _UFFDIO_WAKE | \ 52 51 (__u64)1 << _UFFDIO_COPY | \ 53 52 (__u64)1 << _UFFDIO_ZEROPAGE | \ 53 + (__u64)1 << _UFFDIO_MOVE | \ 54 54 (__u64)1 << _UFFDIO_WRITEPROTECT | \ 55 55 (__u64)1 << _UFFDIO_CONTINUE | \ 56 56 (__u64)1 << _UFFDIO_POISON) ··· 75 73 #define _UFFDIO_WAKE (0x02) 76 74 #define _UFFDIO_COPY (0x03) 77 75 #define _UFFDIO_ZEROPAGE (0x04) 76 + #define _UFFDIO_MOVE (0x05) 78 77 #define _UFFDIO_WRITEPROTECT (0x06) 79 78 #define _UFFDIO_CONTINUE (0x07) 80 79 #define _UFFDIO_POISON (0x08) ··· 95 92 struct uffdio_copy) 96 93 #define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \ 97 94 struct uffdio_zeropage) 95 + #define UFFDIO_MOVE _IOWR(UFFDIO, _UFFDIO_MOVE, \ 96 + struct uffdio_move) 98 97 #define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \ 99 98 struct uffdio_writeprotect) 100 99 #define UFFDIO_CONTINUE _IOWR(UFFDIO, _UFFDIO_CONTINUE, \ ··· 227 222 * asynchronous mode is supported in which the write fault is 228 223 * automatically resolved and write-protection is un-set. 229 224 * It implies UFFD_FEATURE_WP_UNPOPULATED. 225 + * 226 + * UFFD_FEATURE_MOVE indicates that the kernel supports moving an 227 + * existing page contents from userspace. 230 228 */ 231 229 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) 232 230 #define UFFD_FEATURE_EVENT_FORK (1<<1) ··· 247 239 #define UFFD_FEATURE_WP_UNPOPULATED (1<<13) 248 240 #define UFFD_FEATURE_POISON (1<<14) 249 241 #define UFFD_FEATURE_WP_ASYNC (1<<15) 242 + #define UFFD_FEATURE_MOVE (1<<16) 250 243 __u64 features; 251 244 252 245 __u64 ioctls; ··· 354 345 * the copy_from_user will not read past here. 355 346 */ 356 347 __s64 updated; 348 + }; 349 + 350 + struct uffdio_move { 351 + __u64 dst; 352 + __u64 src; 353 + __u64 len; 354 + /* 355 + * Especially if used to atomically remove memory from the 356 + * address space the wake on the dst range is not needed. 357 + */ 358 + #define UFFDIO_MOVE_MODE_DONTWAKE ((__u64)1<<0) 359 + #define UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES ((__u64)1<<1) 360 + __u64 mode; 361 + /* 362 + * "move" is written by the ioctl and must be at the end: the 363 + * copy_from_user will not read the last 8 bytes. 364 + */ 365 + __s64 move; 357 366 }; 358 367 359 368 /*

+122

mm/huge_memory.c

··· 2141 2141 return ret; 2142 2142 } 2143 2143 2144 + #ifdef CONFIG_USERFAULTFD 2145 + /* 2146 + * The PT lock for src_pmd and the mmap_lock for reading are held by 2147 + * the caller, but it must return after releasing the page_table_lock. 2148 + * Just move the page from src_pmd to dst_pmd if possible. 2149 + * Return zero if succeeded in moving the page, -EAGAIN if it needs to be 2150 + * repeated by the caller, or other errors in case of failure. 2151 + */ 2152 + int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, 2153 + struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, 2154 + unsigned long dst_addr, unsigned long src_addr) 2155 + { 2156 + pmd_t _dst_pmd, src_pmdval; 2157 + struct page *src_page; 2158 + struct folio *src_folio; 2159 + struct anon_vma *src_anon_vma; 2160 + spinlock_t *src_ptl, *dst_ptl; 2161 + pgtable_t src_pgtable; 2162 + struct mmu_notifier_range range; 2163 + int err = 0; 2164 + 2165 + src_pmdval = *src_pmd; 2166 + src_ptl = pmd_lockptr(mm, src_pmd); 2167 + 2168 + lockdep_assert_held(src_ptl); 2169 + mmap_assert_locked(mm); 2170 + 2171 + /* Sanity checks before the operation */ 2172 + if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) || 2173 + WARN_ON_ONCE(dst_addr & ~HPAGE_PMD_MASK)) { 2174 + spin_unlock(src_ptl); 2175 + return -EINVAL; 2176 + } 2177 + 2178 + if (!pmd_trans_huge(src_pmdval)) { 2179 + spin_unlock(src_ptl); 2180 + if (is_pmd_migration_entry(src_pmdval)) { 2181 + pmd_migration_entry_wait(mm, &src_pmdval); 2182 + return -EAGAIN; 2183 + } 2184 + return -ENOENT; 2185 + } 2186 + 2187 + src_page = pmd_page(src_pmdval); 2188 + if (unlikely(!PageAnonExclusive(src_page))) { 2189 + spin_unlock(src_ptl); 2190 + return -EBUSY; 2191 + } 2192 + 2193 + src_folio = page_folio(src_page); 2194 + folio_get(src_folio); 2195 + spin_unlock(src_ptl); 2196 + 2197 + flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE); 2198 + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr, 2199 + src_addr + HPAGE_PMD_SIZE); 2200 + mmu_notifier_invalidate_range_start(&range); 2201 + 2202 + folio_lock(src_folio); 2203 + 2204 + /* 2205 + * split_huge_page walks the anon_vma chain without the page 2206 + * lock. Serialize against it with the anon_vma lock, the page 2207 + * lock is not enough. 2208 + */ 2209 + src_anon_vma = folio_get_anon_vma(src_folio); 2210 + if (!src_anon_vma) { 2211 + err = -EAGAIN; 2212 + goto unlock_folio; 2213 + } 2214 + anon_vma_lock_write(src_anon_vma); 2215 + 2216 + dst_ptl = pmd_lockptr(mm, dst_pmd); 2217 + double_pt_lock(src_ptl, dst_ptl); 2218 + if (unlikely(!pmd_same(*src_pmd, src_pmdval) || 2219 + !pmd_same(*dst_pmd, dst_pmdval))) { 2220 + err = -EAGAIN; 2221 + goto unlock_ptls; 2222 + } 2223 + if (folio_maybe_dma_pinned(src_folio) || 2224 + !PageAnonExclusive(&src_folio->page)) { 2225 + err = -EBUSY; 2226 + goto unlock_ptls; 2227 + } 2228 + 2229 + if (WARN_ON_ONCE(!folio_test_head(src_folio)) || 2230 + WARN_ON_ONCE(!folio_test_anon(src_folio))) { 2231 + err = -EBUSY; 2232 + goto unlock_ptls; 2233 + } 2234 + 2235 + folio_move_anon_rmap(src_folio, dst_vma); 2236 + WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); 2237 + 2238 + src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); 2239 + /* Folio got pinned from under us. Put it back and fail the move. */ 2240 + if (folio_maybe_dma_pinned(src_folio)) { 2241 + set_pmd_at(mm, src_addr, src_pmd, src_pmdval); 2242 + err = -EBUSY; 2243 + goto unlock_ptls; 2244 + } 2245 + 2246 + _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot); 2247 + /* Follow mremap() behavior and treat the entry dirty after the move */ 2248 + _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma); 2249 + set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd); 2250 + 2251 + src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd); 2252 + pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable); 2253 + unlock_ptls: 2254 + double_pt_unlock(src_ptl, dst_ptl); 2255 + anon_vma_unlock_write(src_anon_vma); 2256 + put_anon_vma(src_anon_vma); 2257 + unlock_folio: 2258 + /* unblock rmap walks */ 2259 + folio_unlock(src_folio); 2260 + mmu_notifier_invalidate_range_end(&range); 2261 + folio_put(src_folio); 2262 + return err; 2263 + } 2264 + #endif /* CONFIG_USERFAULTFD */ 2265 + 2144 2266 /* 2145 2267 * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise. 2146 2268 *

+3

mm/khugepaged.c

··· 1140 1140 * Prevent all access to pagetables with the exception of 1141 1141 * gup_fast later handled by the ptep_clear_flush and the VM 1142 1142 * handled by the anon_vma lock + PG_lock. 1143 + * 1144 + * UFFDIO_MOVE is prevented to race as well thanks to the 1145 + * mmap_lock. 1143 1146 */ 1144 1147 mmap_write_lock(mm); 1145 1148 result = hugepage_vma_revalidate(mm, address, true, &vma, cc);

+6

mm/rmap.c

··· 490 490 * page_remove_rmap() that the anon_vma pointer from page->mapping is valid 491 491 * if there is a mapcount, we can dereference the anon_vma after observing 492 492 * those. 493 + * 494 + * NOTE: the caller should normally hold folio lock when calling this. If 495 + * not, the caller needs to double check the anon_vma didn't change after 496 + * taking the anon_vma lock for either read or write (UFFDIO_MOVE can modify it 497 + * concurrently without folio lock protection). See folio_lock_anon_vma_read() 498 + * which has already covered that, and comment above remap_pages(). 493 499 */ 494 500 struct anon_vma *folio_get_anon_vma(struct folio *folio) 495 501 {

+614

mm/userfaultfd.c

··· 842 842 mmap_read_unlock(dst_mm); 843 843 return err; 844 844 } 845 + 846 + 847 + void double_pt_lock(spinlock_t *ptl1, 848 + spinlock_t *ptl2) 849 + __acquires(ptl1) 850 + __acquires(ptl2) 851 + { 852 + spinlock_t *ptl_tmp; 853 + 854 + if (ptl1 > ptl2) { 855 + /* exchange ptl1 and ptl2 */ 856 + ptl_tmp = ptl1; 857 + ptl1 = ptl2; 858 + ptl2 = ptl_tmp; 859 + } 860 + /* lock in virtual address order to avoid lock inversion */ 861 + spin_lock(ptl1); 862 + if (ptl1 != ptl2) 863 + spin_lock_nested(ptl2, SINGLE_DEPTH_NESTING); 864 + else 865 + __acquire(ptl2); 866 + } 867 + 868 + void double_pt_unlock(spinlock_t *ptl1, 869 + spinlock_t *ptl2) 870 + __releases(ptl1) 871 + __releases(ptl2) 872 + { 873 + spin_unlock(ptl1); 874 + if (ptl1 != ptl2) 875 + spin_unlock(ptl2); 876 + else 877 + __release(ptl2); 878 + } 879 + 880 + 881 + static int move_present_pte(struct mm_struct *mm, 882 + struct vm_area_struct *dst_vma, 883 + struct vm_area_struct *src_vma, 884 + unsigned long dst_addr, unsigned long src_addr, 885 + pte_t *dst_pte, pte_t *src_pte, 886 + pte_t orig_dst_pte, pte_t orig_src_pte, 887 + spinlock_t *dst_ptl, spinlock_t *src_ptl, 888 + struct folio *src_folio) 889 + { 890 + int err = 0; 891 + 892 + double_pt_lock(dst_ptl, src_ptl); 893 + 894 + if (!pte_same(*src_pte, orig_src_pte) || 895 + !pte_same(*dst_pte, orig_dst_pte)) { 896 + err = -EAGAIN; 897 + goto out; 898 + } 899 + if (folio_test_large(src_folio) || 900 + folio_maybe_dma_pinned(src_folio) || 901 + !PageAnonExclusive(&src_folio->page)) { 902 + err = -EBUSY; 903 + goto out; 904 + } 905 + 906 + folio_move_anon_rmap(src_folio, dst_vma); 907 + WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); 908 + 909 + orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte); 910 + /* Folio got pinned from under us. Put it back and fail the move. */ 911 + if (folio_maybe_dma_pinned(src_folio)) { 912 + set_pte_at(mm, src_addr, src_pte, orig_src_pte); 913 + err = -EBUSY; 914 + goto out; 915 + } 916 + 917 + orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot); 918 + /* Follow mremap() behavior and treat the entry dirty after the move */ 919 + orig_dst_pte = pte_mkwrite(pte_mkdirty(orig_dst_pte), dst_vma); 920 + 921 + set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); 922 + out: 923 + double_pt_unlock(dst_ptl, src_ptl); 924 + return err; 925 + } 926 + 927 + static int move_swap_pte(struct mm_struct *mm, 928 + unsigned long dst_addr, unsigned long src_addr, 929 + pte_t *dst_pte, pte_t *src_pte, 930 + pte_t orig_dst_pte, pte_t orig_src_pte, 931 + spinlock_t *dst_ptl, spinlock_t *src_ptl) 932 + { 933 + if (!pte_swp_exclusive(orig_src_pte)) 934 + return -EBUSY; 935 + 936 + double_pt_lock(dst_ptl, src_ptl); 937 + 938 + if (!pte_same(*src_pte, orig_src_pte) || 939 + !pte_same(*dst_pte, orig_dst_pte)) { 940 + double_pt_unlock(dst_ptl, src_ptl); 941 + return -EAGAIN; 942 + } 943 + 944 + orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); 945 + set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); 946 + double_pt_unlock(dst_ptl, src_ptl); 947 + 948 + return 0; 949 + } 950 + 951 + /* 952 + * The mmap_lock for reading is held by the caller. Just move the page 953 + * from src_pmd to dst_pmd if possible, and return true if succeeded 954 + * in moving the page. 955 + */ 956 + static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, 957 + struct vm_area_struct *dst_vma, 958 + struct vm_area_struct *src_vma, 959 + unsigned long dst_addr, unsigned long src_addr, 960 + __u64 mode) 961 + { 962 + swp_entry_t entry; 963 + pte_t orig_src_pte, orig_dst_pte; 964 + pte_t src_folio_pte; 965 + spinlock_t *src_ptl, *dst_ptl; 966 + pte_t *src_pte = NULL; 967 + pte_t *dst_pte = NULL; 968 + 969 + struct folio *src_folio = NULL; 970 + struct anon_vma *src_anon_vma = NULL; 971 + struct mmu_notifier_range range; 972 + int err = 0; 973 + 974 + flush_cache_range(src_vma, src_addr, src_addr + PAGE_SIZE); 975 + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 976 + src_addr, src_addr + PAGE_SIZE); 977 + mmu_notifier_invalidate_range_start(&range); 978 + retry: 979 + dst_pte = pte_offset_map_nolock(mm, dst_pmd, dst_addr, &dst_ptl); 980 + 981 + /* Retry if a huge pmd materialized from under us */ 982 + if (unlikely(!dst_pte)) { 983 + err = -EAGAIN; 984 + goto out; 985 + } 986 + 987 + src_pte = pte_offset_map_nolock(mm, src_pmd, src_addr, &src_ptl); 988 + 989 + /* 990 + * We held the mmap_lock for reading so MADV_DONTNEED 991 + * can zap transparent huge pages under us, or the 992 + * transparent huge page fault can establish new 993 + * transparent huge pages under us. 994 + */ 995 + if (unlikely(!src_pte)) { 996 + err = -EAGAIN; 997 + goto out; 998 + } 999 + 1000 + /* Sanity checks before the operation */ 1001 + if (WARN_ON_ONCE(pmd_none(*dst_pmd)) || WARN_ON_ONCE(pmd_none(*src_pmd)) || 1002 + WARN_ON_ONCE(pmd_trans_huge(*dst_pmd)) || WARN_ON_ONCE(pmd_trans_huge(*src_pmd))) { 1003 + err = -EINVAL; 1004 + goto out; 1005 + } 1006 + 1007 + spin_lock(dst_ptl); 1008 + orig_dst_pte = *dst_pte; 1009 + spin_unlock(dst_ptl); 1010 + if (!pte_none(orig_dst_pte)) { 1011 + err = -EEXIST; 1012 + goto out; 1013 + } 1014 + 1015 + spin_lock(src_ptl); 1016 + orig_src_pte = *src_pte; 1017 + spin_unlock(src_ptl); 1018 + if (pte_none(orig_src_pte)) { 1019 + if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) 1020 + err = -ENOENT; 1021 + else /* nothing to do to move a hole */ 1022 + err = 0; 1023 + goto out; 1024 + } 1025 + 1026 + /* If PTE changed after we locked the folio them start over */ 1027 + if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) { 1028 + err = -EAGAIN; 1029 + goto out; 1030 + } 1031 + 1032 + if (pte_present(orig_src_pte)) { 1033 + /* 1034 + * Pin and lock both source folio and anon_vma. Since we are in 1035 + * RCU read section, we can't block, so on contention have to 1036 + * unmap the ptes, obtain the lock and retry. 1037 + */ 1038 + if (!src_folio) { 1039 + struct folio *folio; 1040 + 1041 + /* 1042 + * Pin the page while holding the lock to be sure the 1043 + * page isn't freed under us 1044 + */ 1045 + spin_lock(src_ptl); 1046 + if (!pte_same(orig_src_pte, *src_pte)) { 1047 + spin_unlock(src_ptl); 1048 + err = -EAGAIN; 1049 + goto out; 1050 + } 1051 + 1052 + folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); 1053 + if (!folio || !PageAnonExclusive(&folio->page)) { 1054 + spin_unlock(src_ptl); 1055 + err = -EBUSY; 1056 + goto out; 1057 + } 1058 + 1059 + folio_get(folio); 1060 + src_folio = folio; 1061 + src_folio_pte = orig_src_pte; 1062 + spin_unlock(src_ptl); 1063 + 1064 + if (!folio_trylock(src_folio)) { 1065 + pte_unmap(&orig_src_pte); 1066 + pte_unmap(&orig_dst_pte); 1067 + src_pte = dst_pte = NULL; 1068 + /* now we can block and wait */ 1069 + folio_lock(src_folio); 1070 + goto retry; 1071 + } 1072 + 1073 + if (WARN_ON_ONCE(!folio_test_anon(src_folio))) { 1074 + err = -EBUSY; 1075 + goto out; 1076 + } 1077 + } 1078 + 1079 + /* at this point we have src_folio locked */ 1080 + if (folio_test_large(src_folio)) { 1081 + err = split_folio(src_folio); 1082 + if (err) 1083 + goto out; 1084 + } 1085 + 1086 + if (!src_anon_vma) { 1087 + /* 1088 + * folio_referenced walks the anon_vma chain 1089 + * without the folio lock. Serialize against it with 1090 + * the anon_vma lock, the folio lock is not enough. 1091 + */ 1092 + src_anon_vma = folio_get_anon_vma(src_folio); 1093 + if (!src_anon_vma) { 1094 + /* page was unmapped from under us */ 1095 + err = -EAGAIN; 1096 + goto out; 1097 + } 1098 + if (!anon_vma_trylock_write(src_anon_vma)) { 1099 + pte_unmap(&orig_src_pte); 1100 + pte_unmap(&orig_dst_pte); 1101 + src_pte = dst_pte = NULL; 1102 + /* now we can block and wait */ 1103 + anon_vma_lock_write(src_anon_vma); 1104 + goto retry; 1105 + } 1106 + } 1107 + 1108 + err = move_present_pte(mm, dst_vma, src_vma, 1109 + dst_addr, src_addr, dst_pte, src_pte, 1110 + orig_dst_pte, orig_src_pte, 1111 + dst_ptl, src_ptl, src_folio); 1112 + } else { 1113 + entry = pte_to_swp_entry(orig_src_pte); 1114 + if (non_swap_entry(entry)) { 1115 + if (is_migration_entry(entry)) { 1116 + pte_unmap(&orig_src_pte); 1117 + pte_unmap(&orig_dst_pte); 1118 + src_pte = dst_pte = NULL; 1119 + migration_entry_wait(mm, src_pmd, src_addr); 1120 + err = -EAGAIN; 1121 + } else 1122 + err = -EFAULT; 1123 + goto out; 1124 + } 1125 + 1126 + err = move_swap_pte(mm, dst_addr, src_addr, 1127 + dst_pte, src_pte, 1128 + orig_dst_pte, orig_src_pte, 1129 + dst_ptl, src_ptl); 1130 + } 1131 + 1132 + out: 1133 + if (src_anon_vma) { 1134 + anon_vma_unlock_write(src_anon_vma); 1135 + put_anon_vma(src_anon_vma); 1136 + } 1137 + if (src_folio) { 1138 + folio_unlock(src_folio); 1139 + folio_put(src_folio); 1140 + } 1141 + if (dst_pte) 1142 + pte_unmap(dst_pte); 1143 + if (src_pte) 1144 + pte_unmap(src_pte); 1145 + mmu_notifier_invalidate_range_end(&range); 1146 + 1147 + return err; 1148 + } 1149 + 1150 + #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1151 + static inline bool move_splits_huge_pmd(unsigned long dst_addr, 1152 + unsigned long src_addr, 1153 + unsigned long src_end) 1154 + { 1155 + return (src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) || 1156 + src_end - src_addr < HPAGE_PMD_SIZE; 1157 + } 1158 + #else 1159 + static inline bool move_splits_huge_pmd(unsigned long dst_addr, 1160 + unsigned long src_addr, 1161 + unsigned long src_end) 1162 + { 1163 + /* This is unreachable anyway, just to avoid warnings when HPAGE_PMD_SIZE==0 */ 1164 + return false; 1165 + } 1166 + #endif 1167 + 1168 + static inline bool vma_move_compatible(struct vm_area_struct *vma) 1169 + { 1170 + return !(vma->vm_flags & (VM_PFNMAP | VM_IO | VM_HUGETLB | 1171 + VM_MIXEDMAP | VM_SHADOW_STACK)); 1172 + } 1173 + 1174 + static int validate_move_areas(struct userfaultfd_ctx *ctx, 1175 + struct vm_area_struct *src_vma, 1176 + struct vm_area_struct *dst_vma) 1177 + { 1178 + /* Only allow moving if both have the same access and protection */ 1179 + if ((src_vma->vm_flags & VM_ACCESS_FLAGS) != (dst_vma->vm_flags & VM_ACCESS_FLAGS) || 1180 + pgprot_val(src_vma->vm_page_prot) != pgprot_val(dst_vma->vm_page_prot)) 1181 + return -EINVAL; 1182 + 1183 + /* Only allow moving if both are mlocked or both aren't */ 1184 + if ((src_vma->vm_flags & VM_LOCKED) != (dst_vma->vm_flags & VM_LOCKED)) 1185 + return -EINVAL; 1186 + 1187 + /* 1188 + * For now, we keep it simple and only move between writable VMAs. 1189 + * Access flags are equal, therefore cheching only the source is enough. 1190 + */ 1191 + if (!(src_vma->vm_flags & VM_WRITE)) 1192 + return -EINVAL; 1193 + 1194 + /* Check if vma flags indicate content which can be moved */ 1195 + if (!vma_move_compatible(src_vma) || !vma_move_compatible(dst_vma)) 1196 + return -EINVAL; 1197 + 1198 + /* Ensure dst_vma is registered in uffd we are operating on */ 1199 + if (!dst_vma->vm_userfaultfd_ctx.ctx || 1200 + dst_vma->vm_userfaultfd_ctx.ctx != ctx) 1201 + return -EINVAL; 1202 + 1203 + /* Only allow moving across anonymous vmas */ 1204 + if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma)) 1205 + return -EINVAL; 1206 + 1207 + /* 1208 + * Ensure the dst_vma has a anon_vma or this page 1209 + * would get a NULL anon_vma when moved in the 1210 + * dst_vma. 1211 + */ 1212 + if (unlikely(anon_vma_prepare(dst_vma))) 1213 + return -ENOMEM; 1214 + 1215 + return 0; 1216 + } 1217 + 1218 + /** 1219 + * move_pages - move arbitrary anonymous pages of an existing vma 1220 + * @ctx: pointer to the userfaultfd context 1221 + * @mm: the address space to move pages 1222 + * @dst_start: start of the destination virtual memory range 1223 + * @src_start: start of the source virtual memory range 1224 + * @len: length of the virtual memory range 1225 + * @mode: flags from uffdio_move.mode 1226 + * 1227 + * Must be called with mmap_lock held for read. 1228 + * 1229 + * move_pages() remaps arbitrary anonymous pages atomically in zero 1230 + * copy. It only works on non shared anonymous pages because those can 1231 + * be relocated without generating non linear anon_vmas in the rmap 1232 + * code. 1233 + * 1234 + * It provides a zero copy mechanism to handle userspace page faults. 1235 + * The source vma pages should have mapcount == 1, which can be 1236 + * enforced by using madvise(MADV_DONTFORK) on src vma. 1237 + * 1238 + * The thread receiving the page during the userland page fault 1239 + * will receive the faulting page in the source vma through the network, 1240 + * storage or any other I/O device (MADV_DONTFORK in the source vma 1241 + * avoids move_pages() to fail with -EBUSY if the process forks before 1242 + * move_pages() is called), then it will call move_pages() to map the 1243 + * page in the faulting address in the destination vma. 1244 + * 1245 + * This userfaultfd command works purely via pagetables, so it's the 1246 + * most efficient way to move physical non shared anonymous pages 1247 + * across different virtual addresses. Unlike mremap()/mmap()/munmap() 1248 + * it does not create any new vmas. The mapping in the destination 1249 + * address is atomic. 1250 + * 1251 + * It only works if the vma protection bits are identical from the 1252 + * source and destination vma. 1253 + * 1254 + * It can remap non shared anonymous pages within the same vma too. 1255 + * 1256 + * If the source virtual memory range has any unmapped holes, or if 1257 + * the destination virtual memory range is not a whole unmapped hole, 1258 + * move_pages() will fail respectively with -ENOENT or -EEXIST. This 1259 + * provides a very strict behavior to avoid any chance of memory 1260 + * corruption going unnoticed if there are userland race conditions. 1261 + * Only one thread should resolve the userland page fault at any given 1262 + * time for any given faulting address. This means that if two threads 1263 + * try to both call move_pages() on the same destination address at the 1264 + * same time, the second thread will get an explicit error from this 1265 + * command. 1266 + * 1267 + * The command retval will return "len" is successful. The command 1268 + * however can be interrupted by fatal signals or errors. If 1269 + * interrupted it will return the number of bytes successfully 1270 + * remapped before the interruption if any, or the negative error if 1271 + * none. It will never return zero. Either it will return an error or 1272 + * an amount of bytes successfully moved. If the retval reports a 1273 + * "short" remap, the move_pages() command should be repeated by 1274 + * userland with src+retval, dst+reval, len-retval if it wants to know 1275 + * about the error that interrupted it. 1276 + * 1277 + * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag can be specified to 1278 + * prevent -ENOENT errors to materialize if there are holes in the 1279 + * source virtual range that is being remapped. The holes will be 1280 + * accounted as successfully remapped in the retval of the 1281 + * command. This is mostly useful to remap hugepage naturally aligned 1282 + * virtual regions without knowing if there are transparent hugepage 1283 + * in the regions or not, but preventing the risk of having to split 1284 + * the hugepmd during the remap. 1285 + * 1286 + * If there's any rmap walk that is taking the anon_vma locks without 1287 + * first obtaining the folio lock (the only current instance is 1288 + * folio_referenced), they will have to verify if the folio->mapping 1289 + * has changed after taking the anon_vma lock. If it changed they 1290 + * should release the lock and retry obtaining a new anon_vma, because 1291 + * it means the anon_vma was changed by move_pages() before the lock 1292 + * could be obtained. This is the only additional complexity added to 1293 + * the rmap code to provide this anonymous page remapping functionality. 1294 + */ 1295 + ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm, 1296 + unsigned long dst_start, unsigned long src_start, 1297 + unsigned long len, __u64 mode) 1298 + { 1299 + struct vm_area_struct *src_vma, *dst_vma; 1300 + unsigned long src_addr, dst_addr; 1301 + pmd_t *src_pmd, *dst_pmd; 1302 + long err = -EINVAL; 1303 + ssize_t moved = 0; 1304 + 1305 + /* Sanitize the command parameters. */ 1306 + if (WARN_ON_ONCE(src_start & ~PAGE_MASK) || 1307 + WARN_ON_ONCE(dst_start & ~PAGE_MASK) || 1308 + WARN_ON_ONCE(len & ~PAGE_MASK)) 1309 + goto out; 1310 + 1311 + /* Does the address range wrap, or is the span zero-sized? */ 1312 + if (WARN_ON_ONCE(src_start + len <= src_start) || 1313 + WARN_ON_ONCE(dst_start + len <= dst_start)) 1314 + goto out; 1315 + 1316 + /* 1317 + * Make sure the vma is not shared, that the src and dst remap 1318 + * ranges are both valid and fully within a single existing 1319 + * vma. 1320 + */ 1321 + src_vma = find_vma(mm, src_start); 1322 + if (!src_vma || (src_vma->vm_flags & VM_SHARED)) 1323 + goto out; 1324 + if (src_start < src_vma->vm_start || 1325 + src_start + len > src_vma->vm_end) 1326 + goto out; 1327 + 1328 + dst_vma = find_vma(mm, dst_start); 1329 + if (!dst_vma || (dst_vma->vm_flags & VM_SHARED)) 1330 + goto out; 1331 + if (dst_start < dst_vma->vm_start || 1332 + dst_start + len > dst_vma->vm_end) 1333 + goto out; 1334 + 1335 + err = validate_move_areas(ctx, src_vma, dst_vma); 1336 + if (err) 1337 + goto out; 1338 + 1339 + for (src_addr = src_start, dst_addr = dst_start; 1340 + src_addr < src_start + len;) { 1341 + spinlock_t *ptl; 1342 + pmd_t dst_pmdval; 1343 + unsigned long step_size; 1344 + 1345 + /* 1346 + * Below works because anonymous area would not have a 1347 + * transparent huge PUD. If file-backed support is added, 1348 + * that case would need to be handled here. 1349 + */ 1350 + src_pmd = mm_find_pmd(mm, src_addr); 1351 + if (unlikely(!src_pmd)) { 1352 + if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { 1353 + err = -ENOENT; 1354 + break; 1355 + } 1356 + src_pmd = mm_alloc_pmd(mm, src_addr); 1357 + if (unlikely(!src_pmd)) { 1358 + err = -ENOMEM; 1359 + break; 1360 + } 1361 + } 1362 + dst_pmd = mm_alloc_pmd(mm, dst_addr); 1363 + if (unlikely(!dst_pmd)) { 1364 + err = -ENOMEM; 1365 + break; 1366 + } 1367 + 1368 + dst_pmdval = pmdp_get_lockless(dst_pmd); 1369 + /* 1370 + * If the dst_pmd is mapped as THP don't override it and just 1371 + * be strict. If dst_pmd changes into TPH after this check, the 1372 + * move_pages_huge_pmd() will detect the change and retry 1373 + * while move_pages_pte() will detect the change and fail. 1374 + */ 1375 + if (unlikely(pmd_trans_huge(dst_pmdval))) { 1376 + err = -EEXIST; 1377 + break; 1378 + } 1379 + 1380 + ptl = pmd_trans_huge_lock(src_pmd, src_vma); 1381 + if (ptl) { 1382 + if (pmd_devmap(*src_pmd)) { 1383 + spin_unlock(ptl); 1384 + err = -ENOENT; 1385 + break; 1386 + } 1387 + 1388 + /* Check if we can move the pmd without splitting it. */ 1389 + if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) || 1390 + !pmd_none(dst_pmdval)) { 1391 + struct folio *folio = pfn_folio(pmd_pfn(*src_pmd)); 1392 + 1393 + if (!folio || !PageAnonExclusive(&folio->page)) { 1394 + spin_unlock(ptl); 1395 + err = -EBUSY; 1396 + break; 1397 + } 1398 + 1399 + spin_unlock(ptl); 1400 + split_huge_pmd(src_vma, src_pmd, src_addr); 1401 + /* The folio will be split by move_pages_pte() */ 1402 + continue; 1403 + } 1404 + 1405 + err = move_pages_huge_pmd(mm, dst_pmd, src_pmd, 1406 + dst_pmdval, dst_vma, src_vma, 1407 + dst_addr, src_addr); 1408 + step_size = HPAGE_PMD_SIZE; 1409 + } else { 1410 + if (pmd_none(*src_pmd)) { 1411 + if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { 1412 + err = -ENOENT; 1413 + break; 1414 + } 1415 + if (unlikely(__pte_alloc(mm, src_pmd))) { 1416 + err = -ENOMEM; 1417 + break; 1418 + } 1419 + } 1420 + 1421 + if (unlikely(pte_alloc(mm, dst_pmd))) { 1422 + err = -ENOMEM; 1423 + break; 1424 + } 1425 + 1426 + err = move_pages_pte(mm, dst_pmd, src_pmd, 1427 + dst_vma, src_vma, 1428 + dst_addr, src_addr, mode); 1429 + step_size = PAGE_SIZE; 1430 + } 1431 + 1432 + cond_resched(); 1433 + 1434 + if (fatal_signal_pending(current)) { 1435 + /* Do not override an error */ 1436 + if (!err || err == -EAGAIN) 1437 + err = -EINTR; 1438 + break; 1439 + } 1440 + 1441 + if (err) { 1442 + if (err == -EAGAIN) 1443 + continue; 1444 + break; 1445 + } 1446 + 1447 + /* Proceed to the next page */ 1448 + dst_addr += step_size; 1449 + src_addr += step_size; 1450 + moved += step_size; 1451 + } 1452 + 1453 + out: 1454 + VM_WARN_ON(moved < 0); 1455 + VM_WARN_ON(err > 0); 1456 + VM_WARN_ON(!moved && !err); 1457 + return moved ? moved : err; 1458 + }

Configure Feed

Configure Feed