mm: move internal core VMA manipulation functions to own file

-35

include/linux/mm.h

··· 1005 1005 return mas_prev(&vmi->mas, 0); 1006 1006 } 1007 1007 1008 - static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) 1009 - { 1010 - return vmi->mas.index; 1011 - } 1012 - 1013 - static inline unsigned long vma_iter_end(struct vma_iterator *vmi) 1014 - { 1015 - return vmi->mas.last + 1; 1016 - } 1017 - static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, 1018 - unsigned long count) 1019 - { 1020 - return mas_expected_entries(&vmi->mas, count); 1021 - } 1022 - 1023 1008 static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, 1024 1009 unsigned long start, unsigned long end, gfp_t gfp) 1025 1010 { ··· 2519 2534 #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ 2520 2535 MM_CP_UFFD_WP_RESOLVE) 2521 2536 2522 - bool vma_needs_dirty_tracking(struct vm_area_struct *vma); 2523 - bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); 2524 - static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) 2525 - { 2526 - /* 2527 - * We want to check manually if we can change individual PTEs writable 2528 - * if we can't do that automatically for all PTEs in a mapping. For 2529 - * private mappings, that's always the case when we have write 2530 - * permissions as we properly have to handle COW. 2531 - */ 2532 - if (vma->vm_flags & VM_SHARED) 2533 - return vma_wants_writenotify(vma, vma->vm_page_prot); 2534 - return !!(vma->vm_flags & VM_WRITE); 2535 - 2536 - } 2537 2537 bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, 2538 2538 pte_t pte); 2539 2539 extern long change_protection(struct mmu_gather *tlb, ··· 3226 3256 3227 3257 /* mmap.c */ 3228 3258 extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); 3229 - extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); 3230 3259 extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); 3231 - extern void unlink_file_vma(struct vm_area_struct *); 3232 - extern struct vm_area_struct *copy_vma(struct vm_area_struct **, 3233 - unsigned long addr, unsigned long len, pgoff_t pgoff, 3234 - bool *need_rmap_locks); 3235 3260 extern void exit_mmap(struct mm_struct *); 3236 3261 int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); 3237 3262

+1 -1

mm/Makefile

··· 37 37 mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ 38 38 mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ 39 39 msync.o page_vma_mapped.o pagewalk.o \ 40 - pgtable-generic.o rmap.o vmalloc.o 40 + pgtable-generic.o rmap.o vmalloc.o vma.o 41 41 42 42 43 43 ifdef CONFIG_CROSS_MEMORY_ATTACH

+5 -231

mm/internal.h

··· 8 8 #define __MM_INTERNAL_H 9 9 10 10 #include <linux/fs.h> 11 + #include <linux/khugepaged.h> 11 12 #include <linux/mm.h> 13 + #include <linux/mm_inline.h> 12 14 #include <linux/pagemap.h> 13 15 #include <linux/rmap.h> 14 16 #include <linux/swap.h> 15 17 #include <linux/swapops.h> 16 18 #include <linux/tracepoint-defs.h> 19 + 20 + /* Internal core VMA manipulation functions. */ 21 + #include "vma.h" 17 22 18 23 struct folio_batch; 19 24 ··· 783 778 return list_empty(&area->free_list[migratetype]); 784 779 } 785 780 786 - /* 787 - * These three helpers classifies VMAs for virtual memory accounting. 788 - */ 789 - 790 - /* 791 - * Executable code area - executable, not writable, not stack 792 - */ 793 - static inline bool is_exec_mapping(vm_flags_t flags) 794 - { 795 - return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC; 796 - } 797 - 798 - /* 799 - * Stack area (including shadow stacks) 800 - * 801 - * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: 802 - * do_mmap() forbids all other combinations. 803 - */ 804 - static inline bool is_stack_mapping(vm_flags_t flags) 805 - { 806 - return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK); 807 - } 808 - 809 - /* 810 - * Data area - private, writable, not stack 811 - */ 812 - static inline bool is_data_mapping(vm_flags_t flags) 813 - { 814 - return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; 815 - } 816 - 817 781 /* mm/util.c */ 818 782 struct anon_vma *folio_anon_vma(struct folio *folio); 819 783 ··· 1211 1237 void touch_pmd(struct vm_area_struct *vma, unsigned long addr, 1212 1238 pmd_t *pmd, bool write); 1213 1239 1214 - /* 1215 - * mm/mmap.c 1216 - */ 1217 - struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, 1218 - struct vm_area_struct *vma, 1219 - unsigned long delta); 1220 - 1221 - struct vm_area_struct *vma_modify(struct vma_iterator *vmi, 1222 - struct vm_area_struct *prev, 1223 - struct vm_area_struct *vma, 1224 - unsigned long start, unsigned long end, 1225 - unsigned long vm_flags, 1226 - struct mempolicy *policy, 1227 - struct vm_userfaultfd_ctx uffd_ctx, 1228 - struct anon_vma_name *anon_name); 1229 - 1230 - /* We are about to modify the VMA's flags. */ 1231 - static inline struct vm_area_struct 1232 - *vma_modify_flags(struct vma_iterator *vmi, 1233 - struct vm_area_struct *prev, 1234 - struct vm_area_struct *vma, 1235 - unsigned long start, unsigned long end, 1236 - unsigned long new_flags) 1237 - { 1238 - return vma_modify(vmi, prev, vma, start, end, new_flags, 1239 - vma_policy(vma), vma->vm_userfaultfd_ctx, 1240 - anon_vma_name(vma)); 1241 - } 1242 - 1243 - /* We are about to modify the VMA's flags and/or anon_name. */ 1244 - static inline struct vm_area_struct 1245 - *vma_modify_flags_name(struct vma_iterator *vmi, 1246 - struct vm_area_struct *prev, 1247 - struct vm_area_struct *vma, 1248 - unsigned long start, 1249 - unsigned long end, 1250 - unsigned long new_flags, 1251 - struct anon_vma_name *new_name) 1252 - { 1253 - return vma_modify(vmi, prev, vma, start, end, new_flags, 1254 - vma_policy(vma), vma->vm_userfaultfd_ctx, new_name); 1255 - } 1256 - 1257 - /* We are about to modify the VMA's memory policy. */ 1258 - static inline struct vm_area_struct 1259 - *vma_modify_policy(struct vma_iterator *vmi, 1260 - struct vm_area_struct *prev, 1261 - struct vm_area_struct *vma, 1262 - unsigned long start, unsigned long end, 1263 - struct mempolicy *new_pol) 1264 - { 1265 - return vma_modify(vmi, prev, vma, start, end, vma->vm_flags, 1266 - new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); 1267 - } 1268 - 1269 - /* We are about to modify the VMA's flags and/or uffd context. */ 1270 - static inline struct vm_area_struct 1271 - *vma_modify_flags_uffd(struct vma_iterator *vmi, 1272 - struct vm_area_struct *prev, 1273 - struct vm_area_struct *vma, 1274 - unsigned long start, unsigned long end, 1275 - unsigned long new_flags, 1276 - struct vm_userfaultfd_ctx new_ctx) 1277 - { 1278 - return vma_modify(vmi, prev, vma, start, end, new_flags, 1279 - vma_policy(vma), new_ctx, anon_vma_name(vma)); 1280 - } 1281 - 1282 - int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, 1283 - unsigned long start, unsigned long end, pgoff_t pgoff, 1284 - struct vm_area_struct *next); 1285 - int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, 1286 - unsigned long start, unsigned long end, pgoff_t pgoff); 1287 - 1288 1240 enum { 1289 1241 /* mark page accessed */ 1290 1242 FOLL_TOUCH = 1 << 16, ··· 1337 1437 return vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte); 1338 1438 } 1339 1439 1340 - static inline void vma_iter_config(struct vma_iterator *vmi, 1341 - unsigned long index, unsigned long last) 1342 - { 1343 - __mas_set_range(&vmi->mas, index, last - 1); 1344 - } 1345 - 1346 - static inline void vma_iter_reset(struct vma_iterator *vmi) 1347 - { 1348 - mas_reset(&vmi->mas); 1349 - } 1350 - 1351 - static inline 1352 - struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min) 1353 - { 1354 - return mas_prev_range(&vmi->mas, min); 1355 - } 1356 - 1357 - static inline 1358 - struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max) 1359 - { 1360 - return mas_next_range(&vmi->mas, max); 1361 - } 1362 - 1363 - static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min, 1364 - unsigned long max, unsigned long size) 1365 - { 1366 - return mas_empty_area(&vmi->mas, min, max - 1, size); 1367 - } 1368 - 1369 - static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min, 1370 - unsigned long max, unsigned long size) 1371 - { 1372 - return mas_empty_area_rev(&vmi->mas, min, max - 1, size); 1373 - } 1374 - 1375 - /* 1376 - * VMA Iterator functions shared between nommu and mmap 1377 - */ 1378 - static inline int vma_iter_prealloc(struct vma_iterator *vmi, 1379 - struct vm_area_struct *vma) 1380 - { 1381 - return mas_preallocate(&vmi->mas, vma, GFP_KERNEL); 1382 - } 1383 - 1384 - static inline void vma_iter_clear(struct vma_iterator *vmi) 1385 - { 1386 - mas_store_prealloc(&vmi->mas, NULL); 1387 - } 1388 - 1389 - static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) 1390 - { 1391 - return mas_walk(&vmi->mas); 1392 - } 1393 - 1394 - /* Store a VMA with preallocated memory */ 1395 - static inline void vma_iter_store(struct vma_iterator *vmi, 1396 - struct vm_area_struct *vma) 1397 - { 1398 - 1399 - #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) 1400 - if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && 1401 - vmi->mas.index > vma->vm_start)) { 1402 - pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n", 1403 - vmi->mas.index, vma->vm_start, vma->vm_start, 1404 - vma->vm_end, vmi->mas.index, vmi->mas.last); 1405 - } 1406 - if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && 1407 - vmi->mas.last < vma->vm_start)) { 1408 - pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n", 1409 - vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end, 1410 - vmi->mas.index, vmi->mas.last); 1411 - } 1412 - #endif 1413 - 1414 - if (vmi->mas.status != ma_start && 1415 - ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) 1416 - vma_iter_invalidate(vmi); 1417 - 1418 - __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); 1419 - mas_store_prealloc(&vmi->mas, vma); 1420 - } 1421 - 1422 - static inline int vma_iter_store_gfp(struct vma_iterator *vmi, 1423 - struct vm_area_struct *vma, gfp_t gfp) 1424 - { 1425 - if (vmi->mas.status != ma_start && 1426 - ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) 1427 - vma_iter_invalidate(vmi); 1428 - 1429 - __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); 1430 - mas_store_gfp(&vmi->mas, vma, gfp); 1431 - if (unlikely(mas_is_err(&vmi->mas))) 1432 - return -ENOMEM; 1433 - 1434 - return 0; 1435 - } 1436 - 1437 - static inline 1438 - struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi) 1439 - { 1440 - return mas_prev_range(&vmi->mas, 0); 1441 - } 1442 - 1443 - /* 1444 - * VMA lock generalization 1445 - */ 1446 - struct vma_prepare { 1447 - struct vm_area_struct *vma; 1448 - struct vm_area_struct *adj_next; 1449 - struct file *file; 1450 - struct address_space *mapping; 1451 - struct anon_vma *anon_vma; 1452 - struct vm_area_struct *insert; 1453 - struct vm_area_struct *remove; 1454 - struct vm_area_struct *remove2; 1455 - }; 1456 - 1457 1440 void __meminit __init_single_page(struct page *page, unsigned long pfn, 1458 1441 unsigned long zone, int nid); 1459 1442 ··· 1424 1641 /* Only track the nodes of mappings with shadow entries */ 1425 1642 void workingset_update_node(struct xa_node *node); 1426 1643 extern struct list_lru shadow_nodes; 1427 - 1428 - struct unlink_vma_file_batch { 1429 - int count; 1430 - struct vm_area_struct *vmas[8]; 1431 - }; 1432 - 1433 - void unlink_file_vma_batch_init(struct unlink_vma_file_batch *); 1434 - void unlink_file_vma_batch_add(struct unlink_vma_file_batch *, struct vm_area_struct *); 1435 - void unlink_file_vma_batch_final(struct unlink_vma_file_batch *); 1436 1644 1437 1645 /* mremap.c */ 1438 1646 unsigned long move_page_tables(struct vm_area_struct *vma,

-1772

mm/mmap.c

··· 76 76 static bool ignore_rlimit_data; 77 77 core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); 78 78 79 - static void unmap_region(struct mm_struct *mm, struct ma_state *mas, 80 - struct vm_area_struct *vma, struct vm_area_struct *prev, 81 - struct vm_area_struct *next, unsigned long start, 82 - unsigned long end, unsigned long tree_end, bool mm_wr_locked); 83 - 84 - static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) 85 - { 86 - return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); 87 - } 88 - 89 79 /* Update vma->vm_page_prot to reflect vma->vm_flags. */ 90 80 void vma_set_page_prot(struct vm_area_struct *vma) 91 81 { ··· 89 99 } 90 100 /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */ 91 101 WRITE_ONCE(vma->vm_page_prot, vm_page_prot); 92 - } 93 - 94 - /* 95 - * Requires inode->i_mapping->i_mmap_rwsem 96 - */ 97 - static void __remove_shared_vm_struct(struct vm_area_struct *vma, 98 - struct address_space *mapping) 99 - { 100 - if (vma_is_shared_maywrite(vma)) 101 - mapping_unmap_writable(mapping); 102 - 103 - flush_dcache_mmap_lock(mapping); 104 - vma_interval_tree_remove(vma, &mapping->i_mmap); 105 - flush_dcache_mmap_unlock(mapping); 106 - } 107 - 108 - /* 109 - * Unlink a file-based vm structure from its interval tree, to hide 110 - * vma from rmap and vmtruncate before freeing its page tables. 111 - */ 112 - void unlink_file_vma(struct vm_area_struct *vma) 113 - { 114 - struct file *file = vma->vm_file; 115 - 116 - if (file) { 117 - struct address_space *mapping = file->f_mapping; 118 - i_mmap_lock_write(mapping); 119 - __remove_shared_vm_struct(vma, mapping); 120 - i_mmap_unlock_write(mapping); 121 - } 122 - } 123 - 124 - void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) 125 - { 126 - vb->count = 0; 127 - } 128 - 129 - static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb) 130 - { 131 - struct address_space *mapping; 132 - int i; 133 - 134 - mapping = vb->vmas[0]->vm_file->f_mapping; 135 - i_mmap_lock_write(mapping); 136 - for (i = 0; i < vb->count; i++) { 137 - VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping); 138 - __remove_shared_vm_struct(vb->vmas[i], mapping); 139 - } 140 - i_mmap_unlock_write(mapping); 141 - 142 - unlink_file_vma_batch_init(vb); 143 - } 144 - 145 - void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, 146 - struct vm_area_struct *vma) 147 - { 148 - if (vma->vm_file == NULL) 149 - return; 150 - 151 - if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) || 152 - vb->count == ARRAY_SIZE(vb->vmas)) 153 - unlink_file_vma_batch_process(vb); 154 - 155 - vb->vmas[vb->count] = vma; 156 - vb->count++; 157 - } 158 - 159 - void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb) 160 - { 161 - if (vb->count > 0) 162 - unlink_file_vma_batch_process(vb); 163 - } 164 - 165 - /* 166 - * Close a vm structure and free it. 167 - */ 168 - static void remove_vma(struct vm_area_struct *vma, bool unreachable) 169 - { 170 - might_sleep(); 171 - if (vma->vm_ops && vma->vm_ops->close) 172 - vma->vm_ops->close(vma); 173 - if (vma->vm_file) 174 - fput(vma->vm_file); 175 - mpol_put(vma_policy(vma)); 176 - if (unreachable) 177 - __vm_area_free(vma); 178 - else 179 - vm_area_free(vma); 180 - } 181 - 182 - static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, 183 - unsigned long min) 184 - { 185 - return mas_prev(&vmi->mas, min); 186 102 } 187 103 188 104 /* ··· 212 316 mm->brk = origbrk; 213 317 mmap_write_unlock(mm); 214 318 return origbrk; 215 - } 216 - 217 - #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) 218 - static void validate_mm(struct mm_struct *mm) 219 - { 220 - int bug = 0; 221 - int i = 0; 222 - struct vm_area_struct *vma; 223 - VMA_ITERATOR(vmi, mm, 0); 224 - 225 - mt_validate(&mm->mm_mt); 226 - for_each_vma(vmi, vma) { 227 - #ifdef CONFIG_DEBUG_VM_RB 228 - struct anon_vma *anon_vma = vma->anon_vma; 229 - struct anon_vma_chain *avc; 230 - #endif 231 - unsigned long vmi_start, vmi_end; 232 - bool warn = 0; 233 - 234 - vmi_start = vma_iter_addr(&vmi); 235 - vmi_end = vma_iter_end(&vmi); 236 - if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm)) 237 - warn = 1; 238 - 239 - if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm)) 240 - warn = 1; 241 - 242 - if (warn) { 243 - pr_emerg("issue in %s\n", current->comm); 244 - dump_stack(); 245 - dump_vma(vma); 246 - pr_emerg("tree range: %px start %lx end %lx\n", vma, 247 - vmi_start, vmi_end - 1); 248 - vma_iter_dump_tree(&vmi); 249 - } 250 - 251 - #ifdef CONFIG_DEBUG_VM_RB 252 - if (anon_vma) { 253 - anon_vma_lock_read(anon_vma); 254 - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 255 - anon_vma_interval_tree_verify(avc); 256 - anon_vma_unlock_read(anon_vma); 257 - } 258 - #endif 259 - i++; 260 - } 261 - if (i != mm->map_count) { 262 - pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i); 263 - bug = 1; 264 - } 265 - VM_BUG_ON_MM(bug, mm); 266 - } 267 - 268 - #else /* !CONFIG_DEBUG_VM_MAPLE_TREE */ 269 - #define validate_mm(mm) do { } while (0) 270 - #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ 271 - 272 - /* 273 - * vma has some anon_vma assigned, and is already inserted on that 274 - * anon_vma's interval trees. 275 - * 276 - * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the 277 - * vma must be removed from the anon_vma's interval trees using 278 - * anon_vma_interval_tree_pre_update_vma(). 279 - * 280 - * After the update, the vma will be reinserted using 281 - * anon_vma_interval_tree_post_update_vma(). 282 - * 283 - * The entire update must be protected by exclusive mmap_lock and by 284 - * the root anon_vma's mutex. 285 - */ 286 - static inline void 287 - anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) 288 - { 289 - struct anon_vma_chain *avc; 290 - 291 - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 292 - anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); 293 - } 294 - 295 - static inline void 296 - anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) 297 - { 298 - struct anon_vma_chain *avc; 299 - 300 - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 301 - anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); 302 - } 303 - 304 - static unsigned long count_vma_pages_range(struct mm_struct *mm, 305 - unsigned long addr, unsigned long end) 306 - { 307 - VMA_ITERATOR(vmi, mm, addr); 308 - struct vm_area_struct *vma; 309 - unsigned long nr_pages = 0; 310 - 311 - for_each_vma_range(vmi, vma, end) { 312 - unsigned long vm_start = max(addr, vma->vm_start); 313 - unsigned long vm_end = min(end, vma->vm_end); 314 - 315 - nr_pages += PHYS_PFN(vm_end - vm_start); 316 - } 317 - 318 - return nr_pages; 319 - } 320 - 321 - static void __vma_link_file(struct vm_area_struct *vma, 322 - struct address_space *mapping) 323 - { 324 - if (vma_is_shared_maywrite(vma)) 325 - mapping_allow_writable(mapping); 326 - 327 - flush_dcache_mmap_lock(mapping); 328 - vma_interval_tree_insert(vma, &mapping->i_mmap); 329 - flush_dcache_mmap_unlock(mapping); 330 - } 331 - 332 - static void vma_link_file(struct vm_area_struct *vma) 333 - { 334 - struct file *file = vma->vm_file; 335 - struct address_space *mapping; 336 - 337 - if (file) { 338 - mapping = file->f_mapping; 339 - i_mmap_lock_write(mapping); 340 - __vma_link_file(vma, mapping); 341 - i_mmap_unlock_write(mapping); 342 - } 343 - } 344 - 345 - static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) 346 - { 347 - VMA_ITERATOR(vmi, mm, 0); 348 - 349 - vma_iter_config(&vmi, vma->vm_start, vma->vm_end); 350 - if (vma_iter_prealloc(&vmi, vma)) 351 - return -ENOMEM; 352 - 353 - vma_start_write(vma); 354 - vma_iter_store(&vmi, vma); 355 - vma_link_file(vma); 356 - mm->map_count++; 357 - validate_mm(mm); 358 - return 0; 359 - } 360 - 361 - /* 362 - * init_multi_vma_prep() - Initializer for struct vma_prepare 363 - * @vp: The vma_prepare struct 364 - * @vma: The vma that will be altered once locked 365 - * @next: The next vma if it is to be adjusted 366 - * @remove: The first vma to be removed 367 - * @remove2: The second vma to be removed 368 - */ 369 - static inline void init_multi_vma_prep(struct vma_prepare *vp, 370 - struct vm_area_struct *vma, struct vm_area_struct *next, 371 - struct vm_area_struct *remove, struct vm_area_struct *remove2) 372 - { 373 - memset(vp, 0, sizeof(struct vma_prepare)); 374 - vp->vma = vma; 375 - vp->anon_vma = vma->anon_vma; 376 - vp->remove = remove; 377 - vp->remove2 = remove2; 378 - vp->adj_next = next; 379 - if (!vp->anon_vma && next) 380 - vp->anon_vma = next->anon_vma; 381 - 382 - vp->file = vma->vm_file; 383 - if (vp->file) 384 - vp->mapping = vma->vm_file->f_mapping; 385 - 386 - } 387 - 388 - /* 389 - * init_vma_prep() - Initializer wrapper for vma_prepare struct 390 - * @vp: The vma_prepare struct 391 - * @vma: The vma that will be altered once locked 392 - */ 393 - static inline void init_vma_prep(struct vma_prepare *vp, 394 - struct vm_area_struct *vma) 395 - { 396 - init_multi_vma_prep(vp, vma, NULL, NULL, NULL); 397 - } 398 - 399 - 400 - /* 401 - * vma_prepare() - Helper function for handling locking VMAs prior to altering 402 - * @vp: The initialized vma_prepare struct 403 - */ 404 - static inline void vma_prepare(struct vma_prepare *vp) 405 - { 406 - if (vp->file) { 407 - uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); 408 - 409 - if (vp->adj_next) 410 - uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, 411 - vp->adj_next->vm_end); 412 - 413 - i_mmap_lock_write(vp->mapping); 414 - if (vp->insert && vp->insert->vm_file) { 415 - /* 416 - * Put into interval tree now, so instantiated pages 417 - * are visible to arm/parisc __flush_dcache_page 418 - * throughout; but we cannot insert into address 419 - * space until vma start or end is updated. 420 - */ 421 - __vma_link_file(vp->insert, 422 - vp->insert->vm_file->f_mapping); 423 - } 424 - } 425 - 426 - if (vp->anon_vma) { 427 - anon_vma_lock_write(vp->anon_vma); 428 - anon_vma_interval_tree_pre_update_vma(vp->vma); 429 - if (vp->adj_next) 430 - anon_vma_interval_tree_pre_update_vma(vp->adj_next); 431 - } 432 - 433 - if (vp->file) { 434 - flush_dcache_mmap_lock(vp->mapping); 435 - vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); 436 - if (vp->adj_next) 437 - vma_interval_tree_remove(vp->adj_next, 438 - &vp->mapping->i_mmap); 439 - } 440 - 441 - } 442 - 443 - /* 444 - * vma_complete- Helper function for handling the unlocking after altering VMAs, 445 - * or for inserting a VMA. 446 - * 447 - * @vp: The vma_prepare struct 448 - * @vmi: The vma iterator 449 - * @mm: The mm_struct 450 - */ 451 - static inline void vma_complete(struct vma_prepare *vp, 452 - struct vma_iterator *vmi, struct mm_struct *mm) 453 - { 454 - if (vp->file) { 455 - if (vp->adj_next) 456 - vma_interval_tree_insert(vp->adj_next, 457 - &vp->mapping->i_mmap); 458 - vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); 459 - flush_dcache_mmap_unlock(vp->mapping); 460 - } 461 - 462 - if (vp->remove && vp->file) { 463 - __remove_shared_vm_struct(vp->remove, vp->mapping); 464 - if (vp->remove2) 465 - __remove_shared_vm_struct(vp->remove2, vp->mapping); 466 - } else if (vp->insert) { 467 - /* 468 - * split_vma has split insert from vma, and needs 469 - * us to insert it before dropping the locks 470 - * (it may either follow vma or precede it). 471 - */ 472 - vma_iter_store(vmi, vp->insert); 473 - mm->map_count++; 474 - } 475 - 476 - if (vp->anon_vma) { 477 - anon_vma_interval_tree_post_update_vma(vp->vma); 478 - if (vp->adj_next) 479 - anon_vma_interval_tree_post_update_vma(vp->adj_next); 480 - anon_vma_unlock_write(vp->anon_vma); 481 - } 482 - 483 - if (vp->file) { 484 - i_mmap_unlock_write(vp->mapping); 485 - uprobe_mmap(vp->vma); 486 - 487 - if (vp->adj_next) 488 - uprobe_mmap(vp->adj_next); 489 - } 490 - 491 - if (vp->remove) { 492 - again: 493 - vma_mark_detached(vp->remove, true); 494 - if (vp->file) { 495 - uprobe_munmap(vp->remove, vp->remove->vm_start, 496 - vp->remove->vm_end); 497 - fput(vp->file); 498 - } 499 - if (vp->remove->anon_vma) 500 - anon_vma_merge(vp->vma, vp->remove); 501 - mm->map_count--; 502 - mpol_put(vma_policy(vp->remove)); 503 - if (!vp->remove2) 504 - WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); 505 - vm_area_free(vp->remove); 506 - 507 - /* 508 - * In mprotect's case 6 (see comments on vma_merge), 509 - * we are removing both mid and next vmas 510 - */ 511 - if (vp->remove2) { 512 - vp->remove = vp->remove2; 513 - vp->remove2 = NULL; 514 - goto again; 515 - } 516 - } 517 - if (vp->insert && vp->file) 518 - uprobe_mmap(vp->insert); 519 - validate_mm(mm); 520 - } 521 - 522 - /* 523 - * dup_anon_vma() - Helper function to duplicate anon_vma 524 - * @dst: The destination VMA 525 - * @src: The source VMA 526 - * @dup: Pointer to the destination VMA when successful. 527 - * 528 - * Returns: 0 on success. 529 - */ 530 - static inline int dup_anon_vma(struct vm_area_struct *dst, 531 - struct vm_area_struct *src, struct vm_area_struct **dup) 532 - { 533 - /* 534 - * Easily overlooked: when mprotect shifts the boundary, make sure the 535 - * expanding vma has anon_vma set if the shrinking vma had, to cover any 536 - * anon pages imported. 537 - */ 538 - if (src->anon_vma && !dst->anon_vma) { 539 - int ret; 540 - 541 - vma_assert_write_locked(dst); 542 - dst->anon_vma = src->anon_vma; 543 - ret = anon_vma_clone(dst, src); 544 - if (ret) 545 - return ret; 546 - 547 - *dup = dst; 548 - } 549 - 550 - return 0; 551 - } 552 - 553 - /* 554 - * vma_expand - Expand an existing VMA 555 - * 556 - * @vmi: The vma iterator 557 - * @vma: The vma to expand 558 - * @start: The start of the vma 559 - * @end: The exclusive end of the vma 560 - * @pgoff: The page offset of vma 561 - * @next: The current of next vma. 562 - * 563 - * Expand @vma to @start and @end. Can expand off the start and end. Will 564 - * expand over @next if it's different from @vma and @end == @next->vm_end. 565 - * Checking if the @vma can expand and merge with @next needs to be handled by 566 - * the caller. 567 - * 568 - * Returns: 0 on success 569 - */ 570 - int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, 571 - unsigned long start, unsigned long end, pgoff_t pgoff, 572 - struct vm_area_struct *next) 573 - { 574 - struct vm_area_struct *anon_dup = NULL; 575 - bool remove_next = false; 576 - struct vma_prepare vp; 577 - 578 - vma_start_write(vma); 579 - if (next && (vma != next) && (end == next->vm_end)) { 580 - int ret; 581 - 582 - remove_next = true; 583 - vma_start_write(next); 584 - ret = dup_anon_vma(vma, next, &anon_dup); 585 - if (ret) 586 - return ret; 587 - } 588 - 589 - init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL); 590 - /* Not merging but overwriting any part of next is not handled. */ 591 - VM_WARN_ON(next && !vp.remove && 592 - next != vma && end > next->vm_start); 593 - /* Only handles expanding */ 594 - VM_WARN_ON(vma->vm_start < start || vma->vm_end > end); 595 - 596 - /* Note: vma iterator must be pointing to 'start' */ 597 - vma_iter_config(vmi, start, end); 598 - if (vma_iter_prealloc(vmi, vma)) 599 - goto nomem; 600 - 601 - vma_prepare(&vp); 602 - vma_adjust_trans_huge(vma, start, end, 0); 603 - vma_set_range(vma, start, end, pgoff); 604 - vma_iter_store(vmi, vma); 605 - 606 - vma_complete(&vp, vmi, vma->vm_mm); 607 - return 0; 608 - 609 - nomem: 610 - if (anon_dup) 611 - unlink_anon_vmas(anon_dup); 612 - return -ENOMEM; 613 - } 614 - 615 - /* 616 - * vma_shrink() - Reduce an existing VMAs memory area 617 - * @vmi: The vma iterator 618 - * @vma: The VMA to modify 619 - * @start: The new start 620 - * @end: The new end 621 - * 622 - * Returns: 0 on success, -ENOMEM otherwise 623 - */ 624 - int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, 625 - unsigned long start, unsigned long end, pgoff_t pgoff) 626 - { 627 - struct vma_prepare vp; 628 - 629 - WARN_ON((vma->vm_start != start) && (vma->vm_end != end)); 630 - 631 - if (vma->vm_start < start) 632 - vma_iter_config(vmi, vma->vm_start, start); 633 - else 634 - vma_iter_config(vmi, end, vma->vm_end); 635 - 636 - if (vma_iter_prealloc(vmi, NULL)) 637 - return -ENOMEM; 638 - 639 - vma_start_write(vma); 640 - 641 - init_vma_prep(&vp, vma); 642 - vma_prepare(&vp); 643 - vma_adjust_trans_huge(vma, start, end, 0); 644 - 645 - vma_iter_clear(vmi); 646 - vma_set_range(vma, start, end, pgoff); 647 - vma_complete(&vp, vmi, vma->vm_mm); 648 - return 0; 649 - } 650 - 651 - /* 652 - * If the vma has a ->close operation then the driver probably needs to release 653 - * per-vma resources, so we don't attempt to merge those if the caller indicates 654 - * the current vma may be removed as part of the merge. 655 - */ 656 - static inline bool is_mergeable_vma(struct vm_area_struct *vma, 657 - struct file *file, unsigned long vm_flags, 658 - struct vm_userfaultfd_ctx vm_userfaultfd_ctx, 659 - struct anon_vma_name *anon_name, bool may_remove_vma) 660 - { 661 - /* 662 - * VM_SOFTDIRTY should not prevent from VMA merging, if we 663 - * match the flags but dirty bit -- the caller should mark 664 - * merged VMA as dirty. If dirty bit won't be excluded from 665 - * comparison, we increase pressure on the memory system forcing 666 - * the kernel to generate new VMAs when old one could be 667 - * extended instead. 668 - */ 669 - if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY) 670 - return false; 671 - if (vma->vm_file != file) 672 - return false; 673 - if (may_remove_vma && vma->vm_ops && vma->vm_ops->close) 674 - return false; 675 - if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) 676 - return false; 677 - if (!anon_vma_name_eq(anon_vma_name(vma), anon_name)) 678 - return false; 679 - return true; 680 - } 681 - 682 - static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1, 683 - struct anon_vma *anon_vma2, struct vm_area_struct *vma) 684 - { 685 - /* 686 - * The list_is_singular() test is to avoid merging VMA cloned from 687 - * parents. This can improve scalability caused by anon_vma lock. 688 - */ 689 - if ((!anon_vma1 || !anon_vma2) && (!vma || 690 - list_is_singular(&vma->anon_vma_chain))) 691 - return true; 692 - return anon_vma1 == anon_vma2; 693 - } 694 - 695 - /* 696 - * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) 697 - * in front of (at a lower virtual address and file offset than) the vma. 698 - * 699 - * We cannot merge two vmas if they have differently assigned (non-NULL) 700 - * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. 701 - * 702 - * We don't check here for the merged mmap wrapping around the end of pagecache 703 - * indices (16TB on ia32) because do_mmap() does not permit mmap's which 704 - * wrap, nor mmaps which cover the final page at index -1UL. 705 - * 706 - * We assume the vma may be removed as part of the merge. 707 - */ 708 - static bool 709 - can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, 710 - struct anon_vma *anon_vma, struct file *file, 711 - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, 712 - struct anon_vma_name *anon_name) 713 - { 714 - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) && 715 - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { 716 - if (vma->vm_pgoff == vm_pgoff) 717 - return true; 718 - } 719 - return false; 720 - } 721 - 722 - /* 723 - * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) 724 - * beyond (at a higher virtual address and file offset than) the vma. 725 - * 726 - * We cannot merge two vmas if they have differently assigned (non-NULL) 727 - * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. 728 - * 729 - * We assume that vma is not removed as part of the merge. 730 - */ 731 - static bool 732 - can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, 733 - struct anon_vma *anon_vma, struct file *file, 734 - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, 735 - struct anon_vma_name *anon_name) 736 - { 737 - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) && 738 - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { 739 - pgoff_t vm_pglen; 740 - vm_pglen = vma_pages(vma); 741 - if (vma->vm_pgoff + vm_pglen == vm_pgoff) 742 - return true; 743 - } 744 - return false; 745 - } 746 - 747 - /* 748 - * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name), 749 - * figure out whether that can be merged with its predecessor or its 750 - * successor. Or both (it neatly fills a hole). 751 - * 752 - * In most cases - when called for mmap, brk or mremap - [addr,end) is 753 - * certain not to be mapped by the time vma_merge is called; but when 754 - * called for mprotect, it is certain to be already mapped (either at 755 - * an offset within prev, or at the start of next), and the flags of 756 - * this area are about to be changed to vm_flags - and the no-change 757 - * case has already been eliminated. 758 - * 759 - * The following mprotect cases have to be considered, where **** is 760 - * the area passed down from mprotect_fixup, never extending beyond one 761 - * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts 762 - * at the same address as **** and is of the same or larger span, and 763 - * NNNN the next vma after ****: 764 - * 765 - * **** **** **** 766 - * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPCCCCCC 767 - * cannot merge might become might become 768 - * PPNNNNNNNNNN PPPPPPPPPPCC 769 - * mmap, brk or case 4 below case 5 below 770 - * mremap move: 771 - * **** **** 772 - * PPPP NNNN PPPPCCCCNNNN 773 - * might become might become 774 - * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or 775 - * PPPPPPPPNNNN 2 or PPPPPPPPNNNN 7 or 776 - * PPPPNNNNNNNN 3 PPPPNNNNNNNN 8 777 - * 778 - * It is important for case 8 that the vma CCCC overlapping the 779 - * region **** is never going to extended over NNNN. Instead NNNN must 780 - * be extended in region **** and CCCC must be removed. This way in 781 - * all cases where vma_merge succeeds, the moment vma_merge drops the 782 - * rmap_locks, the properties of the merged vma will be already 783 - * correct for the whole merged range. Some of those properties like 784 - * vm_page_prot/vm_flags may be accessed by rmap_walks and they must 785 - * be correct for the whole merged range immediately after the 786 - * rmap_locks are released. Otherwise if NNNN would be removed and 787 - * CCCC would be extended over the NNNN range, remove_migration_ptes 788 - * or other rmap walkers (if working on addresses beyond the "end" 789 - * parameter) may establish ptes with the wrong permissions of CCCC 790 - * instead of the right permissions of NNNN. 791 - * 792 - * In the code below: 793 - * PPPP is represented by *prev 794 - * CCCC is represented by *curr or not represented at all (NULL) 795 - * NNNN is represented by *next or not represented at all (NULL) 796 - * **** is not represented - it will be merged and the vma containing the 797 - * area is returned, or the function will return NULL 798 - */ 799 - static struct vm_area_struct 800 - *vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev, 801 - struct vm_area_struct *src, unsigned long addr, unsigned long end, 802 - unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy, 803 - struct vm_userfaultfd_ctx vm_userfaultfd_ctx, 804 - struct anon_vma_name *anon_name) 805 - { 806 - struct mm_struct *mm = src->vm_mm; 807 - struct anon_vma *anon_vma = src->anon_vma; 808 - struct file *file = src->vm_file; 809 - struct vm_area_struct *curr, *next, *res; 810 - struct vm_area_struct *vma, *adjust, *remove, *remove2; 811 - struct vm_area_struct *anon_dup = NULL; 812 - struct vma_prepare vp; 813 - pgoff_t vma_pgoff; 814 - int err = 0; 815 - bool merge_prev = false; 816 - bool merge_next = false; 817 - bool vma_expanded = false; 818 - unsigned long vma_start = addr; 819 - unsigned long vma_end = end; 820 - pgoff_t pglen = (end - addr) >> PAGE_SHIFT; 821 - long adj_start = 0; 822 - 823 - /* 824 - * We later require that vma->vm_flags == vm_flags, 825 - * so this tests vma->vm_flags & VM_SPECIAL, too. 826 - */ 827 - if (vm_flags & VM_SPECIAL) 828 - return NULL; 829 - 830 - /* Does the input range span an existing VMA? (cases 5 - 8) */ 831 - curr = find_vma_intersection(mm, prev ? prev->vm_end : 0, end); 832 - 833 - if (!curr || /* cases 1 - 4 */ 834 - end == curr->vm_end) /* cases 6 - 8, adjacent VMA */ 835 - next = vma_lookup(mm, end); 836 - else 837 - next = NULL; /* case 5 */ 838 - 839 - if (prev) { 840 - vma_start = prev->vm_start; 841 - vma_pgoff = prev->vm_pgoff; 842 - 843 - /* Can we merge the predecessor? */ 844 - if (addr == prev->vm_end && mpol_equal(vma_policy(prev), policy) 845 - && can_vma_merge_after(prev, vm_flags, anon_vma, file, 846 - pgoff, vm_userfaultfd_ctx, anon_name)) { 847 - merge_prev = true; 848 - vma_prev(vmi); 849 - } 850 - } 851 - 852 - /* Can we merge the successor? */ 853 - if (next && mpol_equal(policy, vma_policy(next)) && 854 - can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, 855 - vm_userfaultfd_ctx, anon_name)) { 856 - merge_next = true; 857 - } 858 - 859 - /* Verify some invariant that must be enforced by the caller. */ 860 - VM_WARN_ON(prev && addr <= prev->vm_start); 861 - VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end)); 862 - VM_WARN_ON(addr >= end); 863 - 864 - if (!merge_prev && !merge_next) 865 - return NULL; /* Not mergeable. */ 866 - 867 - if (merge_prev) 868 - vma_start_write(prev); 869 - 870 - res = vma = prev; 871 - remove = remove2 = adjust = NULL; 872 - 873 - /* Can we merge both the predecessor and the successor? */ 874 - if (merge_prev && merge_next && 875 - is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { 876 - vma_start_write(next); 877 - remove = next; /* case 1 */ 878 - vma_end = next->vm_end; 879 - err = dup_anon_vma(prev, next, &anon_dup); 880 - if (curr) { /* case 6 */ 881 - vma_start_write(curr); 882 - remove = curr; 883 - remove2 = next; 884 - /* 885 - * Note that the dup_anon_vma below cannot overwrite err 886 - * since the first caller would do nothing unless next 887 - * has an anon_vma. 888 - */ 889 - if (!next->anon_vma) 890 - err = dup_anon_vma(prev, curr, &anon_dup); 891 - } 892 - } else if (merge_prev) { /* case 2 */ 893 - if (curr) { 894 - vma_start_write(curr); 895 - if (end == curr->vm_end) { /* case 7 */ 896 - /* 897 - * can_vma_merge_after() assumed we would not be 898 - * removing prev vma, so it skipped the check 899 - * for vm_ops->close, but we are removing curr 900 - */ 901 - if (curr->vm_ops && curr->vm_ops->close) 902 - err = -EINVAL; 903 - remove = curr; 904 - } else { /* case 5 */ 905 - adjust = curr; 906 - adj_start = (end - curr->vm_start); 907 - } 908 - if (!err) 909 - err = dup_anon_vma(prev, curr, &anon_dup); 910 - } 911 - } else { /* merge_next */ 912 - vma_start_write(next); 913 - res = next; 914 - if (prev && addr < prev->vm_end) { /* case 4 */ 915 - vma_start_write(prev); 916 - vma_end = addr; 917 - adjust = next; 918 - adj_start = -(prev->vm_end - addr); 919 - err = dup_anon_vma(next, prev, &anon_dup); 920 - } else { 921 - /* 922 - * Note that cases 3 and 8 are the ONLY ones where prev 923 - * is permitted to be (but is not necessarily) NULL. 924 - */ 925 - vma = next; /* case 3 */ 926 - vma_start = addr; 927 - vma_end = next->vm_end; 928 - vma_pgoff = next->vm_pgoff - pglen; 929 - if (curr) { /* case 8 */ 930 - vma_pgoff = curr->vm_pgoff; 931 - vma_start_write(curr); 932 - remove = curr; 933 - err = dup_anon_vma(next, curr, &anon_dup); 934 - } 935 - } 936 - } 937 - 938 - /* Error in anon_vma clone. */ 939 - if (err) 940 - goto anon_vma_fail; 941 - 942 - if (vma_start < vma->vm_start || vma_end > vma->vm_end) 943 - vma_expanded = true; 944 - 945 - if (vma_expanded) { 946 - vma_iter_config(vmi, vma_start, vma_end); 947 - } else { 948 - vma_iter_config(vmi, adjust->vm_start + adj_start, 949 - adjust->vm_end); 950 - } 951 - 952 - if (vma_iter_prealloc(vmi, vma)) 953 - goto prealloc_fail; 954 - 955 - init_multi_vma_prep(&vp, vma, adjust, remove, remove2); 956 - VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma && 957 - vp.anon_vma != adjust->anon_vma); 958 - 959 - vma_prepare(&vp); 960 - vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start); 961 - vma_set_range(vma, vma_start, vma_end, vma_pgoff); 962 - 963 - if (vma_expanded) 964 - vma_iter_store(vmi, vma); 965 - 966 - if (adj_start) { 967 - adjust->vm_start += adj_start; 968 - adjust->vm_pgoff += adj_start >> PAGE_SHIFT; 969 - if (adj_start < 0) { 970 - WARN_ON(vma_expanded); 971 - vma_iter_store(vmi, next); 972 - } 973 - } 974 - 975 - vma_complete(&vp, vmi, mm); 976 - khugepaged_enter_vma(res, vm_flags); 977 - return res; 978 - 979 - prealloc_fail: 980 - if (anon_dup) 981 - unlink_anon_vmas(anon_dup); 982 - 983 - anon_vma_fail: 984 - vma_iter_set(vmi, addr); 985 - vma_iter_load(vmi); 986 - return NULL; 987 - } 988 - 989 - /* 990 - * Rough compatibility check to quickly see if it's even worth looking 991 - * at sharing an anon_vma. 992 - * 993 - * They need to have the same vm_file, and the flags can only differ 994 - * in things that mprotect may change. 995 - * 996 - * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that 997 - * we can merge the two vma's. For example, we refuse to merge a vma if 998 - * there is a vm_ops->close() function, because that indicates that the 999 - * driver is doing some kind of reference counting. But that doesn't 1000 - * really matter for the anon_vma sharing case. 1001 - */ 1002 - static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) 1003 - { 1004 - return a->vm_end == b->vm_start && 1005 - mpol_equal(vma_policy(a), vma_policy(b)) && 1006 - a->vm_file == b->vm_file && 1007 - !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) && 1008 - b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); 1009 - } 1010 - 1011 - /* 1012 - * Do some basic sanity checking to see if we can re-use the anon_vma 1013 - * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be 1014 - * the same as 'old', the other will be the new one that is trying 1015 - * to share the anon_vma. 1016 - * 1017 - * NOTE! This runs with mmap_lock held for reading, so it is possible that 1018 - * the anon_vma of 'old' is concurrently in the process of being set up 1019 - * by another page fault trying to merge _that_. But that's ok: if it 1020 - * is being set up, that automatically means that it will be a singleton 1021 - * acceptable for merging, so we can do all of this optimistically. But 1022 - * we do that READ_ONCE() to make sure that we never re-load the pointer. 1023 - * 1024 - * IOW: that the "list_is_singular()" test on the anon_vma_chain only 1025 - * matters for the 'stable anon_vma' case (ie the thing we want to avoid 1026 - * is to return an anon_vma that is "complex" due to having gone through 1027 - * a fork). 1028 - * 1029 - * We also make sure that the two vma's are compatible (adjacent, 1030 - * and with the same memory policies). That's all stable, even with just 1031 - * a read lock on the mmap_lock. 1032 - */ 1033 - static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) 1034 - { 1035 - if (anon_vma_compatible(a, b)) { 1036 - struct anon_vma *anon_vma = READ_ONCE(old->anon_vma); 1037 - 1038 - if (anon_vma && list_is_singular(&old->anon_vma_chain)) 1039 - return anon_vma; 1040 - } 1041 - return NULL; 1042 - } 1043 - 1044 - /* 1045 - * find_mergeable_anon_vma is used by anon_vma_prepare, to check 1046 - * neighbouring vmas for a suitable anon_vma, before it goes off 1047 - * to allocate a new anon_vma. It checks because a repetitive 1048 - * sequence of mprotects and faults may otherwise lead to distinct 1049 - * anon_vmas being allocated, preventing vma merge in subsequent 1050 - * mprotect. 1051 - */ 1052 - struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) 1053 - { 1054 - struct anon_vma *anon_vma = NULL; 1055 - struct vm_area_struct *prev, *next; 1056 - VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end); 1057 - 1058 - /* Try next first. */ 1059 - next = vma_iter_load(&vmi); 1060 - if (next) { 1061 - anon_vma = reusable_anon_vma(next, vma, next); 1062 - if (anon_vma) 1063 - return anon_vma; 1064 - } 1065 - 1066 - prev = vma_prev(&vmi); 1067 - VM_BUG_ON_VMA(prev != vma, vma); 1068 - prev = vma_prev(&vmi); 1069 - /* Try prev next. */ 1070 - if (prev) 1071 - anon_vma = reusable_anon_vma(prev, prev, vma); 1072 - 1073 - /* 1074 - * We might reach here with anon_vma == NULL if we can't find 1075 - * any reusable anon_vma. 1076 - * There's no absolute need to look only at touching neighbours: 1077 - * we could search further afield for "compatible" anon_vmas. 1078 - * But it would probably just be a waste of time searching, 1079 - * or lead to too many vmas hanging off the same anon_vma. 1080 - * We're trying to allow mprotect remerging later on, 1081 - * not trying to minimize memory used for anon_vmas. 1082 - */ 1083 - return anon_vma; 1084 319 } 1085 320 1086 321 /* ··· 575 1548 a.offset >> PAGE_SHIFT); 576 1549 } 577 1550 #endif /* __ARCH_WANT_SYS_OLD_MMAP */ 578 - 579 - static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) 580 - { 581 - return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite); 582 - } 583 - 584 - static bool vma_is_shared_writable(struct vm_area_struct *vma) 585 - { 586 - return (vma->vm_flags & (VM_WRITE | VM_SHARED)) == 587 - (VM_WRITE | VM_SHARED); 588 - } 589 - 590 - static bool vma_fs_can_writeback(struct vm_area_struct *vma) 591 - { 592 - /* No managed pages to writeback. */ 593 - if (vma->vm_flags & VM_PFNMAP) 594 - return false; 595 - 596 - return vma->vm_file && vma->vm_file->f_mapping && 597 - mapping_can_writeback(vma->vm_file->f_mapping); 598 - } 599 - 600 - /* 601 - * Does this VMA require the underlying folios to have their dirty state 602 - * tracked? 603 - */ 604 - bool vma_needs_dirty_tracking(struct vm_area_struct *vma) 605 - { 606 - /* Only shared, writable VMAs require dirty tracking. */ 607 - if (!vma_is_shared_writable(vma)) 608 - return false; 609 - 610 - /* Does the filesystem need to be notified? */ 611 - if (vm_ops_needs_writenotify(vma->vm_ops)) 612 - return true; 613 - 614 - /* 615 - * Even if the filesystem doesn't indicate a need for writenotify, if it 616 - * can writeback, dirty tracking is still required. 617 - */ 618 - return vma_fs_can_writeback(vma); 619 - } 620 - 621 - /* 622 - * Some shared mappings will want the pages marked read-only 623 - * to track write events. If so, we'll downgrade vm_page_prot 624 - * to the private version (using protection_map[] without the 625 - * VM_SHARED bit). 626 - */ 627 - bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) 628 - { 629 - /* If it was private or non-writable, the write bit is already clear */ 630 - if (!vma_is_shared_writable(vma)) 631 - return false; 632 - 633 - /* The backer wishes to know when pages are first written to? */ 634 - if (vm_ops_needs_writenotify(vma->vm_ops)) 635 - return true; 636 - 637 - /* The open routine did something to the protections that pgprot_modify 638 - * won't preserve? */ 639 - if (pgprot_val(vm_page_prot) != 640 - pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags))) 641 - return false; 642 - 643 - /* 644 - * Do we need to track softdirty? hugetlb does not support softdirty 645 - * tracking yet. 646 - */ 647 - if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) 648 - return true; 649 - 650 - /* Do we need write faults for uffd-wp tracking? */ 651 - if (userfaultfd_wp(vma)) 652 - return true; 653 - 654 - /* Can the mapping track the dirty pages? */ 655 - return vma_fs_can_writeback(vma); 656 - } 657 1551 658 1552 /* 659 1553 * We account for memory if it's a private writeable mapping, ··· 1341 2393 return vma; 1342 2394 } 1343 2395 1344 - /* 1345 - * Ok - we have the memory areas we should free on a maple tree so release them, 1346 - * and do the vma updates. 1347 - * 1348 - * Called with the mm semaphore held. 1349 - */ 1350 - static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) 1351 - { 1352 - unsigned long nr_accounted = 0; 1353 - struct vm_area_struct *vma; 1354 - 1355 - /* Update high watermark before we lower total_vm */ 1356 - update_hiwater_vm(mm); 1357 - mas_for_each(mas, vma, ULONG_MAX) { 1358 - long nrpages = vma_pages(vma); 1359 - 1360 - if (vma->vm_flags & VM_ACCOUNT) 1361 - nr_accounted += nrpages; 1362 - vm_stat_account(mm, vma->vm_flags, -nrpages); 1363 - remove_vma(vma, false); 1364 - } 1365 - vm_unacct_memory(nr_accounted); 1366 - } 1367 - 1368 - /* 1369 - * Get rid of page table information in the indicated region. 1370 - * 1371 - * Called with the mm semaphore held. 1372 - */ 1373 - static void unmap_region(struct mm_struct *mm, struct ma_state *mas, 1374 - struct vm_area_struct *vma, struct vm_area_struct *prev, 1375 - struct vm_area_struct *next, unsigned long start, 1376 - unsigned long end, unsigned long tree_end, bool mm_wr_locked) 1377 - { 1378 - struct mmu_gather tlb; 1379 - unsigned long mt_start = mas->index; 1380 - 1381 - lru_add_drain(); 1382 - tlb_gather_mmu(&tlb, mm); 1383 - update_hiwater_rss(mm); 1384 - unmap_vmas(&tlb, mas, vma, start, end, tree_end, mm_wr_locked); 1385 - mas_set(mas, mt_start); 1386 - free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, 1387 - next ? next->vm_start : USER_PGTABLES_CEILING, 1388 - mm_wr_locked); 1389 - tlb_finish_mmu(&tlb); 1390 - } 1391 - 1392 - /* 1393 - * __split_vma() bypasses sysctl_max_map_count checking. We use this where it 1394 - * has already been checked or doesn't make sense to fail. 1395 - * VMA Iterator will point to the end VMA. 1396 - */ 1397 - static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, 1398 - unsigned long addr, int new_below) 1399 - { 1400 - struct vma_prepare vp; 1401 - struct vm_area_struct *new; 1402 - int err; 1403 - 1404 - WARN_ON(vma->vm_start >= addr); 1405 - WARN_ON(vma->vm_end <= addr); 1406 - 1407 - if (vma->vm_ops && vma->vm_ops->may_split) { 1408 - err = vma->vm_ops->may_split(vma, addr); 1409 - if (err) 1410 - return err; 1411 - } 1412 - 1413 - new = vm_area_dup(vma); 1414 - if (!new) 1415 - return -ENOMEM; 1416 - 1417 - if (new_below) { 1418 - new->vm_end = addr; 1419 - } else { 1420 - new->vm_start = addr; 1421 - new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); 1422 - } 1423 - 1424 - err = -ENOMEM; 1425 - vma_iter_config(vmi, new->vm_start, new->vm_end); 1426 - if (vma_iter_prealloc(vmi, new)) 1427 - goto out_free_vma; 1428 - 1429 - err = vma_dup_policy(vma, new); 1430 - if (err) 1431 - goto out_free_vmi; 1432 - 1433 - err = anon_vma_clone(new, vma); 1434 - if (err) 1435 - goto out_free_mpol; 1436 - 1437 - if (new->vm_file) 1438 - get_file(new->vm_file); 1439 - 1440 - if (new->vm_ops && new->vm_ops->open) 1441 - new->vm_ops->open(new); 1442 - 1443 - vma_start_write(vma); 1444 - vma_start_write(new); 1445 - 1446 - init_vma_prep(&vp, vma); 1447 - vp.insert = new; 1448 - vma_prepare(&vp); 1449 - vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); 1450 - 1451 - if (new_below) { 1452 - vma->vm_start = addr; 1453 - vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; 1454 - } else { 1455 - vma->vm_end = addr; 1456 - } 1457 - 1458 - /* vma_complete stores the new vma */ 1459 - vma_complete(&vp, vmi, vma->vm_mm); 1460 - 1461 - /* Success. */ 1462 - if (new_below) 1463 - vma_next(vmi); 1464 - return 0; 1465 - 1466 - out_free_mpol: 1467 - mpol_put(vma_policy(new)); 1468 - out_free_vmi: 1469 - vma_iter_free(vmi); 1470 - out_free_vma: 1471 - vm_area_free(new); 1472 - return err; 1473 - } 1474 - 1475 - /* 1476 - * Split a vma into two pieces at address 'addr', a new vma is allocated 1477 - * either for the first part or the tail. 1478 - */ 1479 - static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, 1480 - unsigned long addr, int new_below) 1481 - { 1482 - if (vma->vm_mm->map_count >= sysctl_max_map_count) 1483 - return -ENOMEM; 1484 - 1485 - return __split_vma(vmi, vma, addr, new_below); 1486 - } 1487 - 1488 - /* 1489 - * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd 1490 - * context and anonymous VMA name within the range [start, end). 1491 - * 1492 - * As a result, we might be able to merge the newly modified VMA range with an 1493 - * adjacent VMA with identical properties. 1494 - * 1495 - * If no merge is possible and the range does not span the entirety of the VMA, 1496 - * we then need to split the VMA to accommodate the change. 1497 - * 1498 - * The function returns either the merged VMA, the original VMA if a split was 1499 - * required instead, or an error if the split failed. 1500 - */ 1501 - struct vm_area_struct *vma_modify(struct vma_iterator *vmi, 1502 - struct vm_area_struct *prev, 1503 - struct vm_area_struct *vma, 1504 - unsigned long start, unsigned long end, 1505 - unsigned long vm_flags, 1506 - struct mempolicy *policy, 1507 - struct vm_userfaultfd_ctx uffd_ctx, 1508 - struct anon_vma_name *anon_name) 1509 - { 1510 - pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 1511 - struct vm_area_struct *merged; 1512 - 1513 - merged = vma_merge(vmi, prev, vma, start, end, vm_flags, 1514 - pgoff, policy, uffd_ctx, anon_name); 1515 - if (merged) 1516 - return merged; 1517 - 1518 - if (vma->vm_start < start) { 1519 - int err = split_vma(vmi, vma, start, 1); 1520 - 1521 - if (err) 1522 - return ERR_PTR(err); 1523 - } 1524 - 1525 - if (vma->vm_end > end) { 1526 - int err = split_vma(vmi, vma, end, 0); 1527 - 1528 - if (err) 1529 - return ERR_PTR(err); 1530 - } 1531 - 1532 - return vma; 1533 - } 1534 - 1535 - /* 1536 - * Attempt to merge a newly mapped VMA with those adjacent to it. The caller 1537 - * must ensure that [start, end) does not overlap any existing VMA. 1538 - */ 1539 - static struct vm_area_struct 1540 - *vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev, 1541 - struct vm_area_struct *vma, unsigned long start, 1542 - unsigned long end, pgoff_t pgoff) 1543 - { 1544 - return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff, 1545 - vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); 1546 - } 1547 - 1548 - /* 1549 - * Expand vma by delta bytes, potentially merging with an immediately adjacent 1550 - * VMA with identical properties. 1551 - */ 1552 - struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, 1553 - struct vm_area_struct *vma, 1554 - unsigned long delta) 1555 - { 1556 - pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma); 1557 - 1558 - /* vma is specified as prev, so case 1 or 2 will apply. */ 1559 - return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta, 1560 - vma->vm_flags, pgoff, vma_policy(vma), 1561 - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); 1562 - } 1563 - 1564 - /* 1565 - * do_vmi_align_munmap() - munmap the aligned region from @start to @end. 1566 - * @vmi: The vma iterator 1567 - * @vma: The starting vm_area_struct 1568 - * @mm: The mm_struct 1569 - * @start: The aligned start address to munmap. 1570 - * @end: The aligned end address to munmap. 1571 - * @uf: The userfaultfd list_head 1572 - * @unlock: Set to true to drop the mmap_lock. unlocking only happens on 1573 - * success. 1574 - * 1575 - * Return: 0 on success and drops the lock if so directed, error and leaves the 1576 - * lock held otherwise. 1577 - */ 1578 - static int 1579 - do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, 1580 - struct mm_struct *mm, unsigned long start, 1581 - unsigned long end, struct list_head *uf, bool unlock) 1582 - { 1583 - struct vm_area_struct *prev, *next = NULL; 1584 - struct maple_tree mt_detach; 1585 - int count = 0; 1586 - int error = -ENOMEM; 1587 - unsigned long locked_vm = 0; 1588 - MA_STATE(mas_detach, &mt_detach, 0, 0); 1589 - mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); 1590 - mt_on_stack(mt_detach); 1591 - 1592 - /* 1593 - * If we need to split any vma, do it now to save pain later. 1594 - * 1595 - * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially 1596 - * unmapped vm_area_struct will remain in use: so lower split_vma 1597 - * places tmp vma above, and higher split_vma places tmp vma below. 1598 - */ 1599 - 1600 - /* Does it split the first one? */ 1601 - if (start > vma->vm_start) { 1602 - 1603 - /* 1604 - * Make sure that map_count on return from munmap() will 1605 - * not exceed its limit; but let map_count go just above 1606 - * its limit temporarily, to help free resources as expected. 1607 - */ 1608 - if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) 1609 - goto map_count_exceeded; 1610 - 1611 - error = __split_vma(vmi, vma, start, 1); 1612 - if (error) 1613 - goto start_split_failed; 1614 - } 1615 - 1616 - /* 1617 - * Detach a range of VMAs from the mm. Using next as a temp variable as 1618 - * it is always overwritten. 1619 - */ 1620 - next = vma; 1621 - do { 1622 - /* Does it split the end? */ 1623 - if (next->vm_end > end) { 1624 - error = __split_vma(vmi, next, end, 0); 1625 - if (error) 1626 - goto end_split_failed; 1627 - } 1628 - vma_start_write(next); 1629 - mas_set(&mas_detach, count); 1630 - error = mas_store_gfp(&mas_detach, next, GFP_KERNEL); 1631 - if (error) 1632 - goto munmap_gather_failed; 1633 - vma_mark_detached(next, true); 1634 - if (next->vm_flags & VM_LOCKED) 1635 - locked_vm += vma_pages(next); 1636 - 1637 - count++; 1638 - if (unlikely(uf)) { 1639 - /* 1640 - * If userfaultfd_unmap_prep returns an error the vmas 1641 - * will remain split, but userland will get a 1642 - * highly unexpected error anyway. This is no 1643 - * different than the case where the first of the two 1644 - * __split_vma fails, but we don't undo the first 1645 - * split, despite we could. This is unlikely enough 1646 - * failure that it's not worth optimizing it for. 1647 - */ 1648 - error = userfaultfd_unmap_prep(next, start, end, uf); 1649 - 1650 - if (error) 1651 - goto userfaultfd_error; 1652 - } 1653 - #ifdef CONFIG_DEBUG_VM_MAPLE_TREE 1654 - BUG_ON(next->vm_start < start); 1655 - BUG_ON(next->vm_start > end); 1656 - #endif 1657 - } for_each_vma_range(*vmi, next, end); 1658 - 1659 - #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) 1660 - /* Make sure no VMAs are about to be lost. */ 1661 - { 1662 - MA_STATE(test, &mt_detach, 0, 0); 1663 - struct vm_area_struct *vma_mas, *vma_test; 1664 - int test_count = 0; 1665 - 1666 - vma_iter_set(vmi, start); 1667 - rcu_read_lock(); 1668 - vma_test = mas_find(&test, count - 1); 1669 - for_each_vma_range(*vmi, vma_mas, end) { 1670 - BUG_ON(vma_mas != vma_test); 1671 - test_count++; 1672 - vma_test = mas_next(&test, count - 1); 1673 - } 1674 - rcu_read_unlock(); 1675 - BUG_ON(count != test_count); 1676 - } 1677 - #endif 1678 - 1679 - while (vma_iter_addr(vmi) > start) 1680 - vma_iter_prev_range(vmi); 1681 - 1682 - error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL); 1683 - if (error) 1684 - goto clear_tree_failed; 1685 - 1686 - /* Point of no return */ 1687 - mm->locked_vm -= locked_vm; 1688 - mm->map_count -= count; 1689 - if (unlock) 1690 - mmap_write_downgrade(mm); 1691 - 1692 - prev = vma_iter_prev_range(vmi); 1693 - next = vma_next(vmi); 1694 - if (next) 1695 - vma_iter_prev_range(vmi); 1696 - 1697 - /* 1698 - * We can free page tables without write-locking mmap_lock because VMAs 1699 - * were isolated before we downgraded mmap_lock. 1700 - */ 1701 - mas_set(&mas_detach, 1); 1702 - unmap_region(mm, &mas_detach, vma, prev, next, start, end, count, 1703 - !unlock); 1704 - /* Statistics and freeing VMAs */ 1705 - mas_set(&mas_detach, 0); 1706 - remove_mt(mm, &mas_detach); 1707 - validate_mm(mm); 1708 - if (unlock) 1709 - mmap_read_unlock(mm); 1710 - 1711 - __mt_destroy(&mt_detach); 1712 - return 0; 1713 - 1714 - clear_tree_failed: 1715 - userfaultfd_error: 1716 - munmap_gather_failed: 1717 - end_split_failed: 1718 - mas_set(&mas_detach, 0); 1719 - mas_for_each(&mas_detach, next, end) 1720 - vma_mark_detached(next, false); 1721 - 1722 - __mt_destroy(&mt_detach); 1723 - start_split_failed: 1724 - map_count_exceeded: 1725 - validate_mm(mm); 1726 - return error; 1727 - } 1728 - 1729 - /* 1730 - * do_vmi_munmap() - munmap a given range. 1731 - * @vmi: The vma iterator 1732 - * @mm: The mm_struct 1733 - * @start: The start address to munmap 1734 - * @len: The length of the range to munmap 1735 - * @uf: The userfaultfd list_head 1736 - * @unlock: set to true if the user wants to drop the mmap_lock on success 1737 - * 1738 - * This function takes a @mas that is either pointing to the previous VMA or set 1739 - * to MA_START and sets it up to remove the mapping(s). The @len will be 1740 - * aligned and any arch_unmap work will be preformed. 1741 - * 1742 - * Return: 0 on success and drops the lock if so directed, error and leaves the 1743 - * lock held otherwise. 1744 - */ 1745 - int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, 1746 - unsigned long start, size_t len, struct list_head *uf, 1747 - bool unlock) 1748 - { 1749 - unsigned long end; 1750 - struct vm_area_struct *vma; 1751 - 1752 - if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) 1753 - return -EINVAL; 1754 - 1755 - end = start + PAGE_ALIGN(len); 1756 - if (end == start) 1757 - return -EINVAL; 1758 - 1759 - /* 1760 - * Check if memory is sealed before arch_unmap. 1761 - * Prevent unmapping a sealed VMA. 1762 - * can_modify_mm assumes we have acquired the lock on MM. 1763 - */ 1764 - if (unlikely(!can_modify_mm(mm, start, end))) 1765 - return -EPERM; 1766 - 1767 - /* arch_unmap() might do unmaps itself. */ 1768 - arch_unmap(mm, start, end); 1769 - 1770 - /* Find the first overlapping VMA */ 1771 - vma = vma_find(vmi, end); 1772 - if (!vma) { 1773 - if (unlock) 1774 - mmap_write_unlock(mm); 1775 - return 0; 1776 - } 1777 - 1778 - return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); 1779 - } 1780 - 1781 2396 /* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls. 1782 2397 * @mm: The mm_struct 1783 2398 * @start: The start address to munmap ··· 2002 3491 } 2003 3492 2004 3493 /* 2005 - * Copy the vma structure to a new location in the same mm, 2006 - * prior to moving page table entries, to effect an mremap move. 2007 - */ 2008 - struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, 2009 - unsigned long addr, unsigned long len, pgoff_t pgoff, 2010 - bool *need_rmap_locks) 2011 - { 2012 - struct vm_area_struct *vma = *vmap; 2013 - unsigned long vma_start = vma->vm_start; 2014 - struct mm_struct *mm = vma->vm_mm; 2015 - struct vm_area_struct *new_vma, *prev; 2016 - bool faulted_in_anon_vma = true; 2017 - VMA_ITERATOR(vmi, mm, addr); 2018 - 2019 - /* 2020 - * If anonymous vma has not yet been faulted, update new pgoff 2021 - * to match new location, to increase its chance of merging. 2022 - */ 2023 - if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { 2024 - pgoff = addr >> PAGE_SHIFT; 2025 - faulted_in_anon_vma = false; 2026 - } 2027 - 2028 - new_vma = find_vma_prev(mm, addr, &prev); 2029 - if (new_vma && new_vma->vm_start < addr + len) 2030 - return NULL; /* should never get here */ 2031 - 2032 - new_vma = vma_merge_new_vma(&vmi, prev, vma, addr, addr + len, pgoff); 2033 - if (new_vma) { 2034 - /* 2035 - * Source vma may have been merged into new_vma 2036 - */ 2037 - if (unlikely(vma_start >= new_vma->vm_start && 2038 - vma_start < new_vma->vm_end)) { 2039 - /* 2040 - * The only way we can get a vma_merge with 2041 - * self during an mremap is if the vma hasn't 2042 - * been faulted in yet and we were allowed to 2043 - * reset the dst vma->vm_pgoff to the 2044 - * destination address of the mremap to allow 2045 - * the merge to happen. mremap must change the 2046 - * vm_pgoff linearity between src and dst vmas 2047 - * (in turn preventing a vma_merge) to be 2048 - * safe. It is only safe to keep the vm_pgoff 2049 - * linear if there are no pages mapped yet. 2050 - */ 2051 - VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); 2052 - *vmap = vma = new_vma; 2053 - } 2054 - *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); 2055 - } else { 2056 - new_vma = vm_area_dup(vma); 2057 - if (!new_vma) 2058 - goto out; 2059 - vma_set_range(new_vma, addr, addr + len, pgoff); 2060 - if (vma_dup_policy(vma, new_vma)) 2061 - goto out_free_vma; 2062 - if (anon_vma_clone(new_vma, vma)) 2063 - goto out_free_mempol; 2064 - if (new_vma->vm_file) 2065 - get_file(new_vma->vm_file); 2066 - if (new_vma->vm_ops && new_vma->vm_ops->open) 2067 - new_vma->vm_ops->open(new_vma); 2068 - if (vma_link(mm, new_vma)) 2069 - goto out_vma_link; 2070 - *need_rmap_locks = false; 2071 - } 2072 - return new_vma; 2073 - 2074 - out_vma_link: 2075 - if (new_vma->vm_ops && new_vma->vm_ops->close) 2076 - new_vma->vm_ops->close(new_vma); 2077 - 2078 - if (new_vma->vm_file) 2079 - fput(new_vma->vm_file); 2080 - 2081 - unlink_anon_vmas(new_vma); 2082 - out_free_mempol: 2083 - mpol_put(vma_policy(new_vma)); 2084 - out_free_vma: 2085 - vm_area_free(new_vma); 2086 - out: 2087 - return NULL; 2088 - } 2089 - 2090 - /* 2091 3494 * Return true if the calling process may expand its vm space by the passed 2092 3495 * number of pages 2093 3496 */ ··· 2196 3771 &legacy_special_mapping_vmops); 2197 3772 2198 3773 return PTR_ERR_OR_ZERO(vma); 2199 - } 2200 - 2201 - static DEFINE_MUTEX(mm_all_locks_mutex); 2202 - 2203 - static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) 2204 - { 2205 - if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { 2206 - /* 2207 - * The LSB of head.next can't change from under us 2208 - * because we hold the mm_all_locks_mutex. 2209 - */ 2210 - down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock); 2211 - /* 2212 - * We can safely modify head.next after taking the 2213 - * anon_vma->root->rwsem. If some other vma in this mm shares 2214 - * the same anon_vma we won't take it again. 2215 - * 2216 - * No need of atomic instructions here, head.next 2217 - * can't change from under us thanks to the 2218 - * anon_vma->root->rwsem. 2219 - */ 2220 - if (__test_and_set_bit(0, (unsigned long *) 2221 - &anon_vma->root->rb_root.rb_root.rb_node)) 2222 - BUG(); 2223 - } 2224 - } 2225 - 2226 - static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) 2227 - { 2228 - if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { 2229 - /* 2230 - * AS_MM_ALL_LOCKS can't change from under us because 2231 - * we hold the mm_all_locks_mutex. 2232 - * 2233 - * Operations on ->flags have to be atomic because 2234 - * even if AS_MM_ALL_LOCKS is stable thanks to the 2235 - * mm_all_locks_mutex, there may be other cpus 2236 - * changing other bitflags in parallel to us. 2237 - */ 2238 - if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) 2239 - BUG(); 2240 - down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock); 2241 - } 2242 - } 2243 - 2244 - /* 2245 - * This operation locks against the VM for all pte/vma/mm related 2246 - * operations that could ever happen on a certain mm. This includes 2247 - * vmtruncate, try_to_unmap, and all page faults. 2248 - * 2249 - * The caller must take the mmap_lock in write mode before calling 2250 - * mm_take_all_locks(). The caller isn't allowed to release the 2251 - * mmap_lock until mm_drop_all_locks() returns. 2252 - * 2253 - * mmap_lock in write mode is required in order to block all operations 2254 - * that could modify pagetables and free pages without need of 2255 - * altering the vma layout. It's also needed in write mode to avoid new 2256 - * anon_vmas to be associated with existing vmas. 2257 - * 2258 - * A single task can't take more than one mm_take_all_locks() in a row 2259 - * or it would deadlock. 2260 - * 2261 - * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in 2262 - * mapping->flags avoid to take the same lock twice, if more than one 2263 - * vma in this mm is backed by the same anon_vma or address_space. 2264 - * 2265 - * We take locks in following order, accordingly to comment at beginning 2266 - * of mm/rmap.c: 2267 - * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for 2268 - * hugetlb mapping); 2269 - * - all vmas marked locked 2270 - * - all i_mmap_rwsem locks; 2271 - * - all anon_vma->rwseml 2272 - * 2273 - * We can take all locks within these types randomly because the VM code 2274 - * doesn't nest them and we protected from parallel mm_take_all_locks() by 2275 - * mm_all_locks_mutex. 2276 - * 2277 - * mm_take_all_locks() and mm_drop_all_locks are expensive operations 2278 - * that may have to take thousand of locks. 2279 - * 2280 - * mm_take_all_locks() can fail if it's interrupted by signals. 2281 - */ 2282 - int mm_take_all_locks(struct mm_struct *mm) 2283 - { 2284 - struct vm_area_struct *vma; 2285 - struct anon_vma_chain *avc; 2286 - VMA_ITERATOR(vmi, mm, 0); 2287 - 2288 - mmap_assert_write_locked(mm); 2289 - 2290 - mutex_lock(&mm_all_locks_mutex); 2291 - 2292 - /* 2293 - * vma_start_write() does not have a complement in mm_drop_all_locks() 2294 - * because vma_start_write() is always asymmetrical; it marks a VMA as 2295 - * being written to until mmap_write_unlock() or mmap_write_downgrade() 2296 - * is reached. 2297 - */ 2298 - for_each_vma(vmi, vma) { 2299 - if (signal_pending(current)) 2300 - goto out_unlock; 2301 - vma_start_write(vma); 2302 - } 2303 - 2304 - vma_iter_init(&vmi, mm, 0); 2305 - for_each_vma(vmi, vma) { 2306 - if (signal_pending(current)) 2307 - goto out_unlock; 2308 - if (vma->vm_file && vma->vm_file->f_mapping && 2309 - is_vm_hugetlb_page(vma)) 2310 - vm_lock_mapping(mm, vma->vm_file->f_mapping); 2311 - } 2312 - 2313 - vma_iter_init(&vmi, mm, 0); 2314 - for_each_vma(vmi, vma) { 2315 - if (signal_pending(current)) 2316 - goto out_unlock; 2317 - if (vma->vm_file && vma->vm_file->f_mapping && 2318 - !is_vm_hugetlb_page(vma)) 2319 - vm_lock_mapping(mm, vma->vm_file->f_mapping); 2320 - } 2321 - 2322 - vma_iter_init(&vmi, mm, 0); 2323 - for_each_vma(vmi, vma) { 2324 - if (signal_pending(current)) 2325 - goto out_unlock; 2326 - if (vma->anon_vma) 2327 - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 2328 - vm_lock_anon_vma(mm, avc->anon_vma); 2329 - } 2330 - 2331 - return 0; 2332 - 2333 - out_unlock: 2334 - mm_drop_all_locks(mm); 2335 - return -EINTR; 2336 - } 2337 - 2338 - static void vm_unlock_anon_vma(struct anon_vma *anon_vma) 2339 - { 2340 - if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { 2341 - /* 2342 - * The LSB of head.next can't change to 0 from under 2343 - * us because we hold the mm_all_locks_mutex. 2344 - * 2345 - * We must however clear the bitflag before unlocking 2346 - * the vma so the users using the anon_vma->rb_root will 2347 - * never see our bitflag. 2348 - * 2349 - * No need of atomic instructions here, head.next 2350 - * can't change from under us until we release the 2351 - * anon_vma->root->rwsem. 2352 - */ 2353 - if (!__test_and_clear_bit(0, (unsigned long *) 2354 - &anon_vma->root->rb_root.rb_root.rb_node)) 2355 - BUG(); 2356 - anon_vma_unlock_write(anon_vma); 2357 - } 2358 - } 2359 - 2360 - static void vm_unlock_mapping(struct address_space *mapping) 2361 - { 2362 - if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { 2363 - /* 2364 - * AS_MM_ALL_LOCKS can't change to 0 from under us 2365 - * because we hold the mm_all_locks_mutex. 2366 - */ 2367 - i_mmap_unlock_write(mapping); 2368 - if (!test_and_clear_bit(AS_MM_ALL_LOCKS, 2369 - &mapping->flags)) 2370 - BUG(); 2371 - } 2372 - } 2373 - 2374 - /* 2375 - * The mmap_lock cannot be released by the caller until 2376 - * mm_drop_all_locks() returns. 2377 - */ 2378 - void mm_drop_all_locks(struct mm_struct *mm) 2379 - { 2380 - struct vm_area_struct *vma; 2381 - struct anon_vma_chain *avc; 2382 - VMA_ITERATOR(vmi, mm, 0); 2383 - 2384 - mmap_assert_write_locked(mm); 2385 - BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); 2386 - 2387 - for_each_vma(vmi, vma) { 2388 - if (vma->anon_vma) 2389 - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 2390 - vm_unlock_anon_vma(avc->anon_vma); 2391 - if (vma->vm_file && vma->vm_file->f_mapping) 2392 - vm_unlock_mapping(vma->vm_file->f_mapping); 2393 - } 2394 - 2395 - mutex_unlock(&mm_all_locks_mutex); 2396 3774 } 2397 3775 2398 3776 /*

+2

mm/mmu_notifier.c

··· 19 19 #include <linux/sched/mm.h> 20 20 #include <linux/slab.h> 21 21 22 + #include "vma.h" 23 + 22 24 /* global SRCU for all MMs */ 23 25 DEFINE_STATIC_SRCU(srcu); 24 26

+1766

mm/vma.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + 3 + /* 4 + * VMA-specific functions. 5 + */ 6 + 7 + #include "vma_internal.h" 8 + #include "vma.h" 9 + 10 + /* 11 + * If the vma has a ->close operation then the driver probably needs to release 12 + * per-vma resources, so we don't attempt to merge those if the caller indicates 13 + * the current vma may be removed as part of the merge. 14 + */ 15 + static inline bool is_mergeable_vma(struct vm_area_struct *vma, 16 + struct file *file, unsigned long vm_flags, 17 + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, 18 + struct anon_vma_name *anon_name, bool may_remove_vma) 19 + { 20 + /* 21 + * VM_SOFTDIRTY should not prevent from VMA merging, if we 22 + * match the flags but dirty bit -- the caller should mark 23 + * merged VMA as dirty. If dirty bit won't be excluded from 24 + * comparison, we increase pressure on the memory system forcing 25 + * the kernel to generate new VMAs when old one could be 26 + * extended instead. 27 + */ 28 + if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY) 29 + return false; 30 + if (vma->vm_file != file) 31 + return false; 32 + if (may_remove_vma && vma->vm_ops && vma->vm_ops->close) 33 + return false; 34 + if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) 35 + return false; 36 + if (!anon_vma_name_eq(anon_vma_name(vma), anon_name)) 37 + return false; 38 + return true; 39 + } 40 + 41 + static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1, 42 + struct anon_vma *anon_vma2, struct vm_area_struct *vma) 43 + { 44 + /* 45 + * The list_is_singular() test is to avoid merging VMA cloned from 46 + * parents. This can improve scalability caused by anon_vma lock. 47 + */ 48 + if ((!anon_vma1 || !anon_vma2) && (!vma || 49 + list_is_singular(&vma->anon_vma_chain))) 50 + return true; 51 + return anon_vma1 == anon_vma2; 52 + } 53 + 54 + /* 55 + * init_multi_vma_prep() - Initializer for struct vma_prepare 56 + * @vp: The vma_prepare struct 57 + * @vma: The vma that will be altered once locked 58 + * @next: The next vma if it is to be adjusted 59 + * @remove: The first vma to be removed 60 + * @remove2: The second vma to be removed 61 + */ 62 + static void init_multi_vma_prep(struct vma_prepare *vp, 63 + struct vm_area_struct *vma, 64 + struct vm_area_struct *next, 65 + struct vm_area_struct *remove, 66 + struct vm_area_struct *remove2) 67 + { 68 + memset(vp, 0, sizeof(struct vma_prepare)); 69 + vp->vma = vma; 70 + vp->anon_vma = vma->anon_vma; 71 + vp->remove = remove; 72 + vp->remove2 = remove2; 73 + vp->adj_next = next; 74 + if (!vp->anon_vma && next) 75 + vp->anon_vma = next->anon_vma; 76 + 77 + vp->file = vma->vm_file; 78 + if (vp->file) 79 + vp->mapping = vma->vm_file->f_mapping; 80 + 81 + } 82 + 83 + /* 84 + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) 85 + * in front of (at a lower virtual address and file offset than) the vma. 86 + * 87 + * We cannot merge two vmas if they have differently assigned (non-NULL) 88 + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. 89 + * 90 + * We don't check here for the merged mmap wrapping around the end of pagecache 91 + * indices (16TB on ia32) because do_mmap() does not permit mmap's which 92 + * wrap, nor mmaps which cover the final page at index -1UL. 93 + * 94 + * We assume the vma may be removed as part of the merge. 95 + */ 96 + bool 97 + can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, 98 + struct anon_vma *anon_vma, struct file *file, 99 + pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, 100 + struct anon_vma_name *anon_name) 101 + { 102 + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) && 103 + is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { 104 + if (vma->vm_pgoff == vm_pgoff) 105 + return true; 106 + } 107 + return false; 108 + } 109 + 110 + /* 111 + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) 112 + * beyond (at a higher virtual address and file offset than) the vma. 113 + * 114 + * We cannot merge two vmas if they have differently assigned (non-NULL) 115 + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. 116 + * 117 + * We assume that vma is not removed as part of the merge. 118 + */ 119 + bool 120 + can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, 121 + struct anon_vma *anon_vma, struct file *file, 122 + pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, 123 + struct anon_vma_name *anon_name) 124 + { 125 + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) && 126 + is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { 127 + pgoff_t vm_pglen; 128 + 129 + vm_pglen = vma_pages(vma); 130 + if (vma->vm_pgoff + vm_pglen == vm_pgoff) 131 + return true; 132 + } 133 + return false; 134 + } 135 + 136 + /* 137 + * Close a vm structure and free it. 138 + */ 139 + void remove_vma(struct vm_area_struct *vma, bool unreachable) 140 + { 141 + might_sleep(); 142 + if (vma->vm_ops && vma->vm_ops->close) 143 + vma->vm_ops->close(vma); 144 + if (vma->vm_file) 145 + fput(vma->vm_file); 146 + mpol_put(vma_policy(vma)); 147 + if (unreachable) 148 + __vm_area_free(vma); 149 + else 150 + vm_area_free(vma); 151 + } 152 + 153 + /* 154 + * Get rid of page table information in the indicated region. 155 + * 156 + * Called with the mm semaphore held. 157 + */ 158 + void unmap_region(struct mm_struct *mm, struct ma_state *mas, 159 + struct vm_area_struct *vma, struct vm_area_struct *prev, 160 + struct vm_area_struct *next, unsigned long start, 161 + unsigned long end, unsigned long tree_end, bool mm_wr_locked) 162 + { 163 + struct mmu_gather tlb; 164 + unsigned long mt_start = mas->index; 165 + 166 + lru_add_drain(); 167 + tlb_gather_mmu(&tlb, mm); 168 + update_hiwater_rss(mm); 169 + unmap_vmas(&tlb, mas, vma, start, end, tree_end, mm_wr_locked); 170 + mas_set(mas, mt_start); 171 + free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, 172 + next ? next->vm_start : USER_PGTABLES_CEILING, 173 + mm_wr_locked); 174 + tlb_finish_mmu(&tlb); 175 + } 176 + 177 + /* 178 + * __split_vma() bypasses sysctl_max_map_count checking. We use this where it 179 + * has already been checked or doesn't make sense to fail. 180 + * VMA Iterator will point to the end VMA. 181 + */ 182 + static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, 183 + unsigned long addr, int new_below) 184 + { 185 + struct vma_prepare vp; 186 + struct vm_area_struct *new; 187 + int err; 188 + 189 + WARN_ON(vma->vm_start >= addr); 190 + WARN_ON(vma->vm_end <= addr); 191 + 192 + if (vma->vm_ops && vma->vm_ops->may_split) { 193 + err = vma->vm_ops->may_split(vma, addr); 194 + if (err) 195 + return err; 196 + } 197 + 198 + new = vm_area_dup(vma); 199 + if (!new) 200 + return -ENOMEM; 201 + 202 + if (new_below) { 203 + new->vm_end = addr; 204 + } else { 205 + new->vm_start = addr; 206 + new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); 207 + } 208 + 209 + err = -ENOMEM; 210 + vma_iter_config(vmi, new->vm_start, new->vm_end); 211 + if (vma_iter_prealloc(vmi, new)) 212 + goto out_free_vma; 213 + 214 + err = vma_dup_policy(vma, new); 215 + if (err) 216 + goto out_free_vmi; 217 + 218 + err = anon_vma_clone(new, vma); 219 + if (err) 220 + goto out_free_mpol; 221 + 222 + if (new->vm_file) 223 + get_file(new->vm_file); 224 + 225 + if (new->vm_ops && new->vm_ops->open) 226 + new->vm_ops->open(new); 227 + 228 + vma_start_write(vma); 229 + vma_start_write(new); 230 + 231 + init_vma_prep(&vp, vma); 232 + vp.insert = new; 233 + vma_prepare(&vp); 234 + vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); 235 + 236 + if (new_below) { 237 + vma->vm_start = addr; 238 + vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; 239 + } else { 240 + vma->vm_end = addr; 241 + } 242 + 243 + /* vma_complete stores the new vma */ 244 + vma_complete(&vp, vmi, vma->vm_mm); 245 + 246 + /* Success. */ 247 + if (new_below) 248 + vma_next(vmi); 249 + return 0; 250 + 251 + out_free_mpol: 252 + mpol_put(vma_policy(new)); 253 + out_free_vmi: 254 + vma_iter_free(vmi); 255 + out_free_vma: 256 + vm_area_free(new); 257 + return err; 258 + } 259 + 260 + /* 261 + * Split a vma into two pieces at address 'addr', a new vma is allocated 262 + * either for the first part or the tail. 263 + */ 264 + static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, 265 + unsigned long addr, int new_below) 266 + { 267 + if (vma->vm_mm->map_count >= sysctl_max_map_count) 268 + return -ENOMEM; 269 + 270 + return __split_vma(vmi, vma, addr, new_below); 271 + } 272 + 273 + /* 274 + * Ok - we have the memory areas we should free on a maple tree so release them, 275 + * and do the vma updates. 276 + * 277 + * Called with the mm semaphore held. 278 + */ 279 + static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) 280 + { 281 + unsigned long nr_accounted = 0; 282 + struct vm_area_struct *vma; 283 + 284 + /* Update high watermark before we lower total_vm */ 285 + update_hiwater_vm(mm); 286 + mas_for_each(mas, vma, ULONG_MAX) { 287 + long nrpages = vma_pages(vma); 288 + 289 + if (vma->vm_flags & VM_ACCOUNT) 290 + nr_accounted += nrpages; 291 + vm_stat_account(mm, vma->vm_flags, -nrpages); 292 + remove_vma(vma, false); 293 + } 294 + vm_unacct_memory(nr_accounted); 295 + } 296 + 297 + /* 298 + * init_vma_prep() - Initializer wrapper for vma_prepare struct 299 + * @vp: The vma_prepare struct 300 + * @vma: The vma that will be altered once locked 301 + */ 302 + void init_vma_prep(struct vma_prepare *vp, 303 + struct vm_area_struct *vma) 304 + { 305 + init_multi_vma_prep(vp, vma, NULL, NULL, NULL); 306 + } 307 + 308 + /* 309 + * Requires inode->i_mapping->i_mmap_rwsem 310 + */ 311 + static void __remove_shared_vm_struct(struct vm_area_struct *vma, 312 + struct address_space *mapping) 313 + { 314 + if (vma_is_shared_maywrite(vma)) 315 + mapping_unmap_writable(mapping); 316 + 317 + flush_dcache_mmap_lock(mapping); 318 + vma_interval_tree_remove(vma, &mapping->i_mmap); 319 + flush_dcache_mmap_unlock(mapping); 320 + } 321 + 322 + /* 323 + * vma has some anon_vma assigned, and is already inserted on that 324 + * anon_vma's interval trees. 325 + * 326 + * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the 327 + * vma must be removed from the anon_vma's interval trees using 328 + * anon_vma_interval_tree_pre_update_vma(). 329 + * 330 + * After the update, the vma will be reinserted using 331 + * anon_vma_interval_tree_post_update_vma(). 332 + * 333 + * The entire update must be protected by exclusive mmap_lock and by 334 + * the root anon_vma's mutex. 335 + */ 336 + void 337 + anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) 338 + { 339 + struct anon_vma_chain *avc; 340 + 341 + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 342 + anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); 343 + } 344 + 345 + void 346 + anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) 347 + { 348 + struct anon_vma_chain *avc; 349 + 350 + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 351 + anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); 352 + } 353 + 354 + static void __vma_link_file(struct vm_area_struct *vma, 355 + struct address_space *mapping) 356 + { 357 + if (vma_is_shared_maywrite(vma)) 358 + mapping_allow_writable(mapping); 359 + 360 + flush_dcache_mmap_lock(mapping); 361 + vma_interval_tree_insert(vma, &mapping->i_mmap); 362 + flush_dcache_mmap_unlock(mapping); 363 + } 364 + 365 + /* 366 + * vma_prepare() - Helper function for handling locking VMAs prior to altering 367 + * @vp: The initialized vma_prepare struct 368 + */ 369 + void vma_prepare(struct vma_prepare *vp) 370 + { 371 + if (vp->file) { 372 + uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); 373 + 374 + if (vp->adj_next) 375 + uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, 376 + vp->adj_next->vm_end); 377 + 378 + i_mmap_lock_write(vp->mapping); 379 + if (vp->insert && vp->insert->vm_file) { 380 + /* 381 + * Put into interval tree now, so instantiated pages 382 + * are visible to arm/parisc __flush_dcache_page 383 + * throughout; but we cannot insert into address 384 + * space until vma start or end is updated. 385 + */ 386 + __vma_link_file(vp->insert, 387 + vp->insert->vm_file->f_mapping); 388 + } 389 + } 390 + 391 + if (vp->anon_vma) { 392 + anon_vma_lock_write(vp->anon_vma); 393 + anon_vma_interval_tree_pre_update_vma(vp->vma); 394 + if (vp->adj_next) 395 + anon_vma_interval_tree_pre_update_vma(vp->adj_next); 396 + } 397 + 398 + if (vp->file) { 399 + flush_dcache_mmap_lock(vp->mapping); 400 + vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); 401 + if (vp->adj_next) 402 + vma_interval_tree_remove(vp->adj_next, 403 + &vp->mapping->i_mmap); 404 + } 405 + 406 + } 407 + 408 + /* 409 + * dup_anon_vma() - Helper function to duplicate anon_vma 410 + * @dst: The destination VMA 411 + * @src: The source VMA 412 + * @dup: Pointer to the destination VMA when successful. 413 + * 414 + * Returns: 0 on success. 415 + */ 416 + static int dup_anon_vma(struct vm_area_struct *dst, 417 + struct vm_area_struct *src, struct vm_area_struct **dup) 418 + { 419 + /* 420 + * Easily overlooked: when mprotect shifts the boundary, make sure the 421 + * expanding vma has anon_vma set if the shrinking vma had, to cover any 422 + * anon pages imported. 423 + */ 424 + if (src->anon_vma && !dst->anon_vma) { 425 + int ret; 426 + 427 + vma_assert_write_locked(dst); 428 + dst->anon_vma = src->anon_vma; 429 + ret = anon_vma_clone(dst, src); 430 + if (ret) 431 + return ret; 432 + 433 + *dup = dst; 434 + } 435 + 436 + return 0; 437 + } 438 + 439 + #ifdef CONFIG_DEBUG_VM_MAPLE_TREE 440 + void validate_mm(struct mm_struct *mm) 441 + { 442 + int bug = 0; 443 + int i = 0; 444 + struct vm_area_struct *vma; 445 + VMA_ITERATOR(vmi, mm, 0); 446 + 447 + mt_validate(&mm->mm_mt); 448 + for_each_vma(vmi, vma) { 449 + #ifdef CONFIG_DEBUG_VM_RB 450 + struct anon_vma *anon_vma = vma->anon_vma; 451 + struct anon_vma_chain *avc; 452 + #endif 453 + unsigned long vmi_start, vmi_end; 454 + bool warn = 0; 455 + 456 + vmi_start = vma_iter_addr(&vmi); 457 + vmi_end = vma_iter_end(&vmi); 458 + if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm)) 459 + warn = 1; 460 + 461 + if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm)) 462 + warn = 1; 463 + 464 + if (warn) { 465 + pr_emerg("issue in %s\n", current->comm); 466 + dump_stack(); 467 + dump_vma(vma); 468 + pr_emerg("tree range: %px start %lx end %lx\n", vma, 469 + vmi_start, vmi_end - 1); 470 + vma_iter_dump_tree(&vmi); 471 + } 472 + 473 + #ifdef CONFIG_DEBUG_VM_RB 474 + if (anon_vma) { 475 + anon_vma_lock_read(anon_vma); 476 + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 477 + anon_vma_interval_tree_verify(avc); 478 + anon_vma_unlock_read(anon_vma); 479 + } 480 + #endif 481 + i++; 482 + } 483 + if (i != mm->map_count) { 484 + pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i); 485 + bug = 1; 486 + } 487 + VM_BUG_ON_MM(bug, mm); 488 + } 489 + #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ 490 + 491 + /* 492 + * vma_expand - Expand an existing VMA 493 + * 494 + * @vmi: The vma iterator 495 + * @vma: The vma to expand 496 + * @start: The start of the vma 497 + * @end: The exclusive end of the vma 498 + * @pgoff: The page offset of vma 499 + * @next: The current of next vma. 500 + * 501 + * Expand @vma to @start and @end. Can expand off the start and end. Will 502 + * expand over @next if it's different from @vma and @end == @next->vm_end. 503 + * Checking if the @vma can expand and merge with @next needs to be handled by 504 + * the caller. 505 + * 506 + * Returns: 0 on success 507 + */ 508 + int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, 509 + unsigned long start, unsigned long end, pgoff_t pgoff, 510 + struct vm_area_struct *next) 511 + { 512 + struct vm_area_struct *anon_dup = NULL; 513 + bool remove_next = false; 514 + struct vma_prepare vp; 515 + 516 + vma_start_write(vma); 517 + if (next && (vma != next) && (end == next->vm_end)) { 518 + int ret; 519 + 520 + remove_next = true; 521 + vma_start_write(next); 522 + ret = dup_anon_vma(vma, next, &anon_dup); 523 + if (ret) 524 + return ret; 525 + } 526 + 527 + init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL); 528 + /* Not merging but overwriting any part of next is not handled. */ 529 + VM_WARN_ON(next && !vp.remove && 530 + next != vma && end > next->vm_start); 531 + /* Only handles expanding */ 532 + VM_WARN_ON(vma->vm_start < start || vma->vm_end > end); 533 + 534 + /* Note: vma iterator must be pointing to 'start' */ 535 + vma_iter_config(vmi, start, end); 536 + if (vma_iter_prealloc(vmi, vma)) 537 + goto nomem; 538 + 539 + vma_prepare(&vp); 540 + vma_adjust_trans_huge(vma, start, end, 0); 541 + vma_set_range(vma, start, end, pgoff); 542 + vma_iter_store(vmi, vma); 543 + 544 + vma_complete(&vp, vmi, vma->vm_mm); 545 + return 0; 546 + 547 + nomem: 548 + if (anon_dup) 549 + unlink_anon_vmas(anon_dup); 550 + return -ENOMEM; 551 + } 552 + 553 + /* 554 + * vma_shrink() - Reduce an existing VMAs memory area 555 + * @vmi: The vma iterator 556 + * @vma: The VMA to modify 557 + * @start: The new start 558 + * @end: The new end 559 + * 560 + * Returns: 0 on success, -ENOMEM otherwise 561 + */ 562 + int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, 563 + unsigned long start, unsigned long end, pgoff_t pgoff) 564 + { 565 + struct vma_prepare vp; 566 + 567 + WARN_ON((vma->vm_start != start) && (vma->vm_end != end)); 568 + 569 + if (vma->vm_start < start) 570 + vma_iter_config(vmi, vma->vm_start, start); 571 + else 572 + vma_iter_config(vmi, end, vma->vm_end); 573 + 574 + if (vma_iter_prealloc(vmi, NULL)) 575 + return -ENOMEM; 576 + 577 + vma_start_write(vma); 578 + 579 + init_vma_prep(&vp, vma); 580 + vma_prepare(&vp); 581 + vma_adjust_trans_huge(vma, start, end, 0); 582 + 583 + vma_iter_clear(vmi); 584 + vma_set_range(vma, start, end, pgoff); 585 + vma_complete(&vp, vmi, vma->vm_mm); 586 + return 0; 587 + } 588 + 589 + /* 590 + * vma_complete- Helper function for handling the unlocking after altering VMAs, 591 + * or for inserting a VMA. 592 + * 593 + * @vp: The vma_prepare struct 594 + * @vmi: The vma iterator 595 + * @mm: The mm_struct 596 + */ 597 + void vma_complete(struct vma_prepare *vp, 598 + struct vma_iterator *vmi, struct mm_struct *mm) 599 + { 600 + if (vp->file) { 601 + if (vp->adj_next) 602 + vma_interval_tree_insert(vp->adj_next, 603 + &vp->mapping->i_mmap); 604 + vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); 605 + flush_dcache_mmap_unlock(vp->mapping); 606 + } 607 + 608 + if (vp->remove && vp->file) { 609 + __remove_shared_vm_struct(vp->remove, vp->mapping); 610 + if (vp->remove2) 611 + __remove_shared_vm_struct(vp->remove2, vp->mapping); 612 + } else if (vp->insert) { 613 + /* 614 + * split_vma has split insert from vma, and needs 615 + * us to insert it before dropping the locks 616 + * (it may either follow vma or precede it). 617 + */ 618 + vma_iter_store(vmi, vp->insert); 619 + mm->map_count++; 620 + } 621 + 622 + if (vp->anon_vma) { 623 + anon_vma_interval_tree_post_update_vma(vp->vma); 624 + if (vp->adj_next) 625 + anon_vma_interval_tree_post_update_vma(vp->adj_next); 626 + anon_vma_unlock_write(vp->anon_vma); 627 + } 628 + 629 + if (vp->file) { 630 + i_mmap_unlock_write(vp->mapping); 631 + uprobe_mmap(vp->vma); 632 + 633 + if (vp->adj_next) 634 + uprobe_mmap(vp->adj_next); 635 + } 636 + 637 + if (vp->remove) { 638 + again: 639 + vma_mark_detached(vp->remove, true); 640 + if (vp->file) { 641 + uprobe_munmap(vp->remove, vp->remove->vm_start, 642 + vp->remove->vm_end); 643 + fput(vp->file); 644 + } 645 + if (vp->remove->anon_vma) 646 + anon_vma_merge(vp->vma, vp->remove); 647 + mm->map_count--; 648 + mpol_put(vma_policy(vp->remove)); 649 + if (!vp->remove2) 650 + WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); 651 + vm_area_free(vp->remove); 652 + 653 + /* 654 + * In mprotect's case 6 (see comments on vma_merge), 655 + * we are removing both mid and next vmas 656 + */ 657 + if (vp->remove2) { 658 + vp->remove = vp->remove2; 659 + vp->remove2 = NULL; 660 + goto again; 661 + } 662 + } 663 + if (vp->insert && vp->file) 664 + uprobe_mmap(vp->insert); 665 + validate_mm(mm); 666 + } 667 + 668 + /* 669 + * do_vmi_align_munmap() - munmap the aligned region from @start to @end. 670 + * @vmi: The vma iterator 671 + * @vma: The starting vm_area_struct 672 + * @mm: The mm_struct 673 + * @start: The aligned start address to munmap. 674 + * @end: The aligned end address to munmap. 675 + * @uf: The userfaultfd list_head 676 + * @unlock: Set to true to drop the mmap_lock. unlocking only happens on 677 + * success. 678 + * 679 + * Return: 0 on success and drops the lock if so directed, error and leaves the 680 + * lock held otherwise. 681 + */ 682 + int 683 + do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, 684 + struct mm_struct *mm, unsigned long start, 685 + unsigned long end, struct list_head *uf, bool unlock) 686 + { 687 + struct vm_area_struct *prev, *next = NULL; 688 + struct maple_tree mt_detach; 689 + int count = 0; 690 + int error = -ENOMEM; 691 + unsigned long locked_vm = 0; 692 + MA_STATE(mas_detach, &mt_detach, 0, 0); 693 + mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); 694 + mt_on_stack(mt_detach); 695 + 696 + /* 697 + * If we need to split any vma, do it now to save pain later. 698 + * 699 + * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially 700 + * unmapped vm_area_struct will remain in use: so lower split_vma 701 + * places tmp vma above, and higher split_vma places tmp vma below. 702 + */ 703 + 704 + /* Does it split the first one? */ 705 + if (start > vma->vm_start) { 706 + 707 + /* 708 + * Make sure that map_count on return from munmap() will 709 + * not exceed its limit; but let map_count go just above 710 + * its limit temporarily, to help free resources as expected. 711 + */ 712 + if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) 713 + goto map_count_exceeded; 714 + 715 + error = __split_vma(vmi, vma, start, 1); 716 + if (error) 717 + goto start_split_failed; 718 + } 719 + 720 + /* 721 + * Detach a range of VMAs from the mm. Using next as a temp variable as 722 + * it is always overwritten. 723 + */ 724 + next = vma; 725 + do { 726 + /* Does it split the end? */ 727 + if (next->vm_end > end) { 728 + error = __split_vma(vmi, next, end, 0); 729 + if (error) 730 + goto end_split_failed; 731 + } 732 + vma_start_write(next); 733 + mas_set(&mas_detach, count); 734 + error = mas_store_gfp(&mas_detach, next, GFP_KERNEL); 735 + if (error) 736 + goto munmap_gather_failed; 737 + vma_mark_detached(next, true); 738 + if (next->vm_flags & VM_LOCKED) 739 + locked_vm += vma_pages(next); 740 + 741 + count++; 742 + if (unlikely(uf)) { 743 + /* 744 + * If userfaultfd_unmap_prep returns an error the vmas 745 + * will remain split, but userland will get a 746 + * highly unexpected error anyway. This is no 747 + * different than the case where the first of the two 748 + * __split_vma fails, but we don't undo the first 749 + * split, despite we could. This is unlikely enough 750 + * failure that it's not worth optimizing it for. 751 + */ 752 + error = userfaultfd_unmap_prep(next, start, end, uf); 753 + 754 + if (error) 755 + goto userfaultfd_error; 756 + } 757 + #ifdef CONFIG_DEBUG_VM_MAPLE_TREE 758 + BUG_ON(next->vm_start < start); 759 + BUG_ON(next->vm_start > end); 760 + #endif 761 + } for_each_vma_range(*vmi, next, end); 762 + 763 + #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) 764 + /* Make sure no VMAs are about to be lost. */ 765 + { 766 + MA_STATE(test, &mt_detach, 0, 0); 767 + struct vm_area_struct *vma_mas, *vma_test; 768 + int test_count = 0; 769 + 770 + vma_iter_set(vmi, start); 771 + rcu_read_lock(); 772 + vma_test = mas_find(&test, count - 1); 773 + for_each_vma_range(*vmi, vma_mas, end) { 774 + BUG_ON(vma_mas != vma_test); 775 + test_count++; 776 + vma_test = mas_next(&test, count - 1); 777 + } 778 + rcu_read_unlock(); 779 + BUG_ON(count != test_count); 780 + } 781 + #endif 782 + 783 + while (vma_iter_addr(vmi) > start) 784 + vma_iter_prev_range(vmi); 785 + 786 + error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL); 787 + if (error) 788 + goto clear_tree_failed; 789 + 790 + /* Point of no return */ 791 + mm->locked_vm -= locked_vm; 792 + mm->map_count -= count; 793 + if (unlock) 794 + mmap_write_downgrade(mm); 795 + 796 + prev = vma_iter_prev_range(vmi); 797 + next = vma_next(vmi); 798 + if (next) 799 + vma_iter_prev_range(vmi); 800 + 801 + /* 802 + * We can free page tables without write-locking mmap_lock because VMAs 803 + * were isolated before we downgraded mmap_lock. 804 + */ 805 + mas_set(&mas_detach, 1); 806 + unmap_region(mm, &mas_detach, vma, prev, next, start, end, count, 807 + !unlock); 808 + /* Statistics and freeing VMAs */ 809 + mas_set(&mas_detach, 0); 810 + remove_mt(mm, &mas_detach); 811 + validate_mm(mm); 812 + if (unlock) 813 + mmap_read_unlock(mm); 814 + 815 + __mt_destroy(&mt_detach); 816 + return 0; 817 + 818 + clear_tree_failed: 819 + userfaultfd_error: 820 + munmap_gather_failed: 821 + end_split_failed: 822 + mas_set(&mas_detach, 0); 823 + mas_for_each(&mas_detach, next, end) 824 + vma_mark_detached(next, false); 825 + 826 + __mt_destroy(&mt_detach); 827 + start_split_failed: 828 + map_count_exceeded: 829 + validate_mm(mm); 830 + return error; 831 + } 832 + 833 + /* 834 + * do_vmi_munmap() - munmap a given range. 835 + * @vmi: The vma iterator 836 + * @mm: The mm_struct 837 + * @start: The start address to munmap 838 + * @len: The length of the range to munmap 839 + * @uf: The userfaultfd list_head 840 + * @unlock: set to true if the user wants to drop the mmap_lock on success 841 + * 842 + * This function takes a @mas that is either pointing to the previous VMA or set 843 + * to MA_START and sets it up to remove the mapping(s). The @len will be 844 + * aligned and any arch_unmap work will be preformed. 845 + * 846 + * Return: 0 on success and drops the lock if so directed, error and leaves the 847 + * lock held otherwise. 848 + */ 849 + int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, 850 + unsigned long start, size_t len, struct list_head *uf, 851 + bool unlock) 852 + { 853 + unsigned long end; 854 + struct vm_area_struct *vma; 855 + 856 + if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) 857 + return -EINVAL; 858 + 859 + end = start + PAGE_ALIGN(len); 860 + if (end == start) 861 + return -EINVAL; 862 + 863 + /* 864 + * Check if memory is sealed before arch_unmap. 865 + * Prevent unmapping a sealed VMA. 866 + * can_modify_mm assumes we have acquired the lock on MM. 867 + */ 868 + if (unlikely(!can_modify_mm(mm, start, end))) 869 + return -EPERM; 870 + 871 + /* arch_unmap() might do unmaps itself. */ 872 + arch_unmap(mm, start, end); 873 + 874 + /* Find the first overlapping VMA */ 875 + vma = vma_find(vmi, end); 876 + if (!vma) { 877 + if (unlock) 878 + mmap_write_unlock(mm); 879 + return 0; 880 + } 881 + 882 + return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); 883 + } 884 + 885 + /* 886 + * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name), 887 + * figure out whether that can be merged with its predecessor or its 888 + * successor. Or both (it neatly fills a hole). 889 + * 890 + * In most cases - when called for mmap, brk or mremap - [addr,end) is 891 + * certain not to be mapped by the time vma_merge is called; but when 892 + * called for mprotect, it is certain to be already mapped (either at 893 + * an offset within prev, or at the start of next), and the flags of 894 + * this area are about to be changed to vm_flags - and the no-change 895 + * case has already been eliminated. 896 + * 897 + * The following mprotect cases have to be considered, where **** is 898 + * the area passed down from mprotect_fixup, never extending beyond one 899 + * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts 900 + * at the same address as **** and is of the same or larger span, and 901 + * NNNN the next vma after ****: 902 + * 903 + * **** **** **** 904 + * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPCCCCCC 905 + * cannot merge might become might become 906 + * PPNNNNNNNNNN PPPPPPPPPPCC 907 + * mmap, brk or case 4 below case 5 below 908 + * mremap move: 909 + * **** **** 910 + * PPPP NNNN PPPPCCCCNNNN 911 + * might become might become 912 + * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or 913 + * PPPPPPPPNNNN 2 or PPPPPPPPNNNN 7 or 914 + * PPPPNNNNNNNN 3 PPPPNNNNNNNN 8 915 + * 916 + * It is important for case 8 that the vma CCCC overlapping the 917 + * region **** is never going to extended over NNNN. Instead NNNN must 918 + * be extended in region **** and CCCC must be removed. This way in 919 + * all cases where vma_merge succeeds, the moment vma_merge drops the 920 + * rmap_locks, the properties of the merged vma will be already 921 + * correct for the whole merged range. Some of those properties like 922 + * vm_page_prot/vm_flags may be accessed by rmap_walks and they must 923 + * be correct for the whole merged range immediately after the 924 + * rmap_locks are released. Otherwise if NNNN would be removed and 925 + * CCCC would be extended over the NNNN range, remove_migration_ptes 926 + * or other rmap walkers (if working on addresses beyond the "end" 927 + * parameter) may establish ptes with the wrong permissions of CCCC 928 + * instead of the right permissions of NNNN. 929 + * 930 + * In the code below: 931 + * PPPP is represented by *prev 932 + * CCCC is represented by *curr or not represented at all (NULL) 933 + * NNNN is represented by *next or not represented at all (NULL) 934 + * **** is not represented - it will be merged and the vma containing the 935 + * area is returned, or the function will return NULL 936 + */ 937 + static struct vm_area_struct 938 + *vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev, 939 + struct vm_area_struct *src, unsigned long addr, unsigned long end, 940 + unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy, 941 + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, 942 + struct anon_vma_name *anon_name) 943 + { 944 + struct mm_struct *mm = src->vm_mm; 945 + struct anon_vma *anon_vma = src->anon_vma; 946 + struct file *file = src->vm_file; 947 + struct vm_area_struct *curr, *next, *res; 948 + struct vm_area_struct *vma, *adjust, *remove, *remove2; 949 + struct vm_area_struct *anon_dup = NULL; 950 + struct vma_prepare vp; 951 + pgoff_t vma_pgoff; 952 + int err = 0; 953 + bool merge_prev = false; 954 + bool merge_next = false; 955 + bool vma_expanded = false; 956 + unsigned long vma_start = addr; 957 + unsigned long vma_end = end; 958 + pgoff_t pglen = (end - addr) >> PAGE_SHIFT; 959 + long adj_start = 0; 960 + 961 + /* 962 + * We later require that vma->vm_flags == vm_flags, 963 + * so this tests vma->vm_flags & VM_SPECIAL, too. 964 + */ 965 + if (vm_flags & VM_SPECIAL) 966 + return NULL; 967 + 968 + /* Does the input range span an existing VMA? (cases 5 - 8) */ 969 + curr = find_vma_intersection(mm, prev ? prev->vm_end : 0, end); 970 + 971 + if (!curr || /* cases 1 - 4 */ 972 + end == curr->vm_end) /* cases 6 - 8, adjacent VMA */ 973 + next = vma_lookup(mm, end); 974 + else 975 + next = NULL; /* case 5 */ 976 + 977 + if (prev) { 978 + vma_start = prev->vm_start; 979 + vma_pgoff = prev->vm_pgoff; 980 + 981 + /* Can we merge the predecessor? */ 982 + if (addr == prev->vm_end && mpol_equal(vma_policy(prev), policy) 983 + && can_vma_merge_after(prev, vm_flags, anon_vma, file, 984 + pgoff, vm_userfaultfd_ctx, anon_name)) { 985 + merge_prev = true; 986 + vma_prev(vmi); 987 + } 988 + } 989 + 990 + /* Can we merge the successor? */ 991 + if (next && mpol_equal(policy, vma_policy(next)) && 992 + can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, 993 + vm_userfaultfd_ctx, anon_name)) { 994 + merge_next = true; 995 + } 996 + 997 + /* Verify some invariant that must be enforced by the caller. */ 998 + VM_WARN_ON(prev && addr <= prev->vm_start); 999 + VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end)); 1000 + VM_WARN_ON(addr >= end); 1001 + 1002 + if (!merge_prev && !merge_next) 1003 + return NULL; /* Not mergeable. */ 1004 + 1005 + if (merge_prev) 1006 + vma_start_write(prev); 1007 + 1008 + res = vma = prev; 1009 + remove = remove2 = adjust = NULL; 1010 + 1011 + /* Can we merge both the predecessor and the successor? */ 1012 + if (merge_prev && merge_next && 1013 + is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { 1014 + vma_start_write(next); 1015 + remove = next; /* case 1 */ 1016 + vma_end = next->vm_end; 1017 + err = dup_anon_vma(prev, next, &anon_dup); 1018 + if (curr) { /* case 6 */ 1019 + vma_start_write(curr); 1020 + remove = curr; 1021 + remove2 = next; 1022 + /* 1023 + * Note that the dup_anon_vma below cannot overwrite err 1024 + * since the first caller would do nothing unless next 1025 + * has an anon_vma. 1026 + */ 1027 + if (!next->anon_vma) 1028 + err = dup_anon_vma(prev, curr, &anon_dup); 1029 + } 1030 + } else if (merge_prev) { /* case 2 */ 1031 + if (curr) { 1032 + vma_start_write(curr); 1033 + if (end == curr->vm_end) { /* case 7 */ 1034 + /* 1035 + * can_vma_merge_after() assumed we would not be 1036 + * removing prev vma, so it skipped the check 1037 + * for vm_ops->close, but we are removing curr 1038 + */ 1039 + if (curr->vm_ops && curr->vm_ops->close) 1040 + err = -EINVAL; 1041 + remove = curr; 1042 + } else { /* case 5 */ 1043 + adjust = curr; 1044 + adj_start = (end - curr->vm_start); 1045 + } 1046 + if (!err) 1047 + err = dup_anon_vma(prev, curr, &anon_dup); 1048 + } 1049 + } else { /* merge_next */ 1050 + vma_start_write(next); 1051 + res = next; 1052 + if (prev && addr < prev->vm_end) { /* case 4 */ 1053 + vma_start_write(prev); 1054 + vma_end = addr; 1055 + adjust = next; 1056 + adj_start = -(prev->vm_end - addr); 1057 + err = dup_anon_vma(next, prev, &anon_dup); 1058 + } else { 1059 + /* 1060 + * Note that cases 3 and 8 are the ONLY ones where prev 1061 + * is permitted to be (but is not necessarily) NULL. 1062 + */ 1063 + vma = next; /* case 3 */ 1064 + vma_start = addr; 1065 + vma_end = next->vm_end; 1066 + vma_pgoff = next->vm_pgoff - pglen; 1067 + if (curr) { /* case 8 */ 1068 + vma_pgoff = curr->vm_pgoff; 1069 + vma_start_write(curr); 1070 + remove = curr; 1071 + err = dup_anon_vma(next, curr, &anon_dup); 1072 + } 1073 + } 1074 + } 1075 + 1076 + /* Error in anon_vma clone. */ 1077 + if (err) 1078 + goto anon_vma_fail; 1079 + 1080 + if (vma_start < vma->vm_start || vma_end > vma->vm_end) 1081 + vma_expanded = true; 1082 + 1083 + if (vma_expanded) { 1084 + vma_iter_config(vmi, vma_start, vma_end); 1085 + } else { 1086 + vma_iter_config(vmi, adjust->vm_start + adj_start, 1087 + adjust->vm_end); 1088 + } 1089 + 1090 + if (vma_iter_prealloc(vmi, vma)) 1091 + goto prealloc_fail; 1092 + 1093 + init_multi_vma_prep(&vp, vma, adjust, remove, remove2); 1094 + VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma && 1095 + vp.anon_vma != adjust->anon_vma); 1096 + 1097 + vma_prepare(&vp); 1098 + vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start); 1099 + vma_set_range(vma, vma_start, vma_end, vma_pgoff); 1100 + 1101 + if (vma_expanded) 1102 + vma_iter_store(vmi, vma); 1103 + 1104 + if (adj_start) { 1105 + adjust->vm_start += adj_start; 1106 + adjust->vm_pgoff += adj_start >> PAGE_SHIFT; 1107 + if (adj_start < 0) { 1108 + WARN_ON(vma_expanded); 1109 + vma_iter_store(vmi, next); 1110 + } 1111 + } 1112 + 1113 + vma_complete(&vp, vmi, mm); 1114 + khugepaged_enter_vma(res, vm_flags); 1115 + return res; 1116 + 1117 + prealloc_fail: 1118 + if (anon_dup) 1119 + unlink_anon_vmas(anon_dup); 1120 + 1121 + anon_vma_fail: 1122 + vma_iter_set(vmi, addr); 1123 + vma_iter_load(vmi); 1124 + return NULL; 1125 + } 1126 + 1127 + /* 1128 + * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd 1129 + * context and anonymous VMA name within the range [start, end). 1130 + * 1131 + * As a result, we might be able to merge the newly modified VMA range with an 1132 + * adjacent VMA with identical properties. 1133 + * 1134 + * If no merge is possible and the range does not span the entirety of the VMA, 1135 + * we then need to split the VMA to accommodate the change. 1136 + * 1137 + * The function returns either the merged VMA, the original VMA if a split was 1138 + * required instead, or an error if the split failed. 1139 + */ 1140 + struct vm_area_struct *vma_modify(struct vma_iterator *vmi, 1141 + struct vm_area_struct *prev, 1142 + struct vm_area_struct *vma, 1143 + unsigned long start, unsigned long end, 1144 + unsigned long vm_flags, 1145 + struct mempolicy *policy, 1146 + struct vm_userfaultfd_ctx uffd_ctx, 1147 + struct anon_vma_name *anon_name) 1148 + { 1149 + pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 1150 + struct vm_area_struct *merged; 1151 + 1152 + merged = vma_merge(vmi, prev, vma, start, end, vm_flags, 1153 + pgoff, policy, uffd_ctx, anon_name); 1154 + if (merged) 1155 + return merged; 1156 + 1157 + if (vma->vm_start < start) { 1158 + int err = split_vma(vmi, vma, start, 1); 1159 + 1160 + if (err) 1161 + return ERR_PTR(err); 1162 + } 1163 + 1164 + if (vma->vm_end > end) { 1165 + int err = split_vma(vmi, vma, end, 0); 1166 + 1167 + if (err) 1168 + return ERR_PTR(err); 1169 + } 1170 + 1171 + return vma; 1172 + } 1173 + 1174 + /* 1175 + * Attempt to merge a newly mapped VMA with those adjacent to it. The caller 1176 + * must ensure that [start, end) does not overlap any existing VMA. 1177 + */ 1178 + struct vm_area_struct 1179 + *vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev, 1180 + struct vm_area_struct *vma, unsigned long start, 1181 + unsigned long end, pgoff_t pgoff) 1182 + { 1183 + return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff, 1184 + vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); 1185 + } 1186 + 1187 + /* 1188 + * Expand vma by delta bytes, potentially merging with an immediately adjacent 1189 + * VMA with identical properties. 1190 + */ 1191 + struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, 1192 + struct vm_area_struct *vma, 1193 + unsigned long delta) 1194 + { 1195 + pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma); 1196 + 1197 + /* vma is specified as prev, so case 1 or 2 will apply. */ 1198 + return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta, 1199 + vma->vm_flags, pgoff, vma_policy(vma), 1200 + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); 1201 + } 1202 + 1203 + void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) 1204 + { 1205 + vb->count = 0; 1206 + } 1207 + 1208 + static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb) 1209 + { 1210 + struct address_space *mapping; 1211 + int i; 1212 + 1213 + mapping = vb->vmas[0]->vm_file->f_mapping; 1214 + i_mmap_lock_write(mapping); 1215 + for (i = 0; i < vb->count; i++) { 1216 + VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping); 1217 + __remove_shared_vm_struct(vb->vmas[i], mapping); 1218 + } 1219 + i_mmap_unlock_write(mapping); 1220 + 1221 + unlink_file_vma_batch_init(vb); 1222 + } 1223 + 1224 + void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, 1225 + struct vm_area_struct *vma) 1226 + { 1227 + if (vma->vm_file == NULL) 1228 + return; 1229 + 1230 + if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) || 1231 + vb->count == ARRAY_SIZE(vb->vmas)) 1232 + unlink_file_vma_batch_process(vb); 1233 + 1234 + vb->vmas[vb->count] = vma; 1235 + vb->count++; 1236 + } 1237 + 1238 + void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb) 1239 + { 1240 + if (vb->count > 0) 1241 + unlink_file_vma_batch_process(vb); 1242 + } 1243 + 1244 + /* 1245 + * Unlink a file-based vm structure from its interval tree, to hide 1246 + * vma from rmap and vmtruncate before freeing its page tables. 1247 + */ 1248 + void unlink_file_vma(struct vm_area_struct *vma) 1249 + { 1250 + struct file *file = vma->vm_file; 1251 + 1252 + if (file) { 1253 + struct address_space *mapping = file->f_mapping; 1254 + 1255 + i_mmap_lock_write(mapping); 1256 + __remove_shared_vm_struct(vma, mapping); 1257 + i_mmap_unlock_write(mapping); 1258 + } 1259 + } 1260 + 1261 + void vma_link_file(struct vm_area_struct *vma) 1262 + { 1263 + struct file *file = vma->vm_file; 1264 + struct address_space *mapping; 1265 + 1266 + if (file) { 1267 + mapping = file->f_mapping; 1268 + i_mmap_lock_write(mapping); 1269 + __vma_link_file(vma, mapping); 1270 + i_mmap_unlock_write(mapping); 1271 + } 1272 + } 1273 + 1274 + int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) 1275 + { 1276 + VMA_ITERATOR(vmi, mm, 0); 1277 + 1278 + vma_iter_config(&vmi, vma->vm_start, vma->vm_end); 1279 + if (vma_iter_prealloc(&vmi, vma)) 1280 + return -ENOMEM; 1281 + 1282 + vma_start_write(vma); 1283 + vma_iter_store(&vmi, vma); 1284 + vma_link_file(vma); 1285 + mm->map_count++; 1286 + validate_mm(mm); 1287 + return 0; 1288 + } 1289 + 1290 + /* 1291 + * Copy the vma structure to a new location in the same mm, 1292 + * prior to moving page table entries, to effect an mremap move. 1293 + */ 1294 + struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, 1295 + unsigned long addr, unsigned long len, pgoff_t pgoff, 1296 + bool *need_rmap_locks) 1297 + { 1298 + struct vm_area_struct *vma = *vmap; 1299 + unsigned long vma_start = vma->vm_start; 1300 + struct mm_struct *mm = vma->vm_mm; 1301 + struct vm_area_struct *new_vma, *prev; 1302 + bool faulted_in_anon_vma = true; 1303 + VMA_ITERATOR(vmi, mm, addr); 1304 + 1305 + /* 1306 + * If anonymous vma has not yet been faulted, update new pgoff 1307 + * to match new location, to increase its chance of merging. 1308 + */ 1309 + if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { 1310 + pgoff = addr >> PAGE_SHIFT; 1311 + faulted_in_anon_vma = false; 1312 + } 1313 + 1314 + new_vma = find_vma_prev(mm, addr, &prev); 1315 + if (new_vma && new_vma->vm_start < addr + len) 1316 + return NULL; /* should never get here */ 1317 + 1318 + new_vma = vma_merge_new_vma(&vmi, prev, vma, addr, addr + len, pgoff); 1319 + if (new_vma) { 1320 + /* 1321 + * Source vma may have been merged into new_vma 1322 + */ 1323 + if (unlikely(vma_start >= new_vma->vm_start && 1324 + vma_start < new_vma->vm_end)) { 1325 + /* 1326 + * The only way we can get a vma_merge with 1327 + * self during an mremap is if the vma hasn't 1328 + * been faulted in yet and we were allowed to 1329 + * reset the dst vma->vm_pgoff to the 1330 + * destination address of the mremap to allow 1331 + * the merge to happen. mremap must change the 1332 + * vm_pgoff linearity between src and dst vmas 1333 + * (in turn preventing a vma_merge) to be 1334 + * safe. It is only safe to keep the vm_pgoff 1335 + * linear if there are no pages mapped yet. 1336 + */ 1337 + VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); 1338 + *vmap = vma = new_vma; 1339 + } 1340 + *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); 1341 + } else { 1342 + new_vma = vm_area_dup(vma); 1343 + if (!new_vma) 1344 + goto out; 1345 + vma_set_range(new_vma, addr, addr + len, pgoff); 1346 + if (vma_dup_policy(vma, new_vma)) 1347 + goto out_free_vma; 1348 + if (anon_vma_clone(new_vma, vma)) 1349 + goto out_free_mempol; 1350 + if (new_vma->vm_file) 1351 + get_file(new_vma->vm_file); 1352 + if (new_vma->vm_ops && new_vma->vm_ops->open) 1353 + new_vma->vm_ops->open(new_vma); 1354 + if (vma_link(mm, new_vma)) 1355 + goto out_vma_link; 1356 + *need_rmap_locks = false; 1357 + } 1358 + return new_vma; 1359 + 1360 + out_vma_link: 1361 + if (new_vma->vm_ops && new_vma->vm_ops->close) 1362 + new_vma->vm_ops->close(new_vma); 1363 + 1364 + if (new_vma->vm_file) 1365 + fput(new_vma->vm_file); 1366 + 1367 + unlink_anon_vmas(new_vma); 1368 + out_free_mempol: 1369 + mpol_put(vma_policy(new_vma)); 1370 + out_free_vma: 1371 + vm_area_free(new_vma); 1372 + out: 1373 + return NULL; 1374 + } 1375 + 1376 + /* 1377 + * Rough compatibility check to quickly see if it's even worth looking 1378 + * at sharing an anon_vma. 1379 + * 1380 + * They need to have the same vm_file, and the flags can only differ 1381 + * in things that mprotect may change. 1382 + * 1383 + * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that 1384 + * we can merge the two vma's. For example, we refuse to merge a vma if 1385 + * there is a vm_ops->close() function, because that indicates that the 1386 + * driver is doing some kind of reference counting. But that doesn't 1387 + * really matter for the anon_vma sharing case. 1388 + */ 1389 + static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) 1390 + { 1391 + return a->vm_end == b->vm_start && 1392 + mpol_equal(vma_policy(a), vma_policy(b)) && 1393 + a->vm_file == b->vm_file && 1394 + !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) && 1395 + b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); 1396 + } 1397 + 1398 + /* 1399 + * Do some basic sanity checking to see if we can re-use the anon_vma 1400 + * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be 1401 + * the same as 'old', the other will be the new one that is trying 1402 + * to share the anon_vma. 1403 + * 1404 + * NOTE! This runs with mmap_lock held for reading, so it is possible that 1405 + * the anon_vma of 'old' is concurrently in the process of being set up 1406 + * by another page fault trying to merge _that_. But that's ok: if it 1407 + * is being set up, that automatically means that it will be a singleton 1408 + * acceptable for merging, so we can do all of this optimistically. But 1409 + * we do that READ_ONCE() to make sure that we never re-load the pointer. 1410 + * 1411 + * IOW: that the "list_is_singular()" test on the anon_vma_chain only 1412 + * matters for the 'stable anon_vma' case (ie the thing we want to avoid 1413 + * is to return an anon_vma that is "complex" due to having gone through 1414 + * a fork). 1415 + * 1416 + * We also make sure that the two vma's are compatible (adjacent, 1417 + * and with the same memory policies). That's all stable, even with just 1418 + * a read lock on the mmap_lock. 1419 + */ 1420 + static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, 1421 + struct vm_area_struct *a, 1422 + struct vm_area_struct *b) 1423 + { 1424 + if (anon_vma_compatible(a, b)) { 1425 + struct anon_vma *anon_vma = READ_ONCE(old->anon_vma); 1426 + 1427 + if (anon_vma && list_is_singular(&old->anon_vma_chain)) 1428 + return anon_vma; 1429 + } 1430 + return NULL; 1431 + } 1432 + 1433 + /* 1434 + * find_mergeable_anon_vma is used by anon_vma_prepare, to check 1435 + * neighbouring vmas for a suitable anon_vma, before it goes off 1436 + * to allocate a new anon_vma. It checks because a repetitive 1437 + * sequence of mprotects and faults may otherwise lead to distinct 1438 + * anon_vmas being allocated, preventing vma merge in subsequent 1439 + * mprotect. 1440 + */ 1441 + struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) 1442 + { 1443 + struct anon_vma *anon_vma = NULL; 1444 + struct vm_area_struct *prev, *next; 1445 + VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end); 1446 + 1447 + /* Try next first. */ 1448 + next = vma_iter_load(&vmi); 1449 + if (next) { 1450 + anon_vma = reusable_anon_vma(next, vma, next); 1451 + if (anon_vma) 1452 + return anon_vma; 1453 + } 1454 + 1455 + prev = vma_prev(&vmi); 1456 + VM_BUG_ON_VMA(prev != vma, vma); 1457 + prev = vma_prev(&vmi); 1458 + /* Try prev next. */ 1459 + if (prev) 1460 + anon_vma = reusable_anon_vma(prev, prev, vma); 1461 + 1462 + /* 1463 + * We might reach here with anon_vma == NULL if we can't find 1464 + * any reusable anon_vma. 1465 + * There's no absolute need to look only at touching neighbours: 1466 + * we could search further afield for "compatible" anon_vmas. 1467 + * But it would probably just be a waste of time searching, 1468 + * or lead to too many vmas hanging off the same anon_vma. 1469 + * We're trying to allow mprotect remerging later on, 1470 + * not trying to minimize memory used for anon_vmas. 1471 + */ 1472 + return anon_vma; 1473 + } 1474 + 1475 + static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) 1476 + { 1477 + return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite); 1478 + } 1479 + 1480 + static bool vma_is_shared_writable(struct vm_area_struct *vma) 1481 + { 1482 + return (vma->vm_flags & (VM_WRITE | VM_SHARED)) == 1483 + (VM_WRITE | VM_SHARED); 1484 + } 1485 + 1486 + static bool vma_fs_can_writeback(struct vm_area_struct *vma) 1487 + { 1488 + /* No managed pages to writeback. */ 1489 + if (vma->vm_flags & VM_PFNMAP) 1490 + return false; 1491 + 1492 + return vma->vm_file && vma->vm_file->f_mapping && 1493 + mapping_can_writeback(vma->vm_file->f_mapping); 1494 + } 1495 + 1496 + /* 1497 + * Does this VMA require the underlying folios to have their dirty state 1498 + * tracked? 1499 + */ 1500 + bool vma_needs_dirty_tracking(struct vm_area_struct *vma) 1501 + { 1502 + /* Only shared, writable VMAs require dirty tracking. */ 1503 + if (!vma_is_shared_writable(vma)) 1504 + return false; 1505 + 1506 + /* Does the filesystem need to be notified? */ 1507 + if (vm_ops_needs_writenotify(vma->vm_ops)) 1508 + return true; 1509 + 1510 + /* 1511 + * Even if the filesystem doesn't indicate a need for writenotify, if it 1512 + * can writeback, dirty tracking is still required. 1513 + */ 1514 + return vma_fs_can_writeback(vma); 1515 + } 1516 + 1517 + /* 1518 + * Some shared mappings will want the pages marked read-only 1519 + * to track write events. If so, we'll downgrade vm_page_prot 1520 + * to the private version (using protection_map[] without the 1521 + * VM_SHARED bit). 1522 + */ 1523 + bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) 1524 + { 1525 + /* If it was private or non-writable, the write bit is already clear */ 1526 + if (!vma_is_shared_writable(vma)) 1527 + return false; 1528 + 1529 + /* The backer wishes to know when pages are first written to? */ 1530 + if (vm_ops_needs_writenotify(vma->vm_ops)) 1531 + return true; 1532 + 1533 + /* The open routine did something to the protections that pgprot_modify 1534 + * won't preserve? */ 1535 + if (pgprot_val(vm_page_prot) != 1536 + pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags))) 1537 + return false; 1538 + 1539 + /* 1540 + * Do we need to track softdirty? hugetlb does not support softdirty 1541 + * tracking yet. 1542 + */ 1543 + if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) 1544 + return true; 1545 + 1546 + /* Do we need write faults for uffd-wp tracking? */ 1547 + if (userfaultfd_wp(vma)) 1548 + return true; 1549 + 1550 + /* Can the mapping track the dirty pages? */ 1551 + return vma_fs_can_writeback(vma); 1552 + } 1553 + 1554 + unsigned long count_vma_pages_range(struct mm_struct *mm, 1555 + unsigned long addr, unsigned long end) 1556 + { 1557 + VMA_ITERATOR(vmi, mm, addr); 1558 + struct vm_area_struct *vma; 1559 + unsigned long nr_pages = 0; 1560 + 1561 + for_each_vma_range(vmi, vma, end) { 1562 + unsigned long vm_start = max(addr, vma->vm_start); 1563 + unsigned long vm_end = min(end, vma->vm_end); 1564 + 1565 + nr_pages += PHYS_PFN(vm_end - vm_start); 1566 + } 1567 + 1568 + return nr_pages; 1569 + } 1570 + 1571 + static DEFINE_MUTEX(mm_all_locks_mutex); 1572 + 1573 + static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) 1574 + { 1575 + if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { 1576 + /* 1577 + * The LSB of head.next can't change from under us 1578 + * because we hold the mm_all_locks_mutex. 1579 + */ 1580 + down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock); 1581 + /* 1582 + * We can safely modify head.next after taking the 1583 + * anon_vma->root->rwsem. If some other vma in this mm shares 1584 + * the same anon_vma we won't take it again. 1585 + * 1586 + * No need of atomic instructions here, head.next 1587 + * can't change from under us thanks to the 1588 + * anon_vma->root->rwsem. 1589 + */ 1590 + if (__test_and_set_bit(0, (unsigned long *) 1591 + &anon_vma->root->rb_root.rb_root.rb_node)) 1592 + BUG(); 1593 + } 1594 + } 1595 + 1596 + static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) 1597 + { 1598 + if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { 1599 + /* 1600 + * AS_MM_ALL_LOCKS can't change from under us because 1601 + * we hold the mm_all_locks_mutex. 1602 + * 1603 + * Operations on ->flags have to be atomic because 1604 + * even if AS_MM_ALL_LOCKS is stable thanks to the 1605 + * mm_all_locks_mutex, there may be other cpus 1606 + * changing other bitflags in parallel to us. 1607 + */ 1608 + if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) 1609 + BUG(); 1610 + down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock); 1611 + } 1612 + } 1613 + 1614 + /* 1615 + * This operation locks against the VM for all pte/vma/mm related 1616 + * operations that could ever happen on a certain mm. This includes 1617 + * vmtruncate, try_to_unmap, and all page faults. 1618 + * 1619 + * The caller must take the mmap_lock in write mode before calling 1620 + * mm_take_all_locks(). The caller isn't allowed to release the 1621 + * mmap_lock until mm_drop_all_locks() returns. 1622 + * 1623 + * mmap_lock in write mode is required in order to block all operations 1624 + * that could modify pagetables and free pages without need of 1625 + * altering the vma layout. It's also needed in write mode to avoid new 1626 + * anon_vmas to be associated with existing vmas. 1627 + * 1628 + * A single task can't take more than one mm_take_all_locks() in a row 1629 + * or it would deadlock. 1630 + * 1631 + * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in 1632 + * mapping->flags avoid to take the same lock twice, if more than one 1633 + * vma in this mm is backed by the same anon_vma or address_space. 1634 + * 1635 + * We take locks in following order, accordingly to comment at beginning 1636 + * of mm/rmap.c: 1637 + * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for 1638 + * hugetlb mapping); 1639 + * - all vmas marked locked 1640 + * - all i_mmap_rwsem locks; 1641 + * - all anon_vma->rwseml 1642 + * 1643 + * We can take all locks within these types randomly because the VM code 1644 + * doesn't nest them and we protected from parallel mm_take_all_locks() by 1645 + * mm_all_locks_mutex. 1646 + * 1647 + * mm_take_all_locks() and mm_drop_all_locks are expensive operations 1648 + * that may have to take thousand of locks. 1649 + * 1650 + * mm_take_all_locks() can fail if it's interrupted by signals. 1651 + */ 1652 + int mm_take_all_locks(struct mm_struct *mm) 1653 + { 1654 + struct vm_area_struct *vma; 1655 + struct anon_vma_chain *avc; 1656 + VMA_ITERATOR(vmi, mm, 0); 1657 + 1658 + mmap_assert_write_locked(mm); 1659 + 1660 + mutex_lock(&mm_all_locks_mutex); 1661 + 1662 + /* 1663 + * vma_start_write() does not have a complement in mm_drop_all_locks() 1664 + * because vma_start_write() is always asymmetrical; it marks a VMA as 1665 + * being written to until mmap_write_unlock() or mmap_write_downgrade() 1666 + * is reached. 1667 + */ 1668 + for_each_vma(vmi, vma) { 1669 + if (signal_pending(current)) 1670 + goto out_unlock; 1671 + vma_start_write(vma); 1672 + } 1673 + 1674 + vma_iter_init(&vmi, mm, 0); 1675 + for_each_vma(vmi, vma) { 1676 + if (signal_pending(current)) 1677 + goto out_unlock; 1678 + if (vma->vm_file && vma->vm_file->f_mapping && 1679 + is_vm_hugetlb_page(vma)) 1680 + vm_lock_mapping(mm, vma->vm_file->f_mapping); 1681 + } 1682 + 1683 + vma_iter_init(&vmi, mm, 0); 1684 + for_each_vma(vmi, vma) { 1685 + if (signal_pending(current)) 1686 + goto out_unlock; 1687 + if (vma->vm_file && vma->vm_file->f_mapping && 1688 + !is_vm_hugetlb_page(vma)) 1689 + vm_lock_mapping(mm, vma->vm_file->f_mapping); 1690 + } 1691 + 1692 + vma_iter_init(&vmi, mm, 0); 1693 + for_each_vma(vmi, vma) { 1694 + if (signal_pending(current)) 1695 + goto out_unlock; 1696 + if (vma->anon_vma) 1697 + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 1698 + vm_lock_anon_vma(mm, avc->anon_vma); 1699 + } 1700 + 1701 + return 0; 1702 + 1703 + out_unlock: 1704 + mm_drop_all_locks(mm); 1705 + return -EINTR; 1706 + } 1707 + 1708 + static void vm_unlock_anon_vma(struct anon_vma *anon_vma) 1709 + { 1710 + if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { 1711 + /* 1712 + * The LSB of head.next can't change to 0 from under 1713 + * us because we hold the mm_all_locks_mutex. 1714 + * 1715 + * We must however clear the bitflag before unlocking 1716 + * the vma so the users using the anon_vma->rb_root will 1717 + * never see our bitflag. 1718 + * 1719 + * No need of atomic instructions here, head.next 1720 + * can't change from under us until we release the 1721 + * anon_vma->root->rwsem. 1722 + */ 1723 + if (!__test_and_clear_bit(0, (unsigned long *) 1724 + &anon_vma->root->rb_root.rb_root.rb_node)) 1725 + BUG(); 1726 + anon_vma_unlock_write(anon_vma); 1727 + } 1728 + } 1729 + 1730 + static void vm_unlock_mapping(struct address_space *mapping) 1731 + { 1732 + if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { 1733 + /* 1734 + * AS_MM_ALL_LOCKS can't change to 0 from under us 1735 + * because we hold the mm_all_locks_mutex. 1736 + */ 1737 + i_mmap_unlock_write(mapping); 1738 + if (!test_and_clear_bit(AS_MM_ALL_LOCKS, 1739 + &mapping->flags)) 1740 + BUG(); 1741 + } 1742 + } 1743 + 1744 + /* 1745 + * The mmap_lock cannot be released by the caller until 1746 + * mm_drop_all_locks() returns. 1747 + */ 1748 + void mm_drop_all_locks(struct mm_struct *mm) 1749 + { 1750 + struct vm_area_struct *vma; 1751 + struct anon_vma_chain *avc; 1752 + VMA_ITERATOR(vmi, mm, 0); 1753 + 1754 + mmap_assert_write_locked(mm); 1755 + BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); 1756 + 1757 + for_each_vma(vmi, vma) { 1758 + if (vma->anon_vma) 1759 + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 1760 + vm_unlock_anon_vma(avc->anon_vma); 1761 + if (vma->vm_file && vma->vm_file->f_mapping) 1762 + vm_unlock_mapping(vma->vm_file->f_mapping); 1763 + } 1764 + 1765 + mutex_unlock(&mm_all_locks_mutex); 1766 + }

+364

mm/vma.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * vma.h 4 + * 5 + * Core VMA manipulation API implemented in vma.c. 6 + */ 7 + #ifndef __MM_VMA_H 8 + #define __MM_VMA_H 9 + 10 + /* 11 + * VMA lock generalization 12 + */ 13 + struct vma_prepare { 14 + struct vm_area_struct *vma; 15 + struct vm_area_struct *adj_next; 16 + struct file *file; 17 + struct address_space *mapping; 18 + struct anon_vma *anon_vma; 19 + struct vm_area_struct *insert; 20 + struct vm_area_struct *remove; 21 + struct vm_area_struct *remove2; 22 + }; 23 + 24 + struct unlink_vma_file_batch { 25 + int count; 26 + struct vm_area_struct *vmas[8]; 27 + }; 28 + 29 + #ifdef CONFIG_DEBUG_VM_MAPLE_TREE 30 + void validate_mm(struct mm_struct *mm); 31 + #else 32 + #define validate_mm(mm) do { } while (0) 33 + #endif 34 + 35 + /* Required for expand_downwards(). */ 36 + void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma); 37 + 38 + /* Required for expand_downwards(). */ 39 + void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma); 40 + 41 + /* Required for do_brk_flags(). */ 42 + void vma_prepare(struct vma_prepare *vp); 43 + 44 + /* Required for do_brk_flags(). */ 45 + void init_vma_prep(struct vma_prepare *vp, 46 + struct vm_area_struct *vma); 47 + 48 + /* Required for do_brk_flags(). */ 49 + void vma_complete(struct vma_prepare *vp, 50 + struct vma_iterator *vmi, struct mm_struct *mm); 51 + 52 + int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, 53 + unsigned long start, unsigned long end, pgoff_t pgoff, 54 + struct vm_area_struct *next); 55 + 56 + int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, 57 + unsigned long start, unsigned long end, pgoff_t pgoff); 58 + 59 + int 60 + do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, 61 + struct mm_struct *mm, unsigned long start, 62 + unsigned long end, struct list_head *uf, bool unlock); 63 + 64 + int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, 65 + unsigned long start, size_t len, struct list_head *uf, 66 + bool unlock); 67 + 68 + void remove_vma(struct vm_area_struct *vma, bool unreachable); 69 + 70 + void unmap_region(struct mm_struct *mm, struct ma_state *mas, 71 + struct vm_area_struct *vma, struct vm_area_struct *prev, 72 + struct vm_area_struct *next, unsigned long start, 73 + unsigned long end, unsigned long tree_end, bool mm_wr_locked); 74 + 75 + /* Required by mmap_region(). */ 76 + bool 77 + can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, 78 + struct anon_vma *anon_vma, struct file *file, 79 + pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, 80 + struct anon_vma_name *anon_name); 81 + 82 + /* Required by mmap_region() and do_brk_flags(). */ 83 + bool 84 + can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, 85 + struct anon_vma *anon_vma, struct file *file, 86 + pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, 87 + struct anon_vma_name *anon_name); 88 + 89 + struct vm_area_struct *vma_modify(struct vma_iterator *vmi, 90 + struct vm_area_struct *prev, 91 + struct vm_area_struct *vma, 92 + unsigned long start, unsigned long end, 93 + unsigned long vm_flags, 94 + struct mempolicy *policy, 95 + struct vm_userfaultfd_ctx uffd_ctx, 96 + struct anon_vma_name *anon_name); 97 + 98 + /* We are about to modify the VMA's flags. */ 99 + static inline struct vm_area_struct 100 + *vma_modify_flags(struct vma_iterator *vmi, 101 + struct vm_area_struct *prev, 102 + struct vm_area_struct *vma, 103 + unsigned long start, unsigned long end, 104 + unsigned long new_flags) 105 + { 106 + return vma_modify(vmi, prev, vma, start, end, new_flags, 107 + vma_policy(vma), vma->vm_userfaultfd_ctx, 108 + anon_vma_name(vma)); 109 + } 110 + 111 + /* We are about to modify the VMA's flags and/or anon_name. */ 112 + static inline struct vm_area_struct 113 + *vma_modify_flags_name(struct vma_iterator *vmi, 114 + struct vm_area_struct *prev, 115 + struct vm_area_struct *vma, 116 + unsigned long start, 117 + unsigned long end, 118 + unsigned long new_flags, 119 + struct anon_vma_name *new_name) 120 + { 121 + return vma_modify(vmi, prev, vma, start, end, new_flags, 122 + vma_policy(vma), vma->vm_userfaultfd_ctx, new_name); 123 + } 124 + 125 + /* We are about to modify the VMA's memory policy. */ 126 + static inline struct vm_area_struct 127 + *vma_modify_policy(struct vma_iterator *vmi, 128 + struct vm_area_struct *prev, 129 + struct vm_area_struct *vma, 130 + unsigned long start, unsigned long end, 131 + struct mempolicy *new_pol) 132 + { 133 + return vma_modify(vmi, prev, vma, start, end, vma->vm_flags, 134 + new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); 135 + } 136 + 137 + /* We are about to modify the VMA's flags and/or uffd context. */ 138 + static inline struct vm_area_struct 139 + *vma_modify_flags_uffd(struct vma_iterator *vmi, 140 + struct vm_area_struct *prev, 141 + struct vm_area_struct *vma, 142 + unsigned long start, unsigned long end, 143 + unsigned long new_flags, 144 + struct vm_userfaultfd_ctx new_ctx) 145 + { 146 + return vma_modify(vmi, prev, vma, start, end, new_flags, 147 + vma_policy(vma), new_ctx, anon_vma_name(vma)); 148 + } 149 + 150 + struct vm_area_struct 151 + *vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev, 152 + struct vm_area_struct *vma, unsigned long start, 153 + unsigned long end, pgoff_t pgoff); 154 + 155 + struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, 156 + struct vm_area_struct *vma, 157 + unsigned long delta); 158 + 159 + void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb); 160 + 161 + void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb); 162 + 163 + void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, 164 + struct vm_area_struct *vma); 165 + 166 + void unlink_file_vma(struct vm_area_struct *vma); 167 + 168 + void vma_link_file(struct vm_area_struct *vma); 169 + 170 + int vma_link(struct mm_struct *mm, struct vm_area_struct *vma); 171 + 172 + struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, 173 + unsigned long addr, unsigned long len, pgoff_t pgoff, 174 + bool *need_rmap_locks); 175 + 176 + struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma); 177 + 178 + bool vma_needs_dirty_tracking(struct vm_area_struct *vma); 179 + bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); 180 + 181 + int mm_take_all_locks(struct mm_struct *mm); 182 + void mm_drop_all_locks(struct mm_struct *mm); 183 + unsigned long count_vma_pages_range(struct mm_struct *mm, 184 + unsigned long addr, unsigned long end); 185 + 186 + static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) 187 + { 188 + /* 189 + * We want to check manually if we can change individual PTEs writable 190 + * if we can't do that automatically for all PTEs in a mapping. For 191 + * private mappings, that's always the case when we have write 192 + * permissions as we properly have to handle COW. 193 + */ 194 + if (vma->vm_flags & VM_SHARED) 195 + return vma_wants_writenotify(vma, vma->vm_page_prot); 196 + return !!(vma->vm_flags & VM_WRITE); 197 + } 198 + 199 + #ifdef CONFIG_MMU 200 + static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) 201 + { 202 + return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); 203 + } 204 + #endif 205 + 206 + static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, 207 + unsigned long min) 208 + { 209 + return mas_prev(&vmi->mas, min); 210 + } 211 + 212 + static inline int vma_iter_store_gfp(struct vma_iterator *vmi, 213 + struct vm_area_struct *vma, gfp_t gfp) 214 + { 215 + if (vmi->mas.status != ma_start && 216 + ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) 217 + vma_iter_invalidate(vmi); 218 + 219 + __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); 220 + mas_store_gfp(&vmi->mas, vma, gfp); 221 + if (unlikely(mas_is_err(&vmi->mas))) 222 + return -ENOMEM; 223 + 224 + return 0; 225 + } 226 + 227 + 228 + /* 229 + * These three helpers classifies VMAs for virtual memory accounting. 230 + */ 231 + 232 + /* 233 + * Executable code area - executable, not writable, not stack 234 + */ 235 + static inline bool is_exec_mapping(vm_flags_t flags) 236 + { 237 + return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC; 238 + } 239 + 240 + /* 241 + * Stack area (including shadow stacks) 242 + * 243 + * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: 244 + * do_mmap() forbids all other combinations. 245 + */ 246 + static inline bool is_stack_mapping(vm_flags_t flags) 247 + { 248 + return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK); 249 + } 250 + 251 + /* 252 + * Data area - private, writable, not stack 253 + */ 254 + static inline bool is_data_mapping(vm_flags_t flags) 255 + { 256 + return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; 257 + } 258 + 259 + 260 + static inline void vma_iter_config(struct vma_iterator *vmi, 261 + unsigned long index, unsigned long last) 262 + { 263 + __mas_set_range(&vmi->mas, index, last - 1); 264 + } 265 + 266 + static inline void vma_iter_reset(struct vma_iterator *vmi) 267 + { 268 + mas_reset(&vmi->mas); 269 + } 270 + 271 + static inline 272 + struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min) 273 + { 274 + return mas_prev_range(&vmi->mas, min); 275 + } 276 + 277 + static inline 278 + struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max) 279 + { 280 + return mas_next_range(&vmi->mas, max); 281 + } 282 + 283 + static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min, 284 + unsigned long max, unsigned long size) 285 + { 286 + return mas_empty_area(&vmi->mas, min, max - 1, size); 287 + } 288 + 289 + static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min, 290 + unsigned long max, unsigned long size) 291 + { 292 + return mas_empty_area_rev(&vmi->mas, min, max - 1, size); 293 + } 294 + 295 + /* 296 + * VMA Iterator functions shared between nommu and mmap 297 + */ 298 + static inline int vma_iter_prealloc(struct vma_iterator *vmi, 299 + struct vm_area_struct *vma) 300 + { 301 + return mas_preallocate(&vmi->mas, vma, GFP_KERNEL); 302 + } 303 + 304 + static inline void vma_iter_clear(struct vma_iterator *vmi) 305 + { 306 + mas_store_prealloc(&vmi->mas, NULL); 307 + } 308 + 309 + static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) 310 + { 311 + return mas_walk(&vmi->mas); 312 + } 313 + 314 + /* Store a VMA with preallocated memory */ 315 + static inline void vma_iter_store(struct vma_iterator *vmi, 316 + struct vm_area_struct *vma) 317 + { 318 + 319 + #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) 320 + if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && 321 + vmi->mas.index > vma->vm_start)) { 322 + pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n", 323 + vmi->mas.index, vma->vm_start, vma->vm_start, 324 + vma->vm_end, vmi->mas.index, vmi->mas.last); 325 + } 326 + if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && 327 + vmi->mas.last < vma->vm_start)) { 328 + pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n", 329 + vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end, 330 + vmi->mas.index, vmi->mas.last); 331 + } 332 + #endif 333 + 334 + if (vmi->mas.status != ma_start && 335 + ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) 336 + vma_iter_invalidate(vmi); 337 + 338 + __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); 339 + mas_store_prealloc(&vmi->mas, vma); 340 + } 341 + 342 + static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) 343 + { 344 + return vmi->mas.index; 345 + } 346 + 347 + static inline unsigned long vma_iter_end(struct vma_iterator *vmi) 348 + { 349 + return vmi->mas.last + 1; 350 + } 351 + 352 + static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, 353 + unsigned long count) 354 + { 355 + return mas_expected_entries(&vmi->mas, count); 356 + } 357 + 358 + static inline 359 + struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi) 360 + { 361 + return mas_prev_range(&vmi->mas, 0); 362 + } 363 + 364 + #endif /* __MM_VMA_H */

+50

mm/vma_internal.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * vma_internal.h 4 + * 5 + * Headers required by vma.c, which can be substituted accordingly when testing 6 + * VMA functionality. 7 + */ 8 + 9 + #ifndef __MM_VMA_INTERNAL_H 10 + #define __MM_VMA_INTERNAL_H 11 + 12 + #include <linux/backing-dev.h> 13 + #include <linux/bitops.h> 14 + #include <linux/bug.h> 15 + #include <linux/bug.h> 16 + #include <linux/cacheflush.h> 17 + #include <linux/err.h> 18 + #include <linux/file.h> 19 + #include <linux/fs.h> 20 + #include <linux/huge_mm.h> 21 + #include <linux/hugetlb_inline.h> 22 + #include <linux/kernel.h> 23 + #include <linux/khugepaged.h> 24 + #include <linux/list.h> 25 + #include <linux/maple_tree.h> 26 + #include <linux/mempolicy.h> 27 + #include <linux/mm.h> 28 + #include <linux/mm_inline.h> 29 + #include <linux/mm_types.h> 30 + #include <linux/mman.h> 31 + #include <linux/mmap_lock.h> 32 + #include <linux/mmdebug.h> 33 + #include <linux/mmu_context.h> 34 + #include <linux/mutex.h> 35 + #include <linux/pagemap.h> 36 + #include <linux/pfn.h> 37 + #include <linux/rcupdate.h> 38 + #include <linux/rmap.h> 39 + #include <linux/rwsem.h> 40 + #include <linux/sched/signal.h> 41 + #include <linux/swap.h> 42 + #include <linux/uprobes.h> 43 + #include <linux/userfaultfd_k.h> 44 + 45 + #include <asm/current.h> 46 + #include <asm/tlb.h> 47 + 48 + #include "internal.h" 49 + 50 + #endif /* __MM_VMA_INTERNAL_H */

Configure Feed

Configure Feed