Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

mm: move internal core VMA manipulation functions to own file

This patch introduces vma.c and moves internal core VMA manipulation
functions to this file from mmap.c.

This allows us to isolate VMA functionality in a single place such that we
can create userspace testing code that invokes this functionality in an
environment where we can implement simple unit tests of core
functionality.

This patch ensures that core VMA functionality is explicitly marked as
such by its presence in mm/vma.h.

It also places the header includes required by vma.c in vma_internal.h,
which is simply imported by vma.c. This makes the VMA functionality
testable, as userland testing code can simply stub out functionality as
required.

Link: https://lkml.kernel.org/r/c77a6aafb4c42aaadb8e7271a853658cbdca2e22.1722251717.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Gow <davidgow@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Kees Cook <kees@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rae Moar <rmoar@google.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Pengfei Xu <pengfei.xu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Lorenzo Stoakes and committed by
Andrew Morton
49b1b8d6 d61f0d59

+2188 -2039
-35
include/linux/mm.h
··· 1005 1005 return mas_prev(&vmi->mas, 0); 1006 1006 } 1007 1007 1008 - static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) 1009 - { 1010 - return vmi->mas.index; 1011 - } 1012 - 1013 - static inline unsigned long vma_iter_end(struct vma_iterator *vmi) 1014 - { 1015 - return vmi->mas.last + 1; 1016 - } 1017 - static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, 1018 - unsigned long count) 1019 - { 1020 - return mas_expected_entries(&vmi->mas, count); 1021 - } 1022 - 1023 1008 static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, 1024 1009 unsigned long start, unsigned long end, gfp_t gfp) 1025 1010 { ··· 2519 2534 #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ 2520 2535 MM_CP_UFFD_WP_RESOLVE) 2521 2536 2522 - bool vma_needs_dirty_tracking(struct vm_area_struct *vma); 2523 - bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); 2524 - static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) 2525 - { 2526 - /* 2527 - * We want to check manually if we can change individual PTEs writable 2528 - * if we can't do that automatically for all PTEs in a mapping. For 2529 - * private mappings, that's always the case when we have write 2530 - * permissions as we properly have to handle COW. 2531 - */ 2532 - if (vma->vm_flags & VM_SHARED) 2533 - return vma_wants_writenotify(vma, vma->vm_page_prot); 2534 - return !!(vma->vm_flags & VM_WRITE); 2535 - 2536 - } 2537 2537 bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, 2538 2538 pte_t pte); 2539 2539 extern long change_protection(struct mmu_gather *tlb, ··· 3226 3256 3227 3257 /* mmap.c */ 3228 3258 extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); 3229 - extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); 3230 3259 extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); 3231 - extern void unlink_file_vma(struct vm_area_struct *); 3232 - extern struct vm_area_struct *copy_vma(struct vm_area_struct **, 3233 - unsigned long addr, unsigned long len, pgoff_t pgoff, 3234 - bool *need_rmap_locks); 3235 3260 extern void exit_mmap(struct mm_struct *); 3236 3261 int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); 3237 3262
+1 -1
mm/Makefile
··· 37 37 mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ 38 38 mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ 39 39 msync.o page_vma_mapped.o pagewalk.o \ 40 - pgtable-generic.o rmap.o vmalloc.o 40 + pgtable-generic.o rmap.o vmalloc.o vma.o 41 41 42 42 43 43 ifdef CONFIG_CROSS_MEMORY_ATTACH
+5 -231
mm/internal.h
··· 8 8 #define __MM_INTERNAL_H 9 9 10 10 #include <linux/fs.h> 11 + #include <linux/khugepaged.h> 11 12 #include <linux/mm.h> 13 + #include <linux/mm_inline.h> 12 14 #include <linux/pagemap.h> 13 15 #include <linux/rmap.h> 14 16 #include <linux/swap.h> 15 17 #include <linux/swapops.h> 16 18 #include <linux/tracepoint-defs.h> 19 + 20 + /* Internal core VMA manipulation functions. */ 21 + #include "vma.h" 17 22 18 23 struct folio_batch; 19 24 ··· 783 778 return list_empty(&area->free_list[migratetype]); 784 779 } 785 780 786 - /* 787 - * These three helpers classifies VMAs for virtual memory accounting. 788 - */ 789 - 790 - /* 791 - * Executable code area - executable, not writable, not stack 792 - */ 793 - static inline bool is_exec_mapping(vm_flags_t flags) 794 - { 795 - return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC; 796 - } 797 - 798 - /* 799 - * Stack area (including shadow stacks) 800 - * 801 - * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: 802 - * do_mmap() forbids all other combinations. 803 - */ 804 - static inline bool is_stack_mapping(vm_flags_t flags) 805 - { 806 - return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK); 807 - } 808 - 809 - /* 810 - * Data area - private, writable, not stack 811 - */ 812 - static inline bool is_data_mapping(vm_flags_t flags) 813 - { 814 - return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; 815 - } 816 - 817 781 /* mm/util.c */ 818 782 struct anon_vma *folio_anon_vma(struct folio *folio); 819 783 ··· 1211 1237 void touch_pmd(struct vm_area_struct *vma, unsigned long addr, 1212 1238 pmd_t *pmd, bool write); 1213 1239 1214 - /* 1215 - * mm/mmap.c 1216 - */ 1217 - struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, 1218 - struct vm_area_struct *vma, 1219 - unsigned long delta); 1220 - 1221 - struct vm_area_struct *vma_modify(struct vma_iterator *vmi, 1222 - struct vm_area_struct *prev, 1223 - struct vm_area_struct *vma, 1224 - unsigned long start, unsigned long end, 1225 - unsigned long vm_flags, 1226 - struct mempolicy *policy, 1227 - struct vm_userfaultfd_ctx uffd_ctx, 1228 - struct anon_vma_name *anon_name); 1229 - 1230 - /* We are about to modify the VMA's flags. */ 1231 - static inline struct vm_area_struct 1232 - *vma_modify_flags(struct vma_iterator *vmi, 1233 - struct vm_area_struct *prev, 1234 - struct vm_area_struct *vma, 1235 - unsigned long start, unsigned long end, 1236 - unsigned long new_flags) 1237 - { 1238 - return vma_modify(vmi, prev, vma, start, end, new_flags, 1239 - vma_policy(vma), vma->vm_userfaultfd_ctx, 1240 - anon_vma_name(vma)); 1241 - } 1242 - 1243 - /* We are about to modify the VMA's flags and/or anon_name. */ 1244 - static inline struct vm_area_struct 1245 - *vma_modify_flags_name(struct vma_iterator *vmi, 1246 - struct vm_area_struct *prev, 1247 - struct vm_area_struct *vma, 1248 - unsigned long start, 1249 - unsigned long end, 1250 - unsigned long new_flags, 1251 - struct anon_vma_name *new_name) 1252 - { 1253 - return vma_modify(vmi, prev, vma, start, end, new_flags, 1254 - vma_policy(vma), vma->vm_userfaultfd_ctx, new_name); 1255 - } 1256 - 1257 - /* We are about to modify the VMA's memory policy. */ 1258 - static inline struct vm_area_struct 1259 - *vma_modify_policy(struct vma_iterator *vmi, 1260 - struct vm_area_struct *prev, 1261 - struct vm_area_struct *vma, 1262 - unsigned long start, unsigned long end, 1263 - struct mempolicy *new_pol) 1264 - { 1265 - return vma_modify(vmi, prev, vma, start, end, vma->vm_flags, 1266 - new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); 1267 - } 1268 - 1269 - /* We are about to modify the VMA's flags and/or uffd context. */ 1270 - static inline struct vm_area_struct 1271 - *vma_modify_flags_uffd(struct vma_iterator *vmi, 1272 - struct vm_area_struct *prev, 1273 - struct vm_area_struct *vma, 1274 - unsigned long start, unsigned long end, 1275 - unsigned long new_flags, 1276 - struct vm_userfaultfd_ctx new_ctx) 1277 - { 1278 - return vma_modify(vmi, prev, vma, start, end, new_flags, 1279 - vma_policy(vma), new_ctx, anon_vma_name(vma)); 1280 - } 1281 - 1282 - int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, 1283 - unsigned long start, unsigned long end, pgoff_t pgoff, 1284 - struct vm_area_struct *next); 1285 - int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, 1286 - unsigned long start, unsigned long end, pgoff_t pgoff); 1287 - 1288 1240 enum { 1289 1241 /* mark page accessed */ 1290 1242 FOLL_TOUCH = 1 << 16, ··· 1337 1437 return vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte); 1338 1438 } 1339 1439 1340 - static inline void vma_iter_config(struct vma_iterator *vmi, 1341 - unsigned long index, unsigned long last) 1342 - { 1343 - __mas_set_range(&vmi->mas, index, last - 1); 1344 - } 1345 - 1346 - static inline void vma_iter_reset(struct vma_iterator *vmi) 1347 - { 1348 - mas_reset(&vmi->mas); 1349 - } 1350 - 1351 - static inline 1352 - struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min) 1353 - { 1354 - return mas_prev_range(&vmi->mas, min); 1355 - } 1356 - 1357 - static inline 1358 - struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max) 1359 - { 1360 - return mas_next_range(&vmi->mas, max); 1361 - } 1362 - 1363 - static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min, 1364 - unsigned long max, unsigned long size) 1365 - { 1366 - return mas_empty_area(&vmi->mas, min, max - 1, size); 1367 - } 1368 - 1369 - static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min, 1370 - unsigned long max, unsigned long size) 1371 - { 1372 - return mas_empty_area_rev(&vmi->mas, min, max - 1, size); 1373 - } 1374 - 1375 - /* 1376 - * VMA Iterator functions shared between nommu and mmap 1377 - */ 1378 - static inline int vma_iter_prealloc(struct vma_iterator *vmi, 1379 - struct vm_area_struct *vma) 1380 - { 1381 - return mas_preallocate(&vmi->mas, vma, GFP_KERNEL); 1382 - } 1383 - 1384 - static inline void vma_iter_clear(struct vma_iterator *vmi) 1385 - { 1386 - mas_store_prealloc(&vmi->mas, NULL); 1387 - } 1388 - 1389 - static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) 1390 - { 1391 - return mas_walk(&vmi->mas); 1392 - } 1393 - 1394 - /* Store a VMA with preallocated memory */ 1395 - static inline void vma_iter_store(struct vma_iterator *vmi, 1396 - struct vm_area_struct *vma) 1397 - { 1398 - 1399 - #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) 1400 - if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && 1401 - vmi->mas.index > vma->vm_start)) { 1402 - pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n", 1403 - vmi->mas.index, vma->vm_start, vma->vm_start, 1404 - vma->vm_end, vmi->mas.index, vmi->mas.last); 1405 - } 1406 - if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && 1407 - vmi->mas.last < vma->vm_start)) { 1408 - pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n", 1409 - vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end, 1410 - vmi->mas.index, vmi->mas.last); 1411 - } 1412 - #endif 1413 - 1414 - if (vmi->mas.status != ma_start && 1415 - ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) 1416 - vma_iter_invalidate(vmi); 1417 - 1418 - __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); 1419 - mas_store_prealloc(&vmi->mas, vma); 1420 - } 1421 - 1422 - static inline int vma_iter_store_gfp(struct vma_iterator *vmi, 1423 - struct vm_area_struct *vma, gfp_t gfp) 1424 - { 1425 - if (vmi->mas.status != ma_start && 1426 - ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) 1427 - vma_iter_invalidate(vmi); 1428 - 1429 - __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); 1430 - mas_store_gfp(&vmi->mas, vma, gfp); 1431 - if (unlikely(mas_is_err(&vmi->mas))) 1432 - return -ENOMEM; 1433 - 1434 - return 0; 1435 - } 1436 - 1437 - static inline 1438 - struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi) 1439 - { 1440 - return mas_prev_range(&vmi->mas, 0); 1441 - } 1442 - 1443 - /* 1444 - * VMA lock generalization 1445 - */ 1446 - struct vma_prepare { 1447 - struct vm_area_struct *vma; 1448 - struct vm_area_struct *adj_next; 1449 - struct file *file; 1450 - struct address_space *mapping; 1451 - struct anon_vma *anon_vma; 1452 - struct vm_area_struct *insert; 1453 - struct vm_area_struct *remove; 1454 - struct vm_area_struct *remove2; 1455 - }; 1456 - 1457 1440 void __meminit __init_single_page(struct page *page, unsigned long pfn, 1458 1441 unsigned long zone, int nid); 1459 1442 ··· 1424 1641 /* Only track the nodes of mappings with shadow entries */ 1425 1642 void workingset_update_node(struct xa_node *node); 1426 1643 extern struct list_lru shadow_nodes; 1427 - 1428 - struct unlink_vma_file_batch { 1429 - int count; 1430 - struct vm_area_struct *vmas[8]; 1431 - }; 1432 - 1433 - void unlink_file_vma_batch_init(struct unlink_vma_file_batch *); 1434 - void unlink_file_vma_batch_add(struct unlink_vma_file_batch *, struct vm_area_struct *); 1435 - void unlink_file_vma_batch_final(struct unlink_vma_file_batch *); 1436 1644 1437 1645 /* mremap.c */ 1438 1646 unsigned long move_page_tables(struct vm_area_struct *vma,
-1772
mm/mmap.c
··· 76 76 static bool ignore_rlimit_data; 77 77 core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); 78 78 79 - static void unmap_region(struct mm_struct *mm, struct ma_state *mas, 80 - struct vm_area_struct *vma, struct vm_area_struct *prev, 81 - struct vm_area_struct *next, unsigned long start, 82 - unsigned long end, unsigned long tree_end, bool mm_wr_locked); 83 - 84 - static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) 85 - { 86 - return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); 87 - } 88 - 89 79 /* Update vma->vm_page_prot to reflect vma->vm_flags. */ 90 80 void vma_set_page_prot(struct vm_area_struct *vma) 91 81 { ··· 89 99 } 90 100 /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */ 91 101 WRITE_ONCE(vma->vm_page_prot, vm_page_prot); 92 - } 93 - 94 - /* 95 - * Requires inode->i_mapping->i_mmap_rwsem 96 - */ 97 - static void __remove_shared_vm_struct(struct vm_area_struct *vma, 98 - struct address_space *mapping) 99 - { 100 - if (vma_is_shared_maywrite(vma)) 101 - mapping_unmap_writable(mapping); 102 - 103 - flush_dcache_mmap_lock(mapping); 104 - vma_interval_tree_remove(vma, &mapping->i_mmap); 105 - flush_dcache_mmap_unlock(mapping); 106 - } 107 - 108 - /* 109 - * Unlink a file-based vm structure from its interval tree, to hide 110 - * vma from rmap and vmtruncate before freeing its page tables. 111 - */ 112 - void unlink_file_vma(struct vm_area_struct *vma) 113 - { 114 - struct file *file = vma->vm_file; 115 - 116 - if (file) { 117 - struct address_space *mapping = file->f_mapping; 118 - i_mmap_lock_write(mapping); 119 - __remove_shared_vm_struct(vma, mapping); 120 - i_mmap_unlock_write(mapping); 121 - } 122 - } 123 - 124 - void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) 125 - { 126 - vb->count = 0; 127 - } 128 - 129 - static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb) 130 - { 131 - struct address_space *mapping; 132 - int i; 133 - 134 - mapping = vb->vmas[0]->vm_file->f_mapping; 135 - i_mmap_lock_write(mapping); 136 - for (i = 0; i < vb->count; i++) { 137 - VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping); 138 - __remove_shared_vm_struct(vb->vmas[i], mapping); 139 - } 140 - i_mmap_unlock_write(mapping); 141 - 142 - unlink_file_vma_batch_init(vb); 143 - } 144 - 145 - void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, 146 - struct vm_area_struct *vma) 147 - { 148 - if (vma->vm_file == NULL) 149 - return; 150 - 151 - if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) || 152 - vb->count == ARRAY_SIZE(vb->vmas)) 153 - unlink_file_vma_batch_process(vb); 154 - 155 - vb->vmas[vb->count] = vma; 156 - vb->count++; 157 - } 158 - 159 - void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb) 160 - { 161 - if (vb->count > 0) 162 - unlink_file_vma_batch_process(vb); 163 - } 164 - 165 - /* 166 - * Close a vm structure and free it. 167 - */ 168 - static void remove_vma(struct vm_area_struct *vma, bool unreachable) 169 - { 170 - might_sleep(); 171 - if (vma->vm_ops && vma->vm_ops->close) 172 - vma->vm_ops->close(vma); 173 - if (vma->vm_file) 174 - fput(vma->vm_file); 175 - mpol_put(vma_policy(vma)); 176 - if (unreachable) 177 - __vm_area_free(vma); 178 - else 179 - vm_area_free(vma); 180 - } 181 - 182 - static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, 183 - unsigned long min) 184 - { 185 - return mas_prev(&vmi->mas, min); 186 102 } 187 103 188 104 /* ··· 212 316 mm->brk = origbrk; 213 317 mmap_write_unlock(mm); 214 318 return origbrk; 215 - } 216 - 217 - #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) 218 - static void validate_mm(struct mm_struct *mm) 219 - { 220 - int bug = 0; 221 - int i = 0; 222 - struct vm_area_struct *vma; 223 - VMA_ITERATOR(vmi, mm, 0); 224 - 225 - mt_validate(&mm->mm_mt); 226 - for_each_vma(vmi, vma) { 227 - #ifdef CONFIG_DEBUG_VM_RB 228 - struct anon_vma *anon_vma = vma->anon_vma; 229 - struct anon_vma_chain *avc; 230 - #endif 231 - unsigned long vmi_start, vmi_end; 232 - bool warn = 0; 233 - 234 - vmi_start = vma_iter_addr(&vmi); 235 - vmi_end = vma_iter_end(&vmi); 236 - if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm)) 237 - warn = 1; 238 - 239 - if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm)) 240 - warn = 1; 241 - 242 - if (warn) { 243 - pr_emerg("issue in %s\n", current->comm); 244 - dump_stack(); 245 - dump_vma(vma); 246 - pr_emerg("tree range: %px start %lx end %lx\n", vma, 247 - vmi_start, vmi_end - 1); 248 - vma_iter_dump_tree(&vmi); 249 - } 250 - 251 - #ifdef CONFIG_DEBUG_VM_RB 252 - if (anon_vma) { 253 - anon_vma_lock_read(anon_vma); 254 - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 255 - anon_vma_interval_tree_verify(avc); 256 - anon_vma_unlock_read(anon_vma); 257 - } 258 - #endif 259 - i++; 260 - } 261 - if (i != mm->map_count) { 262 - pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i); 263 - bug = 1; 264 - } 265 - VM_BUG_ON_MM(bug, mm); 266 - } 267 - 268 - #else /* !CONFIG_DEBUG_VM_MAPLE_TREE */ 269 - #define validate_mm(mm) do { } while (0) 270 - #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ 271 - 272 - /* 273 - * vma has some anon_vma assigned, and is already inserted on that 274 - * anon_vma's interval trees. 275 - * 276 - * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the 277 - * vma must be removed from the anon_vma's interval trees using 278 - * anon_vma_interval_tree_pre_update_vma(). 279 - * 280 - * After the update, the vma will be reinserted using 281 - * anon_vma_interval_tree_post_update_vma(). 282 - * 283 - * The entire update must be protected by exclusive mmap_lock and by 284 - * the root anon_vma's mutex. 285 - */ 286 - static inline void 287 - anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) 288 - { 289 - struct anon_vma_chain *avc; 290 - 291 - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 292 - anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); 293 - } 294 - 295 - static inline void 296 - anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) 297 - { 298 - struct anon_vma_chain *avc; 299 - 300 - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 301 - anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); 302 - } 303 - 304 - static unsigned long count_vma_pages_range(struct mm_struct *mm, 305 - unsigned long addr, unsigned long end) 306 - { 307 - VMA_ITERATOR(vmi, mm, addr); 308 - struct vm_area_struct *vma; 309 - unsigned long nr_pages = 0; 310 - 311 - for_each_vma_range(vmi, vma, end) { 312 - unsigned long vm_start = max(addr, vma->vm_start); 313 - unsigned long vm_end = min(end, vma->vm_end); 314 - 315 - nr_pages += PHYS_PFN(vm_end - vm_start); 316 - } 317 - 318 - return nr_pages; 319 - } 320 - 321 - static void __vma_link_file(struct vm_area_struct *vma, 322 - struct address_space *mapping) 323 - { 324 - if (vma_is_shared_maywrite(vma)) 325 - mapping_allow_writable(mapping); 326 - 327 - flush_dcache_mmap_lock(mapping); 328 - vma_interval_tree_insert(vma, &mapping->i_mmap); 329 - flush_dcache_mmap_unlock(mapping); 330 - } 331 - 332 - static void vma_link_file(struct vm_area_struct *vma) 333 - { 334 - struct file *file = vma->vm_file; 335 - struct address_space *mapping; 336 - 337 - if (file) { 338 - mapping = file->f_mapping; 339 - i_mmap_lock_write(mapping); 340 - __vma_link_file(vma, mapping); 341 - i_mmap_unlock_write(mapping); 342 - } 343 - } 344 - 345 - static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) 346 - { 347 - VMA_ITERATOR(vmi, mm, 0); 348 - 349 - vma_iter_config(&vmi, vma->vm_start, vma->vm_end); 350 - if (vma_iter_prealloc(&vmi, vma)) 351 - return -ENOMEM; 352 - 353 - vma_start_write(vma); 354 - vma_iter_store(&vmi, vma); 355 - vma_link_file(vma); 356 - mm->map_count++; 357 - validate_mm(mm); 358 - return 0; 359 - } 360 - 361 - /* 362 - * init_multi_vma_prep() - Initializer for struct vma_prepare 363 - * @vp: The vma_prepare struct 364 - * @vma: The vma that will be altered once locked 365 - * @next: The next vma if it is to be adjusted 366 - * @remove: The first vma to be removed 367 - * @remove2: The second vma to be removed 368 - */ 369 - static inline void init_multi_vma_prep(struct vma_prepare *vp, 370 - struct vm_area_struct *vma, struct vm_area_struct *next, 371 - struct vm_area_struct *remove, struct vm_area_struct *remove2) 372 - { 373 - memset(vp, 0, sizeof(struct vma_prepare)); 374 - vp->vma = vma; 375 - vp->anon_vma = vma->anon_vma; 376 - vp->remove = remove; 377 - vp->remove2 = remove2; 378 - vp->adj_next = next; 379 - if (!vp->anon_vma && next) 380 - vp->anon_vma = next->anon_vma; 381 - 382 - vp->file = vma->vm_file; 383 - if (vp->file) 384 - vp->mapping = vma->vm_file->f_mapping; 385 - 386 - } 387 - 388 - /* 389 - * init_vma_prep() - Initializer wrapper for vma_prepare struct 390 - * @vp: The vma_prepare struct 391 - * @vma: The vma that will be altered once locked 392 - */ 393 - static inline void init_vma_prep(struct vma_prepare *vp, 394 - struct vm_area_struct *vma) 395 - { 396 - init_multi_vma_prep(vp, vma, NULL, NULL, NULL); 397 - } 398 - 399 - 400 - /* 401 - * vma_prepare() - Helper function for handling locking VMAs prior to altering 402 - * @vp: The initialized vma_prepare struct 403 - */ 404 - static inline void vma_prepare(struct vma_prepare *vp) 405 - { 406 - if (vp->file) { 407 - uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); 408 - 409 - if (vp->adj_next) 410 - uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, 411 - vp->adj_next->vm_end); 412 - 413 - i_mmap_lock_write(vp->mapping); 414 - if (vp->insert && vp->insert->vm_file) { 415 - /* 416 - * Put into interval tree now, so instantiated pages 417 - * are visible to arm/parisc __flush_dcache_page 418 - * throughout; but we cannot insert into address 419 - * space until vma start or end is updated. 420 - */ 421 - __vma_link_file(vp->insert, 422 - vp->insert->vm_file->f_mapping); 423 - } 424 - } 425 - 426 - if (vp->anon_vma) { 427 - anon_vma_lock_write(vp->anon_vma); 428 - anon_vma_interval_tree_pre_update_vma(vp->vma); 429 - if (vp->adj_next) 430 - anon_vma_interval_tree_pre_update_vma(vp->adj_next); 431 - } 432 - 433 - if (vp->file) { 434 - flush_dcache_mmap_lock(vp->mapping); 435 - vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); 436 - if (vp->adj_next) 437 - vma_interval_tree_remove(vp->adj_next, 438 - &vp->mapping->i_mmap); 439 - } 440 - 441 - } 442 - 443 - /* 444 - * vma_complete- Helper function for handling the unlocking after altering VMAs, 445 - * or for inserting a VMA. 446 - * 447 - * @vp: The vma_prepare struct 448 - * @vmi: The vma iterator 449 - * @mm: The mm_struct 450 - */ 451 - static inline void vma_complete(struct vma_prepare *vp, 452 - struct vma_iterator *vmi, struct mm_struct *mm) 453 - { 454 - if (vp->file) { 455 - if (vp->adj_next) 456 - vma_interval_tree_insert(vp->adj_next, 457 - &vp->mapping->i_mmap); 458 - vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); 459 - flush_dcache_mmap_unlock(vp->mapping); 460 - } 461 - 462 - if (vp->remove && vp->file) { 463 - __remove_shared_vm_struct(vp->remove, vp->mapping); 464 - if (vp->remove2) 465 - __remove_shared_vm_struct(vp->remove2, vp->mapping); 466 - } else if (vp->insert) { 467 - /* 468 - * split_vma has split insert from vma, and needs 469 - * us to insert it before dropping the locks 470 - * (it may either follow vma or precede it). 471 - */ 472 - vma_iter_store(vmi, vp->insert); 473 - mm->map_count++; 474 - } 475 - 476 - if (vp->anon_vma) { 477 - anon_vma_interval_tree_post_update_vma(vp->vma); 478 - if (vp->adj_next) 479 - anon_vma_interval_tree_post_update_vma(vp->adj_next); 480 - anon_vma_unlock_write(vp->anon_vma); 481 - } 482 - 483 - if (vp->file) { 484 - i_mmap_unlock_write(vp->mapping); 485 - uprobe_mmap(vp->vma); 486 - 487 - if (vp->adj_next) 488 - uprobe_mmap(vp->adj_next); 489 - } 490 - 491 - if (vp->remove) { 492 - again: 493 - vma_mark_detached(vp->remove, true); 494 - if (vp->file) { 495 - uprobe_munmap(vp->remove, vp->remove->vm_start, 496 - vp->remove->vm_end); 497 - fput(vp->file); 498 - } 499 - if (vp->remove->anon_vma) 500 - anon_vma_merge(vp->vma, vp->remove); 501 - mm->map_count--; 502 - mpol_put(vma_policy(vp->remove)); 503 - if (!vp->remove2) 504 - WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); 505 - vm_area_free(vp->remove); 506 - 507 - /* 508 - * In mprotect's case 6 (see comments on vma_merge), 509 - * we are removing both mid and next vmas 510 - */ 511 - if (vp->remove2) { 512 - vp->remove = vp->remove2; 513 - vp->remove2 = NULL; 514 - goto again; 515 - } 516 - } 517 - if (vp->insert && vp->file) 518 - uprobe_mmap(vp->insert); 519 - validate_mm(mm); 520 - } 521 - 522 - /* 523 - * dup_anon_vma() - Helper function to duplicate anon_vma 524 - * @dst: The destination VMA 525 - * @src: The source VMA 526 - * @dup: Pointer to the destination VMA when successful. 527 - * 528 - * Returns: 0 on success. 529 - */ 530 - static inline int dup_anon_vma(struct vm_area_struct *dst, 531 - struct vm_area_struct *src, struct vm_area_struct **dup) 532 - { 533 - /* 534 - * Easily overlooked: when mprotect shifts the boundary, make sure the 535 - * expanding vma has anon_vma set if the shrinking vma had, to cover any 536 - * anon pages imported. 537 - */ 538 - if (src->anon_vma && !dst->anon_vma) { 539 - int ret; 540 - 541 - vma_assert_write_locked(dst); 542 - dst->anon_vma = src->anon_vma; 543 - ret = anon_vma_clone(dst, src); 544 - if (ret) 545 - return ret; 546 - 547 - *dup = dst; 548 - } 549 - 550 - return 0; 551 - } 552 - 553 - /* 554 - * vma_expand - Expand an existing VMA 555 - * 556 - * @vmi: The vma iterator 557 - * @vma: The vma to expand 558 - * @start: The start of the vma 559 - * @end: The exclusive end of the vma 560 - * @pgoff: The page offset of vma 561 - * @next: The current of next vma. 562 - * 563 - * Expand @vma to @start and @end. Can expand off the start and end. Will 564 - * expand over @next if it's different from @vma and @end == @next->vm_end. 565 - * Checking if the @vma can expand and merge with @next needs to be handled by 566 - * the caller. 567 - * 568 - * Returns: 0 on success 569 - */ 570 - int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, 571 - unsigned long start, unsigned long end, pgoff_t pgoff, 572 - struct vm_area_struct *next) 573 - { 574 - struct vm_area_struct *anon_dup = NULL; 575 - bool remove_next = false; 576 - struct vma_prepare vp; 577 - 578 - vma_start_write(vma); 579 - if (next && (vma != next) && (end == next->vm_end)) { 580 - int ret; 581 - 582 - remove_next = true; 583 - vma_start_write(next); 584 - ret = dup_anon_vma(vma, next, &anon_dup); 585 - if (ret) 586 - return ret; 587 - } 588 - 589 - init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL); 590 - /* Not merging but overwriting any part of next is not handled. */ 591 - VM_WARN_ON(next && !vp.remove && 592 - next != vma && end > next->vm_start); 593 - /* Only handles expanding */ 594 - VM_WARN_ON(vma->vm_start < start || vma->vm_end > end); 595 - 596 - /* Note: vma iterator must be pointing to 'start' */ 597 - vma_iter_config(vmi, start, end); 598 - if (vma_iter_prealloc(vmi, vma)) 599 - goto nomem; 600 - 601 - vma_prepare(&vp); 602 - vma_adjust_trans_huge(vma, start, end, 0); 603 - vma_set_range(vma, start, end, pgoff); 604 - vma_iter_store(vmi, vma); 605 - 606 - vma_complete(&vp, vmi, vma->vm_mm); 607 - return 0; 608 - 609 - nomem: 610 - if (anon_dup) 611 - unlink_anon_vmas(anon_dup); 612 - return -ENOMEM; 613 - } 614 - 615 - /* 616 - * vma_shrink() - Reduce an existing VMAs memory area 617 - * @vmi: The vma iterator 618 - * @vma: The VMA to modify 619 - * @start: The new start 620 - * @end: The new end 621 - * 622 - * Returns: 0 on success, -ENOMEM otherwise 623 - */ 624 - int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, 625 - unsigned long start, unsigned long end, pgoff_t pgoff) 626 - { 627 - struct vma_prepare vp; 628 - 629 - WARN_ON((vma->vm_start != start) && (vma->vm_end != end)); 630 - 631 - if (vma->vm_start < start) 632 - vma_iter_config(vmi, vma->vm_start, start); 633 - else 634 - vma_iter_config(vmi, end, vma->vm_end); 635 - 636 - if (vma_iter_prealloc(vmi, NULL)) 637 - return -ENOMEM; 638 - 639 - vma_start_write(vma); 640 - 641 - init_vma_prep(&vp, vma); 642 - vma_prepare(&vp); 643 - vma_adjust_trans_huge(vma, start, end, 0); 644 - 645 - vma_iter_clear(vmi); 646 - vma_set_range(vma, start, end, pgoff); 647 - vma_complete(&vp, vmi, vma->vm_mm); 648 - return 0; 649 - } 650 - 651 - /* 652 - * If the vma has a ->close operation then the driver probably needs to release 653 - * per-vma resources, so we don't attempt to merge those if the caller indicates 654 - * the current vma may be removed as part of the merge. 655 - */ 656 - static inline bool is_mergeable_vma(struct vm_area_struct *vma, 657 - struct file *file, unsigned long vm_flags, 658 - struct vm_userfaultfd_ctx vm_userfaultfd_ctx, 659 - struct anon_vma_name *anon_name, bool may_remove_vma) 660 - { 661 - /* 662 - * VM_SOFTDIRTY should not prevent from VMA merging, if we 663 - * match the flags but dirty bit -- the caller should mark 664 - * merged VMA as dirty. If dirty bit won't be excluded from 665 - * comparison, we increase pressure on the memory system forcing 666 - * the kernel to generate new VMAs when old one could be 667 - * extended instead. 668 - */ 669 - if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY) 670 - return false; 671 - if (vma->vm_file != file) 672 - return false; 673 - if (may_remove_vma && vma->vm_ops && vma->vm_ops->close) 674 - return false; 675 - if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) 676 - return false; 677 - if (!anon_vma_name_eq(anon_vma_name(vma), anon_name)) 678 - return false; 679 - return true; 680 - } 681 - 682 - static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1, 683 - struct anon_vma *anon_vma2, struct vm_area_struct *vma) 684 - { 685 - /* 686 - * The list_is_singular() test is to avoid merging VMA cloned from 687 - * parents. This can improve scalability caused by anon_vma lock. 688 - */ 689 - if ((!anon_vma1 || !anon_vma2) && (!vma || 690 - list_is_singular(&vma->anon_vma_chain))) 691 - return true; 692 - return anon_vma1 == anon_vma2; 693 - } 694 - 695 - /* 696 - * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) 697 - * in front of (at a lower virtual address and file offset than) the vma. 698 - * 699 - * We cannot merge two vmas if they have differently assigned (non-NULL) 700 - * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. 701 - * 702 - * We don't check here for the merged mmap wrapping around the end of pagecache 703 - * indices (16TB on ia32) because do_mmap() does not permit mmap's which 704 - * wrap, nor mmaps which cover the final page at index -1UL. 705 - * 706 - * We assume the vma may be removed as part of the merge. 707 - */ 708 - static bool 709 - can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, 710 - struct anon_vma *anon_vma, struct file *file, 711 - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, 712 - struct anon_vma_name *anon_name) 713 - { 714 - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) && 715 - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { 716 - if (vma->vm_pgoff == vm_pgoff) 717 - return true; 718 - } 719 - return false; 720 - } 721 - 722 - /* 723 - * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) 724 - * beyond (at a higher virtual address and file offset than) the vma. 725 - * 726 - * We cannot merge two vmas if they have differently assigned (non-NULL) 727 - * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. 728 - * 729 - * We assume that vma is not removed as part of the merge. 730 - */ 731 - static bool 732 - can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, 733 - struct anon_vma *anon_vma, struct file *file, 734 - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, 735 - struct anon_vma_name *anon_name) 736 - { 737 - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) && 738 - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { 739 - pgoff_t vm_pglen; 740 - vm_pglen = vma_pages(vma); 741 - if (vma->vm_pgoff + vm_pglen == vm_pgoff) 742 - return true; 743 - } 744 - return false; 745 - } 746 - 747 - /* 748 - * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name), 749 - * figure out whether that can be merged with its predecessor or its 750 - * successor. Or both (it neatly fills a hole). 751 - * 752 - * In most cases - when called for mmap, brk or mremap - [addr,end) is 753 - * certain not to be mapped by the time vma_merge is called; but when 754 - * called for mprotect, it is certain to be already mapped (either at 755 - * an offset within prev, or at the start of next), and the flags of 756 - * this area are about to be changed to vm_flags - and the no-change 757 - * case has already been eliminated. 758 - * 759 - * The following mprotect cases have to be considered, where **** is 760 - * the area passed down from mprotect_fixup, never extending beyond one 761 - * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts 762 - * at the same address as **** and is of the same or larger span, and 763 - * NNNN the next vma after ****: 764 - * 765 - * **** **** **** 766 - * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPCCCCCC 767 - * cannot merge might become might become 768 - * PPNNNNNNNNNN PPPPPPPPPPCC 769 - * mmap, brk or case 4 below case 5 below 770 - * mremap move: 771 - * **** **** 772 - * PPPP NNNN PPPPCCCCNNNN 773 - * might become might become 774 - * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or 775 - * PPPPPPPPNNNN 2 or PPPPPPPPNNNN 7 or 776 - * PPPPNNNNNNNN 3 PPPPNNNNNNNN 8 777 - * 778 - * It is important for case 8 that the vma CCCC overlapping the 779 - * region **** is never going to extended over NNNN. Instead NNNN must 780 - * be extended in region **** and CCCC must be removed. This way in 781 - * all cases where vma_merge succeeds, the moment vma_merge drops the 782 - * rmap_locks, the properties of the merged vma will be already 783 - * correct for the whole merged range. Some of those properties like 784 - * vm_page_prot/vm_flags may be accessed by rmap_walks and they must 785 - * be correct for the whole merged range immediately after the 786 - * rmap_locks are released. Otherwise if NNNN would be removed and 787 - * CCCC would be extended over the NNNN range, remove_migration_ptes 788 - * or other rmap walkers (if working on addresses beyond the "end" 789 - * parameter) may establish ptes with the wrong permissions of CCCC 790 - * instead of the right permissions of NNNN. 791 - * 792 - * In the code below: 793 - * PPPP is represented by *prev 794 - * CCCC is represented by *curr or not represented at all (NULL) 795 - * NNNN is represented by *next or not represented at all (NULL) 796 - * **** is not represented - it will be merged and the vma containing the 797 - * area is returned, or the function will return NULL 798 - */ 799 - static struct vm_area_struct 800 - *vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev, 801 - struct vm_area_struct *src, unsigned long addr, unsigned long end, 802 - unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy, 803 - struct vm_userfaultfd_ctx vm_userfaultfd_ctx, 804 - struct anon_vma_name *anon_name) 805 - { 806 - struct mm_struct *mm = src->vm_mm; 807 - struct anon_vma *anon_vma = src->anon_vma; 808 - struct file *file = src->vm_file; 809 - struct vm_area_struct *curr, *next, *res; 810 - struct vm_area_struct *vma, *adjust, *remove, *remove2; 811 - struct vm_area_struct *anon_dup = NULL; 812 - struct vma_prepare vp; 813 - pgoff_t vma_pgoff; 814 - int err = 0; 815 - bool merge_prev = false; 816 - bool merge_next = false; 817 - bool vma_expanded = false; 818 - unsigned long vma_start = addr; 819 - unsigned long vma_end = end; 820 - pgoff_t pglen = (end - addr) >> PAGE_SHIFT; 821 - long adj_start = 0; 822 - 823 - /* 824 - * We later require that vma->vm_flags == vm_flags, 825 - * so this tests vma->vm_flags & VM_SPECIAL, too. 826 - */ 827 - if (vm_flags & VM_SPECIAL) 828 - return NULL; 829 - 830 - /* Does the input range span an existing VMA? (cases 5 - 8) */ 831 - curr = find_vma_intersection(mm, prev ? prev->vm_end : 0, end); 832 - 833 - if (!curr || /* cases 1 - 4 */ 834 - end == curr->vm_end) /* cases 6 - 8, adjacent VMA */ 835 - next = vma_lookup(mm, end); 836 - else 837 - next = NULL; /* case 5 */ 838 - 839 - if (prev) { 840 - vma_start = prev->vm_start; 841 - vma_pgoff = prev->vm_pgoff; 842 - 843 - /* Can we merge the predecessor? */ 844 - if (addr == prev->vm_end && mpol_equal(vma_policy(prev), policy) 845 - && can_vma_merge_after(prev, vm_flags, anon_vma, file, 846 - pgoff, vm_userfaultfd_ctx, anon_name)) { 847 - merge_prev = true; 848 - vma_prev(vmi); 849 - } 850 - } 851 - 852 - /* Can we merge the successor? */ 853 - if (next && mpol_equal(policy, vma_policy(next)) && 854 - can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, 855 - vm_userfaultfd_ctx, anon_name)) { 856 - merge_next = true; 857 - } 858 - 859 - /* Verify some invariant that must be enforced by the caller. */ 860 - VM_WARN_ON(prev && addr <= prev->vm_start); 861 - VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end)); 862 - VM_WARN_ON(addr >= end); 863 - 864 - if (!merge_prev && !merge_next) 865 - return NULL; /* Not mergeable. */ 866 - 867 - if (merge_prev) 868 - vma_start_write(prev); 869 - 870 - res = vma = prev; 871 - remove = remove2 = adjust = NULL; 872 - 873 - /* Can we merge both the predecessor and the successor? */ 874 - if (merge_prev && merge_next && 875 - is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { 876 - vma_start_write(next); 877 - remove = next; /* case 1 */ 878 - vma_end = next->vm_end; 879 - err = dup_anon_vma(prev, next, &anon_dup); 880 - if (curr) { /* case 6 */ 881 - vma_start_write(curr); 882 - remove = curr; 883 - remove2 = next; 884 - /* 885 - * Note that the dup_anon_vma below cannot overwrite err 886 - * since the first caller would do nothing unless next 887 - * has an anon_vma. 888 - */ 889 - if (!next->anon_vma) 890 - err = dup_anon_vma(prev, curr, &anon_dup); 891 - } 892 - } else if (merge_prev) { /* case 2 */ 893 - if (curr) { 894 - vma_start_write(curr); 895 - if (end == curr->vm_end) { /* case 7 */ 896 - /* 897 - * can_vma_merge_after() assumed we would not be 898 - * removing prev vma, so it skipped the check 899 - * for vm_ops->close, but we are removing curr 900 - */ 901 - if (curr->vm_ops && curr->vm_ops->close) 902 - err = -EINVAL; 903 - remove = curr; 904 - } else { /* case 5 */ 905 - adjust = curr; 906 - adj_start = (end - curr->vm_start); 907 - } 908 - if (!err) 909 - err = dup_anon_vma(prev, curr, &anon_dup); 910 - } 911 - } else { /* merge_next */ 912 - vma_start_write(next); 913 - res = next; 914 - if (prev && addr < prev->vm_end) { /* case 4 */ 915 - vma_start_write(prev); 916 - vma_end = addr; 917 - adjust = next; 918 - adj_start = -(prev->vm_end - addr); 919 - err = dup_anon_vma(next, prev, &anon_dup); 920 - } else { 921 - /* 922 - * Note that cases 3 and 8 are the ONLY ones where prev 923 - * is permitted to be (but is not necessarily) NULL. 924 - */ 925 - vma = next; /* case 3 */ 926 - vma_start = addr; 927 - vma_end = next->vm_end; 928 - vma_pgoff = next->vm_pgoff - pglen; 929 - if (curr) { /* case 8 */ 930 - vma_pgoff = curr->vm_pgoff; 931 - vma_start_write(curr); 932 - remove = curr; 933 - err = dup_anon_vma(next, curr, &anon_dup); 934 - } 935 - } 936 - } 937 - 938 - /* Error in anon_vma clone. */ 939 - if (err) 940 - goto anon_vma_fail; 941 - 942 - if (vma_start < vma->vm_start || vma_end > vma->vm_end) 943 - vma_expanded = true; 944 - 945 - if (vma_expanded) { 946 - vma_iter_config(vmi, vma_start, vma_end); 947 - } else { 948 - vma_iter_config(vmi, adjust->vm_start + adj_start, 949 - adjust->vm_end); 950 - } 951 - 952 - if (vma_iter_prealloc(vmi, vma)) 953 - goto prealloc_fail; 954 - 955 - init_multi_vma_prep(&vp, vma, adjust, remove, remove2); 956 - VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma && 957 - vp.anon_vma != adjust->anon_vma); 958 - 959 - vma_prepare(&vp); 960 - vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start); 961 - vma_set_range(vma, vma_start, vma_end, vma_pgoff); 962 - 963 - if (vma_expanded) 964 - vma_iter_store(vmi, vma); 965 - 966 - if (adj_start) { 967 - adjust->vm_start += adj_start; 968 - adjust->vm_pgoff += adj_start >> PAGE_SHIFT; 969 - if (adj_start < 0) { 970 - WARN_ON(vma_expanded); 971 - vma_iter_store(vmi, next); 972 - } 973 - } 974 - 975 - vma_complete(&vp, vmi, mm); 976 - khugepaged_enter_vma(res, vm_flags); 977 - return res; 978 - 979 - prealloc_fail: 980 - if (anon_dup) 981 - unlink_anon_vmas(anon_dup); 982 - 983 - anon_vma_fail: 984 - vma_iter_set(vmi, addr); 985 - vma_iter_load(vmi); 986 - return NULL; 987 - } 988 - 989 - /* 990 - * Rough compatibility check to quickly see if it's even worth looking 991 - * at sharing an anon_vma. 992 - * 993 - * They need to have the same vm_file, and the flags can only differ 994 - * in things that mprotect may change. 995 - * 996 - * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that 997 - * we can merge the two vma's. For example, we refuse to merge a vma if 998 - * there is a vm_ops->close() function, because that indicates that the 999 - * driver is doing some kind of reference counting. But that doesn't 1000 - * really matter for the anon_vma sharing case. 1001 - */ 1002 - static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) 1003 - { 1004 - return a->vm_end == b->vm_start && 1005 - mpol_equal(vma_policy(a), vma_policy(b)) && 1006 - a->vm_file == b->vm_file && 1007 - !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) && 1008 - b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); 1009 - } 1010 - 1011 - /* 1012 - * Do some basic sanity checking to see if we can re-use the anon_vma 1013 - * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be 1014 - * the same as 'old', the other will be the new one that is trying 1015 - * to share the anon_vma. 1016 - * 1017 - * NOTE! This runs with mmap_lock held for reading, so it is possible that 1018 - * the anon_vma of 'old' is concurrently in the process of being set up 1019 - * by another page fault trying to merge _that_. But that's ok: if it 1020 - * is being set up, that automatically means that it will be a singleton 1021 - * acceptable for merging, so we can do all of this optimistically. But 1022 - * we do that READ_ONCE() to make sure that we never re-load the pointer. 1023 - * 1024 - * IOW: that the "list_is_singular()" test on the anon_vma_chain only 1025 - * matters for the 'stable anon_vma' case (ie the thing we want to avoid 1026 - * is to return an anon_vma that is "complex" due to having gone through 1027 - * a fork). 1028 - * 1029 - * We also make sure that the two vma's are compatible (adjacent, 1030 - * and with the same memory policies). That's all stable, even with just 1031 - * a read lock on the mmap_lock. 1032 - */ 1033 - static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) 1034 - { 1035 - if (anon_vma_compatible(a, b)) { 1036 - struct anon_vma *anon_vma = READ_ONCE(old->anon_vma); 1037 - 1038 - if (anon_vma && list_is_singular(&old->anon_vma_chain)) 1039 - return anon_vma; 1040 - } 1041 - return NULL; 1042 - } 1043 - 1044 - /* 1045 - * find_mergeable_anon_vma is used by anon_vma_prepare, to check 1046 - * neighbouring vmas for a suitable anon_vma, before it goes off 1047 - * to allocate a new anon_vma. It checks because a repetitive 1048 - * sequence of mprotects and faults may otherwise lead to distinct 1049 - * anon_vmas being allocated, preventing vma merge in subsequent 1050 - * mprotect. 1051 - */ 1052 - struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) 1053 - { 1054 - struct anon_vma *anon_vma = NULL; 1055 - struct vm_area_struct *prev, *next; 1056 - VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end); 1057 - 1058 - /* Try next first. */ 1059 - next = vma_iter_load(&vmi); 1060 - if (next) { 1061 - anon_vma = reusable_anon_vma(next, vma, next); 1062 - if (anon_vma) 1063 - return anon_vma; 1064 - } 1065 - 1066 - prev = vma_prev(&vmi); 1067 - VM_BUG_ON_VMA(prev != vma, vma); 1068 - prev = vma_prev(&vmi); 1069 - /* Try prev next. */ 1070 - if (prev) 1071 - anon_vma = reusable_anon_vma(prev, prev, vma); 1072 - 1073 - /* 1074 - * We might reach here with anon_vma == NULL if we can't find 1075 - * any reusable anon_vma. 1076 - * There's no absolute need to look only at touching neighbours: 1077 - * we could search further afield for "compatible" anon_vmas. 1078 - * But it would probably just be a waste of time searching, 1079 - * or lead to too many vmas hanging off the same anon_vma. 1080 - * We're trying to allow mprotect remerging later on, 1081 - * not trying to minimize memory used for anon_vmas. 1082 - */ 1083 - return anon_vma; 1084 319 } 1085 320 1086 321 /* ··· 575 1548 a.offset >> PAGE_SHIFT); 576 1549 } 577 1550 #endif /* __ARCH_WANT_SYS_OLD_MMAP */ 578 - 579 - static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) 580 - { 581 - return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite); 582 - } 583 - 584 - static bool vma_is_shared_writable(struct vm_area_struct *vma) 585 - { 586 - return (vma->vm_flags & (VM_WRITE | VM_SHARED)) == 587 - (VM_WRITE | VM_SHARED); 588 - } 589 - 590 - static bool vma_fs_can_writeback(struct vm_area_struct *vma) 591 - { 592 - /* No managed pages to writeback. */ 593 - if (vma->vm_flags & VM_PFNMAP) 594 - return false; 595 - 596 - return vma->vm_file && vma->vm_file->f_mapping && 597 - mapping_can_writeback(vma->vm_file->f_mapping); 598 - } 599 - 600 - /* 601 - * Does this VMA require the underlying folios to have their dirty state 602 - * tracked? 603 - */ 604 - bool vma_needs_dirty_tracking(struct vm_area_struct *vma) 605 - { 606 - /* Only shared, writable VMAs require dirty tracking. */ 607 - if (!vma_is_shared_writable(vma)) 608 - return false; 609 - 610 - /* Does the filesystem need to be notified? */ 611 - if (vm_ops_needs_writenotify(vma->vm_ops)) 612 - return true; 613 - 614 - /* 615 - * Even if the filesystem doesn't indicate a need for writenotify, if it 616 - * can writeback, dirty tracking is still required. 617 - */ 618 - return vma_fs_can_writeback(vma); 619 - } 620 - 621 - /* 622 - * Some shared mappings will want the pages marked read-only 623 - * to track write events. If so, we'll downgrade vm_page_prot 624 - * to the private version (using protection_map[] without the 625 - * VM_SHARED bit). 626 - */ 627 - bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) 628 - { 629 - /* If it was private or non-writable, the write bit is already clear */ 630 - if (!vma_is_shared_writable(vma)) 631 - return false; 632 - 633 - /* The backer wishes to know when pages are first written to? */ 634 - if (vm_ops_needs_writenotify(vma->vm_ops)) 635 - return true; 636 - 637 - /* The open routine did something to the protections that pgprot_modify 638 - * won't preserve? */ 639 - if (pgprot_val(vm_page_prot) != 640 - pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags))) 641 - return false; 642 - 643 - /* 644 - * Do we need to track softdirty? hugetlb does not support softdirty 645 - * tracking yet. 646 - */ 647 - if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) 648 - return true; 649 - 650 - /* Do we need write faults for uffd-wp tracking? */ 651 - if (userfaultfd_wp(vma)) 652 - return true; 653 - 654 - /* Can the mapping track the dirty pages? */ 655 - return vma_fs_can_writeback(vma); 656 - } 657 1551 658 1552 /* 659 1553 * We account for memory if it's a private writeable mapping, ··· 1341 2393 return vma; 1342 2394 } 1343 2395 1344 - /* 1345 - * Ok - we have the memory areas we should free on a maple tree so release them, 1346 - * and do the vma updates. 1347 - * 1348 - * Called with the mm semaphore held. 1349 - */ 1350 - static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) 1351 - { 1352 - unsigned long nr_accounted = 0; 1353 - struct vm_area_struct *vma; 1354 - 1355 - /* Update high watermark before we lower total_vm */ 1356 - update_hiwater_vm(mm); 1357 - mas_for_each(mas, vma, ULONG_MAX) { 1358 - long nrpages = vma_pages(vma); 1359 - 1360 - if (vma->vm_flags & VM_ACCOUNT) 1361 - nr_accounted += nrpages; 1362 - vm_stat_account(mm, vma->vm_flags, -nrpages); 1363 - remove_vma(vma, false); 1364 - } 1365 - vm_unacct_memory(nr_accounted); 1366 - } 1367 - 1368 - /* 1369 - * Get rid of page table information in the indicated region. 1370 - * 1371 - * Called with the mm semaphore held. 1372 - */ 1373 - static void unmap_region(struct mm_struct *mm, struct ma_state *mas, 1374 - struct vm_area_struct *vma, struct vm_area_struct *prev, 1375 - struct vm_area_struct *next, unsigned long start, 1376 - unsigned long end, unsigned long tree_end, bool mm_wr_locked) 1377 - { 1378 - struct mmu_gather tlb; 1379 - unsigned long mt_start = mas->index; 1380 - 1381 - lru_add_drain(); 1382 - tlb_gather_mmu(&tlb, mm); 1383 - update_hiwater_rss(mm); 1384 - unmap_vmas(&tlb, mas, vma, start, end, tree_end, mm_wr_locked); 1385 - mas_set(mas, mt_start); 1386 - free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, 1387 - next ? next->vm_start : USER_PGTABLES_CEILING, 1388 - mm_wr_locked); 1389 - tlb_finish_mmu(&tlb); 1390 - } 1391 - 1392 - /* 1393 - * __split_vma() bypasses sysctl_max_map_count checking. We use this where it 1394 - * has already been checked or doesn't make sense to fail. 1395 - * VMA Iterator will point to the end VMA. 1396 - */ 1397 - static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, 1398 - unsigned long addr, int new_below) 1399 - { 1400 - struct vma_prepare vp; 1401 - struct vm_area_struct *new; 1402 - int err; 1403 - 1404 - WARN_ON(vma->vm_start >= addr); 1405 - WARN_ON(vma->vm_end <= addr); 1406 - 1407 - if (vma->vm_ops && vma->vm_ops->may_split) { 1408 - err = vma->vm_ops->may_split(vma, addr); 1409 - if (err) 1410 - return err; 1411 - } 1412 - 1413 - new = vm_area_dup(vma); 1414 - if (!new) 1415 - return -ENOMEM; 1416 - 1417 - if (new_below) { 1418 - new->vm_end = addr; 1419 - } else { 1420 - new->vm_start = addr; 1421 - new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); 1422 - } 1423 - 1424 - err = -ENOMEM; 1425 - vma_iter_config(vmi, new->vm_start, new->vm_end); 1426 - if (vma_iter_prealloc(vmi, new)) 1427 - goto out_free_vma; 1428 - 1429 - err = vma_dup_policy(vma, new); 1430 - if (err) 1431 - goto out_free_vmi; 1432 - 1433 - err = anon_vma_clone(new, vma); 1434 - if (err) 1435 - goto out_free_mpol; 1436 - 1437 - if (new->vm_file) 1438 - get_file(new->vm_file); 1439 - 1440 - if (new->vm_ops && new->vm_ops->open) 1441 - new->vm_ops->open(new); 1442 - 1443 - vma_start_write(vma); 1444 - vma_start_write(new); 1445 - 1446 - init_vma_prep(&vp, vma); 1447 - vp.insert = new; 1448 - vma_prepare(&vp); 1449 - vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); 1450 - 1451 - if (new_below) { 1452 - vma->vm_start = addr; 1453 - vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; 1454 - } else { 1455 - vma->vm_end = addr; 1456 - } 1457 - 1458 - /* vma_complete stores the new vma */ 1459 - vma_complete(&vp, vmi, vma->vm_mm); 1460 - 1461 - /* Success. */ 1462 - if (new_below) 1463 - vma_next(vmi); 1464 - return 0; 1465 - 1466 - out_free_mpol: 1467 - mpol_put(vma_policy(new)); 1468 - out_free_vmi: 1469 - vma_iter_free(vmi); 1470 - out_free_vma: 1471 - vm_area_free(new); 1472 - return err; 1473 - } 1474 - 1475 - /* 1476 - * Split a vma into two pieces at address 'addr', a new vma is allocated 1477 - * either for the first part or the tail. 1478 - */ 1479 - static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, 1480 - unsigned long addr, int new_below) 1481 - { 1482 - if (vma->vm_mm->map_count >= sysctl_max_map_count) 1483 - return -ENOMEM; 1484 - 1485 - return __split_vma(vmi, vma, addr, new_below); 1486 - } 1487 - 1488 - /* 1489 - * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd 1490 - * context and anonymous VMA name within the range [start, end). 1491 - * 1492 - * As a result, we might be able to merge the newly modified VMA range with an 1493 - * adjacent VMA with identical properties. 1494 - * 1495 - * If no merge is possible and the range does not span the entirety of the VMA, 1496 - * we then need to split the VMA to accommodate the change. 1497 - * 1498 - * The function returns either the merged VMA, the original VMA if a split was 1499 - * required instead, or an error if the split failed. 1500 - */ 1501 - struct vm_area_struct *vma_modify(struct vma_iterator *vmi, 1502 - struct vm_area_struct *prev, 1503 - struct vm_area_struct *vma, 1504 - unsigned long start, unsigned long end, 1505 - unsigned long vm_flags, 1506 - struct mempolicy *policy, 1507 - struct vm_userfaultfd_ctx uffd_ctx, 1508 - struct anon_vma_name *anon_name) 1509 - { 1510 - pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 1511 - struct vm_area_struct *merged; 1512 - 1513 - merged = vma_merge(vmi, prev, vma, start, end, vm_flags, 1514 - pgoff, policy, uffd_ctx, anon_name); 1515 - if (merged) 1516 - return merged; 1517 - 1518 - if (vma->vm_start < start) { 1519 - int err = split_vma(vmi, vma, start, 1); 1520 - 1521 - if (err) 1522 - return ERR_PTR(err); 1523 - } 1524 - 1525 - if (vma->vm_end > end) { 1526 - int err = split_vma(vmi, vma, end, 0); 1527 - 1528 - if (err) 1529 - return ERR_PTR(err); 1530 - } 1531 - 1532 - return vma; 1533 - } 1534 - 1535 - /* 1536 - * Attempt to merge a newly mapped VMA with those adjacent to it. The caller 1537 - * must ensure that [start, end) does not overlap any existing VMA. 1538 - */ 1539 - static struct vm_area_struct 1540 - *vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev, 1541 - struct vm_area_struct *vma, unsigned long start, 1542 - unsigned long end, pgoff_t pgoff) 1543 - { 1544 - return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff, 1545 - vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); 1546 - } 1547 - 1548 - /* 1549 - * Expand vma by delta bytes, potentially merging with an immediately adjacent 1550 - * VMA with identical properties. 1551 - */ 1552 - struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, 1553 - struct vm_area_struct *vma, 1554 - unsigned long delta) 1555 - { 1556 - pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma); 1557 - 1558 - /* vma is specified as prev, so case 1 or 2 will apply. */ 1559 - return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta, 1560 - vma->vm_flags, pgoff, vma_policy(vma), 1561 - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); 1562 - } 1563 - 1564 - /* 1565 - * do_vmi_align_munmap() - munmap the aligned region from @start to @end. 1566 - * @vmi: The vma iterator 1567 - * @vma: The starting vm_area_struct 1568 - * @mm: The mm_struct 1569 - * @start: The aligned start address to munmap. 1570 - * @end: The aligned end address to munmap. 1571 - * @uf: The userfaultfd list_head 1572 - * @unlock: Set to true to drop the mmap_lock. unlocking only happens on 1573 - * success. 1574 - * 1575 - * Return: 0 on success and drops the lock if so directed, error and leaves the 1576 - * lock held otherwise. 1577 - */ 1578 - static int 1579 - do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, 1580 - struct mm_struct *mm, unsigned long start, 1581 - unsigned long end, struct list_head *uf, bool unlock) 1582 - { 1583 - struct vm_area_struct *prev, *next = NULL; 1584 - struct maple_tree mt_detach; 1585 - int count = 0; 1586 - int error = -ENOMEM; 1587 - unsigned long locked_vm = 0; 1588 - MA_STATE(mas_detach, &mt_detach, 0, 0); 1589 - mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); 1590 - mt_on_stack(mt_detach); 1591 - 1592 - /* 1593 - * If we need to split any vma, do it now to save pain later. 1594 - * 1595 - * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially 1596 - * unmapped vm_area_struct will remain in use: so lower split_vma 1597 - * places tmp vma above, and higher split_vma places tmp vma below. 1598 - */ 1599 - 1600 - /* Does it split the first one? */ 1601 - if (start > vma->vm_start) { 1602 - 1603 - /* 1604 - * Make sure that map_count on return from munmap() will 1605 - * not exceed its limit; but let map_count go just above 1606 - * its limit temporarily, to help free resources as expected. 1607 - */ 1608 - if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) 1609 - goto map_count_exceeded; 1610 - 1611 - error = __split_vma(vmi, vma, start, 1); 1612 - if (error) 1613 - goto start_split_failed; 1614 - } 1615 - 1616 - /* 1617 - * Detach a range of VMAs from the mm. Using next as a temp variable as 1618 - * it is always overwritten. 1619 - */ 1620 - next = vma; 1621 - do { 1622 - /* Does it split the end? */ 1623 - if (next->vm_end > end) { 1624 - error = __split_vma(vmi, next, end, 0); 1625 - if (error) 1626 - goto end_split_failed; 1627 - } 1628 - vma_start_write(next); 1629 - mas_set(&mas_detach, count); 1630 - error = mas_store_gfp(&mas_detach, next, GFP_KERNEL); 1631 - if (error) 1632 - goto munmap_gather_failed; 1633 - vma_mark_detached(next, true); 1634 - if (next->vm_flags & VM_LOCKED) 1635 - locked_vm += vma_pages(next); 1636 - 1637 - count++; 1638 - if (unlikely(uf)) { 1639 - /* 1640 - * If userfaultfd_unmap_prep returns an error the vmas 1641 - * will remain split, but userland will get a 1642 - * highly unexpected error anyway. This is no 1643 - * different than the case where the first of the two 1644 - * __split_vma fails, but we don't undo the first 1645 - * split, despite we could. This is unlikely enough 1646 - * failure that it's not worth optimizing it for. 1647 - */ 1648 - error = userfaultfd_unmap_prep(next, start, end, uf); 1649 - 1650 - if (error) 1651 - goto userfaultfd_error; 1652 - } 1653 - #ifdef CONFIG_DEBUG_VM_MAPLE_TREE 1654 - BUG_ON(next->vm_start < start); 1655 - BUG_ON(next->vm_start > end); 1656 - #endif 1657 - } for_each_vma_range(*vmi, next, end); 1658 - 1659 - #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) 1660 - /* Make sure no VMAs are about to be lost. */ 1661 - { 1662 - MA_STATE(test, &mt_detach, 0, 0); 1663 - struct vm_area_struct *vma_mas, *vma_test; 1664 - int test_count = 0; 1665 - 1666 - vma_iter_set(vmi, start); 1667 - rcu_read_lock(); 1668 - vma_test = mas_find(&test, count - 1); 1669 - for_each_vma_range(*vmi, vma_mas, end) { 1670 - BUG_ON(vma_mas != vma_test); 1671 - test_count++; 1672 - vma_test = mas_next(&test, count - 1); 1673 - } 1674 - rcu_read_unlock(); 1675 - BUG_ON(count != test_count); 1676 - } 1677 - #endif 1678 - 1679 - while (vma_iter_addr(vmi) > start) 1680 - vma_iter_prev_range(vmi); 1681 - 1682 - error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL); 1683 - if (error) 1684 - goto clear_tree_failed; 1685 - 1686 - /* Point of no return */ 1687 - mm->locked_vm -= locked_vm; 1688 - mm->map_count -= count; 1689 - if (unlock) 1690 - mmap_write_downgrade(mm); 1691 - 1692 - prev = vma_iter_prev_range(vmi); 1693 - next = vma_next(vmi); 1694 - if (next) 1695 - vma_iter_prev_range(vmi); 1696 - 1697 - /* 1698 - * We can free page tables without write-locking mmap_lock because VMAs 1699 - * were isolated before we downgraded mmap_lock. 1700 - */ 1701 - mas_set(&mas_detach, 1); 1702 - unmap_region(mm, &mas_detach, vma, prev, next, start, end, count, 1703 - !unlock); 1704 - /* Statistics and freeing VMAs */ 1705 - mas_set(&mas_detach, 0); 1706 - remove_mt(mm, &mas_detach); 1707 - validate_mm(mm); 1708 - if (unlock) 1709 - mmap_read_unlock(mm); 1710 - 1711 - __mt_destroy(&mt_detach); 1712 - return 0; 1713 - 1714 - clear_tree_failed: 1715 - userfaultfd_error: 1716 - munmap_gather_failed: 1717 - end_split_failed: 1718 - mas_set(&mas_detach, 0); 1719 - mas_for_each(&mas_detach, next, end) 1720 - vma_mark_detached(next, false); 1721 - 1722 - __mt_destroy(&mt_detach); 1723 - start_split_failed: 1724 - map_count_exceeded: 1725 - validate_mm(mm); 1726 - return error; 1727 - } 1728 - 1729 - /* 1730 - * do_vmi_munmap() - munmap a given range. 1731 - * @vmi: The vma iterator 1732 - * @mm: The mm_struct 1733 - * @start: The start address to munmap 1734 - * @len: The length of the range to munmap 1735 - * @uf: The userfaultfd list_head 1736 - * @unlock: set to true if the user wants to drop the mmap_lock on success 1737 - * 1738 - * This function takes a @mas that is either pointing to the previous VMA or set 1739 - * to MA_START and sets it up to remove the mapping(s). The @len will be 1740 - * aligned and any arch_unmap work will be preformed. 1741 - * 1742 - * Return: 0 on success and drops the lock if so directed, error and leaves the 1743 - * lock held otherwise. 1744 - */ 1745 - int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, 1746 - unsigned long start, size_t len, struct list_head *uf, 1747 - bool unlock) 1748 - { 1749 - unsigned long end; 1750 - struct vm_area_struct *vma; 1751 - 1752 - if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) 1753 - return -EINVAL; 1754 - 1755 - end = start + PAGE_ALIGN(len); 1756 - if (end == start) 1757 - return -EINVAL; 1758 - 1759 - /* 1760 - * Check if memory is sealed before arch_unmap. 1761 - * Prevent unmapping a sealed VMA. 1762 - * can_modify_mm assumes we have acquired the lock on MM. 1763 - */ 1764 - if (unlikely(!can_modify_mm(mm, start, end))) 1765 - return -EPERM; 1766 - 1767 - /* arch_unmap() might do unmaps itself. */ 1768 - arch_unmap(mm, start, end); 1769 - 1770 - /* Find the first overlapping VMA */ 1771 - vma = vma_find(vmi, end); 1772 - if (!vma) { 1773 - if (unlock) 1774 - mmap_write_unlock(mm); 1775 - return 0; 1776 - } 1777 - 1778 - return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); 1779 - } 1780 - 1781 2396 /* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls. 1782 2397 * @mm: The mm_struct 1783 2398 * @start: The start address to munmap ··· 2002 3491 } 2003 3492 2004 3493 /* 2005 - * Copy the vma structure to a new location in the same mm, 2006 - * prior to moving page table entries, to effect an mremap move. 2007 - */ 2008 - struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, 2009 - unsigned long addr, unsigned long len, pgoff_t pgoff, 2010 - bool *need_rmap_locks) 2011 - { 2012 - struct vm_area_struct *vma = *vmap; 2013 - unsigned long vma_start = vma->vm_start; 2014 - struct mm_struct *mm = vma->vm_mm; 2015 - struct vm_area_struct *new_vma, *prev; 2016 - bool faulted_in_anon_vma = true; 2017 - VMA_ITERATOR(vmi, mm, addr); 2018 - 2019 - /* 2020 - * If anonymous vma has not yet been faulted, update new pgoff 2021 - * to match new location, to increase its chance of merging. 2022 - */ 2023 - if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { 2024 - pgoff = addr >> PAGE_SHIFT; 2025 - faulted_in_anon_vma = false; 2026 - } 2027 - 2028 - new_vma = find_vma_prev(mm, addr, &prev); 2029 - if (new_vma && new_vma->vm_start < addr + len) 2030 - return NULL; /* should never get here */ 2031 - 2032 - new_vma = vma_merge_new_vma(&vmi, prev, vma, addr, addr + len, pgoff); 2033 - if (new_vma) { 2034 - /* 2035 - * Source vma may have been merged into new_vma 2036 - */ 2037 - if (unlikely(vma_start >= new_vma->vm_start && 2038 - vma_start < new_vma->vm_end)) { 2039 - /* 2040 - * The only way we can get a vma_merge with 2041 - * self during an mremap is if the vma hasn't 2042 - * been faulted in yet and we were allowed to 2043 - * reset the dst vma->vm_pgoff to the 2044 - * destination address of the mremap to allow 2045 - * the merge to happen. mremap must change the 2046 - * vm_pgoff linearity between src and dst vmas 2047 - * (in turn preventing a vma_merge) to be 2048 - * safe. It is only safe to keep the vm_pgoff 2049 - * linear if there are no pages mapped yet. 2050 - */ 2051 - VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); 2052 - *vmap = vma = new_vma; 2053 - } 2054 - *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); 2055 - } else { 2056 - new_vma = vm_area_dup(vma); 2057 - if (!new_vma) 2058 - goto out; 2059 - vma_set_range(new_vma, addr, addr + len, pgoff); 2060 - if (vma_dup_policy(vma, new_vma)) 2061 - goto out_free_vma; 2062 - if (anon_vma_clone(new_vma, vma)) 2063 - goto out_free_mempol; 2064 - if (new_vma->vm_file) 2065 - get_file(new_vma->vm_file); 2066 - if (new_vma->vm_ops && new_vma->vm_ops->open) 2067 - new_vma->vm_ops->open(new_vma); 2068 - if (vma_link(mm, new_vma)) 2069 - goto out_vma_link; 2070 - *need_rmap_locks = false; 2071 - } 2072 - return new_vma; 2073 - 2074 - out_vma_link: 2075 - if (new_vma->vm_ops && new_vma->vm_ops->close) 2076 - new_vma->vm_ops->close(new_vma); 2077 - 2078 - if (new_vma->vm_file) 2079 - fput(new_vma->vm_file); 2080 - 2081 - unlink_anon_vmas(new_vma); 2082 - out_free_mempol: 2083 - mpol_put(vma_policy(new_vma)); 2084 - out_free_vma: 2085 - vm_area_free(new_vma); 2086 - out: 2087 - return NULL; 2088 - } 2089 - 2090 - /* 2091 3494 * Return true if the calling process may expand its vm space by the passed 2092 3495 * number of pages 2093 3496 */ ··· 2196 3771 &legacy_special_mapping_vmops); 2197 3772 2198 3773 return PTR_ERR_OR_ZERO(vma); 2199 - } 2200 - 2201 - static DEFINE_MUTEX(mm_all_locks_mutex); 2202 - 2203 - static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) 2204 - { 2205 - if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { 2206 - /* 2207 - * The LSB of head.next can't change from under us 2208 - * because we hold the mm_all_locks_mutex. 2209 - */ 2210 - down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock); 2211 - /* 2212 - * We can safely modify head.next after taking the 2213 - * anon_vma->root->rwsem. If some other vma in this mm shares 2214 - * the same anon_vma we won't take it again. 2215 - * 2216 - * No need of atomic instructions here, head.next 2217 - * can't change from under us thanks to the 2218 - * anon_vma->root->rwsem. 2219 - */ 2220 - if (__test_and_set_bit(0, (unsigned long *) 2221 - &anon_vma->root->rb_root.rb_root.rb_node)) 2222 - BUG(); 2223 - } 2224 - } 2225 - 2226 - static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) 2227 - { 2228 - if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { 2229 - /* 2230 - * AS_MM_ALL_LOCKS can't change from under us because 2231 - * we hold the mm_all_locks_mutex. 2232 - * 2233 - * Operations on ->flags have to be atomic because 2234 - * even if AS_MM_ALL_LOCKS is stable thanks to the 2235 - * mm_all_locks_mutex, there may be other cpus 2236 - * changing other bitflags in parallel to us. 2237 - */ 2238 - if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) 2239 - BUG(); 2240 - down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock); 2241 - } 2242 - } 2243 - 2244 - /* 2245 - * This operation locks against the VM for all pte/vma/mm related 2246 - * operations that could ever happen on a certain mm. This includes 2247 - * vmtruncate, try_to_unmap, and all page faults. 2248 - * 2249 - * The caller must take the mmap_lock in write mode before calling 2250 - * mm_take_all_locks(). The caller isn't allowed to release the 2251 - * mmap_lock until mm_drop_all_locks() returns. 2252 - * 2253 - * mmap_lock in write mode is required in order to block all operations 2254 - * that could modify pagetables and free pages without need of 2255 - * altering the vma layout. It's also needed in write mode to avoid new 2256 - * anon_vmas to be associated with existing vmas. 2257 - * 2258 - * A single task can't take more than one mm_take_all_locks() in a row 2259 - * or it would deadlock. 2260 - * 2261 - * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in 2262 - * mapping->flags avoid to take the same lock twice, if more than one 2263 - * vma in this mm is backed by the same anon_vma or address_space. 2264 - * 2265 - * We take locks in following order, accordingly to comment at beginning 2266 - * of mm/rmap.c: 2267 - * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for 2268 - * hugetlb mapping); 2269 - * - all vmas marked locked 2270 - * - all i_mmap_rwsem locks; 2271 - * - all anon_vma->rwseml 2272 - * 2273 - * We can take all locks within these types randomly because the VM code 2274 - * doesn't nest them and we protected from parallel mm_take_all_locks() by 2275 - * mm_all_locks_mutex. 2276 - * 2277 - * mm_take_all_locks() and mm_drop_all_locks are expensive operations 2278 - * that may have to take thousand of locks. 2279 - * 2280 - * mm_take_all_locks() can fail if it's interrupted by signals. 2281 - */ 2282 - int mm_take_all_locks(struct mm_struct *mm) 2283 - { 2284 - struct vm_area_struct *vma; 2285 - struct anon_vma_chain *avc; 2286 - VMA_ITERATOR(vmi, mm, 0); 2287 - 2288 - mmap_assert_write_locked(mm); 2289 - 2290 - mutex_lock(&mm_all_locks_mutex); 2291 - 2292 - /* 2293 - * vma_start_write() does not have a complement in mm_drop_all_locks() 2294 - * because vma_start_write() is always asymmetrical; it marks a VMA as 2295 - * being written to until mmap_write_unlock() or mmap_write_downgrade() 2296 - * is reached. 2297 - */ 2298 - for_each_vma(vmi, vma) { 2299 - if (signal_pending(current)) 2300 - goto out_unlock; 2301 - vma_start_write(vma); 2302 - } 2303 - 2304 - vma_iter_init(&vmi, mm, 0); 2305 - for_each_vma(vmi, vma) { 2306 - if (signal_pending(current)) 2307 - goto out_unlock; 2308 - if (vma->vm_file && vma->vm_file->f_mapping && 2309 - is_vm_hugetlb_page(vma)) 2310 - vm_lock_mapping(mm, vma->vm_file->f_mapping); 2311 - } 2312 - 2313 - vma_iter_init(&vmi, mm, 0); 2314 - for_each_vma(vmi, vma) { 2315 - if (signal_pending(current)) 2316 - goto out_unlock; 2317 - if (vma->vm_file && vma->vm_file->f_mapping && 2318 - !is_vm_hugetlb_page(vma)) 2319 - vm_lock_mapping(mm, vma->vm_file->f_mapping); 2320 - } 2321 - 2322 - vma_iter_init(&vmi, mm, 0); 2323 - for_each_vma(vmi, vma) { 2324 - if (signal_pending(current)) 2325 - goto out_unlock; 2326 - if (vma->anon_vma) 2327 - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 2328 - vm_lock_anon_vma(mm, avc->anon_vma); 2329 - } 2330 - 2331 - return 0; 2332 - 2333 - out_unlock: 2334 - mm_drop_all_locks(mm); 2335 - return -EINTR; 2336 - } 2337 - 2338 - static void vm_unlock_anon_vma(struct anon_vma *anon_vma) 2339 - { 2340 - if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { 2341 - /* 2342 - * The LSB of head.next can't change to 0 from under 2343 - * us because we hold the mm_all_locks_mutex. 2344 - * 2345 - * We must however clear the bitflag before unlocking 2346 - * the vma so the users using the anon_vma->rb_root will 2347 - * never see our bitflag. 2348 - * 2349 - * No need of atomic instructions here, head.next 2350 - * can't change from under us until we release the 2351 - * anon_vma->root->rwsem. 2352 - */ 2353 - if (!__test_and_clear_bit(0, (unsigned long *) 2354 - &anon_vma->root->rb_root.rb_root.rb_node)) 2355 - BUG(); 2356 - anon_vma_unlock_write(anon_vma); 2357 - } 2358 - } 2359 - 2360 - static void vm_unlock_mapping(struct address_space *mapping) 2361 - { 2362 - if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { 2363 - /* 2364 - * AS_MM_ALL_LOCKS can't change to 0 from under us 2365 - * because we hold the mm_all_locks_mutex. 2366 - */ 2367 - i_mmap_unlock_write(mapping); 2368 - if (!test_and_clear_bit(AS_MM_ALL_LOCKS, 2369 - &mapping->flags)) 2370 - BUG(); 2371 - } 2372 - } 2373 - 2374 - /* 2375 - * The mmap_lock cannot be released by the caller until 2376 - * mm_drop_all_locks() returns. 2377 - */ 2378 - void mm_drop_all_locks(struct mm_struct *mm) 2379 - { 2380 - struct vm_area_struct *vma; 2381 - struct anon_vma_chain *avc; 2382 - VMA_ITERATOR(vmi, mm, 0); 2383 - 2384 - mmap_assert_write_locked(mm); 2385 - BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); 2386 - 2387 - for_each_vma(vmi, vma) { 2388 - if (vma->anon_vma) 2389 - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 2390 - vm_unlock_anon_vma(avc->anon_vma); 2391 - if (vma->vm_file && vma->vm_file->f_mapping) 2392 - vm_unlock_mapping(vma->vm_file->f_mapping); 2393 - } 2394 - 2395 - mutex_unlock(&mm_all_locks_mutex); 2396 3774 } 2397 3775 2398 3776 /*
+2
mm/mmu_notifier.c
··· 19 19 #include <linux/sched/mm.h> 20 20 #include <linux/slab.h> 21 21 22 + #include "vma.h" 23 + 22 24 /* global SRCU for all MMs */ 23 25 DEFINE_STATIC_SRCU(srcu); 24 26
+1766
mm/vma.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + 3 + /* 4 + * VMA-specific functions. 5 + */ 6 + 7 + #include "vma_internal.h" 8 + #include "vma.h" 9 + 10 + /* 11 + * If the vma has a ->close operation then the driver probably needs to release 12 + * per-vma resources, so we don't attempt to merge those if the caller indicates 13 + * the current vma may be removed as part of the merge. 14 + */ 15 + static inline bool is_mergeable_vma(struct vm_area_struct *vma, 16 + struct file *file, unsigned long vm_flags, 17 + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, 18 + struct anon_vma_name *anon_name, bool may_remove_vma) 19 + { 20 + /* 21 + * VM_SOFTDIRTY should not prevent from VMA merging, if we 22 + * match the flags but dirty bit -- the caller should mark 23 + * merged VMA as dirty. If dirty bit won't be excluded from 24 + * comparison, we increase pressure on the memory system forcing 25 + * the kernel to generate new VMAs when old one could be 26 + * extended instead. 27 + */ 28 + if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY) 29 + return false; 30 + if (vma->vm_file != file) 31 + return false; 32 + if (may_remove_vma && vma->vm_ops && vma->vm_ops->close) 33 + return false; 34 + if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) 35 + return false; 36 + if (!anon_vma_name_eq(anon_vma_name(vma), anon_name)) 37 + return false; 38 + return true; 39 + } 40 + 41 + static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1, 42 + struct anon_vma *anon_vma2, struct vm_area_struct *vma) 43 + { 44 + /* 45 + * The list_is_singular() test is to avoid merging VMA cloned from 46 + * parents. This can improve scalability caused by anon_vma lock. 47 + */ 48 + if ((!anon_vma1 || !anon_vma2) && (!vma || 49 + list_is_singular(&vma->anon_vma_chain))) 50 + return true; 51 + return anon_vma1 == anon_vma2; 52 + } 53 + 54 + /* 55 + * init_multi_vma_prep() - Initializer for struct vma_prepare 56 + * @vp: The vma_prepare struct 57 + * @vma: The vma that will be altered once locked 58 + * @next: The next vma if it is to be adjusted 59 + * @remove: The first vma to be removed 60 + * @remove2: The second vma to be removed 61 + */ 62 + static void init_multi_vma_prep(struct vma_prepare *vp, 63 + struct vm_area_struct *vma, 64 + struct vm_area_struct *next, 65 + struct vm_area_struct *remove, 66 + struct vm_area_struct *remove2) 67 + { 68 + memset(vp, 0, sizeof(struct vma_prepare)); 69 + vp->vma = vma; 70 + vp->anon_vma = vma->anon_vma; 71 + vp->remove = remove; 72 + vp->remove2 = remove2; 73 + vp->adj_next = next; 74 + if (!vp->anon_vma && next) 75 + vp->anon_vma = next->anon_vma; 76 + 77 + vp->file = vma->vm_file; 78 + if (vp->file) 79 + vp->mapping = vma->vm_file->f_mapping; 80 + 81 + } 82 + 83 + /* 84 + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) 85 + * in front of (at a lower virtual address and file offset than) the vma. 86 + * 87 + * We cannot merge two vmas if they have differently assigned (non-NULL) 88 + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. 89 + * 90 + * We don't check here for the merged mmap wrapping around the end of pagecache 91 + * indices (16TB on ia32) because do_mmap() does not permit mmap's which 92 + * wrap, nor mmaps which cover the final page at index -1UL. 93 + * 94 + * We assume the vma may be removed as part of the merge. 95 + */ 96 + bool 97 + can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, 98 + struct anon_vma *anon_vma, struct file *file, 99 + pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, 100 + struct anon_vma_name *anon_name) 101 + { 102 + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) && 103 + is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { 104 + if (vma->vm_pgoff == vm_pgoff) 105 + return true; 106 + } 107 + return false; 108 + } 109 + 110 + /* 111 + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) 112 + * beyond (at a higher virtual address and file offset than) the vma. 113 + * 114 + * We cannot merge two vmas if they have differently assigned (non-NULL) 115 + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. 116 + * 117 + * We assume that vma is not removed as part of the merge. 118 + */ 119 + bool 120 + can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, 121 + struct anon_vma *anon_vma, struct file *file, 122 + pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, 123 + struct anon_vma_name *anon_name) 124 + { 125 + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) && 126 + is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { 127 + pgoff_t vm_pglen; 128 + 129 + vm_pglen = vma_pages(vma); 130 + if (vma->vm_pgoff + vm_pglen == vm_pgoff) 131 + return true; 132 + } 133 + return false; 134 + } 135 + 136 + /* 137 + * Close a vm structure and free it. 138 + */ 139 + void remove_vma(struct vm_area_struct *vma, bool unreachable) 140 + { 141 + might_sleep(); 142 + if (vma->vm_ops && vma->vm_ops->close) 143 + vma->vm_ops->close(vma); 144 + if (vma->vm_file) 145 + fput(vma->vm_file); 146 + mpol_put(vma_policy(vma)); 147 + if (unreachable) 148 + __vm_area_free(vma); 149 + else 150 + vm_area_free(vma); 151 + } 152 + 153 + /* 154 + * Get rid of page table information in the indicated region. 155 + * 156 + * Called with the mm semaphore held. 157 + */ 158 + void unmap_region(struct mm_struct *mm, struct ma_state *mas, 159 + struct vm_area_struct *vma, struct vm_area_struct *prev, 160 + struct vm_area_struct *next, unsigned long start, 161 + unsigned long end, unsigned long tree_end, bool mm_wr_locked) 162 + { 163 + struct mmu_gather tlb; 164 + unsigned long mt_start = mas->index; 165 + 166 + lru_add_drain(); 167 + tlb_gather_mmu(&tlb, mm); 168 + update_hiwater_rss(mm); 169 + unmap_vmas(&tlb, mas, vma, start, end, tree_end, mm_wr_locked); 170 + mas_set(mas, mt_start); 171 + free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, 172 + next ? next->vm_start : USER_PGTABLES_CEILING, 173 + mm_wr_locked); 174 + tlb_finish_mmu(&tlb); 175 + } 176 + 177 + /* 178 + * __split_vma() bypasses sysctl_max_map_count checking. We use this where it 179 + * has already been checked or doesn't make sense to fail. 180 + * VMA Iterator will point to the end VMA. 181 + */ 182 + static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, 183 + unsigned long addr, int new_below) 184 + { 185 + struct vma_prepare vp; 186 + struct vm_area_struct *new; 187 + int err; 188 + 189 + WARN_ON(vma->vm_start >= addr); 190 + WARN_ON(vma->vm_end <= addr); 191 + 192 + if (vma->vm_ops && vma->vm_ops->may_split) { 193 + err = vma->vm_ops->may_split(vma, addr); 194 + if (err) 195 + return err; 196 + } 197 + 198 + new = vm_area_dup(vma); 199 + if (!new) 200 + return -ENOMEM; 201 + 202 + if (new_below) { 203 + new->vm_end = addr; 204 + } else { 205 + new->vm_start = addr; 206 + new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); 207 + } 208 + 209 + err = -ENOMEM; 210 + vma_iter_config(vmi, new->vm_start, new->vm_end); 211 + if (vma_iter_prealloc(vmi, new)) 212 + goto out_free_vma; 213 + 214 + err = vma_dup_policy(vma, new); 215 + if (err) 216 + goto out_free_vmi; 217 + 218 + err = anon_vma_clone(new, vma); 219 + if (err) 220 + goto out_free_mpol; 221 + 222 + if (new->vm_file) 223 + get_file(new->vm_file); 224 + 225 + if (new->vm_ops && new->vm_ops->open) 226 + new->vm_ops->open(new); 227 + 228 + vma_start_write(vma); 229 + vma_start_write(new); 230 + 231 + init_vma_prep(&vp, vma); 232 + vp.insert = new; 233 + vma_prepare(&vp); 234 + vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); 235 + 236 + if (new_below) { 237 + vma->vm_start = addr; 238 + vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; 239 + } else { 240 + vma->vm_end = addr; 241 + } 242 + 243 + /* vma_complete stores the new vma */ 244 + vma_complete(&vp, vmi, vma->vm_mm); 245 + 246 + /* Success. */ 247 + if (new_below) 248 + vma_next(vmi); 249 + return 0; 250 + 251 + out_free_mpol: 252 + mpol_put(vma_policy(new)); 253 + out_free_vmi: 254 + vma_iter_free(vmi); 255 + out_free_vma: 256 + vm_area_free(new); 257 + return err; 258 + } 259 + 260 + /* 261 + * Split a vma into two pieces at address 'addr', a new vma is allocated 262 + * either for the first part or the tail. 263 + */ 264 + static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, 265 + unsigned long addr, int new_below) 266 + { 267 + if (vma->vm_mm->map_count >= sysctl_max_map_count) 268 + return -ENOMEM; 269 + 270 + return __split_vma(vmi, vma, addr, new_below); 271 + } 272 + 273 + /* 274 + * Ok - we have the memory areas we should free on a maple tree so release them, 275 + * and do the vma updates. 276 + * 277 + * Called with the mm semaphore held. 278 + */ 279 + static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) 280 + { 281 + unsigned long nr_accounted = 0; 282 + struct vm_area_struct *vma; 283 + 284 + /* Update high watermark before we lower total_vm */ 285 + update_hiwater_vm(mm); 286 + mas_for_each(mas, vma, ULONG_MAX) { 287 + long nrpages = vma_pages(vma); 288 + 289 + if (vma->vm_flags & VM_ACCOUNT) 290 + nr_accounted += nrpages; 291 + vm_stat_account(mm, vma->vm_flags, -nrpages); 292 + remove_vma(vma, false); 293 + } 294 + vm_unacct_memory(nr_accounted); 295 + } 296 + 297 + /* 298 + * init_vma_prep() - Initializer wrapper for vma_prepare struct 299 + * @vp: The vma_prepare struct 300 + * @vma: The vma that will be altered once locked 301 + */ 302 + void init_vma_prep(struct vma_prepare *vp, 303 + struct vm_area_struct *vma) 304 + { 305 + init_multi_vma_prep(vp, vma, NULL, NULL, NULL); 306 + } 307 + 308 + /* 309 + * Requires inode->i_mapping->i_mmap_rwsem 310 + */ 311 + static void __remove_shared_vm_struct(struct vm_area_struct *vma, 312 + struct address_space *mapping) 313 + { 314 + if (vma_is_shared_maywrite(vma)) 315 + mapping_unmap_writable(mapping); 316 + 317 + flush_dcache_mmap_lock(mapping); 318 + vma_interval_tree_remove(vma, &mapping->i_mmap); 319 + flush_dcache_mmap_unlock(mapping); 320 + } 321 + 322 + /* 323 + * vma has some anon_vma assigned, and is already inserted on that 324 + * anon_vma's interval trees. 325 + * 326 + * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the 327 + * vma must be removed from the anon_vma's interval trees using 328 + * anon_vma_interval_tree_pre_update_vma(). 329 + * 330 + * After the update, the vma will be reinserted using 331 + * anon_vma_interval_tree_post_update_vma(). 332 + * 333 + * The entire update must be protected by exclusive mmap_lock and by 334 + * the root anon_vma's mutex. 335 + */ 336 + void 337 + anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) 338 + { 339 + struct anon_vma_chain *avc; 340 + 341 + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 342 + anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); 343 + } 344 + 345 + void 346 + anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) 347 + { 348 + struct anon_vma_chain *avc; 349 + 350 + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 351 + anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); 352 + } 353 + 354 + static void __vma_link_file(struct vm_area_struct *vma, 355 + struct address_space *mapping) 356 + { 357 + if (vma_is_shared_maywrite(vma)) 358 + mapping_allow_writable(mapping); 359 + 360 + flush_dcache_mmap_lock(mapping); 361 + vma_interval_tree_insert(vma, &mapping->i_mmap); 362 + flush_dcache_mmap_unlock(mapping); 363 + } 364 + 365 + /* 366 + * vma_prepare() - Helper function for handling locking VMAs prior to altering 367 + * @vp: The initialized vma_prepare struct 368 + */ 369 + void vma_prepare(struct vma_prepare *vp) 370 + { 371 + if (vp->file) { 372 + uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); 373 + 374 + if (vp->adj_next) 375 + uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, 376 + vp->adj_next->vm_end); 377 + 378 + i_mmap_lock_write(vp->mapping); 379 + if (vp->insert && vp->insert->vm_file) { 380 + /* 381 + * Put into interval tree now, so instantiated pages 382 + * are visible to arm/parisc __flush_dcache_page 383 + * throughout; but we cannot insert into address 384 + * space until vma start or end is updated. 385 + */ 386 + __vma_link_file(vp->insert, 387 + vp->insert->vm_file->f_mapping); 388 + } 389 + } 390 + 391 + if (vp->anon_vma) { 392 + anon_vma_lock_write(vp->anon_vma); 393 + anon_vma_interval_tree_pre_update_vma(vp->vma); 394 + if (vp->adj_next) 395 + anon_vma_interval_tree_pre_update_vma(vp->adj_next); 396 + } 397 + 398 + if (vp->file) { 399 + flush_dcache_mmap_lock(vp->mapping); 400 + vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); 401 + if (vp->adj_next) 402 + vma_interval_tree_remove(vp->adj_next, 403 + &vp->mapping->i_mmap); 404 + } 405 + 406 + } 407 + 408 + /* 409 + * dup_anon_vma() - Helper function to duplicate anon_vma 410 + * @dst: The destination VMA 411 + * @src: The source VMA 412 + * @dup: Pointer to the destination VMA when successful. 413 + * 414 + * Returns: 0 on success. 415 + */ 416 + static int dup_anon_vma(struct vm_area_struct *dst, 417 + struct vm_area_struct *src, struct vm_area_struct **dup) 418 + { 419 + /* 420 + * Easily overlooked: when mprotect shifts the boundary, make sure the 421 + * expanding vma has anon_vma set if the shrinking vma had, to cover any 422 + * anon pages imported. 423 + */ 424 + if (src->anon_vma && !dst->anon_vma) { 425 + int ret; 426 + 427 + vma_assert_write_locked(dst); 428 + dst->anon_vma = src->anon_vma; 429 + ret = anon_vma_clone(dst, src); 430 + if (ret) 431 + return ret; 432 + 433 + *dup = dst; 434 + } 435 + 436 + return 0; 437 + } 438 + 439 + #ifdef CONFIG_DEBUG_VM_MAPLE_TREE 440 + void validate_mm(struct mm_struct *mm) 441 + { 442 + int bug = 0; 443 + int i = 0; 444 + struct vm_area_struct *vma; 445 + VMA_ITERATOR(vmi, mm, 0); 446 + 447 + mt_validate(&mm->mm_mt); 448 + for_each_vma(vmi, vma) { 449 + #ifdef CONFIG_DEBUG_VM_RB 450 + struct anon_vma *anon_vma = vma->anon_vma; 451 + struct anon_vma_chain *avc; 452 + #endif 453 + unsigned long vmi_start, vmi_end; 454 + bool warn = 0; 455 + 456 + vmi_start = vma_iter_addr(&vmi); 457 + vmi_end = vma_iter_end(&vmi); 458 + if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm)) 459 + warn = 1; 460 + 461 + if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm)) 462 + warn = 1; 463 + 464 + if (warn) { 465 + pr_emerg("issue in %s\n", current->comm); 466 + dump_stack(); 467 + dump_vma(vma); 468 + pr_emerg("tree range: %px start %lx end %lx\n", vma, 469 + vmi_start, vmi_end - 1); 470 + vma_iter_dump_tree(&vmi); 471 + } 472 + 473 + #ifdef CONFIG_DEBUG_VM_RB 474 + if (anon_vma) { 475 + anon_vma_lock_read(anon_vma); 476 + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 477 + anon_vma_interval_tree_verify(avc); 478 + anon_vma_unlock_read(anon_vma); 479 + } 480 + #endif 481 + i++; 482 + } 483 + if (i != mm->map_count) { 484 + pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i); 485 + bug = 1; 486 + } 487 + VM_BUG_ON_MM(bug, mm); 488 + } 489 + #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ 490 + 491 + /* 492 + * vma_expand - Expand an existing VMA 493 + * 494 + * @vmi: The vma iterator 495 + * @vma: The vma to expand 496 + * @start: The start of the vma 497 + * @end: The exclusive end of the vma 498 + * @pgoff: The page offset of vma 499 + * @next: The current of next vma. 500 + * 501 + * Expand @vma to @start and @end. Can expand off the start and end. Will 502 + * expand over @next if it's different from @vma and @end == @next->vm_end. 503 + * Checking if the @vma can expand and merge with @next needs to be handled by 504 + * the caller. 505 + * 506 + * Returns: 0 on success 507 + */ 508 + int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, 509 + unsigned long start, unsigned long end, pgoff_t pgoff, 510 + struct vm_area_struct *next) 511 + { 512 + struct vm_area_struct *anon_dup = NULL; 513 + bool remove_next = false; 514 + struct vma_prepare vp; 515 + 516 + vma_start_write(vma); 517 + if (next && (vma != next) && (end == next->vm_end)) { 518 + int ret; 519 + 520 + remove_next = true; 521 + vma_start_write(next); 522 + ret = dup_anon_vma(vma, next, &anon_dup); 523 + if (ret) 524 + return ret; 525 + } 526 + 527 + init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL); 528 + /* Not merging but overwriting any part of next is not handled. */ 529 + VM_WARN_ON(next && !vp.remove && 530 + next != vma && end > next->vm_start); 531 + /* Only handles expanding */ 532 + VM_WARN_ON(vma->vm_start < start || vma->vm_end > end); 533 + 534 + /* Note: vma iterator must be pointing to 'start' */ 535 + vma_iter_config(vmi, start, end); 536 + if (vma_iter_prealloc(vmi, vma)) 537 + goto nomem; 538 + 539 + vma_prepare(&vp); 540 + vma_adjust_trans_huge(vma, start, end, 0); 541 + vma_set_range(vma, start, end, pgoff); 542 + vma_iter_store(vmi, vma); 543 + 544 + vma_complete(&vp, vmi, vma->vm_mm); 545 + return 0; 546 + 547 + nomem: 548 + if (anon_dup) 549 + unlink_anon_vmas(anon_dup); 550 + return -ENOMEM; 551 + } 552 + 553 + /* 554 + * vma_shrink() - Reduce an existing VMAs memory area 555 + * @vmi: The vma iterator 556 + * @vma: The VMA to modify 557 + * @start: The new start 558 + * @end: The new end 559 + * 560 + * Returns: 0 on success, -ENOMEM otherwise 561 + */ 562 + int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, 563 + unsigned long start, unsigned long end, pgoff_t pgoff) 564 + { 565 + struct vma_prepare vp; 566 + 567 + WARN_ON((vma->vm_start != start) && (vma->vm_end != end)); 568 + 569 + if (vma->vm_start < start) 570 + vma_iter_config(vmi, vma->vm_start, start); 571 + else 572 + vma_iter_config(vmi, end, vma->vm_end); 573 + 574 + if (vma_iter_prealloc(vmi, NULL)) 575 + return -ENOMEM; 576 + 577 + vma_start_write(vma); 578 + 579 + init_vma_prep(&vp, vma); 580 + vma_prepare(&vp); 581 + vma_adjust_trans_huge(vma, start, end, 0); 582 + 583 + vma_iter_clear(vmi); 584 + vma_set_range(vma, start, end, pgoff); 585 + vma_complete(&vp, vmi, vma->vm_mm); 586 + return 0; 587 + } 588 + 589 + /* 590 + * vma_complete- Helper function for handling the unlocking after altering VMAs, 591 + * or for inserting a VMA. 592 + * 593 + * @vp: The vma_prepare struct 594 + * @vmi: The vma iterator 595 + * @mm: The mm_struct 596 + */ 597 + void vma_complete(struct vma_prepare *vp, 598 + struct vma_iterator *vmi, struct mm_struct *mm) 599 + { 600 + if (vp->file) { 601 + if (vp->adj_next) 602 + vma_interval_tree_insert(vp->adj_next, 603 + &vp->mapping->i_mmap); 604 + vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); 605 + flush_dcache_mmap_unlock(vp->mapping); 606 + } 607 + 608 + if (vp->remove && vp->file) { 609 + __remove_shared_vm_struct(vp->remove, vp->mapping); 610 + if (vp->remove2) 611 + __remove_shared_vm_struct(vp->remove2, vp->mapping); 612 + } else if (vp->insert) { 613 + /* 614 + * split_vma has split insert from vma, and needs 615 + * us to insert it before dropping the locks 616 + * (it may either follow vma or precede it). 617 + */ 618 + vma_iter_store(vmi, vp->insert); 619 + mm->map_count++; 620 + } 621 + 622 + if (vp->anon_vma) { 623 + anon_vma_interval_tree_post_update_vma(vp->vma); 624 + if (vp->adj_next) 625 + anon_vma_interval_tree_post_update_vma(vp->adj_next); 626 + anon_vma_unlock_write(vp->anon_vma); 627 + } 628 + 629 + if (vp->file) { 630 + i_mmap_unlock_write(vp->mapping); 631 + uprobe_mmap(vp->vma); 632 + 633 + if (vp->adj_next) 634 + uprobe_mmap(vp->adj_next); 635 + } 636 + 637 + if (vp->remove) { 638 + again: 639 + vma_mark_detached(vp->remove, true); 640 + if (vp->file) { 641 + uprobe_munmap(vp->remove, vp->remove->vm_start, 642 + vp->remove->vm_end); 643 + fput(vp->file); 644 + } 645 + if (vp->remove->anon_vma) 646 + anon_vma_merge(vp->vma, vp->remove); 647 + mm->map_count--; 648 + mpol_put(vma_policy(vp->remove)); 649 + if (!vp->remove2) 650 + WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); 651 + vm_area_free(vp->remove); 652 + 653 + /* 654 + * In mprotect's case 6 (see comments on vma_merge), 655 + * we are removing both mid and next vmas 656 + */ 657 + if (vp->remove2) { 658 + vp->remove = vp->remove2; 659 + vp->remove2 = NULL; 660 + goto again; 661 + } 662 + } 663 + if (vp->insert && vp->file) 664 + uprobe_mmap(vp->insert); 665 + validate_mm(mm); 666 + } 667 + 668 + /* 669 + * do_vmi_align_munmap() - munmap the aligned region from @start to @end. 670 + * @vmi: The vma iterator 671 + * @vma: The starting vm_area_struct 672 + * @mm: The mm_struct 673 + * @start: The aligned start address to munmap. 674 + * @end: The aligned end address to munmap. 675 + * @uf: The userfaultfd list_head 676 + * @unlock: Set to true to drop the mmap_lock. unlocking only happens on 677 + * success. 678 + * 679 + * Return: 0 on success and drops the lock if so directed, error and leaves the 680 + * lock held otherwise. 681 + */ 682 + int 683 + do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, 684 + struct mm_struct *mm, unsigned long start, 685 + unsigned long end, struct list_head *uf, bool unlock) 686 + { 687 + struct vm_area_struct *prev, *next = NULL; 688 + struct maple_tree mt_detach; 689 + int count = 0; 690 + int error = -ENOMEM; 691 + unsigned long locked_vm = 0; 692 + MA_STATE(mas_detach, &mt_detach, 0, 0); 693 + mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); 694 + mt_on_stack(mt_detach); 695 + 696 + /* 697 + * If we need to split any vma, do it now to save pain later. 698 + * 699 + * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially 700 + * unmapped vm_area_struct will remain in use: so lower split_vma 701 + * places tmp vma above, and higher split_vma places tmp vma below. 702 + */ 703 + 704 + /* Does it split the first one? */ 705 + if (start > vma->vm_start) { 706 + 707 + /* 708 + * Make sure that map_count on return from munmap() will 709 + * not exceed its limit; but let map_count go just above 710 + * its limit temporarily, to help free resources as expected. 711 + */ 712 + if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) 713 + goto map_count_exceeded; 714 + 715 + error = __split_vma(vmi, vma, start, 1); 716 + if (error) 717 + goto start_split_failed; 718 + } 719 + 720 + /* 721 + * Detach a range of VMAs from the mm. Using next as a temp variable as 722 + * it is always overwritten. 723 + */ 724 + next = vma; 725 + do { 726 + /* Does it split the end? */ 727 + if (next->vm_end > end) { 728 + error = __split_vma(vmi, next, end, 0); 729 + if (error) 730 + goto end_split_failed; 731 + } 732 + vma_start_write(next); 733 + mas_set(&mas_detach, count); 734 + error = mas_store_gfp(&mas_detach, next, GFP_KERNEL); 735 + if (error) 736 + goto munmap_gather_failed; 737 + vma_mark_detached(next, true); 738 + if (next->vm_flags & VM_LOCKED) 739 + locked_vm += vma_pages(next); 740 + 741 + count++; 742 + if (unlikely(uf)) { 743 + /* 744 + * If userfaultfd_unmap_prep returns an error the vmas 745 + * will remain split, but userland will get a 746 + * highly unexpected error anyway. This is no 747 + * different than the case where the first of the two 748 + * __split_vma fails, but we don't undo the first 749 + * split, despite we could. This is unlikely enough 750 + * failure that it's not worth optimizing it for. 751 + */ 752 + error = userfaultfd_unmap_prep(next, start, end, uf); 753 + 754 + if (error) 755 + goto userfaultfd_error; 756 + } 757 + #ifdef CONFIG_DEBUG_VM_MAPLE_TREE 758 + BUG_ON(next->vm_start < start); 759 + BUG_ON(next->vm_start > end); 760 + #endif 761 + } for_each_vma_range(*vmi, next, end); 762 + 763 + #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) 764 + /* Make sure no VMAs are about to be lost. */ 765 + { 766 + MA_STATE(test, &mt_detach, 0, 0); 767 + struct vm_area_struct *vma_mas, *vma_test; 768 + int test_count = 0; 769 + 770 + vma_iter_set(vmi, start); 771 + rcu_read_lock(); 772 + vma_test = mas_find(&test, count - 1); 773 + for_each_vma_range(*vmi, vma_mas, end) { 774 + BUG_ON(vma_mas != vma_test); 775 + test_count++; 776 + vma_test = mas_next(&test, count - 1); 777 + } 778 + rcu_read_unlock(); 779 + BUG_ON(count != test_count); 780 + } 781 + #endif 782 + 783 + while (vma_iter_addr(vmi) > start) 784 + vma_iter_prev_range(vmi); 785 + 786 + error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL); 787 + if (error) 788 + goto clear_tree_failed; 789 + 790 + /* Point of no return */ 791 + mm->locked_vm -= locked_vm; 792 + mm->map_count -= count; 793 + if (unlock) 794 + mmap_write_downgrade(mm); 795 + 796 + prev = vma_iter_prev_range(vmi); 797 + next = vma_next(vmi); 798 + if (next) 799 + vma_iter_prev_range(vmi); 800 + 801 + /* 802 + * We can free page tables without write-locking mmap_lock because VMAs 803 + * were isolated before we downgraded mmap_lock. 804 + */ 805 + mas_set(&mas_detach, 1); 806 + unmap_region(mm, &mas_detach, vma, prev, next, start, end, count, 807 + !unlock); 808 + /* Statistics and freeing VMAs */ 809 + mas_set(&mas_detach, 0); 810 + remove_mt(mm, &mas_detach); 811 + validate_mm(mm); 812 + if (unlock) 813 + mmap_read_unlock(mm); 814 + 815 + __mt_destroy(&mt_detach); 816 + return 0; 817 + 818 + clear_tree_failed: 819 + userfaultfd_error: 820 + munmap_gather_failed: 821 + end_split_failed: 822 + mas_set(&mas_detach, 0); 823 + mas_for_each(&mas_detach, next, end) 824 + vma_mark_detached(next, false); 825 + 826 + __mt_destroy(&mt_detach); 827 + start_split_failed: 828 + map_count_exceeded: 829 + validate_mm(mm); 830 + return error; 831 + } 832 + 833 + /* 834 + * do_vmi_munmap() - munmap a given range. 835 + * @vmi: The vma iterator 836 + * @mm: The mm_struct 837 + * @start: The start address to munmap 838 + * @len: The length of the range to munmap 839 + * @uf: The userfaultfd list_head 840 + * @unlock: set to true if the user wants to drop the mmap_lock on success 841 + * 842 + * This function takes a @mas that is either pointing to the previous VMA or set 843 + * to MA_START and sets it up to remove the mapping(s). The @len will be 844 + * aligned and any arch_unmap work will be preformed. 845 + * 846 + * Return: 0 on success and drops the lock if so directed, error and leaves the 847 + * lock held otherwise. 848 + */ 849 + int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, 850 + unsigned long start, size_t len, struct list_head *uf, 851 + bool unlock) 852 + { 853 + unsigned long end; 854 + struct vm_area_struct *vma; 855 + 856 + if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) 857 + return -EINVAL; 858 + 859 + end = start + PAGE_ALIGN(len); 860 + if (end == start) 861 + return -EINVAL; 862 + 863 + /* 864 + * Check if memory is sealed before arch_unmap. 865 + * Prevent unmapping a sealed VMA. 866 + * can_modify_mm assumes we have acquired the lock on MM. 867 + */ 868 + if (unlikely(!can_modify_mm(mm, start, end))) 869 + return -EPERM; 870 + 871 + /* arch_unmap() might do unmaps itself. */ 872 + arch_unmap(mm, start, end); 873 + 874 + /* Find the first overlapping VMA */ 875 + vma = vma_find(vmi, end); 876 + if (!vma) { 877 + if (unlock) 878 + mmap_write_unlock(mm); 879 + return 0; 880 + } 881 + 882 + return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); 883 + } 884 + 885 + /* 886 + * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name), 887 + * figure out whether that can be merged with its predecessor or its 888 + * successor. Or both (it neatly fills a hole). 889 + * 890 + * In most cases - when called for mmap, brk or mremap - [addr,end) is 891 + * certain not to be mapped by the time vma_merge is called; but when 892 + * called for mprotect, it is certain to be already mapped (either at 893 + * an offset within prev, or at the start of next), and the flags of 894 + * this area are about to be changed to vm_flags - and the no-change 895 + * case has already been eliminated. 896 + * 897 + * The following mprotect cases have to be considered, where **** is 898 + * the area passed down from mprotect_fixup, never extending beyond one 899 + * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts 900 + * at the same address as **** and is of the same or larger span, and 901 + * NNNN the next vma after ****: 902 + * 903 + * **** **** **** 904 + * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPCCCCCC 905 + * cannot merge might become might become 906 + * PPNNNNNNNNNN PPPPPPPPPPCC 907 + * mmap, brk or case 4 below case 5 below 908 + * mremap move: 909 + * **** **** 910 + * PPPP NNNN PPPPCCCCNNNN 911 + * might become might become 912 + * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or 913 + * PPPPPPPPNNNN 2 or PPPPPPPPNNNN 7 or 914 + * PPPPNNNNNNNN 3 PPPPNNNNNNNN 8 915 + * 916 + * It is important for case 8 that the vma CCCC overlapping the 917 + * region **** is never going to extended over NNNN. Instead NNNN must 918 + * be extended in region **** and CCCC must be removed. This way in 919 + * all cases where vma_merge succeeds, the moment vma_merge drops the 920 + * rmap_locks, the properties of the merged vma will be already 921 + * correct for the whole merged range. Some of those properties like 922 + * vm_page_prot/vm_flags may be accessed by rmap_walks and they must 923 + * be correct for the whole merged range immediately after the 924 + * rmap_locks are released. Otherwise if NNNN would be removed and 925 + * CCCC would be extended over the NNNN range, remove_migration_ptes 926 + * or other rmap walkers (if working on addresses beyond the "end" 927 + * parameter) may establish ptes with the wrong permissions of CCCC 928 + * instead of the right permissions of NNNN. 929 + * 930 + * In the code below: 931 + * PPPP is represented by *prev 932 + * CCCC is represented by *curr or not represented at all (NULL) 933 + * NNNN is represented by *next or not represented at all (NULL) 934 + * **** is not represented - it will be merged and the vma containing the 935 + * area is returned, or the function will return NULL 936 + */ 937 + static struct vm_area_struct 938 + *vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev, 939 + struct vm_area_struct *src, unsigned long addr, unsigned long end, 940 + unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy, 941 + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, 942 + struct anon_vma_name *anon_name) 943 + { 944 + struct mm_struct *mm = src->vm_mm; 945 + struct anon_vma *anon_vma = src->anon_vma; 946 + struct file *file = src->vm_file; 947 + struct vm_area_struct *curr, *next, *res; 948 + struct vm_area_struct *vma, *adjust, *remove, *remove2; 949 + struct vm_area_struct *anon_dup = NULL; 950 + struct vma_prepare vp; 951 + pgoff_t vma_pgoff; 952 + int err = 0; 953 + bool merge_prev = false; 954 + bool merge_next = false; 955 + bool vma_expanded = false; 956 + unsigned long vma_start = addr; 957 + unsigned long vma_end = end; 958 + pgoff_t pglen = (end - addr) >> PAGE_SHIFT; 959 + long adj_start = 0; 960 + 961 + /* 962 + * We later require that vma->vm_flags == vm_flags, 963 + * so this tests vma->vm_flags & VM_SPECIAL, too. 964 + */ 965 + if (vm_flags & VM_SPECIAL) 966 + return NULL; 967 + 968 + /* Does the input range span an existing VMA? (cases 5 - 8) */ 969 + curr = find_vma_intersection(mm, prev ? prev->vm_end : 0, end); 970 + 971 + if (!curr || /* cases 1 - 4 */ 972 + end == curr->vm_end) /* cases 6 - 8, adjacent VMA */ 973 + next = vma_lookup(mm, end); 974 + else 975 + next = NULL; /* case 5 */ 976 + 977 + if (prev) { 978 + vma_start = prev->vm_start; 979 + vma_pgoff = prev->vm_pgoff; 980 + 981 + /* Can we merge the predecessor? */ 982 + if (addr == prev->vm_end && mpol_equal(vma_policy(prev), policy) 983 + && can_vma_merge_after(prev, vm_flags, anon_vma, file, 984 + pgoff, vm_userfaultfd_ctx, anon_name)) { 985 + merge_prev = true; 986 + vma_prev(vmi); 987 + } 988 + } 989 + 990 + /* Can we merge the successor? */ 991 + if (next && mpol_equal(policy, vma_policy(next)) && 992 + can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, 993 + vm_userfaultfd_ctx, anon_name)) { 994 + merge_next = true; 995 + } 996 + 997 + /* Verify some invariant that must be enforced by the caller. */ 998 + VM_WARN_ON(prev && addr <= prev->vm_start); 999 + VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end)); 1000 + VM_WARN_ON(addr >= end); 1001 + 1002 + if (!merge_prev && !merge_next) 1003 + return NULL; /* Not mergeable. */ 1004 + 1005 + if (merge_prev) 1006 + vma_start_write(prev); 1007 + 1008 + res = vma = prev; 1009 + remove = remove2 = adjust = NULL; 1010 + 1011 + /* Can we merge both the predecessor and the successor? */ 1012 + if (merge_prev && merge_next && 1013 + is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { 1014 + vma_start_write(next); 1015 + remove = next; /* case 1 */ 1016 + vma_end = next->vm_end; 1017 + err = dup_anon_vma(prev, next, &anon_dup); 1018 + if (curr) { /* case 6 */ 1019 + vma_start_write(curr); 1020 + remove = curr; 1021 + remove2 = next; 1022 + /* 1023 + * Note that the dup_anon_vma below cannot overwrite err 1024 + * since the first caller would do nothing unless next 1025 + * has an anon_vma. 1026 + */ 1027 + if (!next->anon_vma) 1028 + err = dup_anon_vma(prev, curr, &anon_dup); 1029 + } 1030 + } else if (merge_prev) { /* case 2 */ 1031 + if (curr) { 1032 + vma_start_write(curr); 1033 + if (end == curr->vm_end) { /* case 7 */ 1034 + /* 1035 + * can_vma_merge_after() assumed we would not be 1036 + * removing prev vma, so it skipped the check 1037 + * for vm_ops->close, but we are removing curr 1038 + */ 1039 + if (curr->vm_ops && curr->vm_ops->close) 1040 + err = -EINVAL; 1041 + remove = curr; 1042 + } else { /* case 5 */ 1043 + adjust = curr; 1044 + adj_start = (end - curr->vm_start); 1045 + } 1046 + if (!err) 1047 + err = dup_anon_vma(prev, curr, &anon_dup); 1048 + } 1049 + } else { /* merge_next */ 1050 + vma_start_write(next); 1051 + res = next; 1052 + if (prev && addr < prev->vm_end) { /* case 4 */ 1053 + vma_start_write(prev); 1054 + vma_end = addr; 1055 + adjust = next; 1056 + adj_start = -(prev->vm_end - addr); 1057 + err = dup_anon_vma(next, prev, &anon_dup); 1058 + } else { 1059 + /* 1060 + * Note that cases 3 and 8 are the ONLY ones where prev 1061 + * is permitted to be (but is not necessarily) NULL. 1062 + */ 1063 + vma = next; /* case 3 */ 1064 + vma_start = addr; 1065 + vma_end = next->vm_end; 1066 + vma_pgoff = next->vm_pgoff - pglen; 1067 + if (curr) { /* case 8 */ 1068 + vma_pgoff = curr->vm_pgoff; 1069 + vma_start_write(curr); 1070 + remove = curr; 1071 + err = dup_anon_vma(next, curr, &anon_dup); 1072 + } 1073 + } 1074 + } 1075 + 1076 + /* Error in anon_vma clone. */ 1077 + if (err) 1078 + goto anon_vma_fail; 1079 + 1080 + if (vma_start < vma->vm_start || vma_end > vma->vm_end) 1081 + vma_expanded = true; 1082 + 1083 + if (vma_expanded) { 1084 + vma_iter_config(vmi, vma_start, vma_end); 1085 + } else { 1086 + vma_iter_config(vmi, adjust->vm_start + adj_start, 1087 + adjust->vm_end); 1088 + } 1089 + 1090 + if (vma_iter_prealloc(vmi, vma)) 1091 + goto prealloc_fail; 1092 + 1093 + init_multi_vma_prep(&vp, vma, adjust, remove, remove2); 1094 + VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma && 1095 + vp.anon_vma != adjust->anon_vma); 1096 + 1097 + vma_prepare(&vp); 1098 + vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start); 1099 + vma_set_range(vma, vma_start, vma_end, vma_pgoff); 1100 + 1101 + if (vma_expanded) 1102 + vma_iter_store(vmi, vma); 1103 + 1104 + if (adj_start) { 1105 + adjust->vm_start += adj_start; 1106 + adjust->vm_pgoff += adj_start >> PAGE_SHIFT; 1107 + if (adj_start < 0) { 1108 + WARN_ON(vma_expanded); 1109 + vma_iter_store(vmi, next); 1110 + } 1111 + } 1112 + 1113 + vma_complete(&vp, vmi, mm); 1114 + khugepaged_enter_vma(res, vm_flags); 1115 + return res; 1116 + 1117 + prealloc_fail: 1118 + if (anon_dup) 1119 + unlink_anon_vmas(anon_dup); 1120 + 1121 + anon_vma_fail: 1122 + vma_iter_set(vmi, addr); 1123 + vma_iter_load(vmi); 1124 + return NULL; 1125 + } 1126 + 1127 + /* 1128 + * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd 1129 + * context and anonymous VMA name within the range [start, end). 1130 + * 1131 + * As a result, we might be able to merge the newly modified VMA range with an 1132 + * adjacent VMA with identical properties. 1133 + * 1134 + * If no merge is possible and the range does not span the entirety of the VMA, 1135 + * we then need to split the VMA to accommodate the change. 1136 + * 1137 + * The function returns either the merged VMA, the original VMA if a split was 1138 + * required instead, or an error if the split failed. 1139 + */ 1140 + struct vm_area_struct *vma_modify(struct vma_iterator *vmi, 1141 + struct vm_area_struct *prev, 1142 + struct vm_area_struct *vma, 1143 + unsigned long start, unsigned long end, 1144 + unsigned long vm_flags, 1145 + struct mempolicy *policy, 1146 + struct vm_userfaultfd_ctx uffd_ctx, 1147 + struct anon_vma_name *anon_name) 1148 + { 1149 + pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 1150 + struct vm_area_struct *merged; 1151 + 1152 + merged = vma_merge(vmi, prev, vma, start, end, vm_flags, 1153 + pgoff, policy, uffd_ctx, anon_name); 1154 + if (merged) 1155 + return merged; 1156 + 1157 + if (vma->vm_start < start) { 1158 + int err = split_vma(vmi, vma, start, 1); 1159 + 1160 + if (err) 1161 + return ERR_PTR(err); 1162 + } 1163 + 1164 + if (vma->vm_end > end) { 1165 + int err = split_vma(vmi, vma, end, 0); 1166 + 1167 + if (err) 1168 + return ERR_PTR(err); 1169 + } 1170 + 1171 + return vma; 1172 + } 1173 + 1174 + /* 1175 + * Attempt to merge a newly mapped VMA with those adjacent to it. The caller 1176 + * must ensure that [start, end) does not overlap any existing VMA. 1177 + */ 1178 + struct vm_area_struct 1179 + *vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev, 1180 + struct vm_area_struct *vma, unsigned long start, 1181 + unsigned long end, pgoff_t pgoff) 1182 + { 1183 + return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff, 1184 + vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); 1185 + } 1186 + 1187 + /* 1188 + * Expand vma by delta bytes, potentially merging with an immediately adjacent 1189 + * VMA with identical properties. 1190 + */ 1191 + struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, 1192 + struct vm_area_struct *vma, 1193 + unsigned long delta) 1194 + { 1195 + pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma); 1196 + 1197 + /* vma is specified as prev, so case 1 or 2 will apply. */ 1198 + return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta, 1199 + vma->vm_flags, pgoff, vma_policy(vma), 1200 + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); 1201 + } 1202 + 1203 + void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) 1204 + { 1205 + vb->count = 0; 1206 + } 1207 + 1208 + static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb) 1209 + { 1210 + struct address_space *mapping; 1211 + int i; 1212 + 1213 + mapping = vb->vmas[0]->vm_file->f_mapping; 1214 + i_mmap_lock_write(mapping); 1215 + for (i = 0; i < vb->count; i++) { 1216 + VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping); 1217 + __remove_shared_vm_struct(vb->vmas[i], mapping); 1218 + } 1219 + i_mmap_unlock_write(mapping); 1220 + 1221 + unlink_file_vma_batch_init(vb); 1222 + } 1223 + 1224 + void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, 1225 + struct vm_area_struct *vma) 1226 + { 1227 + if (vma->vm_file == NULL) 1228 + return; 1229 + 1230 + if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) || 1231 + vb->count == ARRAY_SIZE(vb->vmas)) 1232 + unlink_file_vma_batch_process(vb); 1233 + 1234 + vb->vmas[vb->count] = vma; 1235 + vb->count++; 1236 + } 1237 + 1238 + void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb) 1239 + { 1240 + if (vb->count > 0) 1241 + unlink_file_vma_batch_process(vb); 1242 + } 1243 + 1244 + /* 1245 + * Unlink a file-based vm structure from its interval tree, to hide 1246 + * vma from rmap and vmtruncate before freeing its page tables. 1247 + */ 1248 + void unlink_file_vma(struct vm_area_struct *vma) 1249 + { 1250 + struct file *file = vma->vm_file; 1251 + 1252 + if (file) { 1253 + struct address_space *mapping = file->f_mapping; 1254 + 1255 + i_mmap_lock_write(mapping); 1256 + __remove_shared_vm_struct(vma, mapping); 1257 + i_mmap_unlock_write(mapping); 1258 + } 1259 + } 1260 + 1261 + void vma_link_file(struct vm_area_struct *vma) 1262 + { 1263 + struct file *file = vma->vm_file; 1264 + struct address_space *mapping; 1265 + 1266 + if (file) { 1267 + mapping = file->f_mapping; 1268 + i_mmap_lock_write(mapping); 1269 + __vma_link_file(vma, mapping); 1270 + i_mmap_unlock_write(mapping); 1271 + } 1272 + } 1273 + 1274 + int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) 1275 + { 1276 + VMA_ITERATOR(vmi, mm, 0); 1277 + 1278 + vma_iter_config(&vmi, vma->vm_start, vma->vm_end); 1279 + if (vma_iter_prealloc(&vmi, vma)) 1280 + return -ENOMEM; 1281 + 1282 + vma_start_write(vma); 1283 + vma_iter_store(&vmi, vma); 1284 + vma_link_file(vma); 1285 + mm->map_count++; 1286 + validate_mm(mm); 1287 + return 0; 1288 + } 1289 + 1290 + /* 1291 + * Copy the vma structure to a new location in the same mm, 1292 + * prior to moving page table entries, to effect an mremap move. 1293 + */ 1294 + struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, 1295 + unsigned long addr, unsigned long len, pgoff_t pgoff, 1296 + bool *need_rmap_locks) 1297 + { 1298 + struct vm_area_struct *vma = *vmap; 1299 + unsigned long vma_start = vma->vm_start; 1300 + struct mm_struct *mm = vma->vm_mm; 1301 + struct vm_area_struct *new_vma, *prev; 1302 + bool faulted_in_anon_vma = true; 1303 + VMA_ITERATOR(vmi, mm, addr); 1304 + 1305 + /* 1306 + * If anonymous vma has not yet been faulted, update new pgoff 1307 + * to match new location, to increase its chance of merging. 1308 + */ 1309 + if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { 1310 + pgoff = addr >> PAGE_SHIFT; 1311 + faulted_in_anon_vma = false; 1312 + } 1313 + 1314 + new_vma = find_vma_prev(mm, addr, &prev); 1315 + if (new_vma && new_vma->vm_start < addr + len) 1316 + return NULL; /* should never get here */ 1317 + 1318 + new_vma = vma_merge_new_vma(&vmi, prev, vma, addr, addr + len, pgoff); 1319 + if (new_vma) { 1320 + /* 1321 + * Source vma may have been merged into new_vma 1322 + */ 1323 + if (unlikely(vma_start >= new_vma->vm_start && 1324 + vma_start < new_vma->vm_end)) { 1325 + /* 1326 + * The only way we can get a vma_merge with 1327 + * self during an mremap is if the vma hasn't 1328 + * been faulted in yet and we were allowed to 1329 + * reset the dst vma->vm_pgoff to the 1330 + * destination address of the mremap to allow 1331 + * the merge to happen. mremap must change the 1332 + * vm_pgoff linearity between src and dst vmas 1333 + * (in turn preventing a vma_merge) to be 1334 + * safe. It is only safe to keep the vm_pgoff 1335 + * linear if there are no pages mapped yet. 1336 + */ 1337 + VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); 1338 + *vmap = vma = new_vma; 1339 + } 1340 + *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); 1341 + } else { 1342 + new_vma = vm_area_dup(vma); 1343 + if (!new_vma) 1344 + goto out; 1345 + vma_set_range(new_vma, addr, addr + len, pgoff); 1346 + if (vma_dup_policy(vma, new_vma)) 1347 + goto out_free_vma; 1348 + if (anon_vma_clone(new_vma, vma)) 1349 + goto out_free_mempol; 1350 + if (new_vma->vm_file) 1351 + get_file(new_vma->vm_file); 1352 + if (new_vma->vm_ops && new_vma->vm_ops->open) 1353 + new_vma->vm_ops->open(new_vma); 1354 + if (vma_link(mm, new_vma)) 1355 + goto out_vma_link; 1356 + *need_rmap_locks = false; 1357 + } 1358 + return new_vma; 1359 + 1360 + out_vma_link: 1361 + if (new_vma->vm_ops && new_vma->vm_ops->close) 1362 + new_vma->vm_ops->close(new_vma); 1363 + 1364 + if (new_vma->vm_file) 1365 + fput(new_vma->vm_file); 1366 + 1367 + unlink_anon_vmas(new_vma); 1368 + out_free_mempol: 1369 + mpol_put(vma_policy(new_vma)); 1370 + out_free_vma: 1371 + vm_area_free(new_vma); 1372 + out: 1373 + return NULL; 1374 + } 1375 + 1376 + /* 1377 + * Rough compatibility check to quickly see if it's even worth looking 1378 + * at sharing an anon_vma. 1379 + * 1380 + * They need to have the same vm_file, and the flags can only differ 1381 + * in things that mprotect may change. 1382 + * 1383 + * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that 1384 + * we can merge the two vma's. For example, we refuse to merge a vma if 1385 + * there is a vm_ops->close() function, because that indicates that the 1386 + * driver is doing some kind of reference counting. But that doesn't 1387 + * really matter for the anon_vma sharing case. 1388 + */ 1389 + static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) 1390 + { 1391 + return a->vm_end == b->vm_start && 1392 + mpol_equal(vma_policy(a), vma_policy(b)) && 1393 + a->vm_file == b->vm_file && 1394 + !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) && 1395 + b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); 1396 + } 1397 + 1398 + /* 1399 + * Do some basic sanity checking to see if we can re-use the anon_vma 1400 + * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be 1401 + * the same as 'old', the other will be the new one that is trying 1402 + * to share the anon_vma. 1403 + * 1404 + * NOTE! This runs with mmap_lock held for reading, so it is possible that 1405 + * the anon_vma of 'old' is concurrently in the process of being set up 1406 + * by another page fault trying to merge _that_. But that's ok: if it 1407 + * is being set up, that automatically means that it will be a singleton 1408 + * acceptable for merging, so we can do all of this optimistically. But 1409 + * we do that READ_ONCE() to make sure that we never re-load the pointer. 1410 + * 1411 + * IOW: that the "list_is_singular()" test on the anon_vma_chain only 1412 + * matters for the 'stable anon_vma' case (ie the thing we want to avoid 1413 + * is to return an anon_vma that is "complex" due to having gone through 1414 + * a fork). 1415 + * 1416 + * We also make sure that the two vma's are compatible (adjacent, 1417 + * and with the same memory policies). That's all stable, even with just 1418 + * a read lock on the mmap_lock. 1419 + */ 1420 + static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, 1421 + struct vm_area_struct *a, 1422 + struct vm_area_struct *b) 1423 + { 1424 + if (anon_vma_compatible(a, b)) { 1425 + struct anon_vma *anon_vma = READ_ONCE(old->anon_vma); 1426 + 1427 + if (anon_vma && list_is_singular(&old->anon_vma_chain)) 1428 + return anon_vma; 1429 + } 1430 + return NULL; 1431 + } 1432 + 1433 + /* 1434 + * find_mergeable_anon_vma is used by anon_vma_prepare, to check 1435 + * neighbouring vmas for a suitable anon_vma, before it goes off 1436 + * to allocate a new anon_vma. It checks because a repetitive 1437 + * sequence of mprotects and faults may otherwise lead to distinct 1438 + * anon_vmas being allocated, preventing vma merge in subsequent 1439 + * mprotect. 1440 + */ 1441 + struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) 1442 + { 1443 + struct anon_vma *anon_vma = NULL; 1444 + struct vm_area_struct *prev, *next; 1445 + VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end); 1446 + 1447 + /* Try next first. */ 1448 + next = vma_iter_load(&vmi); 1449 + if (next) { 1450 + anon_vma = reusable_anon_vma(next, vma, next); 1451 + if (anon_vma) 1452 + return anon_vma; 1453 + } 1454 + 1455 + prev = vma_prev(&vmi); 1456 + VM_BUG_ON_VMA(prev != vma, vma); 1457 + prev = vma_prev(&vmi); 1458 + /* Try prev next. */ 1459 + if (prev) 1460 + anon_vma = reusable_anon_vma(prev, prev, vma); 1461 + 1462 + /* 1463 + * We might reach here with anon_vma == NULL if we can't find 1464 + * any reusable anon_vma. 1465 + * There's no absolute need to look only at touching neighbours: 1466 + * we could search further afield for "compatible" anon_vmas. 1467 + * But it would probably just be a waste of time searching, 1468 + * or lead to too many vmas hanging off the same anon_vma. 1469 + * We're trying to allow mprotect remerging later on, 1470 + * not trying to minimize memory used for anon_vmas. 1471 + */ 1472 + return anon_vma; 1473 + } 1474 + 1475 + static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) 1476 + { 1477 + return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite); 1478 + } 1479 + 1480 + static bool vma_is_shared_writable(struct vm_area_struct *vma) 1481 + { 1482 + return (vma->vm_flags & (VM_WRITE | VM_SHARED)) == 1483 + (VM_WRITE | VM_SHARED); 1484 + } 1485 + 1486 + static bool vma_fs_can_writeback(struct vm_area_struct *vma) 1487 + { 1488 + /* No managed pages to writeback. */ 1489 + if (vma->vm_flags & VM_PFNMAP) 1490 + return false; 1491 + 1492 + return vma->vm_file && vma->vm_file->f_mapping && 1493 + mapping_can_writeback(vma->vm_file->f_mapping); 1494 + } 1495 + 1496 + /* 1497 + * Does this VMA require the underlying folios to have their dirty state 1498 + * tracked? 1499 + */ 1500 + bool vma_needs_dirty_tracking(struct vm_area_struct *vma) 1501 + { 1502 + /* Only shared, writable VMAs require dirty tracking. */ 1503 + if (!vma_is_shared_writable(vma)) 1504 + return false; 1505 + 1506 + /* Does the filesystem need to be notified? */ 1507 + if (vm_ops_needs_writenotify(vma->vm_ops)) 1508 + return true; 1509 + 1510 + /* 1511 + * Even if the filesystem doesn't indicate a need for writenotify, if it 1512 + * can writeback, dirty tracking is still required. 1513 + */ 1514 + return vma_fs_can_writeback(vma); 1515 + } 1516 + 1517 + /* 1518 + * Some shared mappings will want the pages marked read-only 1519 + * to track write events. If so, we'll downgrade vm_page_prot 1520 + * to the private version (using protection_map[] without the 1521 + * VM_SHARED bit). 1522 + */ 1523 + bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) 1524 + { 1525 + /* If it was private or non-writable, the write bit is already clear */ 1526 + if (!vma_is_shared_writable(vma)) 1527 + return false; 1528 + 1529 + /* The backer wishes to know when pages are first written to? */ 1530 + if (vm_ops_needs_writenotify(vma->vm_ops)) 1531 + return true; 1532 + 1533 + /* The open routine did something to the protections that pgprot_modify 1534 + * won't preserve? */ 1535 + if (pgprot_val(vm_page_prot) != 1536 + pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags))) 1537 + return false; 1538 + 1539 + /* 1540 + * Do we need to track softdirty? hugetlb does not support softdirty 1541 + * tracking yet. 1542 + */ 1543 + if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) 1544 + return true; 1545 + 1546 + /* Do we need write faults for uffd-wp tracking? */ 1547 + if (userfaultfd_wp(vma)) 1548 + return true; 1549 + 1550 + /* Can the mapping track the dirty pages? */ 1551 + return vma_fs_can_writeback(vma); 1552 + } 1553 + 1554 + unsigned long count_vma_pages_range(struct mm_struct *mm, 1555 + unsigned long addr, unsigned long end) 1556 + { 1557 + VMA_ITERATOR(vmi, mm, addr); 1558 + struct vm_area_struct *vma; 1559 + unsigned long nr_pages = 0; 1560 + 1561 + for_each_vma_range(vmi, vma, end) { 1562 + unsigned long vm_start = max(addr, vma->vm_start); 1563 + unsigned long vm_end = min(end, vma->vm_end); 1564 + 1565 + nr_pages += PHYS_PFN(vm_end - vm_start); 1566 + } 1567 + 1568 + return nr_pages; 1569 + } 1570 + 1571 + static DEFINE_MUTEX(mm_all_locks_mutex); 1572 + 1573 + static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) 1574 + { 1575 + if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { 1576 + /* 1577 + * The LSB of head.next can't change from under us 1578 + * because we hold the mm_all_locks_mutex. 1579 + */ 1580 + down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock); 1581 + /* 1582 + * We can safely modify head.next after taking the 1583 + * anon_vma->root->rwsem. If some other vma in this mm shares 1584 + * the same anon_vma we won't take it again. 1585 + * 1586 + * No need of atomic instructions here, head.next 1587 + * can't change from under us thanks to the 1588 + * anon_vma->root->rwsem. 1589 + */ 1590 + if (__test_and_set_bit(0, (unsigned long *) 1591 + &anon_vma->root->rb_root.rb_root.rb_node)) 1592 + BUG(); 1593 + } 1594 + } 1595 + 1596 + static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) 1597 + { 1598 + if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { 1599 + /* 1600 + * AS_MM_ALL_LOCKS can't change from under us because 1601 + * we hold the mm_all_locks_mutex. 1602 + * 1603 + * Operations on ->flags have to be atomic because 1604 + * even if AS_MM_ALL_LOCKS is stable thanks to the 1605 + * mm_all_locks_mutex, there may be other cpus 1606 + * changing other bitflags in parallel to us. 1607 + */ 1608 + if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) 1609 + BUG(); 1610 + down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock); 1611 + } 1612 + } 1613 + 1614 + /* 1615 + * This operation locks against the VM for all pte/vma/mm related 1616 + * operations that could ever happen on a certain mm. This includes 1617 + * vmtruncate, try_to_unmap, and all page faults. 1618 + * 1619 + * The caller must take the mmap_lock in write mode before calling 1620 + * mm_take_all_locks(). The caller isn't allowed to release the 1621 + * mmap_lock until mm_drop_all_locks() returns. 1622 + * 1623 + * mmap_lock in write mode is required in order to block all operations 1624 + * that could modify pagetables and free pages without need of 1625 + * altering the vma layout. It's also needed in write mode to avoid new 1626 + * anon_vmas to be associated with existing vmas. 1627 + * 1628 + * A single task can't take more than one mm_take_all_locks() in a row 1629 + * or it would deadlock. 1630 + * 1631 + * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in 1632 + * mapping->flags avoid to take the same lock twice, if more than one 1633 + * vma in this mm is backed by the same anon_vma or address_space. 1634 + * 1635 + * We take locks in following order, accordingly to comment at beginning 1636 + * of mm/rmap.c: 1637 + * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for 1638 + * hugetlb mapping); 1639 + * - all vmas marked locked 1640 + * - all i_mmap_rwsem locks; 1641 + * - all anon_vma->rwseml 1642 + * 1643 + * We can take all locks within these types randomly because the VM code 1644 + * doesn't nest them and we protected from parallel mm_take_all_locks() by 1645 + * mm_all_locks_mutex. 1646 + * 1647 + * mm_take_all_locks() and mm_drop_all_locks are expensive operations 1648 + * that may have to take thousand of locks. 1649 + * 1650 + * mm_take_all_locks() can fail if it's interrupted by signals. 1651 + */ 1652 + int mm_take_all_locks(struct mm_struct *mm) 1653 + { 1654 + struct vm_area_struct *vma; 1655 + struct anon_vma_chain *avc; 1656 + VMA_ITERATOR(vmi, mm, 0); 1657 + 1658 + mmap_assert_write_locked(mm); 1659 + 1660 + mutex_lock(&mm_all_locks_mutex); 1661 + 1662 + /* 1663 + * vma_start_write() does not have a complement in mm_drop_all_locks() 1664 + * because vma_start_write() is always asymmetrical; it marks a VMA as 1665 + * being written to until mmap_write_unlock() or mmap_write_downgrade() 1666 + * is reached. 1667 + */ 1668 + for_each_vma(vmi, vma) { 1669 + if (signal_pending(current)) 1670 + goto out_unlock; 1671 + vma_start_write(vma); 1672 + } 1673 + 1674 + vma_iter_init(&vmi, mm, 0); 1675 + for_each_vma(vmi, vma) { 1676 + if (signal_pending(current)) 1677 + goto out_unlock; 1678 + if (vma->vm_file && vma->vm_file->f_mapping && 1679 + is_vm_hugetlb_page(vma)) 1680 + vm_lock_mapping(mm, vma->vm_file->f_mapping); 1681 + } 1682 + 1683 + vma_iter_init(&vmi, mm, 0); 1684 + for_each_vma(vmi, vma) { 1685 + if (signal_pending(current)) 1686 + goto out_unlock; 1687 + if (vma->vm_file && vma->vm_file->f_mapping && 1688 + !is_vm_hugetlb_page(vma)) 1689 + vm_lock_mapping(mm, vma->vm_file->f_mapping); 1690 + } 1691 + 1692 + vma_iter_init(&vmi, mm, 0); 1693 + for_each_vma(vmi, vma) { 1694 + if (signal_pending(current)) 1695 + goto out_unlock; 1696 + if (vma->anon_vma) 1697 + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 1698 + vm_lock_anon_vma(mm, avc->anon_vma); 1699 + } 1700 + 1701 + return 0; 1702 + 1703 + out_unlock: 1704 + mm_drop_all_locks(mm); 1705 + return -EINTR; 1706 + } 1707 + 1708 + static void vm_unlock_anon_vma(struct anon_vma *anon_vma) 1709 + { 1710 + if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { 1711 + /* 1712 + * The LSB of head.next can't change to 0 from under 1713 + * us because we hold the mm_all_locks_mutex. 1714 + * 1715 + * We must however clear the bitflag before unlocking 1716 + * the vma so the users using the anon_vma->rb_root will 1717 + * never see our bitflag. 1718 + * 1719 + * No need of atomic instructions here, head.next 1720 + * can't change from under us until we release the 1721 + * anon_vma->root->rwsem. 1722 + */ 1723 + if (!__test_and_clear_bit(0, (unsigned long *) 1724 + &anon_vma->root->rb_root.rb_root.rb_node)) 1725 + BUG(); 1726 + anon_vma_unlock_write(anon_vma); 1727 + } 1728 + } 1729 + 1730 + static void vm_unlock_mapping(struct address_space *mapping) 1731 + { 1732 + if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { 1733 + /* 1734 + * AS_MM_ALL_LOCKS can't change to 0 from under us 1735 + * because we hold the mm_all_locks_mutex. 1736 + */ 1737 + i_mmap_unlock_write(mapping); 1738 + if (!test_and_clear_bit(AS_MM_ALL_LOCKS, 1739 + &mapping->flags)) 1740 + BUG(); 1741 + } 1742 + } 1743 + 1744 + /* 1745 + * The mmap_lock cannot be released by the caller until 1746 + * mm_drop_all_locks() returns. 1747 + */ 1748 + void mm_drop_all_locks(struct mm_struct *mm) 1749 + { 1750 + struct vm_area_struct *vma; 1751 + struct anon_vma_chain *avc; 1752 + VMA_ITERATOR(vmi, mm, 0); 1753 + 1754 + mmap_assert_write_locked(mm); 1755 + BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); 1756 + 1757 + for_each_vma(vmi, vma) { 1758 + if (vma->anon_vma) 1759 + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 1760 + vm_unlock_anon_vma(avc->anon_vma); 1761 + if (vma->vm_file && vma->vm_file->f_mapping) 1762 + vm_unlock_mapping(vma->vm_file->f_mapping); 1763 + } 1764 + 1765 + mutex_unlock(&mm_all_locks_mutex); 1766 + }
+364
mm/vma.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * vma.h 4 + * 5 + * Core VMA manipulation API implemented in vma.c. 6 + */ 7 + #ifndef __MM_VMA_H 8 + #define __MM_VMA_H 9 + 10 + /* 11 + * VMA lock generalization 12 + */ 13 + struct vma_prepare { 14 + struct vm_area_struct *vma; 15 + struct vm_area_struct *adj_next; 16 + struct file *file; 17 + struct address_space *mapping; 18 + struct anon_vma *anon_vma; 19 + struct vm_area_struct *insert; 20 + struct vm_area_struct *remove; 21 + struct vm_area_struct *remove2; 22 + }; 23 + 24 + struct unlink_vma_file_batch { 25 + int count; 26 + struct vm_area_struct *vmas[8]; 27 + }; 28 + 29 + #ifdef CONFIG_DEBUG_VM_MAPLE_TREE 30 + void validate_mm(struct mm_struct *mm); 31 + #else 32 + #define validate_mm(mm) do { } while (0) 33 + #endif 34 + 35 + /* Required for expand_downwards(). */ 36 + void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma); 37 + 38 + /* Required for expand_downwards(). */ 39 + void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma); 40 + 41 + /* Required for do_brk_flags(). */ 42 + void vma_prepare(struct vma_prepare *vp); 43 + 44 + /* Required for do_brk_flags(). */ 45 + void init_vma_prep(struct vma_prepare *vp, 46 + struct vm_area_struct *vma); 47 + 48 + /* Required for do_brk_flags(). */ 49 + void vma_complete(struct vma_prepare *vp, 50 + struct vma_iterator *vmi, struct mm_struct *mm); 51 + 52 + int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, 53 + unsigned long start, unsigned long end, pgoff_t pgoff, 54 + struct vm_area_struct *next); 55 + 56 + int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, 57 + unsigned long start, unsigned long end, pgoff_t pgoff); 58 + 59 + int 60 + do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, 61 + struct mm_struct *mm, unsigned long start, 62 + unsigned long end, struct list_head *uf, bool unlock); 63 + 64 + int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, 65 + unsigned long start, size_t len, struct list_head *uf, 66 + bool unlock); 67 + 68 + void remove_vma(struct vm_area_struct *vma, bool unreachable); 69 + 70 + void unmap_region(struct mm_struct *mm, struct ma_state *mas, 71 + struct vm_area_struct *vma, struct vm_area_struct *prev, 72 + struct vm_area_struct *next, unsigned long start, 73 + unsigned long end, unsigned long tree_end, bool mm_wr_locked); 74 + 75 + /* Required by mmap_region(). */ 76 + bool 77 + can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, 78 + struct anon_vma *anon_vma, struct file *file, 79 + pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, 80 + struct anon_vma_name *anon_name); 81 + 82 + /* Required by mmap_region() and do_brk_flags(). */ 83 + bool 84 + can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, 85 + struct anon_vma *anon_vma, struct file *file, 86 + pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, 87 + struct anon_vma_name *anon_name); 88 + 89 + struct vm_area_struct *vma_modify(struct vma_iterator *vmi, 90 + struct vm_area_struct *prev, 91 + struct vm_area_struct *vma, 92 + unsigned long start, unsigned long end, 93 + unsigned long vm_flags, 94 + struct mempolicy *policy, 95 + struct vm_userfaultfd_ctx uffd_ctx, 96 + struct anon_vma_name *anon_name); 97 + 98 + /* We are about to modify the VMA's flags. */ 99 + static inline struct vm_area_struct 100 + *vma_modify_flags(struct vma_iterator *vmi, 101 + struct vm_area_struct *prev, 102 + struct vm_area_struct *vma, 103 + unsigned long start, unsigned long end, 104 + unsigned long new_flags) 105 + { 106 + return vma_modify(vmi, prev, vma, start, end, new_flags, 107 + vma_policy(vma), vma->vm_userfaultfd_ctx, 108 + anon_vma_name(vma)); 109 + } 110 + 111 + /* We are about to modify the VMA's flags and/or anon_name. */ 112 + static inline struct vm_area_struct 113 + *vma_modify_flags_name(struct vma_iterator *vmi, 114 + struct vm_area_struct *prev, 115 + struct vm_area_struct *vma, 116 + unsigned long start, 117 + unsigned long end, 118 + unsigned long new_flags, 119 + struct anon_vma_name *new_name) 120 + { 121 + return vma_modify(vmi, prev, vma, start, end, new_flags, 122 + vma_policy(vma), vma->vm_userfaultfd_ctx, new_name); 123 + } 124 + 125 + /* We are about to modify the VMA's memory policy. */ 126 + static inline struct vm_area_struct 127 + *vma_modify_policy(struct vma_iterator *vmi, 128 + struct vm_area_struct *prev, 129 + struct vm_area_struct *vma, 130 + unsigned long start, unsigned long end, 131 + struct mempolicy *new_pol) 132 + { 133 + return vma_modify(vmi, prev, vma, start, end, vma->vm_flags, 134 + new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); 135 + } 136 + 137 + /* We are about to modify the VMA's flags and/or uffd context. */ 138 + static inline struct vm_area_struct 139 + *vma_modify_flags_uffd(struct vma_iterator *vmi, 140 + struct vm_area_struct *prev, 141 + struct vm_area_struct *vma, 142 + unsigned long start, unsigned long end, 143 + unsigned long new_flags, 144 + struct vm_userfaultfd_ctx new_ctx) 145 + { 146 + return vma_modify(vmi, prev, vma, start, end, new_flags, 147 + vma_policy(vma), new_ctx, anon_vma_name(vma)); 148 + } 149 + 150 + struct vm_area_struct 151 + *vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev, 152 + struct vm_area_struct *vma, unsigned long start, 153 + unsigned long end, pgoff_t pgoff); 154 + 155 + struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, 156 + struct vm_area_struct *vma, 157 + unsigned long delta); 158 + 159 + void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb); 160 + 161 + void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb); 162 + 163 + void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, 164 + struct vm_area_struct *vma); 165 + 166 + void unlink_file_vma(struct vm_area_struct *vma); 167 + 168 + void vma_link_file(struct vm_area_struct *vma); 169 + 170 + int vma_link(struct mm_struct *mm, struct vm_area_struct *vma); 171 + 172 + struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, 173 + unsigned long addr, unsigned long len, pgoff_t pgoff, 174 + bool *need_rmap_locks); 175 + 176 + struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma); 177 + 178 + bool vma_needs_dirty_tracking(struct vm_area_struct *vma); 179 + bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); 180 + 181 + int mm_take_all_locks(struct mm_struct *mm); 182 + void mm_drop_all_locks(struct mm_struct *mm); 183 + unsigned long count_vma_pages_range(struct mm_struct *mm, 184 + unsigned long addr, unsigned long end); 185 + 186 + static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) 187 + { 188 + /* 189 + * We want to check manually if we can change individual PTEs writable 190 + * if we can't do that automatically for all PTEs in a mapping. For 191 + * private mappings, that's always the case when we have write 192 + * permissions as we properly have to handle COW. 193 + */ 194 + if (vma->vm_flags & VM_SHARED) 195 + return vma_wants_writenotify(vma, vma->vm_page_prot); 196 + return !!(vma->vm_flags & VM_WRITE); 197 + } 198 + 199 + #ifdef CONFIG_MMU 200 + static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) 201 + { 202 + return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); 203 + } 204 + #endif 205 + 206 + static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, 207 + unsigned long min) 208 + { 209 + return mas_prev(&vmi->mas, min); 210 + } 211 + 212 + static inline int vma_iter_store_gfp(struct vma_iterator *vmi, 213 + struct vm_area_struct *vma, gfp_t gfp) 214 + { 215 + if (vmi->mas.status != ma_start && 216 + ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) 217 + vma_iter_invalidate(vmi); 218 + 219 + __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); 220 + mas_store_gfp(&vmi->mas, vma, gfp); 221 + if (unlikely(mas_is_err(&vmi->mas))) 222 + return -ENOMEM; 223 + 224 + return 0; 225 + } 226 + 227 + 228 + /* 229 + * These three helpers classifies VMAs for virtual memory accounting. 230 + */ 231 + 232 + /* 233 + * Executable code area - executable, not writable, not stack 234 + */ 235 + static inline bool is_exec_mapping(vm_flags_t flags) 236 + { 237 + return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC; 238 + } 239 + 240 + /* 241 + * Stack area (including shadow stacks) 242 + * 243 + * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: 244 + * do_mmap() forbids all other combinations. 245 + */ 246 + static inline bool is_stack_mapping(vm_flags_t flags) 247 + { 248 + return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK); 249 + } 250 + 251 + /* 252 + * Data area - private, writable, not stack 253 + */ 254 + static inline bool is_data_mapping(vm_flags_t flags) 255 + { 256 + return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; 257 + } 258 + 259 + 260 + static inline void vma_iter_config(struct vma_iterator *vmi, 261 + unsigned long index, unsigned long last) 262 + { 263 + __mas_set_range(&vmi->mas, index, last - 1); 264 + } 265 + 266 + static inline void vma_iter_reset(struct vma_iterator *vmi) 267 + { 268 + mas_reset(&vmi->mas); 269 + } 270 + 271 + static inline 272 + struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min) 273 + { 274 + return mas_prev_range(&vmi->mas, min); 275 + } 276 + 277 + static inline 278 + struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max) 279 + { 280 + return mas_next_range(&vmi->mas, max); 281 + } 282 + 283 + static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min, 284 + unsigned long max, unsigned long size) 285 + { 286 + return mas_empty_area(&vmi->mas, min, max - 1, size); 287 + } 288 + 289 + static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min, 290 + unsigned long max, unsigned long size) 291 + { 292 + return mas_empty_area_rev(&vmi->mas, min, max - 1, size); 293 + } 294 + 295 + /* 296 + * VMA Iterator functions shared between nommu and mmap 297 + */ 298 + static inline int vma_iter_prealloc(struct vma_iterator *vmi, 299 + struct vm_area_struct *vma) 300 + { 301 + return mas_preallocate(&vmi->mas, vma, GFP_KERNEL); 302 + } 303 + 304 + static inline void vma_iter_clear(struct vma_iterator *vmi) 305 + { 306 + mas_store_prealloc(&vmi->mas, NULL); 307 + } 308 + 309 + static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) 310 + { 311 + return mas_walk(&vmi->mas); 312 + } 313 + 314 + /* Store a VMA with preallocated memory */ 315 + static inline void vma_iter_store(struct vma_iterator *vmi, 316 + struct vm_area_struct *vma) 317 + { 318 + 319 + #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) 320 + if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && 321 + vmi->mas.index > vma->vm_start)) { 322 + pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n", 323 + vmi->mas.index, vma->vm_start, vma->vm_start, 324 + vma->vm_end, vmi->mas.index, vmi->mas.last); 325 + } 326 + if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && 327 + vmi->mas.last < vma->vm_start)) { 328 + pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n", 329 + vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end, 330 + vmi->mas.index, vmi->mas.last); 331 + } 332 + #endif 333 + 334 + if (vmi->mas.status != ma_start && 335 + ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) 336 + vma_iter_invalidate(vmi); 337 + 338 + __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); 339 + mas_store_prealloc(&vmi->mas, vma); 340 + } 341 + 342 + static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) 343 + { 344 + return vmi->mas.index; 345 + } 346 + 347 + static inline unsigned long vma_iter_end(struct vma_iterator *vmi) 348 + { 349 + return vmi->mas.last + 1; 350 + } 351 + 352 + static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, 353 + unsigned long count) 354 + { 355 + return mas_expected_entries(&vmi->mas, count); 356 + } 357 + 358 + static inline 359 + struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi) 360 + { 361 + return mas_prev_range(&vmi->mas, 0); 362 + } 363 + 364 + #endif /* __MM_VMA_H */
+50
mm/vma_internal.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * vma_internal.h 4 + * 5 + * Headers required by vma.c, which can be substituted accordingly when testing 6 + * VMA functionality. 7 + */ 8 + 9 + #ifndef __MM_VMA_INTERNAL_H 10 + #define __MM_VMA_INTERNAL_H 11 + 12 + #include <linux/backing-dev.h> 13 + #include <linux/bitops.h> 14 + #include <linux/bug.h> 15 + #include <linux/bug.h> 16 + #include <linux/cacheflush.h> 17 + #include <linux/err.h> 18 + #include <linux/file.h> 19 + #include <linux/fs.h> 20 + #include <linux/huge_mm.h> 21 + #include <linux/hugetlb_inline.h> 22 + #include <linux/kernel.h> 23 + #include <linux/khugepaged.h> 24 + #include <linux/list.h> 25 + #include <linux/maple_tree.h> 26 + #include <linux/mempolicy.h> 27 + #include <linux/mm.h> 28 + #include <linux/mm_inline.h> 29 + #include <linux/mm_types.h> 30 + #include <linux/mman.h> 31 + #include <linux/mmap_lock.h> 32 + #include <linux/mmdebug.h> 33 + #include <linux/mmu_context.h> 34 + #include <linux/mutex.h> 35 + #include <linux/pagemap.h> 36 + #include <linux/pfn.h> 37 + #include <linux/rcupdate.h> 38 + #include <linux/rmap.h> 39 + #include <linux/rwsem.h> 40 + #include <linux/sched/signal.h> 41 + #include <linux/swap.h> 42 + #include <linux/uprobes.h> 43 + #include <linux/userfaultfd_k.h> 44 + 45 + #include <asm/current.h> 46 + #include <asm/tlb.h> 47 + 48 + #include "internal.h" 49 + 50 + #endif /* __MM_VMA_INTERNAL_H */