Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

mm: abstract initial stack setup to mm subsystem

There are peculiarities within the kernel where what is very clearly mm
code is performed elsewhere arbitrarily.

This violates separation of concerns and makes it harder to refactor code
to make changes to how fundamental initialisation and operation of mm
logic is performed.

One such case is the creation of the VMA containing the initial stack upon
execve()'ing a new process. This is currently performed in
__bprm_mm_init() in fs/exec.c.

Abstract this operation to create_init_stack_vma(). This allows us to
limit use of vma allocation and free code to fork and mm only.

We previously did the same for the step at which we relocate the initial
stack VMA downwards via relocate_vma_down(), now we move the initial VMA
establishment too.

Take the opportunity to also move insert_vm_struct() to mm/vma.c as it's
no longer needed anywhere outside of mm.

Link: https://lkml.kernel.org/r/118c950ef7a8dd19ab20a23a68c3603751acd30e.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Lorenzo Stoakes and committed by
Andrew Morton
dd7a6246 6c36ac1e

+153 -103
+5 -61
fs/exec.c
··· 245 245 flush_cache_page(bprm->vma, pos, page_to_pfn(page)); 246 246 } 247 247 248 - static int __bprm_mm_init(struct linux_binprm *bprm) 249 - { 250 - int err; 251 - struct vm_area_struct *vma = NULL; 252 - struct mm_struct *mm = bprm->mm; 253 - 254 - bprm->vma = vma = vm_area_alloc(mm); 255 - if (!vma) 256 - return -ENOMEM; 257 - vma_set_anonymous(vma); 258 - 259 - if (mmap_write_lock_killable(mm)) { 260 - err = -EINTR; 261 - goto err_free; 262 - } 263 - 264 - /* 265 - * Need to be called with mmap write lock 266 - * held, to avoid race with ksmd. 267 - */ 268 - err = ksm_execve(mm); 269 - if (err) 270 - goto err_ksm; 271 - 272 - /* 273 - * Place the stack at the largest stack address the architecture 274 - * supports. Later, we'll move this to an appropriate place. We don't 275 - * use STACK_TOP because that can depend on attributes which aren't 276 - * configured yet. 277 - */ 278 - BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); 279 - vma->vm_end = STACK_TOP_MAX; 280 - vma->vm_start = vma->vm_end - PAGE_SIZE; 281 - vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); 282 - vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 283 - 284 - err = insert_vm_struct(mm, vma); 285 - if (err) 286 - goto err; 287 - 288 - mm->stack_vm = mm->total_vm = 1; 289 - mmap_write_unlock(mm); 290 - bprm->p = vma->vm_end - sizeof(void *); 291 - return 0; 292 - err: 293 - ksm_exit(mm); 294 - err_ksm: 295 - mmap_write_unlock(mm); 296 - err_free: 297 - bprm->vma = NULL; 298 - vm_area_free(vma); 299 - return err; 300 - } 301 - 302 248 static bool valid_arg_len(struct linux_binprm *bprm, long len) 303 249 { 304 250 return len <= MAX_ARG_STRLEN; ··· 297 351 { 298 352 } 299 353 300 - static int __bprm_mm_init(struct linux_binprm *bprm) 301 - { 302 - bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); 303 - return 0; 304 - } 305 - 306 354 static bool valid_arg_len(struct linux_binprm *bprm, long len) 307 355 { 308 356 return len <= bprm->p; ··· 325 385 bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK]; 326 386 task_unlock(current->group_leader); 327 387 328 - err = __bprm_mm_init(bprm); 388 + #ifndef CONFIG_MMU 389 + bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); 390 + #else 391 + err = create_init_stack_vma(bprm->mm, &bprm->vma, &bprm->p); 329 392 if (err) 330 393 goto err; 394 + #endif 331 395 332 396 return 0; 333 397
-42
mm/mmap.c
··· 1321 1321 vm_unacct_memory(nr_accounted); 1322 1322 } 1323 1323 1324 - /* Insert vm structure into process list sorted by address 1325 - * and into the inode's i_mmap tree. If vm_file is non-NULL 1326 - * then i_mmap_rwsem is taken here. 1327 - */ 1328 - int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) 1329 - { 1330 - unsigned long charged = vma_pages(vma); 1331 - 1332 - 1333 - if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) 1334 - return -ENOMEM; 1335 - 1336 - if ((vma->vm_flags & VM_ACCOUNT) && 1337 - security_vm_enough_memory_mm(mm, charged)) 1338 - return -ENOMEM; 1339 - 1340 - /* 1341 - * The vm_pgoff of a purely anonymous vma should be irrelevant 1342 - * until its first write fault, when page's anon_vma and index 1343 - * are set. But now set the vm_pgoff it will almost certainly 1344 - * end up with (unless mremap moves it elsewhere before that 1345 - * first wfault), so /proc/pid/maps tells a consistent story. 1346 - * 1347 - * By setting it to reflect the virtual start address of the 1348 - * vma, merges and splits can happen in a seamless way, just 1349 - * using the existing file pgoff checks and manipulations. 1350 - * Similarly in do_mmap and in do_brk_flags. 1351 - */ 1352 - if (vma_is_anonymous(vma)) { 1353 - BUG_ON(vma->anon_vma); 1354 - vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; 1355 - } 1356 - 1357 - if (vma_link(mm, vma)) { 1358 - if (vma->vm_flags & VM_ACCOUNT) 1359 - vm_unacct_memory(charged); 1360 - return -ENOMEM; 1361 - } 1362 - 1363 - return 0; 1364 - } 1365 - 1366 1324 /* 1367 1325 * Return true if the calling process may expand its vm space by the passed 1368 1326 * number of pages
+43
mm/vma.c
··· 3052 3052 userfaultfd_unmap_complete(mm, &uf); 3053 3053 return ret; 3054 3054 } 3055 + 3056 + 3057 + /* Insert vm structure into process list sorted by address 3058 + * and into the inode's i_mmap tree. If vm_file is non-NULL 3059 + * then i_mmap_rwsem is taken here. 3060 + */ 3061 + int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) 3062 + { 3063 + unsigned long charged = vma_pages(vma); 3064 + 3065 + 3066 + if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) 3067 + return -ENOMEM; 3068 + 3069 + if ((vma->vm_flags & VM_ACCOUNT) && 3070 + security_vm_enough_memory_mm(mm, charged)) 3071 + return -ENOMEM; 3072 + 3073 + /* 3074 + * The vm_pgoff of a purely anonymous vma should be irrelevant 3075 + * until its first write fault, when page's anon_vma and index 3076 + * are set. But now set the vm_pgoff it will almost certainly 3077 + * end up with (unless mremap moves it elsewhere before that 3078 + * first wfault), so /proc/pid/maps tells a consistent story. 3079 + * 3080 + * By setting it to reflect the virtual start address of the 3081 + * vma, merges and splits can happen in a seamless way, just 3082 + * using the existing file pgoff checks and manipulations. 3083 + * Similarly in do_mmap and in do_brk_flags. 3084 + */ 3085 + if (vma_is_anonymous(vma)) { 3086 + BUG_ON(vma->anon_vma); 3087 + vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; 3088 + } 3089 + 3090 + if (vma_link(mm, vma)) { 3091 + if (vma->vm_flags & VM_ACCOUNT) 3092 + vm_unacct_memory(charged); 3093 + return -ENOMEM; 3094 + } 3095 + 3096 + return 0; 3097 + }
+4
mm/vma.h
··· 548 548 549 549 int __vm_munmap(unsigned long start, size_t len, bool unlock); 550 550 551 + int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma); 552 + 551 553 /* vma_exec.c */ 552 554 #ifdef CONFIG_MMU 555 + int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, 556 + unsigned long *top_mem_p); 553 557 int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); 554 558 #endif 555 559
+69
mm/vma_exec.c
··· 90 90 /* Shrink the vma to just the new range */ 91 91 return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); 92 92 } 93 + 94 + /* 95 + * Establish the stack VMA in an execve'd process, located temporarily at the 96 + * maximum stack address provided by the architecture. 97 + * 98 + * We later relocate this downwards in relocate_vma_down(). 99 + * 100 + * This function is almost certainly NOT what you want for anything other than 101 + * early executable initialisation. 102 + * 103 + * On success, returns 0 and sets *vmap to the stack VMA and *top_mem_p to the 104 + * maximum addressable location in the stack (that is capable of storing a 105 + * system word of data). 106 + */ 107 + int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, 108 + unsigned long *top_mem_p) 109 + { 110 + int err; 111 + struct vm_area_struct *vma = vm_area_alloc(mm); 112 + 113 + if (!vma) 114 + return -ENOMEM; 115 + 116 + vma_set_anonymous(vma); 117 + 118 + if (mmap_write_lock_killable(mm)) { 119 + err = -EINTR; 120 + goto err_free; 121 + } 122 + 123 + /* 124 + * Need to be called with mmap write lock 125 + * held, to avoid race with ksmd. 126 + */ 127 + err = ksm_execve(mm); 128 + if (err) 129 + goto err_ksm; 130 + 131 + /* 132 + * Place the stack at the largest stack address the architecture 133 + * supports. Later, we'll move this to an appropriate place. We don't 134 + * use STACK_TOP because that can depend on attributes which aren't 135 + * configured yet. 136 + */ 137 + BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); 138 + vma->vm_end = STACK_TOP_MAX; 139 + vma->vm_start = vma->vm_end - PAGE_SIZE; 140 + vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); 141 + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 142 + 143 + err = insert_vm_struct(mm, vma); 144 + if (err) 145 + goto err; 146 + 147 + mm->stack_vm = mm->total_vm = 1; 148 + mmap_write_unlock(mm); 149 + *vmap = vma; 150 + *top_mem_p = vma->vm_end - sizeof(void *); 151 + return 0; 152 + 153 + err: 154 + ksm_exit(mm); 155 + err_ksm: 156 + mmap_write_unlock(mm); 157 + err_free: 158 + *vmap = NULL; 159 + vm_area_free(vma); 160 + return err; 161 + }
+32
tools/testing/vma/vma_internal.h
··· 56 56 #define VM_PFNMAP 0x00000400 57 57 #define VM_LOCKED 0x00002000 58 58 #define VM_IO 0x00004000 59 + #define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ 60 + #define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ 59 61 #define VM_DONTEXPAND 0x00040000 60 62 #define VM_LOCKONFAULT 0x00080000 61 63 #define VM_ACCOUNT 0x00100000 ··· 72 70 #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) 73 71 #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) 74 72 73 + #ifdef CONFIG_STACK_GROWSUP 74 + #define VM_STACK VM_GROWSUP 75 + #define VM_STACK_EARLY VM_GROWSDOWN 76 + #else 77 + #define VM_STACK VM_GROWSDOWN 78 + #define VM_STACK_EARLY 0 79 + #endif 80 + 81 + #define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) 82 + #define TASK_SIZE_LOW DEFAULT_MAP_WINDOW 83 + #define TASK_SIZE_MAX DEFAULT_MAP_WINDOW 84 + #define STACK_TOP TASK_SIZE_LOW 85 + #define STACK_TOP_MAX TASK_SIZE_MAX 86 + 75 87 /* This mask represents all the VMA flag bits used by mlock */ 76 88 #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) 77 89 ··· 97 81 #define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC 98 82 99 83 #define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) 84 + 85 + #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS 86 + #define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) 87 + #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) 100 88 101 89 #define RLIMIT_STACK 3 /* max stack size */ 102 90 #define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */ ··· 1298 1278 (void)end; 1299 1279 (void)floor; 1300 1280 (void)ceiling; 1281 + } 1282 + 1283 + static inline int ksm_execve(struct mm_struct *mm) 1284 + { 1285 + (void)mm; 1286 + 1287 + return 0; 1288 + } 1289 + 1290 + static inline void ksm_exit(struct mm_struct *mm) 1291 + { 1292 + (void)mm; 1301 1293 } 1302 1294 1303 1295 #endif /* __MM_VMA_INTERNAL_H */