mm: abstract initial stack setup to mm subsystem

+5 -61

fs/exec.c

··· 245 245 flush_cache_page(bprm->vma, pos, page_to_pfn(page)); 246 246 } 247 247 248 - static int __bprm_mm_init(struct linux_binprm *bprm) 249 - { 250 - int err; 251 - struct vm_area_struct *vma = NULL; 252 - struct mm_struct *mm = bprm->mm; 253 - 254 - bprm->vma = vma = vm_area_alloc(mm); 255 - if (!vma) 256 - return -ENOMEM; 257 - vma_set_anonymous(vma); 258 - 259 - if (mmap_write_lock_killable(mm)) { 260 - err = -EINTR; 261 - goto err_free; 262 - } 263 - 264 - /* 265 - * Need to be called with mmap write lock 266 - * held, to avoid race with ksmd. 267 - */ 268 - err = ksm_execve(mm); 269 - if (err) 270 - goto err_ksm; 271 - 272 - /* 273 - * Place the stack at the largest stack address the architecture 274 - * supports. Later, we'll move this to an appropriate place. We don't 275 - * use STACK_TOP because that can depend on attributes which aren't 276 - * configured yet. 277 - */ 278 - BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); 279 - vma->vm_end = STACK_TOP_MAX; 280 - vma->vm_start = vma->vm_end - PAGE_SIZE; 281 - vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); 282 - vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 283 - 284 - err = insert_vm_struct(mm, vma); 285 - if (err) 286 - goto err; 287 - 288 - mm->stack_vm = mm->total_vm = 1; 289 - mmap_write_unlock(mm); 290 - bprm->p = vma->vm_end - sizeof(void *); 291 - return 0; 292 - err: 293 - ksm_exit(mm); 294 - err_ksm: 295 - mmap_write_unlock(mm); 296 - err_free: 297 - bprm->vma = NULL; 298 - vm_area_free(vma); 299 - return err; 300 - } 301 - 302 248 static bool valid_arg_len(struct linux_binprm *bprm, long len) 303 249 { 304 250 return len <= MAX_ARG_STRLEN; ··· 297 351 { 298 352 } 299 353 300 - static int __bprm_mm_init(struct linux_binprm *bprm) 301 - { 302 - bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); 303 - return 0; 304 - } 305 - 306 354 static bool valid_arg_len(struct linux_binprm *bprm, long len) 307 355 { 308 356 return len <= bprm->p; ··· 325 385 bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK]; 326 386 task_unlock(current->group_leader); 327 387 328 - err = __bprm_mm_init(bprm); 388 + #ifndef CONFIG_MMU 389 + bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); 390 + #else 391 + err = create_init_stack_vma(bprm->mm, &bprm->vma, &bprm->p); 329 392 if (err) 330 393 goto err; 394 + #endif 331 395 332 396 return 0; 333 397

-42

mm/mmap.c

··· 1321 1321 vm_unacct_memory(nr_accounted); 1322 1322 } 1323 1323 1324 - /* Insert vm structure into process list sorted by address 1325 - * and into the inode's i_mmap tree. If vm_file is non-NULL 1326 - * then i_mmap_rwsem is taken here. 1327 - */ 1328 - int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) 1329 - { 1330 - unsigned long charged = vma_pages(vma); 1331 - 1332 - 1333 - if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) 1334 - return -ENOMEM; 1335 - 1336 - if ((vma->vm_flags & VM_ACCOUNT) && 1337 - security_vm_enough_memory_mm(mm, charged)) 1338 - return -ENOMEM; 1339 - 1340 - /* 1341 - * The vm_pgoff of a purely anonymous vma should be irrelevant 1342 - * until its first write fault, when page's anon_vma and index 1343 - * are set. But now set the vm_pgoff it will almost certainly 1344 - * end up with (unless mremap moves it elsewhere before that 1345 - * first wfault), so /proc/pid/maps tells a consistent story. 1346 - * 1347 - * By setting it to reflect the virtual start address of the 1348 - * vma, merges and splits can happen in a seamless way, just 1349 - * using the existing file pgoff checks and manipulations. 1350 - * Similarly in do_mmap and in do_brk_flags. 1351 - */ 1352 - if (vma_is_anonymous(vma)) { 1353 - BUG_ON(vma->anon_vma); 1354 - vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; 1355 - } 1356 - 1357 - if (vma_link(mm, vma)) { 1358 - if (vma->vm_flags & VM_ACCOUNT) 1359 - vm_unacct_memory(charged); 1360 - return -ENOMEM; 1361 - } 1362 - 1363 - return 0; 1364 - } 1365 - 1366 1324 /* 1367 1325 * Return true if the calling process may expand its vm space by the passed 1368 1326 * number of pages

+43

mm/vma.c

··· 3052 3052 userfaultfd_unmap_complete(mm, &uf); 3053 3053 return ret; 3054 3054 } 3055 + 3056 + 3057 + /* Insert vm structure into process list sorted by address 3058 + * and into the inode's i_mmap tree. If vm_file is non-NULL 3059 + * then i_mmap_rwsem is taken here. 3060 + */ 3061 + int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) 3062 + { 3063 + unsigned long charged = vma_pages(vma); 3064 + 3065 + 3066 + if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) 3067 + return -ENOMEM; 3068 + 3069 + if ((vma->vm_flags & VM_ACCOUNT) && 3070 + security_vm_enough_memory_mm(mm, charged)) 3071 + return -ENOMEM; 3072 + 3073 + /* 3074 + * The vm_pgoff of a purely anonymous vma should be irrelevant 3075 + * until its first write fault, when page's anon_vma and index 3076 + * are set. But now set the vm_pgoff it will almost certainly 3077 + * end up with (unless mremap moves it elsewhere before that 3078 + * first wfault), so /proc/pid/maps tells a consistent story. 3079 + * 3080 + * By setting it to reflect the virtual start address of the 3081 + * vma, merges and splits can happen in a seamless way, just 3082 + * using the existing file pgoff checks and manipulations. 3083 + * Similarly in do_mmap and in do_brk_flags. 3084 + */ 3085 + if (vma_is_anonymous(vma)) { 3086 + BUG_ON(vma->anon_vma); 3087 + vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; 3088 + } 3089 + 3090 + if (vma_link(mm, vma)) { 3091 + if (vma->vm_flags & VM_ACCOUNT) 3092 + vm_unacct_memory(charged); 3093 + return -ENOMEM; 3094 + } 3095 + 3096 + return 0; 3097 + }

+4

mm/vma.h

··· 548 548 549 549 int __vm_munmap(unsigned long start, size_t len, bool unlock); 550 550 551 + int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma); 552 + 551 553 /* vma_exec.c */ 552 554 #ifdef CONFIG_MMU 555 + int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, 556 + unsigned long *top_mem_p); 553 557 int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); 554 558 #endif 555 559

+69

mm/vma_exec.c

··· 90 90 /* Shrink the vma to just the new range */ 91 91 return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); 92 92 } 93 + 94 + /* 95 + * Establish the stack VMA in an execve'd process, located temporarily at the 96 + * maximum stack address provided by the architecture. 97 + * 98 + * We later relocate this downwards in relocate_vma_down(). 99 + * 100 + * This function is almost certainly NOT what you want for anything other than 101 + * early executable initialisation. 102 + * 103 + * On success, returns 0 and sets *vmap to the stack VMA and *top_mem_p to the 104 + * maximum addressable location in the stack (that is capable of storing a 105 + * system word of data). 106 + */ 107 + int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, 108 + unsigned long *top_mem_p) 109 + { 110 + int err; 111 + struct vm_area_struct *vma = vm_area_alloc(mm); 112 + 113 + if (!vma) 114 + return -ENOMEM; 115 + 116 + vma_set_anonymous(vma); 117 + 118 + if (mmap_write_lock_killable(mm)) { 119 + err = -EINTR; 120 + goto err_free; 121 + } 122 + 123 + /* 124 + * Need to be called with mmap write lock 125 + * held, to avoid race with ksmd. 126 + */ 127 + err = ksm_execve(mm); 128 + if (err) 129 + goto err_ksm; 130 + 131 + /* 132 + * Place the stack at the largest stack address the architecture 133 + * supports. Later, we'll move this to an appropriate place. We don't 134 + * use STACK_TOP because that can depend on attributes which aren't 135 + * configured yet. 136 + */ 137 + BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); 138 + vma->vm_end = STACK_TOP_MAX; 139 + vma->vm_start = vma->vm_end - PAGE_SIZE; 140 + vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); 141 + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 142 + 143 + err = insert_vm_struct(mm, vma); 144 + if (err) 145 + goto err; 146 + 147 + mm->stack_vm = mm->total_vm = 1; 148 + mmap_write_unlock(mm); 149 + *vmap = vma; 150 + *top_mem_p = vma->vm_end - sizeof(void *); 151 + return 0; 152 + 153 + err: 154 + ksm_exit(mm); 155 + err_ksm: 156 + mmap_write_unlock(mm); 157 + err_free: 158 + *vmap = NULL; 159 + vm_area_free(vma); 160 + return err; 161 + }

+32

tools/testing/vma/vma_internal.h

··· 56 56 #define VM_PFNMAP 0x00000400 57 57 #define VM_LOCKED 0x00002000 58 58 #define VM_IO 0x00004000 59 + #define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ 60 + #define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ 59 61 #define VM_DONTEXPAND 0x00040000 60 62 #define VM_LOCKONFAULT 0x00080000 61 63 #define VM_ACCOUNT 0x00100000 ··· 72 70 #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) 73 71 #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) 74 72 73 + #ifdef CONFIG_STACK_GROWSUP 74 + #define VM_STACK VM_GROWSUP 75 + #define VM_STACK_EARLY VM_GROWSDOWN 76 + #else 77 + #define VM_STACK VM_GROWSDOWN 78 + #define VM_STACK_EARLY 0 79 + #endif 80 + 81 + #define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) 82 + #define TASK_SIZE_LOW DEFAULT_MAP_WINDOW 83 + #define TASK_SIZE_MAX DEFAULT_MAP_WINDOW 84 + #define STACK_TOP TASK_SIZE_LOW 85 + #define STACK_TOP_MAX TASK_SIZE_MAX 86 + 75 87 /* This mask represents all the VMA flag bits used by mlock */ 76 88 #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) 77 89 ··· 97 81 #define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC 98 82 99 83 #define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) 84 + 85 + #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS 86 + #define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) 87 + #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) 100 88 101 89 #define RLIMIT_STACK 3 /* max stack size */ 102 90 #define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */ ··· 1298 1278 (void)end; 1299 1279 (void)floor; 1300 1280 (void)ceiling; 1281 + } 1282 + 1283 + static inline int ksm_execve(struct mm_struct *mm) 1284 + { 1285 + (void)mm; 1286 + 1287 + return 0; 1288 + } 1289 + 1290 + static inline void ksm_exit(struct mm_struct *mm) 1291 + { 1292 + (void)mm; 1301 1293 } 1302 1294 1303 1295 #endif /* __MM_VMA_INTERNAL_H */

Configure Feed

Configure Feed