Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

mm: refactor __mmap_region()

We have seen bugs and resource leaks arise from the complexity of the
__mmap_region() function. This, and the generally deeply fragile error
handling logic and complexity which makes understanding the function
difficult make it highly desirable to refactor it into something readable.

Achieve this by separating the function into smaller logical parts which
are easier to understand and follow, and which importantly very
significantly simplify the error handling.

Note that we now call vms_abort_munmap_vmas() in more error paths than we
used to, however in cases where no abort need occur, vms->nr_pages will be
equal to zero and we simply exit this function without doing more than we
would have done previously.

Importantly, the invocation of the driver mmap hook via mmap_file() now
has very simple and obvious handling (this was previously the most
problematic part of the mmap() operation).

Use a generalised stack-based 'mmap state' to thread through values and
also retrieve state as needed.

Also avoid ever relying on vma merge (vmg) state after a merge is
attempted, instead maintain meaningful state in the mmap state and
establish vmg state as and when required.

This avoids any subtle bugs arising from merge logic mutating this state
and mmap_region() logic later relying upon it.

Link: https://lkml.kernel.org/r/25bd2edc3275450f448cbfe0756ce2a7cd06810f.1729858176.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Lorenzo Stoakes and committed by
Andrew Morton
0d11630c 52956b0d

+269 -139
+269 -139
mm/vma.c
··· 7 7 #include "vma_internal.h" 8 8 #include "vma.h" 9 9 10 + struct mmap_state { 11 + struct mm_struct *mm; 12 + struct vma_iterator *vmi; 13 + 14 + unsigned long addr; 15 + unsigned long end; 16 + pgoff_t pgoff; 17 + unsigned long pglen; 18 + unsigned long flags; 19 + struct file *file; 20 + 21 + unsigned long charged; 22 + 23 + struct vm_area_struct *prev; 24 + struct vm_area_struct *next; 25 + 26 + /* Unmapping state. */ 27 + struct vma_munmap_struct vms; 28 + struct ma_state mas_detach; 29 + struct maple_tree mt_detach; 30 + }; 31 + 32 + #define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, flags_, file_) \ 33 + struct mmap_state name = { \ 34 + .mm = mm_, \ 35 + .vmi = vmi_, \ 36 + .addr = addr_, \ 37 + .end = (addr_) + len, \ 38 + .pgoff = pgoff_, \ 39 + .pglen = PHYS_PFN(len_), \ 40 + .flags = flags_, \ 41 + .file = file_, \ 42 + } 43 + 44 + #define VMG_MMAP_STATE(name, map_, vma_) \ 45 + struct vma_merge_struct name = { \ 46 + .mm = (map_)->mm, \ 47 + .vmi = (map_)->vmi, \ 48 + .start = (map_)->addr, \ 49 + .end = (map_)->end, \ 50 + .flags = (map_)->flags, \ 51 + .pgoff = (map_)->pgoff, \ 52 + .file = (map_)->file, \ 53 + .prev = (map_)->prev, \ 54 + .vma = vma_, \ 55 + .next = (vma_) ? NULL : (map_)->next, \ 56 + .state = VMA_MERGE_START, \ 57 + .merge_flags = VMG_FLAG_DEFAULT, \ 58 + } 59 + 10 60 static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next) 11 61 { 12 62 struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev; ··· 2219 2169 vms_complete_munmap_vmas(vms, mas_detach); 2220 2170 } 2221 2171 2222 - unsigned long __mmap_region(struct file *file, unsigned long addr, 2223 - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, 2224 - struct list_head *uf) 2172 + /* 2173 + * __mmap_prepare() - Prepare to gather any overlapping VMAs that need to be 2174 + * unmapped once the map operation is completed, check limits, account mapping 2175 + * and clean up any pre-existing VMAs. 2176 + * 2177 + * @map: Mapping state. 2178 + * @uf: Userfaultfd context list. 2179 + * 2180 + * Returns: 0 on success, error code otherwise. 2181 + */ 2182 + static int __mmap_prepare(struct mmap_state *map, struct list_head *uf) 2225 2183 { 2226 - struct mm_struct *mm = current->mm; 2227 - struct vm_area_struct *vma = NULL; 2228 - pgoff_t pglen = PHYS_PFN(len); 2229 - unsigned long charged = 0; 2230 - struct vma_munmap_struct vms; 2231 - struct ma_state mas_detach; 2232 - struct maple_tree mt_detach; 2233 - unsigned long end = addr + len; 2234 2184 int error; 2235 - VMA_ITERATOR(vmi, mm, addr); 2236 - VMG_STATE(vmg, mm, &vmi, addr, end, vm_flags, pgoff); 2185 + struct vma_iterator *vmi = map->vmi; 2186 + struct vma_munmap_struct *vms = &map->vms; 2237 2187 2238 - vmg.file = file; 2239 - /* Find the first overlapping VMA */ 2240 - vma = vma_find(&vmi, end); 2241 - init_vma_munmap(&vms, &vmi, vma, addr, end, uf, /* unlock = */ false); 2242 - if (vma) { 2243 - mt_init_flags(&mt_detach, vmi.mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); 2244 - mt_on_stack(mt_detach); 2245 - mas_init(&mas_detach, &mt_detach, /* addr = */ 0); 2188 + /* Find the first overlapping VMA and initialise unmap state. */ 2189 + vms->vma = vma_find(vmi, map->end); 2190 + init_vma_munmap(vms, vmi, vms->vma, map->addr, map->end, uf, 2191 + /* unlock = */ false); 2192 + 2193 + /* OK, we have overlapping VMAs - prepare to unmap them. */ 2194 + if (vms->vma) { 2195 + mt_init_flags(&map->mt_detach, 2196 + vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); 2197 + mt_on_stack(map->mt_detach); 2198 + mas_init(&map->mas_detach, &map->mt_detach, /* addr = */ 0); 2246 2199 /* Prepare to unmap any existing mapping in the area */ 2247 - error = vms_gather_munmap_vmas(&vms, &mas_detach); 2248 - if (error) 2249 - goto gather_failed; 2200 + error = vms_gather_munmap_vmas(vms, &map->mas_detach); 2201 + if (error) { 2202 + /* On error VMAs will already have been reattached. */ 2203 + vms->nr_pages = 0; 2204 + return error; 2205 + } 2250 2206 2251 - vmg.next = vms.next; 2252 - vmg.prev = vms.prev; 2253 - vma = NULL; 2207 + map->next = vms->next; 2208 + map->prev = vms->prev; 2254 2209 } else { 2255 - vmg.next = vma_iter_next_rewind(&vmi, &vmg.prev); 2210 + map->next = vma_iter_next_rewind(vmi, &map->prev); 2256 2211 } 2257 2212 2258 2213 /* Check against address space limit. */ 2259 - if (!may_expand_vm(mm, vm_flags, pglen - vms.nr_pages)) { 2260 - error = -ENOMEM; 2261 - goto abort_munmap; 2262 - } 2214 + if (!may_expand_vm(map->mm, map->flags, map->pglen - vms->nr_pages)) 2215 + return -ENOMEM; 2263 2216 2264 - /* 2265 - * Private writable mapping: check memory availability 2266 - */ 2267 - if (accountable_mapping(file, vm_flags)) { 2268 - charged = pglen; 2269 - charged -= vms.nr_accounted; 2270 - if (charged) { 2271 - error = security_vm_enough_memory_mm(mm, charged); 2217 + /* Private writable mapping: check memory availability. */ 2218 + if (accountable_mapping(map->file, map->flags)) { 2219 + map->charged = map->pglen; 2220 + map->charged -= vms->nr_accounted; 2221 + if (map->charged) { 2222 + error = security_vm_enough_memory_mm(map->mm, map->charged); 2272 2223 if (error) 2273 - goto abort_munmap; 2224 + return error; 2274 2225 } 2275 2226 2276 - vms.nr_accounted = 0; 2277 - vm_flags |= VM_ACCOUNT; 2278 - vmg.flags = vm_flags; 2227 + vms->nr_accounted = 0; 2228 + map->flags |= VM_ACCOUNT; 2279 2229 } 2280 2230 2281 2231 /* 2282 - * clear PTEs while the vma is still in the tree so that rmap 2232 + * Clear PTEs while the vma is still in the tree so that rmap 2283 2233 * cannot race with the freeing later in the truncate scenario. 2284 2234 * This is also needed for mmap_file(), which is why vm_ops 2285 2235 * close function is called. 2286 2236 */ 2287 - vms_clean_up_area(&vms, &mas_detach); 2288 - vma = vma_merge_new_range(&vmg); 2289 - if (vma) 2290 - goto expanded; 2237 + vms_clean_up_area(vms, &map->mas_detach); 2238 + 2239 + return 0; 2240 + } 2241 + 2242 + static int __mmap_new_file_vma(struct mmap_state *map, 2243 + struct vm_area_struct **vmap, bool *mergedp) 2244 + { 2245 + struct vma_iterator *vmi = map->vmi; 2246 + struct vm_area_struct *vma = *vmap; 2247 + int error; 2248 + 2249 + vma->vm_file = get_file(map->file); 2250 + error = mmap_file(vma->vm_file, vma); 2251 + if (error) { 2252 + fput(vma->vm_file); 2253 + vma->vm_file = NULL; 2254 + 2255 + vma_iter_set(vmi, vma->vm_end); 2256 + /* Undo any partial mapping done by a device driver. */ 2257 + unmap_region(&vmi->mas, vma, map->prev, map->next); 2258 + 2259 + return error; 2260 + } 2261 + 2262 + /* Drivers cannot alter the address of the VMA. */ 2263 + WARN_ON_ONCE(map->addr != vma->vm_start); 2264 + /* 2265 + * Drivers should not permit writability when previously it was 2266 + * disallowed. 2267 + */ 2268 + VM_WARN_ON_ONCE(map->flags != vma->vm_flags && 2269 + !(map->flags & VM_MAYWRITE) && 2270 + (vma->vm_flags & VM_MAYWRITE)); 2271 + 2272 + /* mmap_file() might have changed VMA flags. */ 2273 + map->flags = vma->vm_flags; 2274 + 2275 + vma_iter_config(vmi, map->addr, map->end); 2276 + /* 2277 + * If flags changed after mmap_file(), we should try merge 2278 + * vma again as we may succeed this time. 2279 + */ 2280 + if (unlikely(map->flags != vma->vm_flags && map->prev)) { 2281 + struct vm_area_struct *merge; 2282 + VMG_MMAP_STATE(vmg, map, /* vma = */ NULL); 2283 + 2284 + merge = vma_merge_new_range(&vmg); 2285 + if (merge) { 2286 + /* 2287 + * ->mmap() can change vma->vm_file and fput 2288 + * the original file. So fput the vma->vm_file 2289 + * here or we would add an extra fput for file 2290 + * and cause general protection fault 2291 + * ultimately. 2292 + */ 2293 + fput(vma->vm_file); 2294 + vm_area_free(vma); 2295 + vma = merge; 2296 + *mergedp = true; 2297 + } else { 2298 + vma_iter_config(vmi, map->addr, map->end); 2299 + } 2300 + } 2301 + 2302 + *vmap = vma; 2303 + return 0; 2304 + } 2305 + 2306 + /* 2307 + * __mmap_new_vma() - Allocate a new VMA for the region, as merging was not 2308 + * possible. 2309 + * 2310 + * An exception to this is if the mapping is file-backed, and the underlying 2311 + * driver changes the VMA flags, permitting a subsequent merge of the VMA, in 2312 + * which case the returned VMA is one that was merged on a second attempt. 2313 + * 2314 + * @map: Mapping state. 2315 + * @vmap: Output pointer for the new VMA. 2316 + * 2317 + * Returns: Zero on success, or an error. 2318 + */ 2319 + static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) 2320 + { 2321 + struct vma_iterator *vmi = map->vmi; 2322 + int error = 0; 2323 + bool merged = false; 2324 + struct vm_area_struct *vma; 2325 + 2291 2326 /* 2292 2327 * Determine the object being mapped and call the appropriate 2293 2328 * specific mapper. the address has already been validated, but 2294 2329 * not unmapped, but the maps are removed from the list. 2295 2330 */ 2296 - vma = vm_area_alloc(mm); 2297 - if (!vma) { 2298 - error = -ENOMEM; 2299 - goto unacct_error; 2300 - } 2331 + vma = vm_area_alloc(map->mm); 2332 + if (!vma) 2333 + return -ENOMEM; 2301 2334 2302 - vma_iter_config(&vmi, addr, end); 2303 - vma_set_range(vma, addr, end, pgoff); 2304 - vm_flags_init(vma, vm_flags); 2305 - vma->vm_page_prot = vm_get_page_prot(vm_flags); 2335 + vma_iter_config(vmi, map->addr, map->end); 2336 + vma_set_range(vma, map->addr, map->end, map->pgoff); 2337 + vm_flags_init(vma, map->flags); 2338 + vma->vm_page_prot = vm_get_page_prot(map->flags); 2306 2339 2307 - if (vma_iter_prealloc(&vmi, vma)) { 2340 + if (vma_iter_prealloc(vmi, vma)) { 2308 2341 error = -ENOMEM; 2309 2342 goto free_vma; 2310 2343 } 2311 2344 2312 - if (file) { 2313 - vma->vm_file = get_file(file); 2314 - error = mmap_file(file, vma); 2315 - if (error) 2316 - goto unmap_and_free_file_vma; 2317 - 2318 - /* Drivers cannot alter the address of the VMA. */ 2319 - WARN_ON_ONCE(addr != vma->vm_start); 2320 - /* 2321 - * Drivers should not permit writability when previously it was 2322 - * disallowed. 2323 - */ 2324 - VM_WARN_ON_ONCE(vm_flags != vma->vm_flags && 2325 - !(vm_flags & VM_MAYWRITE) && 2326 - (vma->vm_flags & VM_MAYWRITE)); 2327 - 2328 - vma_iter_config(&vmi, addr, end); 2329 - /* 2330 - * If vm_flags changed after mmap_file(), we should try merge 2331 - * vma again as we may succeed this time. 2332 - */ 2333 - if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) { 2334 - struct vm_area_struct *merge; 2335 - 2336 - vmg.flags = vma->vm_flags; 2337 - /* If this fails, state is reset ready for a reattempt. */ 2338 - merge = vma_merge_new_range(&vmg); 2339 - 2340 - if (merge) { 2341 - /* 2342 - * ->mmap() can change vma->vm_file and fput 2343 - * the original file. So fput the vma->vm_file 2344 - * here or we would add an extra fput for file 2345 - * and cause general protection fault 2346 - * ultimately. 2347 - */ 2348 - fput(vma->vm_file); 2349 - vm_area_free(vma); 2350 - vma = merge; 2351 - /* Update vm_flags to pick up the change. */ 2352 - vm_flags = vma->vm_flags; 2353 - goto file_expanded; 2354 - } 2355 - vma_iter_config(&vmi, addr, end); 2356 - } 2357 - 2358 - vm_flags = vma->vm_flags; 2359 - } else if (vm_flags & VM_SHARED) { 2345 + if (map->file) 2346 + error = __mmap_new_file_vma(map, &vma, &merged); 2347 + else if (map->flags & VM_SHARED) 2360 2348 error = shmem_zero_setup(vma); 2361 - if (error) 2362 - goto free_iter_vma; 2363 - } else { 2349 + else 2364 2350 vma_set_anonymous(vma); 2365 - } 2351 + 2352 + if (error) 2353 + goto free_iter_vma; 2354 + 2355 + if (merged) 2356 + goto file_expanded; 2366 2357 2367 2358 #ifdef CONFIG_SPARC64 2368 2359 /* TODO: Fix SPARC ADI! */ 2369 - WARN_ON_ONCE(!arch_validate_flags(vm_flags)); 2360 + WARN_ON_ONCE(!arch_validate_flags(map->flags)); 2370 2361 #endif 2371 2362 2372 2363 /* Lock the VMA since it is modified after insertion into VMA tree */ 2373 2364 vma_start_write(vma); 2374 - vma_iter_store(&vmi, vma); 2375 - mm->map_count++; 2365 + vma_iter_store(vmi, vma); 2366 + map->mm->map_count++; 2376 2367 vma_link_file(vma); 2377 2368 2378 2369 /* 2379 2370 * vma_merge_new_range() calls khugepaged_enter_vma() too, the below 2380 2371 * call covers the non-merge case. 2381 2372 */ 2382 - khugepaged_enter_vma(vma, vma->vm_flags); 2373 + khugepaged_enter_vma(vma, map->flags); 2383 2374 2384 2375 file_expanded: 2385 - file = vma->vm_file; 2386 2376 ksm_add_vma(vma); 2387 - expanded: 2377 + *vmap = vma; 2378 + return 0; 2379 + 2380 + free_iter_vma: 2381 + vma_iter_free(vmi); 2382 + free_vma: 2383 + vm_area_free(vma); 2384 + return error; 2385 + } 2386 + 2387 + /* 2388 + * __mmap_complete() - Unmap any VMAs we overlap, account memory mapping 2389 + * statistics, handle locking and finalise the VMA. 2390 + * 2391 + * @map: Mapping state. 2392 + * @vma: Merged or newly allocated VMA for the mmap()'d region. 2393 + */ 2394 + static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) 2395 + { 2396 + struct mm_struct *mm = map->mm; 2397 + unsigned long vm_flags = vma->vm_flags; 2398 + 2388 2399 perf_event_mmap(vma); 2389 2400 2390 - /* Unmap any existing mapping in the area */ 2391 - vms_complete_munmap_vmas(&vms, &mas_detach); 2401 + /* Unmap any existing mapping in the area. */ 2402 + vms_complete_munmap_vmas(&map->vms, &map->mas_detach); 2392 2403 2393 - vm_stat_account(mm, vm_flags, pglen); 2404 + vm_stat_account(mm, vma->vm_flags, map->pglen); 2394 2405 if (vm_flags & VM_LOCKED) { 2395 2406 if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || 2396 2407 is_vm_hugetlb_page(vma) || 2397 - vma == get_gate_vma(current->mm)) 2408 + vma == get_gate_vma(mm)) 2398 2409 vm_flags_clear(vma, VM_LOCKED_MASK); 2399 2410 else 2400 - mm->locked_vm += pglen; 2411 + mm->locked_vm += map->pglen; 2401 2412 } 2402 2413 2403 - if (file) 2414 + if (vma->vm_file) 2404 2415 uprobe_mmap(vma); 2405 2416 2406 2417 /* ··· 2474 2363 vm_flags_set(vma, VM_SOFTDIRTY); 2475 2364 2476 2365 vma_set_page_prot(vma); 2366 + } 2367 + 2368 + unsigned long __mmap_region(struct file *file, unsigned long addr, 2369 + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, 2370 + struct list_head *uf) 2371 + { 2372 + struct mm_struct *mm = current->mm; 2373 + struct vm_area_struct *vma = NULL; 2374 + int error; 2375 + VMA_ITERATOR(vmi, mm, addr); 2376 + MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file); 2377 + 2378 + error = __mmap_prepare(&map, uf); 2379 + if (error) 2380 + goto abort_munmap; 2381 + 2382 + /* Attempt to merge with adjacent VMAs... */ 2383 + if (map.prev || map.next) { 2384 + VMG_MMAP_STATE(vmg, &map, /* vma = */ NULL); 2385 + 2386 + vma = vma_merge_new_range(&vmg); 2387 + } 2388 + 2389 + /* ...but if we can't, allocate a new VMA. */ 2390 + if (!vma) { 2391 + error = __mmap_new_vma(&map, &vma); 2392 + if (error) 2393 + goto unacct_error; 2394 + } 2395 + 2396 + __mmap_complete(&map, vma); 2477 2397 2478 2398 return addr; 2479 2399 2480 - unmap_and_free_file_vma: 2481 - fput(vma->vm_file); 2482 - vma->vm_file = NULL; 2483 - 2484 - vma_iter_set(&vmi, vma->vm_end); 2485 - /* Undo any partial mapping done by a device driver. */ 2486 - unmap_region(&vmi.mas, vma, vmg.prev, vmg.next); 2487 - free_iter_vma: 2488 - vma_iter_free(&vmi); 2489 - free_vma: 2490 - vm_area_free(vma); 2400 + /* Accounting was done by __mmap_prepare(). */ 2491 2401 unacct_error: 2492 - if (charged) 2493 - vm_unacct_memory(charged); 2494 - 2402 + if (map.charged) 2403 + vm_unacct_memory(map.charged); 2495 2404 abort_munmap: 2496 - vms_abort_munmap_vmas(&vms, &mas_detach); 2497 - gather_failed: 2405 + vms_abort_munmap_vmas(&map.vms, &map.mas_detach); 2498 2406 return error; 2499 2407 }