Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

mm: pagewalk: Fix race between unmap and page walker

The mmap lock protects the page walker from changes to the page tables
during the walk. However a read lock is insufficient to protect those
areas which don't have a VMA as munmap() detaches the VMAs before
downgrading to a read lock and actually tearing down PTEs/page tables.

For users of walk_page_range() the solution is to simply call pte_hole()
immediately without checking the actual page tables when a VMA is not
present. We now never call __walk_page_range() without a valid vma.

For walk_page_range_novma() the locking requirements are tightened to
require the mmap write lock to be taken, and then walking the pgd
directly with 'no_vma' set.

This in turn means that all page walkers either have a valid vma, or
it's that special 'novma' case for page table debugging. As a result,
all the odd '(!walk->vma && !walk->no_vma)' tests can be removed.

Fixes: dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in munmap")
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Steven Price <steven.price@arm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Steven Price and committed by
Linus Torvalds
8782fb61 d895ec79

+16 -13
+2 -2
arch/riscv/mm/pageattr.c
··· 118 118 if (!numpages) 119 119 return 0; 120 120 121 - mmap_read_lock(&init_mm); 121 + mmap_write_lock(&init_mm); 122 122 ret = walk_page_range_novma(&init_mm, start, end, &pageattr_ops, NULL, 123 123 &masks); 124 - mmap_read_unlock(&init_mm); 124 + mmap_write_unlock(&init_mm); 125 125 126 126 flush_tlb_kernel_range(start, end); 127 127
+12 -9
mm/pagewalk.c
··· 110 110 do { 111 111 again: 112 112 next = pmd_addr_end(addr, end); 113 - if (pmd_none(*pmd) || (!walk->vma && !walk->no_vma)) { 113 + if (pmd_none(*pmd)) { 114 114 if (ops->pte_hole) 115 115 err = ops->pte_hole(addr, next, depth, walk); 116 116 if (err) ··· 171 171 do { 172 172 again: 173 173 next = pud_addr_end(addr, end); 174 - if (pud_none(*pud) || (!walk->vma && !walk->no_vma)) { 174 + if (pud_none(*pud)) { 175 175 if (ops->pte_hole) 176 176 err = ops->pte_hole(addr, next, depth, walk); 177 177 if (err) ··· 366 366 struct vm_area_struct *vma = walk->vma; 367 367 const struct mm_walk_ops *ops = walk->ops; 368 368 369 - if (vma && ops->pre_vma) { 369 + if (ops->pre_vma) { 370 370 err = ops->pre_vma(start, end, walk); 371 371 if (err) 372 372 return err; 373 373 } 374 374 375 - if (vma && is_vm_hugetlb_page(vma)) { 375 + if (is_vm_hugetlb_page(vma)) { 376 376 if (ops->hugetlb_entry) 377 377 err = walk_hugetlb_range(start, end, walk); 378 378 } else 379 379 err = walk_pgd_range(start, end, walk); 380 380 381 - if (vma && ops->post_vma) 381 + if (ops->post_vma) 382 382 ops->post_vma(walk); 383 383 384 384 return err; ··· 450 450 if (!vma) { /* after the last vma */ 451 451 walk.vma = NULL; 452 452 next = end; 453 + if (ops->pte_hole) 454 + err = ops->pte_hole(start, next, -1, &walk); 453 455 } else if (start < vma->vm_start) { /* outside vma */ 454 456 walk.vma = NULL; 455 457 next = min(end, vma->vm_start); 458 + if (ops->pte_hole) 459 + err = ops->pte_hole(start, next, -1, &walk); 456 460 } else { /* inside vma */ 457 461 walk.vma = vma; 458 462 next = min(end, vma->vm_end); ··· 474 470 } 475 471 if (err < 0) 476 472 break; 477 - } 478 - if (walk.vma || walk.ops->pte_hole) 479 473 err = __walk_page_range(start, next, &walk); 474 + } 480 475 if (err) 481 476 break; 482 477 } while (start = next, start < end); ··· 504 501 if (start >= end || !walk.mm) 505 502 return -EINVAL; 506 503 507 - mmap_assert_locked(walk.mm); 504 + mmap_assert_write_locked(walk.mm); 508 505 509 - return __walk_page_range(start, end, &walk); 506 + return walk_pgd_range(start, end, &walk); 510 507 } 511 508 512 509 int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
+2 -2
mm/ptdump.c
··· 152 152 { 153 153 const struct ptdump_range *range = st->range; 154 154 155 - mmap_read_lock(mm); 155 + mmap_write_lock(mm); 156 156 while (range->start != range->end) { 157 157 walk_page_range_novma(mm, range->start, range->end, 158 158 &ptdump_ops, pgd, st); 159 159 range++; 160 160 } 161 - mmap_read_unlock(mm); 161 + mmap_write_unlock(mm); 162 162 163 163 /* Flush out the last page */ 164 164 st->note_page(st, 0, -1, 0);