Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

mm: move dup_mmap() to mm

This is a key step in our being able to abstract and isolate VMA
allocation and destruction logic.

This function is the last one where vm_area_free() and vm_area_dup() are
directly referenced outside of mmap, so having this in mm allows us to
isolate these.

We do the same for the nommu version which is substantially simpler.

We place the declaration for dup_mmap() in mm/internal.h and have
kernel/fork.c import this in order to prevent improper use of this
functionality elsewhere in the kernel.

While we're here, we remove the useless #ifdef CONFIG_MMU check around
mmap_read_lock_maybe_expand() in mmap.c, mmap.c is compiled only if
CONFIG_MMU is set.

Link: https://lkml.kernel.org/r/e49aad3d00212f5539d9fa5769bfda4ce451db3e.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Suggested-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Lorenzo Stoakes and committed by
Andrew Morton
26a8f577 dd7a6246

+189 -191
+4 -185
kernel/fork.c
··· 112 112 #include <asm/cacheflush.h> 113 113 #include <asm/tlbflush.h> 114 114 115 + /* For dup_mmap(). */ 116 + #include "../mm/internal.h" 117 + 115 118 #include <trace/events/sched.h> 116 119 117 120 #define CREATE_TRACE_POINTS ··· 592 589 } 593 590 EXPORT_SYMBOL(free_task); 594 591 595 - static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) 592 + void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) 596 593 { 597 594 struct file *exe_file; 598 595 ··· 607 604 } 608 605 609 606 #ifdef CONFIG_MMU 610 - static __latent_entropy int dup_mmap(struct mm_struct *mm, 611 - struct mm_struct *oldmm) 612 - { 613 - struct vm_area_struct *mpnt, *tmp; 614 - int retval; 615 - unsigned long charge = 0; 616 - LIST_HEAD(uf); 617 - VMA_ITERATOR(vmi, mm, 0); 618 - 619 - if (mmap_write_lock_killable(oldmm)) 620 - return -EINTR; 621 - flush_cache_dup_mm(oldmm); 622 - uprobe_dup_mmap(oldmm, mm); 623 - /* 624 - * Not linked in yet - no deadlock potential: 625 - */ 626 - mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); 627 - 628 - /* No ordering required: file already has been exposed. */ 629 - dup_mm_exe_file(mm, oldmm); 630 - 631 - mm->total_vm = oldmm->total_vm; 632 - mm->data_vm = oldmm->data_vm; 633 - mm->exec_vm = oldmm->exec_vm; 634 - mm->stack_vm = oldmm->stack_vm; 635 - 636 - /* Use __mt_dup() to efficiently build an identical maple tree. */ 637 - retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); 638 - if (unlikely(retval)) 639 - goto out; 640 - 641 - mt_clear_in_rcu(vmi.mas.tree); 642 - for_each_vma(vmi, mpnt) { 643 - struct file *file; 644 - 645 - vma_start_write(mpnt); 646 - if (mpnt->vm_flags & VM_DONTCOPY) { 647 - retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, 648 - mpnt->vm_end, GFP_KERNEL); 649 - if (retval) 650 - goto loop_out; 651 - 652 - vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); 653 - continue; 654 - } 655 - charge = 0; 656 - /* 657 - * Don't duplicate many vmas if we've been oom-killed (for 658 - * example) 659 - */ 660 - if (fatal_signal_pending(current)) { 661 - retval = -EINTR; 662 - goto loop_out; 663 - } 664 - if (mpnt->vm_flags & VM_ACCOUNT) { 665 - unsigned long len = vma_pages(mpnt); 666 - 667 - if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ 668 - goto fail_nomem; 669 - charge = len; 670 - } 671 - tmp = vm_area_dup(mpnt); 672 - if (!tmp) 673 - goto fail_nomem; 674 - 675 - /* track_pfn_copy() will later take care of copying internal state. */ 676 - if (unlikely(tmp->vm_flags & VM_PFNMAP)) 677 - untrack_pfn_clear(tmp); 678 - 679 - retval = vma_dup_policy(mpnt, tmp); 680 - if (retval) 681 - goto fail_nomem_policy; 682 - tmp->vm_mm = mm; 683 - retval = dup_userfaultfd(tmp, &uf); 684 - if (retval) 685 - goto fail_nomem_anon_vma_fork; 686 - if (tmp->vm_flags & VM_WIPEONFORK) { 687 - /* 688 - * VM_WIPEONFORK gets a clean slate in the child. 689 - * Don't prepare anon_vma until fault since we don't 690 - * copy page for current vma. 691 - */ 692 - tmp->anon_vma = NULL; 693 - } else if (anon_vma_fork(tmp, mpnt)) 694 - goto fail_nomem_anon_vma_fork; 695 - vm_flags_clear(tmp, VM_LOCKED_MASK); 696 - /* 697 - * Copy/update hugetlb private vma information. 698 - */ 699 - if (is_vm_hugetlb_page(tmp)) 700 - hugetlb_dup_vma_private(tmp); 701 - 702 - /* 703 - * Link the vma into the MT. After using __mt_dup(), memory 704 - * allocation is not necessary here, so it cannot fail. 705 - */ 706 - vma_iter_bulk_store(&vmi, tmp); 707 - 708 - mm->map_count++; 709 - 710 - if (tmp->vm_ops && tmp->vm_ops->open) 711 - tmp->vm_ops->open(tmp); 712 - 713 - file = tmp->vm_file; 714 - if (file) { 715 - struct address_space *mapping = file->f_mapping; 716 - 717 - get_file(file); 718 - i_mmap_lock_write(mapping); 719 - if (vma_is_shared_maywrite(tmp)) 720 - mapping_allow_writable(mapping); 721 - flush_dcache_mmap_lock(mapping); 722 - /* insert tmp into the share list, just after mpnt */ 723 - vma_interval_tree_insert_after(tmp, mpnt, 724 - &mapping->i_mmap); 725 - flush_dcache_mmap_unlock(mapping); 726 - i_mmap_unlock_write(mapping); 727 - } 728 - 729 - if (!(tmp->vm_flags & VM_WIPEONFORK)) 730 - retval = copy_page_range(tmp, mpnt); 731 - 732 - if (retval) { 733 - mpnt = vma_next(&vmi); 734 - goto loop_out; 735 - } 736 - } 737 - /* a new mm has just been created */ 738 - retval = arch_dup_mmap(oldmm, mm); 739 - loop_out: 740 - vma_iter_free(&vmi); 741 - if (!retval) { 742 - mt_set_in_rcu(vmi.mas.tree); 743 - ksm_fork(mm, oldmm); 744 - khugepaged_fork(mm, oldmm); 745 - } else { 746 - 747 - /* 748 - * The entire maple tree has already been duplicated. If the 749 - * mmap duplication fails, mark the failure point with 750 - * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, 751 - * stop releasing VMAs that have not been duplicated after this 752 - * point. 753 - */ 754 - if (mpnt) { 755 - mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); 756 - mas_store(&vmi.mas, XA_ZERO_ENTRY); 757 - /* Avoid OOM iterating a broken tree */ 758 - set_bit(MMF_OOM_SKIP, &mm->flags); 759 - } 760 - /* 761 - * The mm_struct is going to exit, but the locks will be dropped 762 - * first. Set the mm_struct as unstable is advisable as it is 763 - * not fully initialised. 764 - */ 765 - set_bit(MMF_UNSTABLE, &mm->flags); 766 - } 767 - out: 768 - mmap_write_unlock(mm); 769 - flush_tlb_mm(oldmm); 770 - mmap_write_unlock(oldmm); 771 - if (!retval) 772 - dup_userfaultfd_complete(&uf); 773 - else 774 - dup_userfaultfd_fail(&uf); 775 - return retval; 776 - 777 - fail_nomem_anon_vma_fork: 778 - mpol_put(vma_policy(tmp)); 779 - fail_nomem_policy: 780 - vm_area_free(tmp); 781 - fail_nomem: 782 - retval = -ENOMEM; 783 - vm_unacct_memory(charge); 784 - goto loop_out; 785 - } 786 - 787 607 static inline int mm_alloc_pgd(struct mm_struct *mm) 788 608 { 789 609 mm->pgd = pgd_alloc(mm); ··· 620 794 pgd_free(mm, mm->pgd); 621 795 } 622 796 #else 623 - static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) 624 - { 625 - mmap_write_lock(oldmm); 626 - dup_mm_exe_file(mm, oldmm); 627 - mmap_write_unlock(oldmm); 628 - return 0; 629 - } 630 797 #define mm_alloc_pgd(mm) (0) 631 798 #define mm_free_pgd(mm) 632 799 #endif /* CONFIG_MMU */
+2
mm/internal.h
··· 1624 1624 } 1625 1625 #endif /* CONFIG_PT_RECLAIM */ 1626 1626 1627 + void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm); 1628 + int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm); 1627 1629 1628 1630 #endif /* __MM_INTERNAL_H */
+175 -6
mm/mmap.c
··· 1675 1675 } 1676 1676 subsys_initcall(init_reserve_notifier); 1677 1677 1678 - #ifdef CONFIG_MMU 1679 1678 /* 1680 1679 * Obtain a read lock on mm->mmap_lock, if the specified address is below the 1681 1680 * start of the VMA, the intent is to perform a write, and it is a ··· 1718 1719 mmap_write_downgrade(mm); 1719 1720 return true; 1720 1721 } 1721 - #else 1722 - bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, 1723 - unsigned long addr, bool write) 1722 + 1723 + __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) 1724 1724 { 1725 - return false; 1725 + struct vm_area_struct *mpnt, *tmp; 1726 + int retval; 1727 + unsigned long charge = 0; 1728 + LIST_HEAD(uf); 1729 + VMA_ITERATOR(vmi, mm, 0); 1730 + 1731 + if (mmap_write_lock_killable(oldmm)) 1732 + return -EINTR; 1733 + flush_cache_dup_mm(oldmm); 1734 + uprobe_dup_mmap(oldmm, mm); 1735 + /* 1736 + * Not linked in yet - no deadlock potential: 1737 + */ 1738 + mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); 1739 + 1740 + /* No ordering required: file already has been exposed. */ 1741 + dup_mm_exe_file(mm, oldmm); 1742 + 1743 + mm->total_vm = oldmm->total_vm; 1744 + mm->data_vm = oldmm->data_vm; 1745 + mm->exec_vm = oldmm->exec_vm; 1746 + mm->stack_vm = oldmm->stack_vm; 1747 + 1748 + /* Use __mt_dup() to efficiently build an identical maple tree. */ 1749 + retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); 1750 + if (unlikely(retval)) 1751 + goto out; 1752 + 1753 + mt_clear_in_rcu(vmi.mas.tree); 1754 + for_each_vma(vmi, mpnt) { 1755 + struct file *file; 1756 + 1757 + vma_start_write(mpnt); 1758 + if (mpnt->vm_flags & VM_DONTCOPY) { 1759 + retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, 1760 + mpnt->vm_end, GFP_KERNEL); 1761 + if (retval) 1762 + goto loop_out; 1763 + 1764 + vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); 1765 + continue; 1766 + } 1767 + charge = 0; 1768 + /* 1769 + * Don't duplicate many vmas if we've been oom-killed (for 1770 + * example) 1771 + */ 1772 + if (fatal_signal_pending(current)) { 1773 + retval = -EINTR; 1774 + goto loop_out; 1775 + } 1776 + if (mpnt->vm_flags & VM_ACCOUNT) { 1777 + unsigned long len = vma_pages(mpnt); 1778 + 1779 + if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ 1780 + goto fail_nomem; 1781 + charge = len; 1782 + } 1783 + 1784 + tmp = vm_area_dup(mpnt); 1785 + if (!tmp) 1786 + goto fail_nomem; 1787 + 1788 + /* track_pfn_copy() will later take care of copying internal state. */ 1789 + if (unlikely(tmp->vm_flags & VM_PFNMAP)) 1790 + untrack_pfn_clear(tmp); 1791 + 1792 + retval = vma_dup_policy(mpnt, tmp); 1793 + if (retval) 1794 + goto fail_nomem_policy; 1795 + tmp->vm_mm = mm; 1796 + retval = dup_userfaultfd(tmp, &uf); 1797 + if (retval) 1798 + goto fail_nomem_anon_vma_fork; 1799 + if (tmp->vm_flags & VM_WIPEONFORK) { 1800 + /* 1801 + * VM_WIPEONFORK gets a clean slate in the child. 1802 + * Don't prepare anon_vma until fault since we don't 1803 + * copy page for current vma. 1804 + */ 1805 + tmp->anon_vma = NULL; 1806 + } else if (anon_vma_fork(tmp, mpnt)) 1807 + goto fail_nomem_anon_vma_fork; 1808 + vm_flags_clear(tmp, VM_LOCKED_MASK); 1809 + /* 1810 + * Copy/update hugetlb private vma information. 1811 + */ 1812 + if (is_vm_hugetlb_page(tmp)) 1813 + hugetlb_dup_vma_private(tmp); 1814 + 1815 + /* 1816 + * Link the vma into the MT. After using __mt_dup(), memory 1817 + * allocation is not necessary here, so it cannot fail. 1818 + */ 1819 + vma_iter_bulk_store(&vmi, tmp); 1820 + 1821 + mm->map_count++; 1822 + 1823 + if (tmp->vm_ops && tmp->vm_ops->open) 1824 + tmp->vm_ops->open(tmp); 1825 + 1826 + file = tmp->vm_file; 1827 + if (file) { 1828 + struct address_space *mapping = file->f_mapping; 1829 + 1830 + get_file(file); 1831 + i_mmap_lock_write(mapping); 1832 + if (vma_is_shared_maywrite(tmp)) 1833 + mapping_allow_writable(mapping); 1834 + flush_dcache_mmap_lock(mapping); 1835 + /* insert tmp into the share list, just after mpnt */ 1836 + vma_interval_tree_insert_after(tmp, mpnt, 1837 + &mapping->i_mmap); 1838 + flush_dcache_mmap_unlock(mapping); 1839 + i_mmap_unlock_write(mapping); 1840 + } 1841 + 1842 + if (!(tmp->vm_flags & VM_WIPEONFORK)) 1843 + retval = copy_page_range(tmp, mpnt); 1844 + 1845 + if (retval) { 1846 + mpnt = vma_next(&vmi); 1847 + goto loop_out; 1848 + } 1849 + } 1850 + /* a new mm has just been created */ 1851 + retval = arch_dup_mmap(oldmm, mm); 1852 + loop_out: 1853 + vma_iter_free(&vmi); 1854 + if (!retval) { 1855 + mt_set_in_rcu(vmi.mas.tree); 1856 + ksm_fork(mm, oldmm); 1857 + khugepaged_fork(mm, oldmm); 1858 + } else { 1859 + 1860 + /* 1861 + * The entire maple tree has already been duplicated. If the 1862 + * mmap duplication fails, mark the failure point with 1863 + * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, 1864 + * stop releasing VMAs that have not been duplicated after this 1865 + * point. 1866 + */ 1867 + if (mpnt) { 1868 + mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); 1869 + mas_store(&vmi.mas, XA_ZERO_ENTRY); 1870 + /* Avoid OOM iterating a broken tree */ 1871 + set_bit(MMF_OOM_SKIP, &mm->flags); 1872 + } 1873 + /* 1874 + * The mm_struct is going to exit, but the locks will be dropped 1875 + * first. Set the mm_struct as unstable is advisable as it is 1876 + * not fully initialised. 1877 + */ 1878 + set_bit(MMF_UNSTABLE, &mm->flags); 1879 + } 1880 + out: 1881 + mmap_write_unlock(mm); 1882 + flush_tlb_mm(oldmm); 1883 + mmap_write_unlock(oldmm); 1884 + if (!retval) 1885 + dup_userfaultfd_complete(&uf); 1886 + else 1887 + dup_userfaultfd_fail(&uf); 1888 + return retval; 1889 + 1890 + fail_nomem_anon_vma_fork: 1891 + mpol_put(vma_policy(tmp)); 1892 + fail_nomem_policy: 1893 + vm_area_free(tmp); 1894 + fail_nomem: 1895 + retval = -ENOMEM; 1896 + vm_unacct_memory(charge); 1897 + goto loop_out; 1726 1898 } 1727 - #endif
+8
mm/nommu.c
··· 1874 1874 return 0; 1875 1875 } 1876 1876 subsys_initcall(init_admin_reserve); 1877 + 1878 + int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) 1879 + { 1880 + mmap_write_lock(oldmm); 1881 + dup_mm_exe_file(mm, oldmm); 1882 + mmap_write_unlock(oldmm); 1883 + return 0; 1884 + }