Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

mm: support batched unmap for lazyfree large folios during reclamation

Currently, the PTEs and rmap of a large folio are removed one at a time.
This is not only slow but also causes the large folio to be unnecessarily
added to deferred_split, which can lead to races between the
deferred_split shrinker callback and memory reclamation. This patch
releases all PTEs and rmap entries in a batch. Currently, it only handles
lazyfree large folios.

The below microbench tries to reclaim 128MB lazyfree large folios
whose sizes are 64KiB:

#include <stdio.h>
#include <sys/mman.h>
#include <string.h>
#include <time.h>

#define SIZE 128*1024*1024 // 128 MB

unsigned long read_split_deferred()
{
FILE *file = fopen("/sys/kernel/mm/transparent_hugepage"
"/hugepages-64kB/stats/split_deferred", "r");
if (!file) {
perror("Error opening file");
return 0;
}

unsigned long value;
if (fscanf(file, "%lu", &value) != 1) {
perror("Error reading value");
fclose(file);
return 0;
}

fclose(file);
return value;
}

int main(int argc, char *argv[])
{
while(1) {
volatile int *p = mmap(0, SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);

memset((void *)p, 1, SIZE);

madvise((void *)p, SIZE, MADV_FREE);

clock_t start_time = clock();
unsigned long start_split = read_split_deferred();
madvise((void *)p, SIZE, MADV_PAGEOUT);
clock_t end_time = clock();
unsigned long end_split = read_split_deferred();

double elapsed_time = (double)(end_time - start_time) / CLOCKS_PER_SEC;
printf("Time taken by reclamation: %f seconds, split_deferred: %ld\n",
elapsed_time, end_split - start_split);

munmap((void *)p, SIZE);
}
return 0;
}

w/o patch:
~ # ./a.out
Time taken by reclamation: 0.177418 seconds, split_deferred: 2048
Time taken by reclamation: 0.178348 seconds, split_deferred: 2048
Time taken by reclamation: 0.174525 seconds, split_deferred: 2048
Time taken by reclamation: 0.171620 seconds, split_deferred: 2048
Time taken by reclamation: 0.172241 seconds, split_deferred: 2048
Time taken by reclamation: 0.174003 seconds, split_deferred: 2048
Time taken by reclamation: 0.171058 seconds, split_deferred: 2048
Time taken by reclamation: 0.171993 seconds, split_deferred: 2048
Time taken by reclamation: 0.169829 seconds, split_deferred: 2048
Time taken by reclamation: 0.172895 seconds, split_deferred: 2048
Time taken by reclamation: 0.176063 seconds, split_deferred: 2048
Time taken by reclamation: 0.172568 seconds, split_deferred: 2048
Time taken by reclamation: 0.171185 seconds, split_deferred: 2048
Time taken by reclamation: 0.170632 seconds, split_deferred: 2048
Time taken by reclamation: 0.170208 seconds, split_deferred: 2048
Time taken by reclamation: 0.174192 seconds, split_deferred: 2048
...

w/ patch:
~ # ./a.out
Time taken by reclamation: 0.074231 seconds, split_deferred: 0
Time taken by reclamation: 0.071026 seconds, split_deferred: 0
Time taken by reclamation: 0.072029 seconds, split_deferred: 0
Time taken by reclamation: 0.071873 seconds, split_deferred: 0
Time taken by reclamation: 0.073573 seconds, split_deferred: 0
Time taken by reclamation: 0.071906 seconds, split_deferred: 0
Time taken by reclamation: 0.073604 seconds, split_deferred: 0
Time taken by reclamation: 0.075903 seconds, split_deferred: 0
Time taken by reclamation: 0.073191 seconds, split_deferred: 0
Time taken by reclamation: 0.071228 seconds, split_deferred: 0
Time taken by reclamation: 0.071391 seconds, split_deferred: 0
Time taken by reclamation: 0.071468 seconds, split_deferred: 0
Time taken by reclamation: 0.071896 seconds, split_deferred: 0
Time taken by reclamation: 0.072508 seconds, split_deferred: 0
Time taken by reclamation: 0.071884 seconds, split_deferred: 0
Time taken by reclamation: 0.072433 seconds, split_deferred: 0
Time taken by reclamation: 0.071939 seconds, split_deferred: 0
...

Link: https://lkml.kernel.org/r/20250214093015.51024-4-21cnbao@gmail.com
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chis Li <chrisl@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Gavin Shan <gshan@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mauricio Faria de Oliveira <mfo@canonical.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shaoqin Huang <shahuang@redhat.com>
Cc: Tangquan Zheng <zhengtangquan@oppo.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Barry Song and committed by
Andrew Morton
354dffd2 2f4ab3ac

+50 -22
+50 -22
mm/rmap.c
··· 1722 1722 #endif 1723 1723 } 1724 1724 1725 + /* We support batch unmapping of PTEs for lazyfree large folios */ 1726 + static inline bool can_batch_unmap_folio_ptes(unsigned long addr, 1727 + struct folio *folio, pte_t *ptep) 1728 + { 1729 + const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; 1730 + int max_nr = folio_nr_pages(folio); 1731 + pte_t pte = ptep_get(ptep); 1732 + 1733 + if (!folio_test_anon(folio) || folio_test_swapbacked(folio)) 1734 + return false; 1735 + if (pte_unused(pte)) 1736 + return false; 1737 + if (pte_pfn(pte) != folio_pfn(folio)) 1738 + return false; 1739 + 1740 + return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL, 1741 + NULL, NULL) == max_nr; 1742 + } 1743 + 1725 1744 /* 1726 1745 * @arg: enum ttu_flags will be passed to this argument 1727 1746 */ ··· 1754 1735 struct page *subpage; 1755 1736 struct mmu_notifier_range range; 1756 1737 enum ttu_flags flags = (enum ttu_flags)(long)arg; 1738 + unsigned long nr_pages = 1, end_addr; 1757 1739 unsigned long pfn; 1758 1740 unsigned long hsz = 0; 1759 1741 ··· 1894 1874 if (pte_dirty(pteval)) 1895 1875 folio_mark_dirty(folio); 1896 1876 } else if (likely(pte_present(pteval))) { 1897 - flush_cache_page(vma, address, pfn); 1898 - /* Nuke the page table entry. */ 1899 - if (should_defer_flush(mm, flags)) { 1900 - /* 1901 - * We clear the PTE but do not flush so potentially 1902 - * a remote CPU could still be writing to the folio. 1903 - * If the entry was previously clean then the 1904 - * architecture must guarantee that a clear->dirty 1905 - * transition on a cached TLB entry is written through 1906 - * and traps if the PTE is unmapped. 1907 - */ 1908 - pteval = ptep_get_and_clear(mm, address, pvmw.pte); 1877 + if (folio_test_large(folio) && !(flags & TTU_HWPOISON) && 1878 + can_batch_unmap_folio_ptes(address, folio, pvmw.pte)) 1879 + nr_pages = folio_nr_pages(folio); 1880 + end_addr = address + nr_pages * PAGE_SIZE; 1881 + flush_cache_range(vma, address, end_addr); 1909 1882 1910 - set_tlb_ubc_flush_pending(mm, pteval, address, address + PAGE_SIZE); 1911 - } else { 1912 - pteval = ptep_clear_flush(vma, address, pvmw.pte); 1913 - } 1883 + /* Nuke the page table entry. */ 1884 + pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0); 1885 + /* 1886 + * We clear the PTE but do not flush so potentially 1887 + * a remote CPU could still be writing to the folio. 1888 + * If the entry was previously clean then the 1889 + * architecture must guarantee that a clear->dirty 1890 + * transition on a cached TLB entry is written through 1891 + * and traps if the PTE is unmapped. 1892 + */ 1893 + if (should_defer_flush(mm, flags)) 1894 + set_tlb_ubc_flush_pending(mm, pteval, address, end_addr); 1895 + else 1896 + flush_tlb_range(vma, address, end_addr); 1914 1897 if (pte_dirty(pteval)) 1915 1898 folio_mark_dirty(folio); 1916 1899 } else { ··· 1991 1968 * redirtied either using the page table or a previously 1992 1969 * obtained GUP reference. 1993 1970 */ 1994 - set_pte_at(mm, address, pvmw.pte, pteval); 1971 + set_ptes(mm, address, pvmw.pte, pteval, nr_pages); 1995 1972 folio_set_swapbacked(folio); 1996 1973 goto walk_abort; 1997 1974 } else if (ref_count != 1 + map_count) { ··· 2004 1981 * We'll come back here later and detect if the folio was 2005 1982 * dirtied when the additional reference is gone. 2006 1983 */ 2007 - set_pte_at(mm, address, pvmw.pte, pteval); 1984 + set_ptes(mm, address, pvmw.pte, pteval, nr_pages); 2008 1985 goto walk_abort; 2009 1986 } 2010 - dec_mm_counter(mm, MM_ANONPAGES); 1987 + add_mm_counter(mm, MM_ANONPAGES, -nr_pages); 2011 1988 goto discard; 2012 1989 } 2013 1990 ··· 2072 2049 dec_mm_counter(mm, mm_counter_file(folio)); 2073 2050 } 2074 2051 discard: 2075 - if (unlikely(folio_test_hugetlb(folio))) 2052 + if (unlikely(folio_test_hugetlb(folio))) { 2076 2053 hugetlb_remove_rmap(folio); 2077 - else 2078 - folio_remove_rmap_pte(folio, subpage, vma); 2054 + } else { 2055 + folio_remove_rmap_ptes(folio, subpage, nr_pages, vma); 2056 + folio_ref_sub(folio, nr_pages - 1); 2057 + } 2079 2058 if (vma->vm_flags & VM_LOCKED) 2080 2059 mlock_drain_local(); 2081 2060 folio_put(folio); 2061 + /* We have already batched the entire folio */ 2062 + if (nr_pages > 1) 2063 + goto walk_done; 2082 2064 continue; 2083 2065 walk_abort: 2084 2066 ret = false;