mm: madvise: avoid split during MADV_PAGEOUT and MADV_COLD

Rework madvise_cold_or_pageout_pte_range() to avoid splitting any large
folio that is fully and contiguously mapped in the pageout/cold vm range.
This change means that large folios will be maintained all the way to swap
storage. This both improves performance during swap-out, by eliding the
cost of splitting the folio, and sets us up nicely for maintaining the
large folio when it is swapped back in (to be covered in a separate
series).

Folios that are not fully mapped in the target range are still split, but
note that behavior is changed so that if the split fails for any reason
(folio locked, shared, etc) we now leave it as is and move to the next pte
in the range and continue work on the proceeding folios. Previously any
failure of this sort would cause the entire operation to give up and no
folios mapped at higher addresses were paged out or made cold. Given
large folios are becoming more common, this old behavior would have likely
lead to wasted opportunities.

While we are at it, change the code that clears young from the ptes to use
ptep_test_and_clear_young(), via the new mkold_ptes() batch helper
function. This is more efficent than get_and_clear/modify/set, especially
for contpte mappings on arm64, where the old approach would require
unfolding/refolding and the new approach can be done in place.

Link: https://lkml.kernel.org/r/20240408183946.2991168-8-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Barry Song <v-songbaohua@oppo.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Gao Xiang <xiang@kernel.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Ryan Roberts and committed by

Andrew Morton 2 years ago 3931b871 5ed890ce

+91 -40

4 changed files

expand all

include

linux

pgtable.h

internal.h

madvise.c

memory.c

+30

include/linux/pgtable.h

··· 361 361 } 362 362 #endif 363 363 364 + #ifndef mkold_ptes 365 + /** 366 + * mkold_ptes - Mark PTEs that map consecutive pages of the same folio as old. 367 + * @vma: VMA the pages are mapped into. 368 + * @addr: Address the first page is mapped at. 369 + * @ptep: Page table pointer for the first entry. 370 + * @nr: Number of entries to mark old. 371 + * 372 + * May be overridden by the architecture; otherwise, implemented as a simple 373 + * loop over ptep_test_and_clear_young(). 374 + * 375 + * Note that PTE bits in the PTE range besides the PFN can differ. For example, 376 + * some PTEs might be write-protected. 377 + * 378 + * Context: The caller holds the page table lock. The PTEs map consecutive 379 + * pages that belong to the same folio. The PTEs are all in the same PMD. 380 + */ 381 + static inline void mkold_ptes(struct vm_area_struct *vma, unsigned long addr, 382 + pte_t *ptep, unsigned int nr) 383 + { 384 + for (;;) { 385 + ptep_test_and_clear_young(vma, addr, ptep); 386 + if (--nr == 0) 387 + break; 388 + ptep++; 389 + addr += PAGE_SIZE; 390 + } 391 + } 392 + #endif 393 + 364 394 #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG 365 395 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) 366 396 static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,

+10 -2

mm/internal.h

··· 130 130 * @flags: Flags to modify the PTE batch semantics. 131 131 * @any_writable: Optional pointer to indicate whether any entry except the 132 132 * first one is writable. 133 + * @any_young: Optional pointer to indicate whether any entry except the 134 + * first one is young. 133 135 * 134 136 * Detect a PTE batch: consecutive (present) PTEs that map consecutive 135 137 * pages of the same large folio. ··· 147 145 */ 148 146 static inline int folio_pte_batch(struct folio *folio, unsigned long addr, 149 147 pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags, 150 - bool *any_writable) 148 + bool *any_writable, bool *any_young) 151 149 { 152 150 unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio); 153 151 const pte_t *end_ptep = start_ptep + max_nr; 154 152 pte_t expected_pte, *ptep; 155 - bool writable; 153 + bool writable, young; 156 154 int nr; 157 155 158 156 if (any_writable) 159 157 *any_writable = false; 158 + if (any_young) 159 + *any_young = false; 160 160 161 161 VM_WARN_ON_FOLIO(!pte_present(pte), folio); 162 162 VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); ··· 172 168 pte = ptep_get(ptep); 173 169 if (any_writable) 174 170 writable = !!pte_write(pte); 171 + if (any_young) 172 + young = !!pte_young(pte); 175 173 pte = __pte_batch_clear_ignored(pte, flags); 176 174 177 175 if (!pte_same(pte, expected_pte)) ··· 189 183 190 184 if (any_writable) 191 185 *any_writable |= writable; 186 + if (any_young) 187 + *any_young |= young; 192 188 193 189 nr = pte_batch_hint(ptep, pte); 194 190 expected_pte = pte_advance_pfn(expected_pte, nr);

+49 -36

mm/madvise.c

··· 336 336 LIST_HEAD(folio_list); 337 337 bool pageout_anon_only_filter; 338 338 unsigned int batch_count = 0; 339 + int nr; 339 340 340 341 if (fatal_signal_pending(current)) 341 342 return -EINTR; ··· 424 423 return 0; 425 424 flush_tlb_batched_pending(mm); 426 425 arch_enter_lazy_mmu_mode(); 427 - for (; addr < end; pte++, addr += PAGE_SIZE) { 426 + for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) { 427 + nr = 1; 428 428 ptent = ptep_get(pte); 429 429 430 430 if (++batch_count == SWAP_CLUSTER_MAX) { ··· 449 447 continue; 450 448 451 449 /* 452 - * Creating a THP page is expensive so split it only if we 453 - * are sure it's worth. Split it if we are only owner. 450 + * If we encounter a large folio, only split it if it is not 451 + * fully mapped within the range we are operating on. Otherwise 452 + * leave it as is so that it can be swapped out whole. If we 453 + * fail to split a folio, leave it in place and advance to the 454 + * next pte in the range. 454 455 */ 455 456 if (folio_test_large(folio)) { 456 - int err; 457 + const fpb_t fpb_flags = FPB_IGNORE_DIRTY | 458 + FPB_IGNORE_SOFT_DIRTY; 459 + int max_nr = (end - addr) / PAGE_SIZE; 460 + bool any_young; 457 461 458 - if (folio_likely_mapped_shared(folio)) 459 - break; 460 - if (pageout_anon_only_filter && !folio_test_anon(folio)) 461 - break; 462 - if (!folio_trylock(folio)) 463 - break; 464 - folio_get(folio); 465 - arch_leave_lazy_mmu_mode(); 466 - pte_unmap_unlock(start_pte, ptl); 467 - start_pte = NULL; 468 - err = split_folio(folio); 469 - folio_unlock(folio); 470 - folio_put(folio); 471 - if (err) 472 - break; 473 - start_pte = pte = 474 - pte_offset_map_lock(mm, pmd, addr, &ptl); 475 - if (!start_pte) 476 - break; 477 - arch_enter_lazy_mmu_mode(); 478 - pte--; 479 - addr -= PAGE_SIZE; 480 - continue; 462 + nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, 463 + fpb_flags, NULL, &any_young); 464 + if (any_young) 465 + ptent = pte_mkyoung(ptent); 466 + 467 + if (nr < folio_nr_pages(folio)) { 468 + int err; 469 + 470 + if (folio_likely_mapped_shared(folio)) 471 + continue; 472 + if (pageout_anon_only_filter && !folio_test_anon(folio)) 473 + continue; 474 + if (!folio_trylock(folio)) 475 + continue; 476 + folio_get(folio); 477 + arch_leave_lazy_mmu_mode(); 478 + pte_unmap_unlock(start_pte, ptl); 479 + start_pte = NULL; 480 + err = split_folio(folio); 481 + folio_unlock(folio); 482 + folio_put(folio); 483 + start_pte = pte = 484 + pte_offset_map_lock(mm, pmd, addr, &ptl); 485 + if (!start_pte) 486 + break; 487 + arch_enter_lazy_mmu_mode(); 488 + if (!err) 489 + nr = 0; 490 + continue; 491 + } 481 492 } 482 493 483 494 /* 484 495 * Do not interfere with other mappings of this folio and 485 - * non-LRU folio. 496 + * non-LRU folio. If we have a large folio at this point, we 497 + * know it is fully mapped so if its mapcount is the same as its 498 + * number of pages, it must be exclusive. 486 499 */ 487 - if (!folio_test_lru(folio) || folio_mapcount(folio) != 1) 500 + if (!folio_test_lru(folio) || 501 + folio_mapcount(folio) != folio_nr_pages(folio)) 488 502 continue; 489 503 490 504 if (pageout_anon_only_filter && !folio_test_anon(folio)) 491 505 continue; 492 506 493 - VM_BUG_ON_FOLIO(folio_test_large(folio), folio); 494 - 495 507 if (!pageout && pte_young(ptent)) { 496 - ptent = ptep_get_and_clear_full(mm, addr, pte, 497 - tlb->fullmm); 498 - ptent = pte_mkold(ptent); 499 - set_pte_at(mm, addr, pte, ptent); 500 - tlb_remove_tlb_entry(tlb, pte, addr); 508 + mkold_ptes(vma, addr, pte, nr); 509 + tlb_remove_tlb_entries(tlb, pte, nr, addr); 501 510 } 502 511 503 512 /*

+2 -2

mm/memory.c

··· 989 989 flags |= FPB_IGNORE_SOFT_DIRTY; 990 990 991 991 nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags, 992 - &any_writable); 992 + &any_writable, NULL); 993 993 folio_ref_add(folio, nr); 994 994 if (folio_test_anon(folio)) { 995 995 if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, ··· 1559 1559 */ 1560 1560 if (unlikely(folio_test_large(folio) && max_nr != 1)) { 1561 1561 nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, 1562 - NULL); 1562 + NULL, NULL); 1563 1563 1564 1564 zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, 1565 1565 addr, details, rss, force_flush,

Configure Feed

Configure Feed