Merge tag 'mm-hotfixes-stable-2023-08-25-11-07' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

tjh.dev / kernel

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Merge tag 'mm-hotfixes-stable-2023-08-25-11-07' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull misc fixes from Andrew Morton:
"18 hotfixes. 13 are cc:stable and the remainder pertain to post-6.4
issues or aren't considered suitable for a -stable backport"

* tag 'mm-hotfixes-stable-2023-08-25-11-07' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm:
shmem: fix smaps BUG sleeping while atomic
selftests: cachestat: catch failing fsync test on tmpfs
selftests: cachestat: test for cachestat availability
maple_tree: disable mas_wr_append() when other readers are possible
madvise:madvise_free_pte_range(): don't use mapcount() against large folio for sharing check
madvise:madvise_free_huge_pmd(): don't use mapcount() against large folio for sharing check
madvise:madvise_cold_or_pageout_pte_range(): don't use mapcount() against large folio for sharing check
mm: multi-gen LRU: don't spin during memcg release
mm: memory-failure: fix unexpected return value in soft_offline_page()
radix tree: remove unused variable
mm: add a call to flush_cache_vmap() in vmap_pfn()
selftests/mm: FOLL_LONGTERM need to be updated to 0x100
nilfs2: fix general protection fault in nilfs_lookup_dirty_data_buffers()
mm/gup: handle cont-PTE hugetlb pages correctly in gup_must_unshare() via GUP-fast
selftests: cgroup: fix test_kmem_basic less than error
mm: enable page walking API to lock vmas during the walk
smaps: use vm_normal_page_pmd() instead of follow_trans_huge_pmd()
mm/gup: reintroduce FOLL_NUMA as FOLL_HONOR_NUMA_FAULT

Linus Torvalds 2 years ago 6f0edbb8 4942fed8

+280 -74

32 changed files

expand all collapse all

arch

powerpc

book3s64

subpage_prot.c

riscv

pageattr.c

s390

gmap.c

nilfs2

segment.c

proc

task_mmu.c

include

linux

huge_mm.h

mm.h

mm_types.h

pagewalk.h

lib

maple_tree.c

radix-tree.c

damon

vaddr.c

gup.c

hmm.c

huge_memory.c

internal.h

ksm.c

madvise.c

memcontrol.c

memory-failure.c

mempolicy.c

migrate_device.c

mincore.c

mlock.c

mprotect.c

pagewalk.c

shmem.c

vmalloc.c

vmscan.c

tools

testing

selftests

cachestat

test_cachestat.c

cgroup

test_kmem.c

hmm-tests.c

arch/powerpc/mm/book3s64/subpage_prot.c

reviewed

··· 145 145 146 146 static const struct mm_walk_ops subpage_walk_ops = { 147 147 .pmd_entry = subpage_walk_pmd_entry, 148 148 + .walk_lock = PGWALK_WRLOCK_VERIFY, 148 149 }; 149 150 150 151 static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,

arch/riscv/mm/pageattr.c

reviewed

··· 102 102 .pmd_entry = pageattr_pmd_entry, 103 103 .pte_entry = pageattr_pte_entry, 104 104 .pte_hole = pageattr_pte_hole, 105 105 + .walk_lock = PGWALK_RDLOCK, 105 106 }; 106 107 107 108 static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask,

arch/s390/mm/gmap.c

reviewed

··· 2514 2514 2515 2515 static const struct mm_walk_ops thp_split_walk_ops = { 2516 2516 .pmd_entry = thp_split_walk_pmd_entry, 2517 2517 + .walk_lock = PGWALK_WRLOCK_VERIFY, 2517 2518 }; 2518 2519 2519 2520 static inline void thp_split_mm(struct mm_struct *mm) ··· 2566 2565 2567 2566 static const struct mm_walk_ops zap_zero_walk_ops = { 2568 2567 .pmd_entry = __zap_zero_pages, 2568 2568 + .walk_lock = PGWALK_WRLOCK, 2569 2569 }; 2570 2570 2571 2571 /* ··· 2657 2655 .hugetlb_entry = __s390_enable_skey_hugetlb, 2658 2656 .pte_entry = __s390_enable_skey_pte, 2659 2657 .pmd_entry = __s390_enable_skey_pmd, 2658 2658 + .walk_lock = PGWALK_WRLOCK, 2660 2659 }; 2661 2660 2662 2661 int s390_enable_skey(void) ··· 2695 2692 2696 2693 static const struct mm_walk_ops reset_cmma_walk_ops = { 2697 2694 .pte_entry = __s390_reset_cmma, 2695 2695 + .walk_lock = PGWALK_WRLOCK, 2698 2696 }; 2699 2697 2700 2698 void s390_reset_cmma(struct mm_struct *mm) ··· 2732 2728 2733 2729 static const struct mm_walk_ops gather_pages_ops = { 2734 2730 .pte_entry = s390_gather_pages, 2731 2731 + .walk_lock = PGWALK_RDLOCK, 2735 2732 }; 2736 2733 2737 2734 /*

fs/nilfs2/segment.c

reviewed

··· 725 725 struct folio *folio = fbatch.folios[i]; 726 726 727 727 folio_lock(folio); 728 728 + if (unlikely(folio->mapping != mapping)) { 729 729 + /* Exclude folios removed from the address space */ 730 730 + folio_unlock(folio); 731 731 + continue; 732 732 + } 728 733 head = folio_buffers(folio); 729 734 if (!head) { 730 735 create_empty_buffers(&folio->page, i_blocksize(inode), 0);

+6 -2

fs/proc/task_mmu.c

reviewed

··· 587 587 bool migration = false; 588 588 589 589 if (pmd_present(*pmd)) { 590 590 - /* FOLL_DUMP will return -EFAULT on huge zero page */ 591 591 - page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); 590 590 + page = vm_normal_page_pmd(vma, addr, *pmd); 592 591 } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { 593 592 swp_entry_t entry = pmd_to_swp_entry(*pmd); 594 593 ··· 757 758 static const struct mm_walk_ops smaps_walk_ops = { 758 759 .pmd_entry = smaps_pte_range, 759 760 .hugetlb_entry = smaps_hugetlb_range, 761 761 + .walk_lock = PGWALK_RDLOCK, 760 762 }; 761 763 762 764 static const struct mm_walk_ops smaps_shmem_walk_ops = { 763 765 .pmd_entry = smaps_pte_range, 764 766 .hugetlb_entry = smaps_hugetlb_range, 765 767 .pte_hole = smaps_pte_hole, 768 768 + .walk_lock = PGWALK_RDLOCK, 766 769 }; 767 770 768 771 /* ··· 1246 1245 static const struct mm_walk_ops clear_refs_walk_ops = { 1247 1246 .pmd_entry = clear_refs_pte_range, 1248 1247 .test_walk = clear_refs_test_walk, 1248 1248 + .walk_lock = PGWALK_WRLOCK, 1249 1249 }; 1250 1250 1251 1251 static ssize_t clear_refs_write(struct file *file, const char __user *buf, ··· 1624 1622 .pmd_entry = pagemap_pmd_range, 1625 1623 .pte_hole = pagemap_pte_hole, 1626 1624 .hugetlb_entry = pagemap_hugetlb_range, 1625 1625 + .walk_lock = PGWALK_RDLOCK, 1627 1626 }; 1628 1627 1629 1628 /* ··· 1938 1935 static const struct mm_walk_ops show_numa_ops = { 1939 1936 .hugetlb_entry = gather_hugetlb_stats, 1940 1937 .pmd_entry = gather_pte_stats, 1938 1938 + .walk_lock = PGWALK_RDLOCK, 1941 1939 }; 1942 1940 1943 1941 /*

-3

include/linux/huge_mm.h

reviewed

··· 25 25 #endif 26 26 27 27 vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf); 28 28 - struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, 29 29 - unsigned long addr, pmd_t *pmd, 30 30 - unsigned int flags); 31 28 bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 32 29 pmd_t *pmd, unsigned long addr, unsigned long next); 33 30 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd,

+15 -6

include/linux/mm.h

reviewed

··· 3421 3421 * Indicates whether GUP can follow a PROT_NONE mapped page, or whether 3422 3422 * a (NUMA hinting) fault is required. 3423 3423 */ 3424 3424 - static inline bool gup_can_follow_protnone(unsigned int flags) 3424 3424 + static inline bool gup_can_follow_protnone(struct vm_area_struct *vma, 3425 3425 + unsigned int flags) 3425 3426 { 3426 3427 /* 3427 3427 - * FOLL_FORCE has to be able to make progress even if the VMA is 3428 3428 - * inaccessible. Further, FOLL_FORCE access usually does not represent 3429 3429 - * application behaviour and we should avoid triggering NUMA hinting 3430 3430 - * faults. 3428 3428 + * If callers don't want to honor NUMA hinting faults, no need to 3429 3429 + * determine if we would actually have to trigger a NUMA hinting fault. 3431 3430 */ 3432 3432 - return flags & FOLL_FORCE; 3431 3431 + if (!(flags & FOLL_HONOR_NUMA_FAULT)) 3432 3432 + return true; 3433 3433 + 3434 3434 + /* 3435 3435 + * NUMA hinting faults don't apply in inaccessible (PROT_NONE) VMAs. 3436 3436 + * 3437 3437 + * Requiring a fault here even for inaccessible VMAs would mean that 3438 3438 + * FOLL_FORCE cannot make any progress, because handle_mm_fault() 3439 3439 + * refuses to process NUMA hinting faults in inaccessible VMAs. 3440 3440 + */ 3441 3441 + return !vma_is_accessible(vma); 3433 3442 } 3434 3443 3435 3444 typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);

include/linux/mm_types.h

reviewed

··· 1286 1286 FOLL_PCI_P2PDMA = 1 << 10, 1287 1287 /* allow interrupts from generic signals */ 1288 1288 FOLL_INTERRUPTIBLE = 1 << 11, 1289 1289 + /* 1290 1290 + * Always honor (trigger) NUMA hinting faults. 1291 1291 + * 1292 1292 + * FOLL_WRITE implicitly honors NUMA hinting faults because a 1293 1293 + * PROT_NONE-mapped page is not writable (exceptions with FOLL_FORCE 1294 1294 + * apply). get_user_pages_fast_only() always implicitly honors NUMA 1295 1295 + * hinting faults. 1296 1296 + */ 1297 1297 + FOLL_HONOR_NUMA_FAULT = 1 << 12, 1289 1298 1290 1299 /* See also internal only FOLL flags in mm/internal.h */ 1291 1300 };

+11

include/linux/pagewalk.h

reviewed

··· 6 6 7 7 struct mm_walk; 8 8 9 9 + /* Locking requirement during a page walk. */ 10 10 + enum page_walk_lock { 11 11 + /* mmap_lock should be locked for read to stabilize the vma tree */ 12 12 + PGWALK_RDLOCK = 0, 13 13 + /* vma will be write-locked during the walk */ 14 14 + PGWALK_WRLOCK = 1, 15 15 + /* vma is expected to be already write-locked during the walk */ 16 16 + PGWALK_WRLOCK_VERIFY = 2, 17 17 + }; 18 18 + 9 19 /** 10 20 * struct mm_walk_ops - callbacks for walk_page_range 11 21 * @pgd_entry: if set, called for each non-empty PGD (top-level) entry ··· 76 66 int (*pre_vma)(unsigned long start, unsigned long end, 77 67 struct mm_walk *walk); 78 68 void (*post_vma)(struct mm_walk *walk); 69 69 + enum page_walk_lock walk_lock; 79 70 }; 80 71 81 72 /*

lib/maple_tree.c

reviewed

··· 4265 4265 * mas_wr_append: Attempt to append 4266 4266 * @wr_mas: the maple write state 4267 4267 * 4268 4268 + * This is currently unsafe in rcu mode since the end of the node may be cached 4269 4269 + * by readers while the node contents may be updated which could result in 4270 4270 + * inaccurate information. 4271 4271 + * 4268 4272 * Return: True if appended, false otherwise 4269 4273 */ 4270 4274 static inline bool mas_wr_append(struct ma_wr_state *wr_mas) ··· 4277 4273 unsigned char new_end = end + 1; 4278 4274 struct ma_state *mas = wr_mas->mas; 4279 4275 unsigned char node_pivots = mt_pivots[wr_mas->type]; 4276 4276 + 4277 4277 + if (mt_in_rcu(mas->tree)) 4278 4278 + return false; 4280 4279 4281 4280 if (mas->offset != wr_mas->node_end) 4282 4281 return false;

-1

lib/radix-tree.c

reviewed

··· 1136 1136 void __rcu **radix_tree_iter_resume(void __rcu **slot, 1137 1137 struct radix_tree_iter *iter) 1138 1138 { 1139 1139 - slot++; 1140 1139 iter->index = __radix_tree_iter_add(iter, 1); 1141 1140 iter->next_index = iter->index; 1142 1141 iter->tags = 0;

mm/damon/vaddr.c

reviewed

··· 386 386 static const struct mm_walk_ops damon_mkold_ops = { 387 387 .pmd_entry = damon_mkold_pmd_entry, 388 388 .hugetlb_entry = damon_mkold_hugetlb_entry, 389 389 + .walk_lock = PGWALK_RDLOCK, 389 390 }; 390 391 391 392 static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) ··· 526 525 static const struct mm_walk_ops damon_young_ops = { 527 526 .pmd_entry = damon_young_pmd_entry, 528 527 .hugetlb_entry = damon_young_hugetlb_entry, 528 528 + .walk_lock = PGWALK_RDLOCK, 529 529 }; 530 530 531 531 static bool damon_va_young(struct mm_struct *mm, unsigned long addr,

+24 -6

mm/gup.c

reviewed

··· 597 597 pte = ptep_get(ptep); 598 598 if (!pte_present(pte)) 599 599 goto no_page; 600 600 - if (pte_protnone(pte) && !gup_can_follow_protnone(flags)) 600 600 + if (pte_protnone(pte) && !gup_can_follow_protnone(vma, flags)) 601 601 goto no_page; 602 602 603 603 page = vm_normal_page(vma, address, pte); ··· 714 714 if (likely(!pmd_trans_huge(pmdval))) 715 715 return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); 716 716 717 717 - if (pmd_protnone(pmdval) && !gup_can_follow_protnone(flags)) 717 717 + if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags)) 718 718 return no_page_table(vma, flags); 719 719 720 720 ptl = pmd_lock(mm, pmd); ··· 851 851 if (WARN_ON_ONCE(foll_flags & FOLL_PIN)) 852 852 return NULL; 853 853 854 854 + /* 855 855 + * We never set FOLL_HONOR_NUMA_FAULT because callers don't expect 856 856 + * to fail on PROT_NONE-mapped pages. 857 857 + */ 854 858 page = follow_page_mask(vma, address, foll_flags, &ctx); 855 859 if (ctx.pgmap) 856 860 put_dev_pagemap(ctx.pgmap); ··· 2231 2227 gup_flags |= FOLL_UNLOCKABLE; 2232 2228 } 2233 2229 2230 2230 + /* 2231 2231 + * For now, always trigger NUMA hinting faults. Some GUP users like 2232 2232 + * KVM require the hint to be as the calling context of GUP is 2233 2233 + * functionally similar to a memory reference from task context. 2234 2234 + */ 2235 2235 + gup_flags |= FOLL_HONOR_NUMA_FAULT; 2236 2236 + 2234 2237 /* FOLL_GET and FOLL_PIN are mutually exclusive. */ 2235 2238 if (WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) == 2236 2239 (FOLL_PIN | FOLL_GET))) ··· 2562 2551 struct page *page; 2563 2552 struct folio *folio; 2564 2553 2565 2565 - if (pte_protnone(pte) && !gup_can_follow_protnone(flags)) 2554 2554 + /* 2555 2555 + * Always fallback to ordinary GUP on PROT_NONE-mapped pages: 2556 2556 + * pte_access_permitted() better should reject these pages 2557 2557 + * either way: otherwise, GUP-fast might succeed in 2558 2558 + * cases where ordinary GUP would fail due to VMA access 2559 2559 + * permissions. 2560 2560 + */ 2561 2561 + if (pte_protnone(pte)) 2566 2562 goto pte_unmap; 2567 2563 2568 2564 if (!pte_access_permitted(pte, flags & FOLL_WRITE)) ··· 2988 2970 2989 2971 if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) || 2990 2972 pmd_devmap(pmd))) { 2991 2991 - if (pmd_protnone(pmd) && 2992 2992 - !gup_can_follow_protnone(flags)) 2973 2973 + /* See gup_pte_range() */ 2974 2974 + if (pmd_protnone(pmd)) 2993 2975 return 0; 2994 2976 2995 2977 if (!gup_huge_pmd(pmd, pmdp, addr, next, flags, ··· 3169 3151 if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | 3170 3152 FOLL_FORCE | FOLL_PIN | FOLL_GET | 3171 3153 FOLL_FAST_ONLY | FOLL_NOFAULT | 3172 3172 - FOLL_PCI_P2PDMA))) 3154 3154 + FOLL_PCI_P2PDMA | FOLL_HONOR_NUMA_FAULT))) 3173 3155 return -EINVAL; 3174 3156 3175 3157 if (gup_flags & FOLL_PIN)

mm/hmm.c

reviewed

··· 562 562 .pte_hole = hmm_vma_walk_hole, 563 563 .hugetlb_entry = hmm_vma_walk_hugetlb_entry, 564 564 .test_walk = hmm_vma_walk_test, 565 565 + .walk_lock = PGWALK_RDLOCK, 565 566 }; 566 567 567 568 /**

+2 -3

mm/huge_memory.c

reviewed

··· 1467 1467 if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) 1468 1468 return ERR_PTR(-EFAULT); 1469 1469 1470 1470 - /* Full NUMA hinting faults to serialise migration in fault paths */ 1471 1471 - if (pmd_protnone(*pmd) && !gup_can_follow_protnone(flags)) 1470 1470 + if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags)) 1472 1471 return NULL; 1473 1472 1474 1473 if (!pmd_write(*pmd) && gup_must_unshare(vma, flags, page)) ··· 1612 1613 * If other processes are mapping this folio, we couldn't discard 1613 1614 * the folio unless they all do MADV_FREE so let's skip the folio. 1614 1615 */ 1615 1615 - if (folio_mapcount(folio) != 1) 1616 1616 + if (folio_estimated_sharers(folio) != 1) 1616 1617 goto out; 1617 1618 1618 1619 if (!folio_trylock(folio))

+17

mm/internal.h

reviewed

··· 924 924 struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); 925 925 int __must_check try_grab_page(struct page *page, unsigned int flags); 926 926 927 927 + /* 928 928 + * mm/huge_memory.c 929 929 + */ 930 930 + struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, 931 931 + unsigned long addr, pmd_t *pmd, 932 932 + unsigned int flags); 933 933 + 927 934 enum { 928 935 /* mark page accessed */ 929 936 FOLL_TOUCH = 1 << 16, ··· 1003 996 /* Paired with a memory barrier in page_try_share_anon_rmap(). */ 1004 997 if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) 1005 998 smp_rmb(); 999 999 + 1000 1000 + /* 1001 1001 + * During GUP-fast we might not get called on the head page for a 1002 1002 + * hugetlb page that is mapped using cont-PTE, because GUP-fast does 1003 1003 + * not work with the abstracted hugetlb PTEs that always point at the 1004 1004 + * head page. For hugetlb, PageAnonExclusive only applies on the head 1005 1005 + * page (as it cannot be partially COW-shared), so lookup the head page. 1006 1006 + */ 1007 1007 + if (unlikely(!PageHead(page) && PageHuge(page))) 1008 1008 + page = compound_head(page); 1006 1009 1007 1010 /* 1008 1011 * Note that PageKsm() pages cannot be exclusive, and consequently,

+16 -9

mm/ksm.c

reviewed

··· 455 455 456 456 static const struct mm_walk_ops break_ksm_ops = { 457 457 .pmd_entry = break_ksm_pmd_entry, 458 458 + .walk_lock = PGWALK_RDLOCK, 459 459 + }; 460 460 + 461 461 + static const struct mm_walk_ops break_ksm_lock_vma_ops = { 462 462 + .pmd_entry = break_ksm_pmd_entry, 463 463 + .walk_lock = PGWALK_WRLOCK, 458 464 }; 459 465 460 466 /* ··· 476 470 * of the process that owns 'vma'. We also do not want to enforce 477 471 * protection keys here anyway. 478 472 */ 479 479 - static int break_ksm(struct vm_area_struct *vma, unsigned long addr) 473 473 + static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma) 480 474 { 481 475 vm_fault_t ret = 0; 476 476 + const struct mm_walk_ops *ops = lock_vma ? 477 477 + &break_ksm_lock_vma_ops : &break_ksm_ops; 482 478 483 479 do { 484 480 int ksm_page; 485 481 486 482 cond_resched(); 487 487 - ksm_page = walk_page_range_vma(vma, addr, addr + 1, 488 488 - &break_ksm_ops, NULL); 483 483 + ksm_page = walk_page_range_vma(vma, addr, addr + 1, ops, NULL); 489 484 if (WARN_ON_ONCE(ksm_page < 0)) 490 485 return ksm_page; 491 486 if (!ksm_page) ··· 572 565 mmap_read_lock(mm); 573 566 vma = find_mergeable_vma(mm, addr); 574 567 if (vma) 575 575 - break_ksm(vma, addr); 568 568 + break_ksm(vma, addr, false); 576 569 mmap_read_unlock(mm); 577 570 } 578 571 ··· 878 871 * in cmp_and_merge_page on one of the rmap_items we would be removing. 879 872 */ 880 873 static int unmerge_ksm_pages(struct vm_area_struct *vma, 881 881 - unsigned long start, unsigned long end) 874 874 + unsigned long start, unsigned long end, bool lock_vma) 882 875 { 883 876 unsigned long addr; 884 877 int err = 0; ··· 889 882 if (signal_pending(current)) 890 883 err = -ERESTARTSYS; 891 884 else 892 892 - err = break_ksm(vma, addr); 885 885 + err = break_ksm(vma, addr, lock_vma); 893 886 } 894 887 return err; 895 888 } ··· 1036 1029 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) 1037 1030 continue; 1038 1031 err = unmerge_ksm_pages(vma, 1039 1039 - vma->vm_start, vma->vm_end); 1032 1032 + vma->vm_start, vma->vm_end, false); 1040 1033 if (err) 1041 1034 goto error; 1042 1035 } ··· 2537 2530 return 0; 2538 2531 2539 2532 if (vma->anon_vma) { 2540 2540 - err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end); 2533 2533 + err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, true); 2541 2534 if (err) 2542 2535 return err; 2543 2536 } ··· 2675 2668 return 0; /* just ignore the advice */ 2676 2669 2677 2670 if (vma->anon_vma) { 2678 2678 - err = unmerge_ksm_pages(vma, start, end); 2671 2671 + err = unmerge_ksm_pages(vma, start, end, true); 2679 2672 if (err) 2680 2673 return err; 2681 2674 }

+6 -3

mm/madvise.c

reviewed

··· 233 233 234 234 static const struct mm_walk_ops swapin_walk_ops = { 235 235 .pmd_entry = swapin_walk_pmd_entry, 236 236 + .walk_lock = PGWALK_RDLOCK, 236 237 }; 237 238 238 239 static void shmem_swapin_range(struct vm_area_struct *vma, ··· 384 383 folio = pfn_folio(pmd_pfn(orig_pmd)); 385 384 386 385 /* Do not interfere with other mappings of this folio */ 387 387 - if (folio_mapcount(folio) != 1) 386 386 + if (folio_estimated_sharers(folio) != 1) 388 387 goto huge_unlock; 389 388 390 389 if (pageout_anon_only_filter && !folio_test_anon(folio)) ··· 458 457 if (folio_test_large(folio)) { 459 458 int err; 460 459 461 461 - if (folio_mapcount(folio) != 1) 460 460 + if (folio_estimated_sharers(folio) != 1) 462 461 break; 463 462 if (pageout_anon_only_filter && !folio_test_anon(folio)) 464 463 break; ··· 535 534 536 535 static const struct mm_walk_ops cold_walk_ops = { 537 536 .pmd_entry = madvise_cold_or_pageout_pte_range, 537 537 + .walk_lock = PGWALK_RDLOCK, 538 538 }; 539 539 540 540 static void madvise_cold_page_range(struct mmu_gather *tlb, ··· 680 678 if (folio_test_large(folio)) { 681 679 int err; 682 680 683 683 - if (folio_mapcount(folio) != 1) 681 681 + if (folio_estimated_sharers(folio) != 1) 684 682 break; 685 683 if (!folio_trylock(folio)) 686 684 break; ··· 759 757 760 758 static const struct mm_walk_ops madvise_free_walk_ops = { 761 759 .pmd_entry = madvise_free_pte_range, 760 760 + .walk_lock = PGWALK_RDLOCK, 762 761 }; 763 762 764 763 static int madvise_free_single_vma(struct vm_area_struct *vma,

mm/memcontrol.c

reviewed

··· 6024 6024 6025 6025 static const struct mm_walk_ops precharge_walk_ops = { 6026 6026 .pmd_entry = mem_cgroup_count_precharge_pte_range, 6027 6027 + .walk_lock = PGWALK_RDLOCK, 6027 6028 }; 6028 6029 6029 6030 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) ··· 6304 6303 6305 6304 static const struct mm_walk_ops charge_walk_ops = { 6306 6305 .pmd_entry = mem_cgroup_move_charge_pte_range, 6306 6306 + .walk_lock = PGWALK_RDLOCK, 6307 6307 }; 6308 6308 6309 6309 static void mem_cgroup_move_charge(void)

+8 -4

mm/memory-failure.c

reviewed

··· 831 831 static const struct mm_walk_ops hwp_walk_ops = { 832 832 .pmd_entry = hwpoison_pte_range, 833 833 .hugetlb_entry = hwpoison_hugetlb_range, 834 834 + .walk_lock = PGWALK_RDLOCK, 834 835 }; 835 836 836 837 /* ··· 2741 2740 if (ret > 0) { 2742 2741 ret = soft_offline_in_use_page(page); 2743 2742 } else if (ret == 0) { 2744 2744 - if (!page_handle_poison(page, true, false) && try_again) { 2745 2745 - try_again = false; 2746 2746 - flags &= ~MF_COUNT_INCREASED; 2747 2747 - goto retry; 2743 2743 + if (!page_handle_poison(page, true, false)) { 2744 2744 + if (try_again) { 2745 2745 + try_again = false; 2746 2746 + flags &= ~MF_COUNT_INCREASED; 2747 2747 + goto retry; 2748 2748 + } 2749 2749 + ret = -EBUSY; 2748 2750 } 2749 2751 } 2750 2752

+14 -8

mm/mempolicy.c

reviewed

··· 718 718 .hugetlb_entry = queue_folios_hugetlb, 719 719 .pmd_entry = queue_folios_pte_range, 720 720 .test_walk = queue_pages_test_walk, 721 721 + .walk_lock = PGWALK_RDLOCK, 722 722 + }; 723 723 + 724 724 + static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = { 725 725 + .hugetlb_entry = queue_folios_hugetlb, 726 726 + .pmd_entry = queue_folios_pte_range, 727 727 + .test_walk = queue_pages_test_walk, 728 728 + .walk_lock = PGWALK_WRLOCK, 721 729 }; 722 730 723 731 /* ··· 746 738 static int 747 739 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, 748 740 nodemask_t *nodes, unsigned long flags, 749 749 - struct list_head *pagelist) 741 741 + struct list_head *pagelist, bool lock_vma) 750 742 { 751 743 int err; 752 744 struct queue_pages qp = { ··· 757 749 .end = end, 758 750 .first = NULL, 759 751 }; 752 752 + const struct mm_walk_ops *ops = lock_vma ? 753 753 + &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops; 760 754 761 761 - err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp); 755 755 + err = walk_page_range(mm, start, end, ops, &qp); 762 756 763 757 if (!qp.first) 764 758 /* whole range in hole */ ··· 1088 1078 vma = find_vma(mm, 0); 1089 1079 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); 1090 1080 queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, 1091 1091 - flags | MPOL_MF_DISCONTIG_OK, &pagelist); 1081 1081 + flags | MPOL_MF_DISCONTIG_OK, &pagelist, false); 1092 1082 1093 1083 if (!list_empty(&pagelist)) { 1094 1084 err = migrate_pages(&pagelist, alloc_migration_target, NULL, ··· 1331 1321 * Lock the VMAs before scanning for pages to migrate, to ensure we don't 1332 1322 * miss a concurrently inserted page. 1333 1323 */ 1334 1334 - vma_iter_init(&vmi, mm, start); 1335 1335 - for_each_vma_range(vmi, vma, end) 1336 1336 - vma_start_write(vma); 1337 1337 - 1338 1324 ret = queue_pages_range(mm, start, end, nmask, 1339 1339 - flags | MPOL_MF_INVERT, &pagelist); 1325 1325 + flags | MPOL_MF_INVERT, &pagelist, true); 1340 1326 1341 1327 if (ret < 0) { 1342 1328 err = ret;

mm/migrate_device.c

reviewed

··· 279 279 static const struct mm_walk_ops migrate_vma_walk_ops = { 280 280 .pmd_entry = migrate_vma_collect_pmd, 281 281 .pte_hole = migrate_vma_collect_hole, 282 282 + .walk_lock = PGWALK_RDLOCK, 282 283 }; 283 284 284 285 /*

mm/mincore.c

reviewed

··· 176 176 .pmd_entry = mincore_pte_range, 177 177 .pte_hole = mincore_unmapped_range, 178 178 .hugetlb_entry = mincore_hugetlb, 179 179 + .walk_lock = PGWALK_RDLOCK, 179 180 }; 180 181 181 182 /*

mm/mlock.c

reviewed

··· 371 371 { 372 372 static const struct mm_walk_ops mlock_walk_ops = { 373 373 .pmd_entry = mlock_pte_range, 374 374 + .walk_lock = PGWALK_WRLOCK_VERIFY, 374 375 }; 375 376 376 377 /*

mm/mprotect.c

reviewed

··· 568 568 .pte_entry = prot_none_pte_entry, 569 569 .hugetlb_entry = prot_none_hugetlb_entry, 570 570 .test_walk = prot_none_test, 571 571 + .walk_lock = PGWALK_WRLOCK, 571 572 }; 572 573 573 574 int

+33 -3

mm/pagewalk.c

reviewed

··· 400 400 return err; 401 401 } 402 402 403 403 + static inline void process_mm_walk_lock(struct mm_struct *mm, 404 404 + enum page_walk_lock walk_lock) 405 405 + { 406 406 + if (walk_lock == PGWALK_RDLOCK) 407 407 + mmap_assert_locked(mm); 408 408 + else 409 409 + mmap_assert_write_locked(mm); 410 410 + } 411 411 + 412 412 + static inline void process_vma_walk_lock(struct vm_area_struct *vma, 413 413 + enum page_walk_lock walk_lock) 414 414 + { 415 415 + #ifdef CONFIG_PER_VMA_LOCK 416 416 + switch (walk_lock) { 417 417 + case PGWALK_WRLOCK: 418 418 + vma_start_write(vma); 419 419 + break; 420 420 + case PGWALK_WRLOCK_VERIFY: 421 421 + vma_assert_write_locked(vma); 422 422 + break; 423 423 + case PGWALK_RDLOCK: 424 424 + /* PGWALK_RDLOCK is handled by process_mm_walk_lock */ 425 425 + break; 426 426 + } 427 427 + #endif 428 428 + } 429 429 + 403 430 /** 404 431 * walk_page_range - walk page table with caller specific callbacks 405 432 * @mm: mm_struct representing the target process of page table walk ··· 486 459 if (!walk.mm) 487 460 return -EINVAL; 488 461 489 489 - mmap_assert_locked(walk.mm); 462 462 + process_mm_walk_lock(walk.mm, ops->walk_lock); 490 463 491 464 vma = find_vma(walk.mm, start); 492 465 do { ··· 501 474 if (ops->pte_hole) 502 475 err = ops->pte_hole(start, next, -1, &walk); 503 476 } else { /* inside vma */ 477 477 + process_vma_walk_lock(vma, ops->walk_lock); 504 478 walk.vma = vma; 505 479 next = min(end, vma->vm_end); 506 480 vma = find_vma(mm, vma->vm_end); ··· 577 549 if (start < vma->vm_start || end > vma->vm_end) 578 550 return -EINVAL; 579 551 580 580 - mmap_assert_locked(walk.mm); 552 552 + process_mm_walk_lock(walk.mm, ops->walk_lock); 553 553 + process_vma_walk_lock(vma, ops->walk_lock); 581 554 return __walk_page_range(start, end, &walk); 582 555 } 583 556 ··· 595 566 if (!walk.mm) 596 567 return -EINVAL; 597 568 598 598 - mmap_assert_locked(walk.mm); 569 569 + process_mm_walk_lock(walk.mm, ops->walk_lock); 570 570 + process_vma_walk_lock(vma, ops->walk_lock); 599 571 return __walk_page_range(vma->vm_start, vma->vm_end, &walk); 600 572 } 601 573

+4 -2

mm/shmem.c

reviewed

··· 806 806 XA_STATE(xas, &mapping->i_pages, start); 807 807 struct page *page; 808 808 unsigned long swapped = 0; 809 809 + unsigned long max = end - 1; 809 810 810 811 rcu_read_lock(); 811 811 - xas_for_each(&xas, page, end - 1) { 812 812 + xas_for_each(&xas, page, max) { 812 813 if (xas_retry(&xas, page)) 813 814 continue; 814 815 if (xa_is_value(page)) 815 816 swapped++; 816 816 - 817 817 + if (xas.xa_index == max) 818 818 + break; 817 819 if (need_resched()) { 818 820 xas_pause(&xas); 819 821 cond_resched_rcu();

mm/vmalloc.c

reviewed

··· 2979 2979 free_vm_area(area); 2980 2980 return NULL; 2981 2981 } 2982 2982 + 2983 2983 + flush_cache_vmap((unsigned long)area->addr, 2984 2984 + (unsigned long)area->addr + count * PAGE_SIZE); 2985 2985 + 2982 2986 return area->addr; 2983 2987 } 2984 2988 EXPORT_SYMBOL_GPL(vmap_pfn);

+9 -5

mm/vmscan.c

reviewed

··· 4284 4284 static const struct mm_walk_ops mm_walk_ops = { 4285 4285 .test_walk = should_skip_vma, 4286 4286 .p4d_entry = walk_pud_range, 4287 4287 + .walk_lock = PGWALK_RDLOCK, 4287 4288 }; 4288 4289 4289 4290 int err; ··· 4854 4853 4855 4854 spin_lock_irq(&pgdat->memcg_lru.lock); 4856 4855 4857 4857 - VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); 4856 4856 + if (hlist_nulls_unhashed(&lruvec->lrugen.list)) 4857 4857 + goto unlock; 4858 4858 4859 4859 gen = lruvec->lrugen.gen; 4860 4860 4861 4861 - hlist_nulls_del_rcu(&lruvec->lrugen.list); 4861 4861 + hlist_nulls_del_init_rcu(&lruvec->lrugen.list); 4862 4862 pgdat->memcg_lru.nr_memcgs[gen]--; 4863 4863 4864 4864 if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq)) 4865 4865 WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); 4866 4866 - 4866 4866 + unlock: 4867 4867 spin_unlock_irq(&pgdat->memcg_lru.lock); 4868 4868 } 4869 4869 } ··· 5436 5434 rcu_read_lock(); 5437 5435 5438 5436 hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) { 5439 5439 - if (op) 5437 5437 + if (op) { 5440 5438 lru_gen_rotate_memcg(lruvec, op); 5439 5439 + op = 0; 5440 5440 + } 5441 5441 5442 5442 mem_cgroup_put(memcg); 5443 5443 ··· 5447 5443 memcg = lruvec_memcg(lruvec); 5448 5444 5449 5445 if (!mem_cgroup_tryget(memcg)) { 5450 5450 - op = 0; 5446 5446 + lru_gen_release_memcg(memcg); 5451 5447 memcg = NULL; 5452 5448 continue; 5453 5449 }

+65 -15

tools/testing/selftests/cachestat/test_cachestat.c

reviewed

··· 4 4 #include <stdio.h> 5 5 #include <stdbool.h> 6 6 #include <linux/kernel.h> 7 7 + #include <linux/magic.h> 7 8 #include <linux/mman.h> 8 9 #include <sys/mman.h> 9 10 #include <sys/shm.h> 10 11 #include <sys/syscall.h> 12 12 + #include <sys/vfs.h> 11 13 #include <unistd.h> 12 14 #include <string.h> 13 15 #include <fcntl.h> 14 16 #include <errno.h> 15 17 16 18 #include "../kselftest.h" 19 19 + 20 20 + #define NR_TESTS 9 17 21 18 22 static const char * const dev_files[] = { 19 23 "/dev/zero", "/dev/null", "/dev/urandom", ··· 95 91 } 96 92 97 93 /* 94 94 + * fsync() is implemented via noop_fsync() on tmpfs. This makes the fsync() 95 95 + * test fail below, so we need to check for test file living on a tmpfs. 96 96 + */ 97 97 + static bool is_on_tmpfs(int fd) 98 98 + { 99 99 + struct statfs statfs_buf; 100 100 + 101 101 + if (fstatfs(fd, &statfs_buf)) 102 102 + return false; 103 103 + 104 104 + return statfs_buf.f_type == TMPFS_MAGIC; 105 105 + } 106 106 + 107 107 + /* 98 108 * Open/create the file at filename, (optionally) write random data to it 99 109 * (exactly num_pages), then test the cachestat syscall on this file. 100 110 * 101 111 * If test_fsync == true, fsync the file, then check the number of dirty 102 112 * pages. 103 113 */ 104 104 - bool test_cachestat(const char *filename, bool write_random, bool create, 105 105 - bool test_fsync, unsigned long num_pages, int open_flags, 106 106 - mode_t open_mode) 114 114 + static int test_cachestat(const char *filename, bool write_random, bool create, 115 115 + bool test_fsync, unsigned long num_pages, 116 116 + int open_flags, mode_t open_mode) 107 117 { 108 118 size_t PS = sysconf(_SC_PAGESIZE); 109 119 int filesize = num_pages * PS; 110 110 - bool ret = true; 120 120 + int ret = KSFT_PASS; 111 121 long syscall_ret; 112 122 struct cachestat cs; 113 123 struct cachestat_range cs_range = { 0, filesize }; ··· 130 112 131 113 if (fd == -1) { 132 114 ksft_print_msg("Unable to create/open file.\n"); 133 133 - ret = false; 115 115 + ret = KSFT_FAIL; 134 116 goto out; 135 117 } else { 136 118 ksft_print_msg("Create/open %s\n", filename); ··· 139 121 if (write_random) { 140 122 if (!write_exactly(fd, filesize)) { 141 123 ksft_print_msg("Unable to access urandom.\n"); 142 142 - ret = false; 124 124 + ret = KSFT_FAIL; 143 125 goto out1; 144 126 } 145 127 } ··· 150 132 151 133 if (syscall_ret) { 152 134 ksft_print_msg("Cachestat returned non-zero.\n"); 153 153 - ret = false; 135 135 + ret = KSFT_FAIL; 154 136 goto out1; 155 137 156 138 } else { ··· 160 142 if (cs.nr_cache + cs.nr_evicted != num_pages) { 161 143 ksft_print_msg( 162 144 "Total number of cached and evicted pages is off.\n"); 163 163 - ret = false; 145 145 + ret = KSFT_FAIL; 164 146 } 165 147 } 166 148 } 167 149 168 150 if (test_fsync) { 169 169 - if (fsync(fd)) { 151 151 + if (is_on_tmpfs(fd)) { 152 152 + ret = KSFT_SKIP; 153 153 + } else if (fsync(fd)) { 170 154 ksft_print_msg("fsync fails.\n"); 171 171 - ret = false; 155 155 + ret = KSFT_FAIL; 172 156 } else { 173 157 syscall_ret = syscall(cachestat_nr, fd, &cs_range, &cs, 0); 174 158 ··· 181 161 print_cachestat(&cs); 182 162 183 163 if (cs.nr_dirty) { 184 184 - ret = false; 164 164 + ret = KSFT_FAIL; 185 165 ksft_print_msg( 186 166 "Number of dirty should be zero after fsync.\n"); 187 167 } 188 168 } else { 189 169 ksft_print_msg("Cachestat (after fsync) returned non-zero.\n"); 190 190 - ret = false; 170 170 + ret = KSFT_FAIL; 191 171 goto out1; 192 172 } 193 173 } ··· 256 236 257 237 int main(void) 258 238 { 259 259 - int ret = 0; 239 239 + int ret; 240 240 + 241 241 + ksft_print_header(); 242 242 + 243 243 + ret = syscall(__NR_cachestat, -1, NULL, NULL, 0); 244 244 + if (ret == -1 && errno == ENOSYS) 245 245 + ksft_exit_skip("cachestat syscall not available\n"); 246 246 + 247 247 + ksft_set_plan(NR_TESTS); 248 248 + 249 249 + if (ret == -1 && errno == EBADF) { 250 250 + ksft_test_result_pass("bad file descriptor recognized\n"); 251 251 + ret = 0; 252 252 + } else { 253 253 + ksft_test_result_fail("bad file descriptor ignored\n"); 254 254 + ret = 1; 255 255 + } 260 256 261 257 for (int i = 0; i < 5; i++) { 262 258 const char *dev_filename = dev_files[i]; 263 259 264 260 if (test_cachestat(dev_filename, false, false, false, 265 265 - 4, O_RDONLY, 0400)) 261 261 + 4, O_RDONLY, 0400) == KSFT_PASS) 266 262 ksft_test_result_pass("cachestat works with %s\n", dev_filename); 267 263 else { 268 264 ksft_test_result_fail("cachestat fails with %s\n", dev_filename); ··· 287 251 } 288 252 289 253 if (test_cachestat("tmpfilecachestat", true, true, 290 290 - true, 4, O_CREAT | O_RDWR, 0400 | 0600)) 254 254 + false, 4, O_CREAT | O_RDWR, 0600) == KSFT_PASS) 291 255 ksft_test_result_pass("cachestat works with a normal file\n"); 292 256 else { 293 257 ksft_test_result_fail("cachestat fails with normal file\n"); 294 258 ret = 1; 259 259 + } 260 260 + 261 261 + switch (test_cachestat("tmpfilecachestat", true, true, 262 262 + true, 4, O_CREAT | O_RDWR, 0600)) { 263 263 + case KSFT_FAIL: 264 264 + ksft_test_result_fail("cachestat fsync fails with normal file\n"); 265 265 + ret = KSFT_FAIL; 266 266 + break; 267 267 + case KSFT_PASS: 268 268 + ksft_test_result_pass("cachestat fsync works with a normal file\n"); 269 269 + break; 270 270 + case KSFT_SKIP: 271 271 + ksft_test_result_skip("tmpfilecachestat is on tmpfs\n"); 272 272 + break; 295 273 } 296 274 297 275 if (test_cachestat_shmem())

+2 -2

tools/testing/selftests/cgroup/test_kmem.c

reviewed

··· 75 75 sleep(1); 76 76 77 77 slab1 = cg_read_key_long(cg, "memory.stat", "slab "); 78 78 - if (slab1 <= 0) 78 78 + if (slab1 < 0) 79 79 goto cleanup; 80 80 81 81 current = cg_read_long(cg, "memory.current"); 82 82 - if (current <= 0) 82 82 + if (current < 0) 83 83 goto cleanup; 84 84 85 85 if (slab1 < slab0 / 2 && current < slab0 / 2)

+7 -2

tools/testing/selftests/mm/hmm-tests.c

reviewed

··· 57 57 58 58 #define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) 59 59 /* Just the flags we need, copied from mm.h: */ 60 60 - #define FOLL_WRITE 0x01 /* check pte is writable */ 61 61 - #define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite */ 62 60 61 61 + #ifndef FOLL_WRITE 62 62 + #define FOLL_WRITE 0x01 /* check pte is writable */ 63 63 + #endif 64 64 + 65 65 + #ifndef FOLL_LONGTERM 66 66 + #define FOLL_LONGTERM 0x100 /* mapping lifetime is indefinite */ 67 67 + #endif 63 68 FIXTURE(hmm) 64 69 { 65 70 int fd;