Merge tag 'folio-5.18c' of git://git.infradead.org/users/willy/pagecache

+9 -9

Documentation/core-api/pin_user_pages.rst

··· 55 55 pages* array, and the function then pins pages by incrementing each by a special 56 56 value: GUP_PIN_COUNTING_BIAS. 57 57 58 - For huge pages (and in fact, any compound page of more than 2 pages), the 59 - GUP_PIN_COUNTING_BIAS scheme is not used. Instead, an exact form of pin counting 60 - is achieved, by using the 3rd struct page in the compound page. A new struct 61 - page field, hpage_pinned_refcount, has been added in order to support this. 58 + For compound pages, the GUP_PIN_COUNTING_BIAS scheme is not used. Instead, 59 + an exact form of pin counting is achieved, by using the 2nd struct page 60 + in the compound page. A new struct page field, compound_pincount, has 61 + been added in order to support this. 62 62 63 63 This approach for compound pages avoids the counting upper limit problems that 64 64 are discussed below. Those limitations would have been aggravated severely by 65 65 huge pages, because each tail page adds a refcount to the head page. And in 66 - fact, testing revealed that, without a separate hpage_pinned_refcount field, 66 + fact, testing revealed that, without a separate compound_pincount field, 67 67 page overflows were seen in some huge page stress tests. 68 68 69 - This also means that huge pages and compound pages (of order > 1) do not suffer 69 + This also means that huge pages and compound pages do not suffer 70 70 from the false positives problem that is mentioned below.:: 71 71 72 72 Function ··· 264 264 Other diagnostics 265 265 ================= 266 266 267 - dump_page() has been enhanced slightly, to handle these new counting fields, and 268 - to better report on compound pages in general. Specifically, for compound pages 269 - with order > 1, the exact (hpage_pinned_refcount) pincount is reported. 267 + dump_page() has been enhanced slightly, to handle these new counting 268 + fields, and to better report on compound pages in general. Specifically, 269 + for compound pages, the exact (compound_pincount) pincount is reported. 270 270 271 271 References 272 272 ==========

+1

arch/alpha/include/asm/pgtable.h

··· 233 233 return ((pmd_val(pmd) & _PFN_MASK) >> (32-PAGE_SHIFT)) + PAGE_OFFSET; 234 234 } 235 235 236 + #define pmd_pfn(pmd) (pmd_val(pmd) >> 32) 236 237 #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> 32)) 237 238 #define pud_page(pud) (pfn_to_page(pud_val(pud) >> 32)) 238 239

-1

arch/arc/include/asm/hugepage.h

··· 31 31 32 32 #define pmd_write(pmd) pte_write(pmd_pte(pmd)) 33 33 #define pmd_young(pmd) pte_young(pmd_pte(pmd)) 34 - #define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd)) 35 34 #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) 36 35 37 36 #define mk_pmd(page, prot) pte_pmd(mk_pte(page, prot))

+1

arch/arc/include/asm/pgtable-levels.h

··· 161 161 #define pmd_present(x) (pmd_val(x)) 162 162 #define pmd_clear(xp) do { pmd_val(*(xp)) = 0; } while (0) 163 163 #define pmd_page_vaddr(pmd) (pmd_val(pmd) & PAGE_MASK) 164 + #define pmd_pfn(pmd) ((pmd_val(pmd) & PAGE_MASK) >> PAGE_SHIFT) 164 165 #define pmd_page(pmd) virt_to_page(pmd_page_vaddr(pmd)) 165 166 #define set_pmd(pmdp, pmd) (*(pmdp) = pmd) 166 167 #define pmd_pgtable(pmd) ((pgtable_t) pmd_page_vaddr(pmd))

+2

arch/arm/include/asm/pgtable-2level.h

··· 208 208 } 209 209 #define pmd_offset pmd_offset 210 210 211 + #define pmd_pfn(pmd) (__phys_to_pfn(pmd_val(pmd) & PHYS_MASK)) 212 + 211 213 #define pmd_large(pmd) (pmd_val(pmd) & 2) 212 214 #define pmd_leaf(pmd) (pmd_val(pmd) & 2) 213 215 #define pmd_bad(pmd) (pmd_val(pmd) & 2)

+1

arch/arm64/mm/mmu.c

··· 17 17 #include <linux/mman.h> 18 18 #include <linux/nodemask.h> 19 19 #include <linux/memblock.h> 20 + #include <linux/memremap.h> 20 21 #include <linux/memory.h> 21 22 #include <linux/fs.h> 22 23 #include <linux/io.h>

+1

arch/csky/include/asm/pgtable.h

··· 30 30 #define pgd_ERROR(e) \ 31 31 pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) 32 32 33 + #define pmd_pfn(pmd) (pmd_phys(pmd) >> PAGE_SHIFT) 33 34 #define pmd_page(pmd) (pfn_to_page(pmd_phys(pmd) >> PAGE_SHIFT)) 34 35 #define pte_clear(mm, addr, ptep) set_pte((ptep), \ 35 36 (((unsigned int) addr >= PAGE_OFFSET) ? __pte(_PAGE_GLOBAL) : __pte(0)))

+5

arch/hexagon/include/asm/pgtable.h

··· 236 236 } 237 237 238 238 /* 239 + * pmd_pfn - converts a PMD entry to a page frame number 240 + */ 241 + #define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT) 242 + 243 + /* 239 244 * pmd_page - converts a PMD entry to a page pointer 240 245 */ 241 246 #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))

+1

arch/ia64/include/asm/pgtable.h

··· 267 267 #define pmd_present(pmd) (pmd_val(pmd) != 0UL) 268 268 #define pmd_clear(pmdp) (pmd_val(*(pmdp)) = 0UL) 269 269 #define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & _PFN_MASK)) 270 + #define pmd_pfn(pmd) ((pmd_val(pmd) & _PFN_MASK) >> PAGE_SHIFT) 270 271 #define pmd_page(pmd) virt_to_page((pmd_val(pmd) + PAGE_OFFSET)) 271 272 272 273 #define pud_none(pud) (!pud_val(pud))

+1

arch/m68k/include/asm/mcf_pgtable.h

··· 322 322 #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) 323 323 #define __swp_entry_to_pte(x) (__pte((x).val)) 324 324 325 + #define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT) 325 326 #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) 326 327 327 328 #define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot))

+1

arch/m68k/include/asm/motorola_pgtable.h

··· 147 147 #define pmd_present(pmd) (pmd_val(pmd) & _PAGE_TABLE) 148 148 #define pmd_clear(pmdp) ({ pmd_val(*pmdp) = 0; }) 149 149 150 + #define pmd_pfn(pmd) ((pmd_val(pmd) & _TABLE_MASK) >> PAGE_SHIFT) 150 151 /* 151 152 * m68k does not have huge pages (020/030 actually could), but generic code 152 153 * expects pmd_page() to exists, only to then DCE it all. Provide a dummy to

+1

arch/m68k/include/asm/sun3_pgtable.h

··· 130 130 ({ pte_t __pte; pte_val(__pte) = pfn | pgprot_val(pgprot); __pte; }) 131 131 132 132 #define pte_page(pte) virt_to_page(__pte_page(pte)) 133 + #define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT) 133 134 #define pmd_page(pmd) virt_to_page(pmd_page_vaddr(pmd)) 134 135 135 136

+3

arch/microblaze/include/asm/pgtable.h

··· 399 399 return ((unsigned long) (pmd_val(pmd) & PAGE_MASK)); 400 400 } 401 401 402 + /* returns pfn of the pmd entry*/ 403 + #define pmd_pfn(pmd) (__pa(pmd_val(pmd)) >> PAGE_SHIFT) 404 + 402 405 /* returns struct *page of the pmd entry*/ 403 406 #define pmd_page(pmd) (pfn_to_page(__pa(pmd_val(pmd)) >> PAGE_SHIFT)) 404 407

+5 -5

arch/mips/include/asm/pgtable.h

··· 86 86 */ 87 87 #define pmd_phys(pmd) virt_to_phys((void *)pmd_val(pmd)) 88 88 89 + static inline unsigned long pmd_pfn(pmd_t pmd) 90 + { 91 + return pmd_val(pmd) >> _PFN_SHIFT; 92 + } 93 + 89 94 #ifndef CONFIG_MIPS_HUGE_TLB_SUPPORT 90 95 #define pmd_page(pmd) (pfn_to_page(pmd_phys(pmd) >> PAGE_SHIFT)) 91 96 #endif /* CONFIG_MIPS_HUGE_TLB_SUPPORT */ ··· 425 420 static inline int pmd_write(pmd_t pmd) 426 421 { 427 422 return !!(pmd_val(pmd) & _PAGE_WRITE); 428 - } 429 - 430 - static inline unsigned long pmd_pfn(pmd_t pmd) 431 - { 432 - return pmd_val(pmd) >> _PFN_SHIFT; 433 423 } 434 424 435 425 static inline struct page *pmd_page(pmd_t pmd)

+1

arch/nds32/include/asm/pgtable.h

··· 308 308 return pmd; 309 309 } 310 310 311 + #define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT) 311 312 #define pmd_page(pmd) virt_to_page(__va(pmd_val(pmd))) 312 313 313 314 /*

+1

arch/nios2/include/asm/pgtable.h

··· 235 235 * and a page entry and page directory to the page they refer to. 236 236 */ 237 237 #define pmd_phys(pmd) virt_to_phys((void *)pmd_val(pmd)) 238 + #define pmd_pfn(pmd) (pmd_phys(pmd) >> PAGE_SHIFT) 238 239 #define pmd_page(pmd) (pfn_to_page(pmd_phys(pmd) >> PAGE_SHIFT)) 239 240 240 241 static inline unsigned long pmd_page_vaddr(pmd_t pmd)

+1

arch/openrisc/include/asm/pgtable.h

··· 361 361 pmd_val(*pmdp) = _KERNPG_TABLE | (unsigned long) ptep; 362 362 } 363 363 364 + #define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT) 364 365 #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) 365 366 366 367 static inline unsigned long pmd_page_vaddr(pmd_t pmd)

+1

arch/parisc/include/asm/pgtable.h

··· 408 408 return ((unsigned long) __va(pmd_address(pmd))); 409 409 } 410 410 411 + #define pmd_pfn(pmd) (pmd_address(pmd) >> PAGE_SHIFT) 411 412 #define __pmd_page(pmd) ((unsigned long) __va(pmd_address(pmd))) 412 413 #define pmd_page(pmd) virt_to_page((void *)__pmd_page(pmd)) 413 414

+2 -2

arch/powerpc/include/asm/book3s/32/pgtable.h

··· 372 372 #define __HAVE_ARCH_PTE_SAME 373 373 #define pte_same(A,B) (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HASHPTE) == 0) 374 374 375 - #define pmd_page(pmd) \ 376 - pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT) 375 + #define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT) 376 + #define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) 377 377 378 378 /* 379 379 * Encode and decode a swap entry.

-1

arch/powerpc/include/asm/mmu_context.h

··· 21 21 #ifdef CONFIG_SPAPR_TCE_IOMMU 22 22 struct mm_iommu_table_group_mem_t; 23 23 24 - extern int isolate_lru_page(struct page *page); /* from internal.h */ 25 24 extern bool mm_iommu_preregistered(struct mm_struct *mm); 26 25 extern long mm_iommu_new(struct mm_struct *mm, 27 26 unsigned long ua, unsigned long entries,

+3 -4

arch/powerpc/include/asm/nohash/32/pgtable.h

··· 349 349 * of the pte page. -- paulus 350 350 */ 351 351 #ifndef CONFIG_BOOKE 352 - #define pmd_page(pmd) \ 353 - pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT) 352 + #define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT) 354 353 #else 355 354 #define pmd_page_vaddr(pmd) \ 356 355 ((unsigned long)(pmd_val(pmd) & ~(PTE_TABLE_SIZE - 1))) 357 - #define pmd_page(pmd) \ 358 - pfn_to_page((__pa(pmd_val(pmd)) >> PAGE_SHIFT)) 356 + #define pmd_pfn(pmd) (__pa(pmd_val(pmd)) >> PAGE_SHIFT) 359 357 #endif 360 358 359 + #define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) 361 360 /* 362 361 * Encode and decode a swap entry. 363 362 * Note that the bits we use in a PTE for representing a swap entry

+1

arch/powerpc/include/asm/nohash/64/pgtable.h

··· 142 142 #define pmd_present(pmd) (!pmd_none(pmd)) 143 143 #define pmd_page_vaddr(pmd) (pmd_val(pmd) & ~PMD_MASKED_BITS) 144 144 extern struct page *pmd_page(pmd_t pmd); 145 + #define pmd_pfn(pmd) (page_to_pfn(pmd_page(pmd))) 145 146 146 147 static inline void pud_set(pud_t *pudp, unsigned long val) 147 148 {

+1 -1

arch/powerpc/kvm/book3s_hv_uvmem.c

··· 91 91 #include <linux/kvm_host.h> 92 92 #include <linux/ksm.h> 93 93 #include <linux/of.h> 94 + #include <linux/memremap.h> 94 95 #include <asm/ultravisor.h> 95 96 #include <asm/mman.h> 96 97 #include <asm/kvm_ppc.h> ··· 713 712 714 713 dpage = pfn_to_page(uvmem_pfn); 715 714 dpage->zone_device_data = pvt; 716 - get_page(dpage); 717 715 lock_page(dpage); 718 716 return dpage; 719 717 out_clear:

+1

arch/powerpc/mm/book3s64/pgtable.c

··· 6 6 #include <linux/sched.h> 7 7 #include <linux/mm_types.h> 8 8 #include <linux/memblock.h> 9 + #include <linux/memremap.h> 9 10 #include <linux/debugfs.h> 10 11 #include <misc/cxl-base.h> 11 12

+1

arch/sh/include/asm/pgtable_32.h

··· 406 406 return (unsigned long)pmd_val(pmd); 407 407 } 408 408 409 + #define pmd_pfn(pmd) (__pa(pmd_val(pmd)) >> PAGE_SHIFT) 409 410 #define pmd_page(pmd) (virt_to_page(pmd_val(pmd))) 410 411 411 412 #ifdef CONFIG_X2TLB

+6 -1

arch/sparc/include/asm/pgtable_32.h

··· 127 127 return ((x & 0xF0000000) != 0); 128 128 } 129 129 130 + static inline unsigned long pmd_pfn(pmd_t pmd) 131 + { 132 + return (pmd_val(pmd) & SRMMU_PTD_PMASK) >> (PAGE_SHIFT-4); 133 + } 134 + 130 135 static inline struct page *pmd_page(pmd_t pmd) 131 136 { 132 137 if (srmmu_device_memory(pmd_val(pmd))) 133 138 BUG(); 134 - return pfn_to_page((pmd_val(pmd) & SRMMU_PTD_PMASK) >> (PAGE_SHIFT-4)); 139 + return pfn_to_page(pmd_pfn(pmd)); 135 140 } 136 141 137 142 static inline unsigned long __pmd_page(pmd_t pmd)

+1

arch/um/include/asm/pgtable.h

··· 109 109 #define p4d_newpage(x) (p4d_val(x) & _PAGE_NEWPAGE) 110 110 #define p4d_mkuptodate(x) (p4d_val(x) &= ~_PAGE_NEWPAGE) 111 111 112 + #define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT) 112 113 #define pmd_page(pmd) phys_to_page(pmd_val(pmd) & PAGE_MASK) 113 114 114 115 #define pte_page(x) pfn_to_page(pte_pfn(x))

+1

arch/xtensa/include/asm/pgtable.h

··· 241 241 * The pmd contains the kernel virtual address of the pte page. 242 242 */ 243 243 #define pmd_page_vaddr(pmd) ((unsigned long)(pmd_val(pmd) & PAGE_MASK)) 244 + #define pmd_pfn(pmd) (__pa(pmd_val(pmd)) >> PAGE_SHIFT) 244 245 #define pmd_page(pmd) virt_to_page(pmd_val(pmd)) 245 246 246 247 /*

+1

drivers/block/xen-blkback/xenbus.c

··· 10 10 11 11 #include <linux/module.h> 12 12 #include <linux/kthread.h> 13 + #include <linux/pagemap.h> 13 14 #include <xen/events.h> 14 15 #include <xen/grant_table.h> 15 16 #include "common.h"

+1 -1

drivers/gpu/drm/amd/amdkfd/kfd_migrate.c

··· 24 24 #include <linux/hmm.h> 25 25 #include <linux/dma-direction.h> 26 26 #include <linux/dma-mapping.h> 27 + #include <linux/migrate.h> 27 28 #include "amdgpu_sync.h" 28 29 #include "amdgpu_object.h" 29 30 #include "amdgpu_vm.h" ··· 225 224 page = pfn_to_page(pfn); 226 225 svm_range_bo_ref(prange->svm_bo); 227 226 page->zone_device_data = prange->svm_bo; 228 - get_page(page); 229 227 lock_page(page); 230 228 } 231 229

+1

drivers/gpu/drm/amd/amdkfd/kfd_priv.h

··· 25 25 26 26 #include <linux/hashtable.h> 27 27 #include <linux/mmu_notifier.h> 28 + #include <linux/memremap.h> 28 29 #include <linux/mutex.h> 29 30 #include <linux/types.h> 30 31 #include <linux/atomic.h>

+1 -1

drivers/gpu/drm/drm_cache.c

··· 27 27 /* 28 28 * Authors: Thomas Hellström <thomas-at-tungstengraphics-dot-com> 29 29 */ 30 - 31 30 #include <linux/dma-buf-map.h> 32 31 #include <linux/export.h> 33 32 #include <linux/highmem.h> 34 33 #include <linux/cc_platform.h> 34 + #include <linux/ioport.h> 35 35 #include <xen/xen.h> 36 36 37 37 #include <drm/drm_cache.h>

+2 -1

drivers/gpu/drm/nouveau/nouveau_dmem.c

··· 39 39 40 40 #include <linux/sched/mm.h> 41 41 #include <linux/hmm.h> 42 + #include <linux/memremap.h> 43 + #include <linux/migrate.h> 42 44 43 45 /* 44 46 * FIXME: this is ugly right now we are using TTM to allocate vram and we pin ··· 326 324 return NULL; 327 325 } 328 326 329 - get_page(page); 330 327 lock_page(page); 331 328 return page; 332 329 }

+1

drivers/gpu/drm/nouveau/nouveau_svm.c

··· 35 35 #include <linux/sched/mm.h> 36 36 #include <linux/sort.h> 37 37 #include <linux/hmm.h> 38 + #include <linux/memremap.h> 38 39 #include <linux/rmap.h> 39 40 40 41 struct nouveau_svm {

+1

drivers/infiniband/core/rw.c

··· 2 2 /* 3 3 * Copyright (c) 2016 HGST, a Western Digital Company. 4 4 */ 5 + #include <linux/memremap.h> 5 6 #include <linux/moduleparam.h> 6 7 #include <linux/slab.h> 7 8 #include <linux/pci-p2pdma.h>

+1

drivers/nvdimm/pmem.h

··· 3 3 #define __NVDIMM_PMEM_H__ 4 4 #include <linux/page-flags.h> 5 5 #include <linux/badblocks.h> 6 + #include <linux/memremap.h> 6 7 #include <linux/types.h> 7 8 #include <linux/pfn_t.h> 8 9 #include <linux/fs.h>

+1

drivers/nvme/host/pci.c

··· 15 15 #include <linux/init.h> 16 16 #include <linux/interrupt.h> 17 17 #include <linux/io.h> 18 + #include <linux/memremap.h> 18 19 #include <linux/mm.h> 19 20 #include <linux/module.h> 20 21 #include <linux/mutex.h>

+1

drivers/nvme/target/io-cmd-bdev.c

··· 6 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 7 #include <linux/blkdev.h> 8 8 #include <linux/blk-integrity.h> 9 + #include <linux/memremap.h> 9 10 #include <linux/module.h> 10 11 #include "nvmet.h" 11 12

+1

drivers/usb/gadget/function/f_mass_storage.c

··· 179 179 #include <linux/kthread.h> 180 180 #include <linux/sched/signal.h> 181 181 #include <linux/limits.h> 182 + #include <linux/pagemap.h> 182 183 #include <linux/rwsem.h> 183 184 #include <linux/slab.h> 184 185 #include <linux/spinlock.h>

+1 -1

fs/Kconfig

··· 48 48 bool "File system based Direct Access (DAX) support" 49 49 depends on MMU 50 50 depends on !(ARM || MIPS || SPARC) 51 - select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED) 51 + depends on ZONE_DEVICE || FS_DAX_LIMITED 52 52 select FS_IOMAP 53 53 select DAX 54 54 help

+1

fs/coda/file.c

··· 14 14 #include <linux/time.h> 15 15 #include <linux/file.h> 16 16 #include <linux/fs.h> 17 + #include <linux/pagemap.h> 17 18 #include <linux/stat.h> 18 19 #include <linux/cred.h> 19 20 #include <linux/errno.h>

+1

fs/fuse/virtio_fs.c

··· 8 8 #include <linux/dax.h> 9 9 #include <linux/pci.h> 10 10 #include <linux/pfn_t.h> 11 + #include <linux/memremap.h> 11 12 #include <linux/module.h> 12 13 #include <linux/virtio.h> 13 14 #include <linux/virtio_fs.h>

+1

fs/iomap/fiemap.c

··· 7 7 #include <linux/fs.h> 8 8 #include <linux/iomap.h> 9 9 #include <linux/fiemap.h> 10 + #include <linux/pagemap.h> 10 11 11 12 static int iomap_to_fiemap(struct fiemap_extent_info *fi, 12 13 const struct iomap *iomap, u32 flags)

+1

fs/nfsd/filecache.c

··· 7 7 #include <linux/hash.h> 8 8 #include <linux/slab.h> 9 9 #include <linux/file.h> 10 + #include <linux/pagemap.h> 10 11 #include <linux/sched.h> 11 12 #include <linux/list_lru.h> 12 13 #include <linux/fsnotify_backend.h>

+1

fs/nfsd/vfs.c

··· 26 26 #include <linux/xattr.h> 27 27 #include <linux/jhash.h> 28 28 #include <linux/ima.h> 29 + #include <linux/pagemap.h> 29 30 #include <linux/slab.h> 30 31 #include <linux/uaccess.h> 31 32 #include <linux/exportfs.h>

+1

fs/proc/page.c

··· 10 10 #include <linux/proc_fs.h> 11 11 #include <linux/seq_file.h> 12 12 #include <linux/hugetlb.h> 13 + #include <linux/memremap.h> 13 14 #include <linux/memcontrol.h> 14 15 #include <linux/mmu_notifier.h> 15 16 #include <linux/page_idle.h>

+12 -12

fs/splice.c

··· 46 46 static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe, 47 47 struct pipe_buffer *buf) 48 48 { 49 - struct page *page = buf->page; 49 + struct folio *folio = page_folio(buf->page); 50 50 struct address_space *mapping; 51 51 52 - lock_page(page); 52 + folio_lock(folio); 53 53 54 - mapping = page_mapping(page); 54 + mapping = folio_mapping(folio); 55 55 if (mapping) { 56 - WARN_ON(!PageUptodate(page)); 56 + WARN_ON(!folio_test_uptodate(folio)); 57 57 58 58 /* 59 59 * At least for ext2 with nobh option, we need to wait on 60 - * writeback completing on this page, since we'll remove it 60 + * writeback completing on this folio, since we'll remove it 61 61 * from the pagecache. Otherwise truncate wont wait on the 62 - * page, allowing the disk blocks to be reused by someone else 62 + * folio, allowing the disk blocks to be reused by someone else 63 63 * before we actually wrote our data to them. fs corruption 64 64 * ensues. 65 65 */ 66 - wait_on_page_writeback(page); 66 + folio_wait_writeback(folio); 67 67 68 - if (page_has_private(page) && 69 - !try_to_release_page(page, GFP_KERNEL)) 68 + if (folio_has_private(folio) && 69 + !filemap_release_folio(folio, GFP_KERNEL)) 70 70 goto out_unlock; 71 71 72 72 /* 73 73 * If we succeeded in removing the mapping, set LRU flag 74 74 * and return good. 75 75 */ 76 - if (remove_mapping(mapping, page)) { 76 + if (remove_mapping(mapping, folio)) { 77 77 buf->flags |= PIPE_BUF_FLAG_LRU; 78 78 return true; 79 79 } 80 80 } 81 81 82 82 /* 83 - * Raced with truncate or failed to remove page from current 83 + * Raced with truncate or failed to remove folio from current 84 84 * address space, unlock and return failure. 85 85 */ 86 86 out_unlock: 87 - unlock_page(page); 87 + folio_unlock(folio); 88 88 return false; 89 89 } 90 90

+1

fs/vboxsf/utils.c

··· 9 9 #include <linux/namei.h> 10 10 #include <linux/nls.h> 11 11 #include <linux/sizes.h> 12 + #include <linux/pagemap.h> 12 13 #include <linux/vfs.h> 13 14 #include "vfsmod.h" 14 15

-120

include/linux/fs.h

··· 2753 2753 extern void make_bad_inode(struct inode *); 2754 2754 extern bool is_bad_inode(struct inode *); 2755 2755 2756 - unsigned long invalidate_mapping_pages(struct address_space *mapping, 2757 - pgoff_t start, pgoff_t end); 2758 - 2759 - void invalidate_mapping_pagevec(struct address_space *mapping, 2760 - pgoff_t start, pgoff_t end, 2761 - unsigned long *nr_pagevec); 2762 - 2763 - static inline void invalidate_remote_inode(struct inode *inode) 2764 - { 2765 - if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 2766 - S_ISLNK(inode->i_mode)) 2767 - invalidate_mapping_pages(inode->i_mapping, 0, -1); 2768 - } 2769 - extern int invalidate_inode_pages2(struct address_space *mapping); 2770 - extern int invalidate_inode_pages2_range(struct address_space *mapping, 2771 - pgoff_t start, pgoff_t end); 2772 - extern int write_inode_now(struct inode *, int); 2773 - extern int filemap_fdatawrite(struct address_space *); 2774 - extern int filemap_flush(struct address_space *); 2775 - extern int filemap_fdatawait_keep_errors(struct address_space *mapping); 2776 - extern int filemap_fdatawait_range(struct address_space *, loff_t lstart, 2777 - loff_t lend); 2778 - extern int filemap_fdatawait_range_keep_errors(struct address_space *mapping, 2779 - loff_t start_byte, loff_t end_byte); 2780 - 2781 - static inline int filemap_fdatawait(struct address_space *mapping) 2782 - { 2783 - return filemap_fdatawait_range(mapping, 0, LLONG_MAX); 2784 - } 2785 - 2786 - extern bool filemap_range_has_page(struct address_space *, loff_t lstart, 2787 - loff_t lend); 2788 - extern int filemap_write_and_wait_range(struct address_space *mapping, 2789 - loff_t lstart, loff_t lend); 2790 - extern int __filemap_fdatawrite_range(struct address_space *mapping, 2791 - loff_t start, loff_t end, int sync_mode); 2792 - extern int filemap_fdatawrite_range(struct address_space *mapping, 2793 - loff_t start, loff_t end); 2794 - extern int filemap_check_errors(struct address_space *mapping); 2795 - extern void __filemap_set_wb_err(struct address_space *mapping, int err); 2796 - int filemap_fdatawrite_wbc(struct address_space *mapping, 2797 - struct writeback_control *wbc); 2798 - 2799 - static inline int filemap_write_and_wait(struct address_space *mapping) 2800 - { 2801 - return filemap_write_and_wait_range(mapping, 0, LLONG_MAX); 2802 - } 2803 - 2804 2756 extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart, 2805 2757 loff_t lend); 2806 2758 extern int __must_check file_check_and_advance_wb_err(struct file *file); ··· 2762 2810 static inline int file_write_and_wait(struct file *file) 2763 2811 { 2764 2812 return file_write_and_wait_range(file, 0, LLONG_MAX); 2765 - } 2766 - 2767 - /** 2768 - * filemap_set_wb_err - set a writeback error on an address_space 2769 - * @mapping: mapping in which to set writeback error 2770 - * @err: error to be set in mapping 2771 - * 2772 - * When writeback fails in some way, we must record that error so that 2773 - * userspace can be informed when fsync and the like are called. We endeavor 2774 - * to report errors on any file that was open at the time of the error. Some 2775 - * internal callers also need to know when writeback errors have occurred. 2776 - * 2777 - * When a writeback error occurs, most filesystems will want to call 2778 - * filemap_set_wb_err to record the error in the mapping so that it will be 2779 - * automatically reported whenever fsync is called on the file. 2780 - */ 2781 - static inline void filemap_set_wb_err(struct address_space *mapping, int err) 2782 - { 2783 - /* Fastpath for common case of no error */ 2784 - if (unlikely(err)) 2785 - __filemap_set_wb_err(mapping, err); 2786 - } 2787 - 2788 - /** 2789 - * filemap_check_wb_err - has an error occurred since the mark was sampled? 2790 - * @mapping: mapping to check for writeback errors 2791 - * @since: previously-sampled errseq_t 2792 - * 2793 - * Grab the errseq_t value from the mapping, and see if it has changed "since" 2794 - * the given value was sampled. 2795 - * 2796 - * If it has then report the latest error set, otherwise return 0. 2797 - */ 2798 - static inline int filemap_check_wb_err(struct address_space *mapping, 2799 - errseq_t since) 2800 - { 2801 - return errseq_check(&mapping->wb_err, since); 2802 - } 2803 - 2804 - /** 2805 - * filemap_sample_wb_err - sample the current errseq_t to test for later errors 2806 - * @mapping: mapping to be sampled 2807 - * 2808 - * Writeback errors are always reported relative to a particular sample point 2809 - * in the past. This function provides those sample points. 2810 - */ 2811 - static inline errseq_t filemap_sample_wb_err(struct address_space *mapping) 2812 - { 2813 - return errseq_sample(&mapping->wb_err); 2814 - } 2815 - 2816 - /** 2817 - * file_sample_sb_err - sample the current errseq_t to test for later errors 2818 - * @file: file pointer to be sampled 2819 - * 2820 - * Grab the most current superblock-level errseq_t value for the given 2821 - * struct file. 2822 - */ 2823 - static inline errseq_t file_sample_sb_err(struct file *file) 2824 - { 2825 - return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err); 2826 2813 } 2827 2814 2828 2815 extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, ··· 3517 3626 int advice); 3518 3627 extern int generic_fadvise(struct file *file, loff_t offset, loff_t len, 3519 3628 int advice); 3520 - 3521 - /* 3522 - * Flush file data before changing attributes. Caller must hold any locks 3523 - * required to prevent further writes to this file until we're done setting 3524 - * flags. 3525 - */ 3526 - static inline int inode_drain_writes(struct inode *inode) 3527 - { 3528 - inode_dio_wait(inode); 3529 - return filemap_write_and_wait(inode->i_mapping); 3530 - } 3531 3629 3532 3630 #endif /* _LINUX_FS_H */

+2 -7

include/linux/hmm.h

··· 9 9 #ifndef LINUX_HMM_H 10 10 #define LINUX_HMM_H 11 11 12 - #include <linux/kconfig.h> 13 - #include <linux/pgtable.h> 12 + #include <linux/mm.h> 14 13 15 - #include <linux/device.h> 16 - #include <linux/migrate.h> 17 - #include <linux/memremap.h> 18 - #include <linux/completion.h> 19 - #include <linux/mmu_notifier.h> 14 + struct mmu_interval_notifier; 20 15 21 16 /* 22 17 * On output:

+9 -50

include/linux/huge_mm.h

··· 185 185 void free_transhuge_page(struct page *page); 186 186 bool is_transparent_hugepage(struct page *page); 187 187 188 - bool can_split_huge_page(struct page *page, int *pextra_pins); 188 + bool can_split_folio(struct folio *folio, int *pextra_pins); 189 189 int split_huge_page_to_list(struct page *page, struct list_head *list); 190 190 static inline int split_huge_page(struct page *page) 191 191 { ··· 194 194 void deferred_split_huge_page(struct page *page); 195 195 196 196 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 197 - unsigned long address, bool freeze, struct page *page); 197 + unsigned long address, bool freeze, struct folio *folio); 198 198 199 199 #define split_huge_pmd(__vma, __pmd, __address) \ 200 200 do { \ ··· 207 207 208 208 209 209 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, 210 - bool freeze, struct page *page); 210 + bool freeze, struct folio *folio); 211 211 212 212 void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, 213 213 unsigned long address); ··· 248 248 return __pud_trans_huge_lock(pud, vma); 249 249 else 250 250 return NULL; 251 - } 252 - 253 - /** 254 - * thp_order - Order of a transparent huge page. 255 - * @page: Head page of a transparent huge page. 256 - */ 257 - static inline unsigned int thp_order(struct page *page) 258 - { 259 - VM_BUG_ON_PGFLAGS(PageTail(page), page); 260 - if (PageHead(page)) 261 - return HPAGE_PMD_ORDER; 262 - return 0; 263 - } 264 - 265 - /** 266 - * thp_nr_pages - The number of regular pages in this huge page. 267 - * @page: The head page of a huge page. 268 - */ 269 - static inline int thp_nr_pages(struct page *page) 270 - { 271 - VM_BUG_ON_PGFLAGS(PageTail(page), page); 272 - if (PageHead(page)) 273 - return HPAGE_PMD_NR; 274 - return 1; 275 251 } 276 252 277 253 /** ··· 312 336 #define HPAGE_PUD_MASK ({ BUILD_BUG(); 0; }) 313 337 #define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; }) 314 338 315 - static inline unsigned int thp_order(struct page *page) 316 - { 317 - VM_BUG_ON_PGFLAGS(PageTail(page), page); 318 - return 0; 319 - } 320 - 321 - static inline int thp_nr_pages(struct page *page) 322 - { 323 - VM_BUG_ON_PGFLAGS(PageTail(page), page); 324 - return 1; 325 - } 326 - 327 339 static inline bool folio_test_pmd_mappable(struct folio *folio) 328 340 { 329 341 return false; ··· 351 387 #define thp_get_unmapped_area NULL 352 388 353 389 static inline bool 354 - can_split_huge_page(struct page *page, int *pextra_pins) 390 + can_split_folio(struct folio *folio, int *pextra_pins) 355 391 { 356 392 BUILD_BUG(); 357 393 return false; ··· 370 406 do { } while (0) 371 407 372 408 static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 373 - unsigned long address, bool freeze, struct page *page) {} 409 + unsigned long address, bool freeze, struct folio *folio) {} 374 410 static inline void split_huge_pmd_address(struct vm_area_struct *vma, 375 - unsigned long address, bool freeze, struct page *page) {} 411 + unsigned long address, bool freeze, struct folio *folio) {} 376 412 377 413 #define split_huge_pud(__vma, __pmd, __address) \ 378 414 do { } while (0) ··· 447 483 } 448 484 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 449 485 450 - /** 451 - * thp_size - Size of a transparent huge page. 452 - * @page: Head page of a transparent huge page. 453 - * 454 - * Return: Number of bytes in this page. 455 - */ 456 - static inline unsigned long thp_size(struct page *page) 486 + static inline int split_folio_to_list(struct folio *folio, 487 + struct list_head *list) 457 488 { 458 - return PAGE_SIZE << thp_order(page); 489 + return split_huge_page_to_list(&folio->page, list); 459 490 } 460 491 461 492 #endif /* _LINUX_HUGE_MM_H */

+5

include/linux/hugetlb.h

··· 970 970 return NULL; 971 971 } 972 972 973 + static inline struct hstate *size_to_hstate(unsigned long size) 974 + { 975 + return NULL; 976 + } 977 + 973 978 static inline unsigned long huge_page_size(struct hstate *h) 974 979 { 975 980 return PAGE_SIZE;

+3 -3

include/linux/ksm.h

··· 51 51 struct page *ksm_might_need_to_copy(struct page *page, 52 52 struct vm_area_struct *vma, unsigned long address); 53 53 54 - void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); 54 + void rmap_walk_ksm(struct folio *folio, const struct rmap_walk_control *rwc); 55 55 void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); 56 56 57 57 #else /* !CONFIG_KSM */ ··· 78 78 return page; 79 79 } 80 80 81 - static inline void rmap_walk_ksm(struct page *page, 82 - struct rmap_walk_control *rwc) 81 + static inline void rmap_walk_ksm(struct folio *folio, 82 + const struct rmap_walk_control *rwc) 83 83 { 84 84 } 85 85

+24 -3

include/linux/memremap.h

··· 1 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 2 #ifndef _LINUX_MEMREMAP_H_ 3 3 #define _LINUX_MEMREMAP_H_ 4 + 5 + #include <linux/mm.h> 4 6 #include <linux/range.h> 5 7 #include <linux/ioport.h> 6 8 #include <linux/percpu-refcount.h> ··· 68 66 69 67 struct dev_pagemap_ops { 70 68 /* 71 - * Called once the page refcount reaches 1. (ZONE_DEVICE pages never 72 - * reach 0 refcount unless there is a refcount bug. This allows the 73 - * device driver to implement its own memory management.) 69 + * Called once the page refcount reaches 0. The reference count will be 70 + * reset to one by the core code after the method is called to prepare 71 + * for handing out the page again. 74 72 */ 75 73 void (*page_free)(struct page *page); 76 74 ··· 129 127 static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap) 130 128 { 131 129 return 1 << pgmap->vmemmap_shift; 130 + } 131 + 132 + static inline bool is_device_private_page(const struct page *page) 133 + { 134 + return IS_ENABLED(CONFIG_DEVICE_PRIVATE) && 135 + is_zone_device_page(page) && 136 + page->pgmap->type == MEMORY_DEVICE_PRIVATE; 137 + } 138 + 139 + static inline bool folio_is_device_private(const struct folio *folio) 140 + { 141 + return is_device_private_page(&folio->page); 142 + } 143 + 144 + static inline bool is_pci_p2pdma_page(const struct page *page) 145 + { 146 + return IS_ENABLED(CONFIG_PCI_P2PDMA) && 147 + is_zone_device_page(page) && 148 + page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; 132 149 } 133 150 134 151 #ifdef CONFIG_ZONE_DEVICE

+152 -144

include/linux/mm.h

··· 3 3 #define _LINUX_MM_H 4 4 5 5 #include <linux/errno.h> 6 - 7 - #ifdef __KERNEL__ 8 - 9 6 #include <linux/mmdebug.h> 10 7 #include <linux/gfp.h> 11 8 #include <linux/bug.h> ··· 23 26 #include <linux/err.h> 24 27 #include <linux/page-flags.h> 25 28 #include <linux/page_ref.h> 26 - #include <linux/memremap.h> 27 29 #include <linux/overflow.h> 28 30 #include <linux/sizes.h> 29 31 #include <linux/sched.h> ··· 212 216 213 217 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) 214 218 #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) 219 + #define folio_page_idx(folio, p) (page_to_pfn(p) - folio_pfn(folio)) 215 220 #else 216 221 #define nth_page(page,n) ((page) + (n)) 222 + #define folio_page_idx(folio, p) ((p) - &(folio)->page) 217 223 #endif 218 224 219 225 /* to align the pointer to the (next) page boundary */ ··· 225 227 #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) 226 228 227 229 #define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) 230 + static inline struct folio *lru_to_folio(struct list_head *head) 231 + { 232 + return list_entry((head)->prev, struct folio, lru); 233 + } 228 234 229 235 void setup_initial_init_mm(void *start_code, void *end_code, 230 236 void *end_data, void *brk); ··· 777 775 } 778 776 #endif 779 777 780 - static inline int head_compound_mapcount(struct page *head) 778 + /* 779 + * How many times the entire folio is mapped as a single unit (eg by a 780 + * PMD or PUD entry). This is probably not what you want, except for 781 + * debugging purposes; look at folio_mapcount() or page_mapcount() 782 + * instead. 783 + */ 784 + static inline int folio_entire_mapcount(struct folio *folio) 781 785 { 782 - return atomic_read(compound_mapcount_ptr(head)) + 1; 786 + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); 787 + return atomic_read(folio_mapcount_ptr(folio)) + 1; 783 788 } 784 789 785 790 /* 786 791 * Mapcount of compound page as a whole, does not include mapped sub-pages. 787 792 * 788 - * Must be called only for compound pages or any their tail sub-pages. 793 + * Must be called only for compound pages. 789 794 */ 790 795 static inline int compound_mapcount(struct page *page) 791 796 { 792 - VM_BUG_ON_PAGE(!PageCompound(page), page); 793 - page = compound_head(page); 794 - return head_compound_mapcount(page); 797 + return folio_entire_mapcount(page_folio(page)); 795 798 } 796 799 797 800 /* ··· 826 819 return atomic_read(&page->_mapcount) + 1; 827 820 } 828 821 822 + int folio_mapcount(struct folio *folio); 823 + 829 824 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 830 - int total_mapcount(struct page *page); 825 + static inline int total_mapcount(struct page *page) 826 + { 827 + return folio_mapcount(page_folio(page)); 828 + } 829 + 831 830 int page_trans_huge_mapcount(struct page *page); 832 831 #else 833 832 static inline int total_mapcount(struct page *page) ··· 903 890 compound_page_dtors[page[1].compound_dtor](page); 904 891 } 905 892 906 - static inline bool hpage_pincount_available(struct page *page) 907 - { 908 - /* 909 - * Can the page->hpage_pinned_refcount field be used? That field is in 910 - * the 3rd page of the compound page, so the smallest (2-page) compound 911 - * pages cannot support it. 912 - */ 913 - page = compound_head(page); 914 - return PageCompound(page) && compound_order(page) > 1; 915 - } 916 - 917 893 static inline int head_compound_pincount(struct page *head) 918 894 { 919 895 return atomic_read(compound_pincount_ptr(head)); 920 896 } 921 897 922 - static inline int compound_pincount(struct page *page) 923 - { 924 - VM_BUG_ON_PAGE(!hpage_pincount_available(page), page); 925 - page = compound_head(page); 926 - return head_compound_pincount(page); 927 - } 928 - 929 898 static inline void set_compound_order(struct page *page, unsigned int order) 930 899 { 931 900 page[1].compound_order = order; 901 + #ifdef CONFIG_64BIT 932 902 page[1].compound_nr = 1U << order; 903 + #endif 933 904 } 934 905 935 906 /* Returns the number of pages in this potentially compound page. */ ··· 921 924 { 922 925 if (!PageHead(page)) 923 926 return 1; 927 + #ifdef CONFIG_64BIT 924 928 return page[1].compound_nr; 929 + #else 930 + return 1UL << compound_order(page); 931 + #endif 925 932 } 926 933 927 934 /* Returns the number of bytes in this potentially compound page. */ ··· 938 937 static inline unsigned int page_shift(struct page *page) 939 938 { 940 939 return PAGE_SHIFT + compound_order(page); 940 + } 941 + 942 + /** 943 + * thp_order - Order of a transparent huge page. 944 + * @page: Head page of a transparent huge page. 945 + */ 946 + static inline unsigned int thp_order(struct page *page) 947 + { 948 + VM_BUG_ON_PGFLAGS(PageTail(page), page); 949 + return compound_order(page); 950 + } 951 + 952 + /** 953 + * thp_nr_pages - The number of regular pages in this huge page. 954 + * @page: The head page of a huge page. 955 + */ 956 + static inline int thp_nr_pages(struct page *page) 957 + { 958 + VM_BUG_ON_PGFLAGS(PageTail(page), page); 959 + return compound_nr(page); 960 + } 961 + 962 + /** 963 + * thp_size - Size of a transparent huge page. 964 + * @page: Head page of a transparent huge page. 965 + * 966 + * Return: Number of bytes in this page. 967 + */ 968 + static inline unsigned long thp_size(struct page *page) 969 + { 970 + return PAGE_SIZE << thp_order(page); 941 971 } 942 972 943 973 void free_compound_page(struct page *page); ··· 1122 1090 } 1123 1091 #endif 1124 1092 1093 + static inline bool folio_is_zone_device(const struct folio *folio) 1094 + { 1095 + return is_zone_device_page(&folio->page); 1096 + } 1097 + 1125 1098 static inline bool is_zone_movable_page(const struct page *page) 1126 1099 { 1127 1100 return page_zonenum(page) == ZONE_MOVABLE; 1128 1101 } 1129 1102 1130 - #ifdef CONFIG_DEV_PAGEMAP_OPS 1131 - void free_devmap_managed_page(struct page *page); 1103 + #if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX) 1132 1104 DECLARE_STATIC_KEY_FALSE(devmap_managed_key); 1133 1105 1134 - static inline bool page_is_devmap_managed(struct page *page) 1106 + bool __put_devmap_managed_page(struct page *page); 1107 + static inline bool put_devmap_managed_page(struct page *page) 1135 1108 { 1136 1109 if (!static_branch_unlikely(&devmap_managed_key)) 1137 1110 return false; 1138 1111 if (!is_zone_device_page(page)) 1139 1112 return false; 1140 - switch (page->pgmap->type) { 1141 - case MEMORY_DEVICE_PRIVATE: 1142 - case MEMORY_DEVICE_FS_DAX: 1143 - return true; 1144 - default: 1145 - break; 1146 - } 1147 - return false; 1113 + return __put_devmap_managed_page(page); 1148 1114 } 1149 1115 1150 - void put_devmap_managed_page(struct page *page); 1151 - 1152 - #else /* CONFIG_DEV_PAGEMAP_OPS */ 1153 - static inline bool page_is_devmap_managed(struct page *page) 1116 + #else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ 1117 + static inline bool put_devmap_managed_page(struct page *page) 1154 1118 { 1155 1119 return false; 1156 1120 } 1157 - 1158 - static inline void put_devmap_managed_page(struct page *page) 1159 - { 1160 - } 1161 - #endif /* CONFIG_DEV_PAGEMAP_OPS */ 1162 - 1163 - static inline bool is_device_private_page(const struct page *page) 1164 - { 1165 - return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && 1166 - IS_ENABLED(CONFIG_DEVICE_PRIVATE) && 1167 - is_zone_device_page(page) && 1168 - page->pgmap->type == MEMORY_DEVICE_PRIVATE; 1169 - } 1170 - 1171 - static inline bool is_pci_p2pdma_page(const struct page *page) 1172 - { 1173 - return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && 1174 - IS_ENABLED(CONFIG_PCI_P2PDMA) && 1175 - is_zone_device_page(page) && 1176 - page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; 1177 - } 1121 + #endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ 1178 1122 1179 1123 /* 127: arbitrary random number, small enough to assemble well */ 1180 1124 #define folio_ref_zero_or_close_to_overflow(folio) \ ··· 1176 1168 } 1177 1169 1178 1170 bool __must_check try_grab_page(struct page *page, unsigned int flags); 1179 - struct page *try_grab_compound_head(struct page *page, int refs, 1180 - unsigned int flags); 1181 - 1182 1171 1183 1172 static inline __must_check bool try_get_page(struct page *page) 1184 1173 { ··· 1230 1225 struct folio *folio = page_folio(page); 1231 1226 1232 1227 /* 1233 - * For devmap managed pages we need to catch refcount transition from 1234 - * 2 to 1, when refcount reach one it means the page is free and we 1235 - * need to inform the device driver through callback. See 1236 - * include/linux/memremap.h and HMM for details. 1228 + * For some devmap managed pages we need to catch refcount transition 1229 + * from 2 to 1: 1237 1230 */ 1238 - if (page_is_devmap_managed(&folio->page)) { 1239 - put_devmap_managed_page(&folio->page); 1231 + if (put_devmap_managed_page(&folio->page)) 1240 1232 return; 1241 - } 1242 - 1243 1233 folio_put(folio); 1244 1234 } 1245 1235 ··· 1264 1264 * applications that don't have huge page reference counts, this won't be an 1265 1265 * issue. 1266 1266 * 1267 - * Locking: the lockless algorithm described in page_cache_get_speculative() 1268 - * and page_cache_gup_pin_speculative() provides safe operation for 1269 - * get_user_pages and page_mkclean and other calls that race to set up page 1270 - * table entries. 1267 + * Locking: the lockless algorithm described in folio_try_get_rcu() 1268 + * provides safe operation for get_user_pages(), page_mkclean() and 1269 + * other calls that race to set up page table entries. 1271 1270 */ 1272 1271 #define GUP_PIN_COUNTING_BIAS (1U << 10) 1273 1272 ··· 1277 1278 bool make_dirty); 1278 1279 void unpin_user_pages(struct page **pages, unsigned long npages); 1279 1280 1280 - /** 1281 - * page_maybe_dma_pinned - Report if a page is pinned for DMA. 1282 - * @page: The page. 1283 - * 1284 - * This function checks if a page has been pinned via a call to 1285 - * a function in the pin_user_pages() family. 1286 - * 1287 - * For non-huge pages, the return value is partially fuzzy: false is not fuzzy, 1288 - * because it means "definitely not pinned for DMA", but true means "probably 1289 - * pinned for DMA, but possibly a false positive due to having at least 1290 - * GUP_PIN_COUNTING_BIAS worth of normal page references". 1291 - * 1292 - * False positives are OK, because: a) it's unlikely for a page to get that many 1293 - * refcounts, and b) all the callers of this routine are expected to be able to 1294 - * deal gracefully with a false positive. 1295 - * 1296 - * For huge pages, the result will be exactly correct. That's because we have 1297 - * more tracking data available: the 3rd struct page in the compound page is 1298 - * used to track the pincount (instead using of the GUP_PIN_COUNTING_BIAS 1299 - * scheme). 1300 - * 1301 - * For more information, please see Documentation/core-api/pin_user_pages.rst. 1302 - * 1303 - * Return: True, if it is likely that the page has been "dma-pinned". 1304 - * False, if the page is definitely not dma-pinned. 1305 - */ 1306 - static inline bool page_maybe_dma_pinned(struct page *page) 1307 - { 1308 - if (hpage_pincount_available(page)) 1309 - return compound_pincount(page) > 0; 1310 - 1311 - /* 1312 - * page_ref_count() is signed. If that refcount overflows, then 1313 - * page_ref_count() returns a negative value, and callers will avoid 1314 - * further incrementing the refcount. 1315 - * 1316 - * Here, for that overflow case, use the signed bit to count a little 1317 - * bit higher via unsigned math, and thus still get an accurate result. 1318 - */ 1319 - return ((unsigned int)page_ref_count(compound_head(page))) >= 1320 - GUP_PIN_COUNTING_BIAS; 1321 - } 1322 - 1323 1281 static inline bool is_cow_mapping(vm_flags_t flags) 1324 1282 { 1325 1283 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 1326 - } 1327 - 1328 - /* 1329 - * This should most likely only be called during fork() to see whether we 1330 - * should break the cow immediately for a page on the src mm. 1331 - */ 1332 - static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, 1333 - struct page *page) 1334 - { 1335 - if (!is_cow_mapping(vma->vm_flags)) 1336 - return false; 1337 - 1338 - if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) 1339 - return false; 1340 - 1341 - return page_maybe_dma_pinned(page); 1342 1284 } 1343 1285 1344 1286 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) ··· 1526 1586 return page_to_pfn(&folio->page); 1527 1587 } 1528 1588 1589 + static inline atomic_t *folio_pincount_ptr(struct folio *folio) 1590 + { 1591 + return &folio_page(folio, 1)->compound_pincount; 1592 + } 1593 + 1594 + /** 1595 + * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA. 1596 + * @folio: The folio. 1597 + * 1598 + * This function checks if a folio has been pinned via a call to 1599 + * a function in the pin_user_pages() family. 1600 + * 1601 + * For small folios, the return value is partially fuzzy: false is not fuzzy, 1602 + * because it means "definitely not pinned for DMA", but true means "probably 1603 + * pinned for DMA, but possibly a false positive due to having at least 1604 + * GUP_PIN_COUNTING_BIAS worth of normal folio references". 1605 + * 1606 + * False positives are OK, because: a) it's unlikely for a folio to 1607 + * get that many refcounts, and b) all the callers of this routine are 1608 + * expected to be able to deal gracefully with a false positive. 1609 + * 1610 + * For large folios, the result will be exactly correct. That's because 1611 + * we have more tracking data available: the compound_pincount is used 1612 + * instead of the GUP_PIN_COUNTING_BIAS scheme. 1613 + * 1614 + * For more information, please see Documentation/core-api/pin_user_pages.rst. 1615 + * 1616 + * Return: True, if it is likely that the page has been "dma-pinned". 1617 + * False, if the page is definitely not dma-pinned. 1618 + */ 1619 + static inline bool folio_maybe_dma_pinned(struct folio *folio) 1620 + { 1621 + if (folio_test_large(folio)) 1622 + return atomic_read(folio_pincount_ptr(folio)) > 0; 1623 + 1624 + /* 1625 + * folio_ref_count() is signed. If that refcount overflows, then 1626 + * folio_ref_count() returns a negative value, and callers will avoid 1627 + * further incrementing the refcount. 1628 + * 1629 + * Here, for that overflow case, use the sign bit to count a little 1630 + * bit higher via unsigned math, and thus still get an accurate result. 1631 + */ 1632 + return ((unsigned int)folio_ref_count(folio)) >= 1633 + GUP_PIN_COUNTING_BIAS; 1634 + } 1635 + 1636 + static inline bool page_maybe_dma_pinned(struct page *page) 1637 + { 1638 + return folio_maybe_dma_pinned(page_folio(page)); 1639 + } 1640 + 1641 + /* 1642 + * This should most likely only be called during fork() to see whether we 1643 + * should break the cow immediately for a page on the src mm. 1644 + */ 1645 + static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, 1646 + struct page *page) 1647 + { 1648 + if (!is_cow_mapping(vma->vm_flags)) 1649 + return false; 1650 + 1651 + if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) 1652 + return false; 1653 + 1654 + return page_maybe_dma_pinned(page); 1655 + } 1656 + 1529 1657 /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */ 1530 1658 #ifdef CONFIG_MIGRATION 1531 1659 static inline bool is_pinnable_page(struct page *page) ··· 1607 1599 return true; 1608 1600 } 1609 1601 #endif 1602 + 1603 + static inline bool folio_is_pinnable(struct folio *folio) 1604 + { 1605 + return is_pinnable_page(&folio->page); 1606 + } 1610 1607 1611 1608 static inline void set_page_zone(struct page *page, enum zone_type zone) 1612 1609 { ··· 1762 1749 } 1763 1750 1764 1751 extern void *page_rmapping(struct page *page); 1765 - extern struct anon_vma *page_anon_vma(struct page *page); 1766 1752 extern pgoff_t __page_file_index(struct page *page); 1767 1753 1768 1754 /* ··· 1867 1855 void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); 1868 1856 void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); 1869 1857 int generic_error_remove_page(struct address_space *mapping, struct page *page); 1870 - int invalidate_inode_page(struct page *page); 1871 1858 1872 1859 #ifdef CONFIG_MMU 1873 1860 extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma, ··· 2932 2921 #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ 2933 2922 #define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO 2934 2923 * and return without waiting upon it */ 2935 - #define FOLL_POPULATE 0x40 /* fault in pages (with FOLL_MLOCK) */ 2936 2924 #define FOLL_NOFAULT 0x80 /* do not fault in pages */ 2937 2925 #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ 2938 2926 #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ 2939 2927 #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ 2940 2928 #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ 2941 - #define FOLL_MLOCK 0x1000 /* lock present pages */ 2942 2929 #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ 2943 2930 #define FOLL_COW 0x4000 /* internal GUP flag */ 2944 2931 #define FOLL_ANON 0x8000 /* don't do file mappings */ ··· 3390 3381 } 3391 3382 #endif 3392 3383 3393 - #endif /* __KERNEL__ */ 3394 3384 #endif /* _LINUX_MM_H */

+8 -3

include/linux/mm_inline.h

··· 99 99 100 100 update_lru_size(lruvec, lru, folio_zonenum(folio), 101 101 folio_nr_pages(folio)); 102 - list_add(&folio->lru, &lruvec->lists[lru]); 102 + if (lru != LRU_UNEVICTABLE) 103 + list_add(&folio->lru, &lruvec->lists[lru]); 103 104 } 104 105 105 106 static __always_inline void add_page_to_lru_list(struct page *page, ··· 116 115 117 116 update_lru_size(lruvec, lru, folio_zonenum(folio), 118 117 folio_nr_pages(folio)); 118 + /* This is not expected to be used on LRU_UNEVICTABLE */ 119 119 list_add_tail(&folio->lru, &lruvec->lists[lru]); 120 120 } 121 121 ··· 129 127 static __always_inline 130 128 void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) 131 129 { 132 - list_del(&folio->lru); 133 - update_lru_size(lruvec, folio_lru_list(folio), folio_zonenum(folio), 130 + enum lru_list lru = folio_lru_list(folio); 131 + 132 + if (lru != LRU_UNEVICTABLE) 133 + list_del(&folio->lru); 134 + update_lru_size(lruvec, lru, folio_zonenum(folio), 134 135 -folio_nr_pages(folio)); 135 136 } 136 137

+22 -4

include/linux/mm_types.h

··· 85 85 * lruvec->lru_lock. Sometimes used as a generic list 86 86 * by the page owner. 87 87 */ 88 - struct list_head lru; 88 + union { 89 + struct list_head lru; 90 + /* Or, for the Unevictable "LRU list" slot */ 91 + struct { 92 + /* Always even, to negate PageTail */ 93 + void *__filler; 94 + /* Count page's or folio's mlocks */ 95 + unsigned int mlock_count; 96 + }; 97 + }; 89 98 /* See page-flags.h for PAGE_MAPPING_FLAGS */ 90 99 struct address_space *mapping; 91 100 pgoff_t index; /* Our offset within mapping. */ ··· 135 126 unsigned char compound_dtor; 136 127 unsigned char compound_order; 137 128 atomic_t compound_mapcount; 129 + atomic_t compound_pincount; 130 + #ifdef CONFIG_64BIT 138 131 unsigned int compound_nr; /* 1 << compound_order */ 132 + #endif 139 133 }; 140 134 struct { /* Second tail page of compound page */ 141 135 unsigned long _compound_pad_1; /* compound_head */ 142 - atomic_t hpage_pinned_refcount; 136 + unsigned long _compound_pad_2; 143 137 /* For both global and memcg */ 144 138 struct list_head deferred_list; 145 139 }; ··· 253 241 struct { 254 242 /* public: */ 255 243 unsigned long flags; 256 - struct list_head lru; 244 + union { 245 + struct list_head lru; 246 + struct { 247 + void *__filler; 248 + unsigned int mlock_count; 249 + }; 250 + }; 257 251 struct address_space *mapping; 258 252 pgoff_t index; 259 253 void *private; ··· 303 285 304 286 static inline atomic_t *compound_pincount_ptr(struct page *page) 305 287 { 306 - return &page[2].hpage_pinned_refcount; 288 + return &page[1].compound_pincount; 307 289 } 308 290 309 291 /*

+133 -13

include/linux/pagemap.h

··· 18 18 19 19 struct folio_batch; 20 20 21 + unsigned long invalidate_mapping_pages(struct address_space *mapping, 22 + pgoff_t start, pgoff_t end); 23 + 24 + static inline void invalidate_remote_inode(struct inode *inode) 25 + { 26 + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 27 + S_ISLNK(inode->i_mode)) 28 + invalidate_mapping_pages(inode->i_mapping, 0, -1); 29 + } 30 + int invalidate_inode_pages2(struct address_space *mapping); 31 + int invalidate_inode_pages2_range(struct address_space *mapping, 32 + pgoff_t start, pgoff_t end); 33 + int write_inode_now(struct inode *, int sync); 34 + int filemap_fdatawrite(struct address_space *); 35 + int filemap_flush(struct address_space *); 36 + int filemap_fdatawait_keep_errors(struct address_space *mapping); 37 + int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend); 38 + int filemap_fdatawait_range_keep_errors(struct address_space *mapping, 39 + loff_t start_byte, loff_t end_byte); 40 + 41 + static inline int filemap_fdatawait(struct address_space *mapping) 42 + { 43 + return filemap_fdatawait_range(mapping, 0, LLONG_MAX); 44 + } 45 + 46 + bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend); 47 + int filemap_write_and_wait_range(struct address_space *mapping, 48 + loff_t lstart, loff_t lend); 49 + int __filemap_fdatawrite_range(struct address_space *mapping, 50 + loff_t start, loff_t end, int sync_mode); 51 + int filemap_fdatawrite_range(struct address_space *mapping, 52 + loff_t start, loff_t end); 53 + int filemap_check_errors(struct address_space *mapping); 54 + void __filemap_set_wb_err(struct address_space *mapping, int err); 55 + int filemap_fdatawrite_wbc(struct address_space *mapping, 56 + struct writeback_control *wbc); 57 + 58 + static inline int filemap_write_and_wait(struct address_space *mapping) 59 + { 60 + return filemap_write_and_wait_range(mapping, 0, LLONG_MAX); 61 + } 62 + 63 + /** 64 + * filemap_set_wb_err - set a writeback error on an address_space 65 + * @mapping: mapping in which to set writeback error 66 + * @err: error to be set in mapping 67 + * 68 + * When writeback fails in some way, we must record that error so that 69 + * userspace can be informed when fsync and the like are called. We endeavor 70 + * to report errors on any file that was open at the time of the error. Some 71 + * internal callers also need to know when writeback errors have occurred. 72 + * 73 + * When a writeback error occurs, most filesystems will want to call 74 + * filemap_set_wb_err to record the error in the mapping so that it will be 75 + * automatically reported whenever fsync is called on the file. 76 + */ 77 + static inline void filemap_set_wb_err(struct address_space *mapping, int err) 78 + { 79 + /* Fastpath for common case of no error */ 80 + if (unlikely(err)) 81 + __filemap_set_wb_err(mapping, err); 82 + } 83 + 84 + /** 85 + * filemap_check_wb_err - has an error occurred since the mark was sampled? 86 + * @mapping: mapping to check for writeback errors 87 + * @since: previously-sampled errseq_t 88 + * 89 + * Grab the errseq_t value from the mapping, and see if it has changed "since" 90 + * the given value was sampled. 91 + * 92 + * If it has then report the latest error set, otherwise return 0. 93 + */ 94 + static inline int filemap_check_wb_err(struct address_space *mapping, 95 + errseq_t since) 96 + { 97 + return errseq_check(&mapping->wb_err, since); 98 + } 99 + 100 + /** 101 + * filemap_sample_wb_err - sample the current errseq_t to test for later errors 102 + * @mapping: mapping to be sampled 103 + * 104 + * Writeback errors are always reported relative to a particular sample point 105 + * in the past. This function provides those sample points. 106 + */ 107 + static inline errseq_t filemap_sample_wb_err(struct address_space *mapping) 108 + { 109 + return errseq_sample(&mapping->wb_err); 110 + } 111 + 112 + /** 113 + * file_sample_sb_err - sample the current errseq_t to test for later errors 114 + * @file: file pointer to be sampled 115 + * 116 + * Grab the most current superblock-level errseq_t value for the given 117 + * struct file. 118 + */ 119 + static inline errseq_t file_sample_sb_err(struct file *file) 120 + { 121 + return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err); 122 + } 123 + 124 + /* 125 + * Flush file data before changing attributes. Caller must hold any locks 126 + * required to prevent further writes to this file until we're done setting 127 + * flags. 128 + */ 129 + static inline int inode_drain_writes(struct inode *inode) 130 + { 131 + inode_dio_wait(inode); 132 + return filemap_write_and_wait(inode->i_mapping); 133 + } 134 + 21 135 static inline bool mapping_empty(struct address_space *mapping) 22 136 { 23 137 return xa_empty(&mapping->i_pages); ··· 306 192 __set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); 307 193 } 308 194 195 + /* 196 + * Large folio support currently depends on THP. These dependencies are 197 + * being worked on but are not yet fixed. 198 + */ 309 199 static inline bool mapping_large_folio_support(struct address_space *mapping) 310 200 { 311 - return test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); 201 + return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && 202 + test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); 312 203 } 313 204 314 205 static inline int filemap_nr_thps(struct address_space *mapping) ··· 331 212 if (!mapping_large_folio_support(mapping)) 332 213 atomic_inc(&mapping->nr_thps); 333 214 #else 334 - WARN_ON_ONCE(1); 215 + WARN_ON_ONCE(mapping_large_folio_support(mapping) == 0); 335 216 #endif 336 217 } 337 218 ··· 341 222 if (!mapping_large_folio_support(mapping)) 342 223 atomic_dec(&mapping->nr_thps); 343 224 #else 344 - WARN_ON_ONCE(1); 225 + WARN_ON_ONCE(mapping_large_folio_support(mapping) == 0); 345 226 #endif 346 227 } 347 228 ··· 400 281 static inline struct inode *folio_inode(struct folio *folio) 401 282 { 402 283 return folio->mapping->host; 403 - } 404 - 405 - static inline bool page_cache_add_speculative(struct page *page, int count) 406 - { 407 - return folio_ref_try_add_rcu((struct folio *)page, count); 408 - } 409 - 410 - static inline bool page_cache_get_speculative(struct page *page) 411 - { 412 - return page_cache_add_speculative(page, 1); 413 284 } 414 285 415 286 /** ··· 813 704 static inline loff_t folio_file_pos(struct folio *folio) 814 705 { 815 706 return page_file_offset(&folio->page); 707 + } 708 + 709 + /* 710 + * Get the offset in PAGE_SIZE (even for hugetlb folios). 711 + * (TODO: hugetlb folios should have ->index in PAGE_SIZE) 712 + */ 713 + static inline pgoff_t folio_pgoff(struct folio *folio) 714 + { 715 + if (unlikely(folio_test_hugetlb(folio))) 716 + return hugetlb_basepage_index(&folio->page); 717 + return folio->index; 816 718 } 817 719 818 720 extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,

+47 -29

include/linux/rmap.h

··· 11 11 #include <linux/rwsem.h> 12 12 #include <linux/memcontrol.h> 13 13 #include <linux/highmem.h> 14 + #include <linux/pagemap.h> 14 15 15 16 /* 16 17 * The anon_vma heads a list of private "related" vmas, to scan if ··· 168 167 */ 169 168 void page_move_anon_rmap(struct page *, struct vm_area_struct *); 170 169 void page_add_anon_rmap(struct page *, struct vm_area_struct *, 171 - unsigned long, bool); 170 + unsigned long address, bool compound); 172 171 void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, 173 - unsigned long, int); 172 + unsigned long address, int flags); 174 173 void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, 175 - unsigned long, bool); 176 - void page_add_file_rmap(struct page *, bool); 177 - void page_remove_rmap(struct page *, bool); 178 - 174 + unsigned long address, bool compound); 175 + void page_add_file_rmap(struct page *, struct vm_area_struct *, 176 + bool compound); 177 + void page_remove_rmap(struct page *, struct vm_area_struct *, 178 + bool compound); 179 179 void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, 180 - unsigned long); 180 + unsigned long address); 181 181 void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, 182 - unsigned long); 182 + unsigned long address); 183 183 184 184 static inline void page_dup_rmap(struct page *page, bool compound) 185 185 { ··· 190 188 /* 191 189 * Called from mm/vmscan.c to handle paging out 192 190 */ 193 - int page_referenced(struct page *, int is_locked, 191 + int folio_referenced(struct folio *, int is_locked, 194 192 struct mem_cgroup *memcg, unsigned long *vm_flags); 195 193 196 - void try_to_migrate(struct page *page, enum ttu_flags flags); 197 - void try_to_unmap(struct page *, enum ttu_flags flags); 194 + void try_to_migrate(struct folio *folio, enum ttu_flags flags); 195 + void try_to_unmap(struct folio *, enum ttu_flags flags); 198 196 199 197 int make_device_exclusive_range(struct mm_struct *mm, unsigned long start, 200 198 unsigned long end, struct page **pages, ··· 202 200 203 201 /* Avoid racy checks */ 204 202 #define PVMW_SYNC (1 << 0) 205 - /* Look for migarion entries rather than present PTEs */ 203 + /* Look for migration entries rather than present PTEs */ 206 204 #define PVMW_MIGRATION (1 << 1) 207 205 208 206 struct page_vma_mapped_walk { 209 - struct page *page; 207 + unsigned long pfn; 208 + unsigned long nr_pages; 209 + pgoff_t pgoff; 210 210 struct vm_area_struct *vma; 211 211 unsigned long address; 212 212 pmd_t *pmd; ··· 217 213 unsigned int flags; 218 214 }; 219 215 216 + #define DEFINE_PAGE_VMA_WALK(name, _page, _vma, _address, _flags) \ 217 + struct page_vma_mapped_walk name = { \ 218 + .pfn = page_to_pfn(_page), \ 219 + .nr_pages = compound_nr(page), \ 220 + .pgoff = page_to_pgoff(page), \ 221 + .vma = _vma, \ 222 + .address = _address, \ 223 + .flags = _flags, \ 224 + } 225 + 226 + #define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags) \ 227 + struct page_vma_mapped_walk name = { \ 228 + .pfn = folio_pfn(_folio), \ 229 + .nr_pages = folio_nr_pages(_folio), \ 230 + .pgoff = folio_pgoff(_folio), \ 231 + .vma = _vma, \ 232 + .address = _address, \ 233 + .flags = _flags, \ 234 + } 235 + 220 236 static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw) 221 237 { 222 238 /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */ 223 - if (pvmw->pte && !PageHuge(pvmw->page)) 239 + if (pvmw->pte && !is_vm_hugetlb_page(pvmw->vma)) 224 240 pte_unmap(pvmw->pte); 225 241 if (pvmw->ptl) 226 242 spin_unlock(pvmw->ptl); ··· 261 237 */ 262 238 int folio_mkclean(struct folio *); 263 239 264 - /* 265 - * called in munlock()/munmap() path to check for other vmas holding 266 - * the page mlocked. 267 - */ 268 - void page_mlock(struct page *page); 269 - 270 - void remove_migration_ptes(struct page *old, struct page *new, bool locked); 240 + void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked); 271 241 272 242 /* 273 243 * Called by memory-failure.c to kill processes. 274 244 */ 275 - struct anon_vma *page_lock_anon_vma_read(struct page *page); 245 + struct anon_vma *folio_lock_anon_vma_read(struct folio *folio); 276 246 void page_unlock_anon_vma_read(struct anon_vma *anon_vma); 277 247 int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); 278 248 ··· 285 267 * Return false if page table scanning in rmap_walk should be stopped. 286 268 * Otherwise, return true. 287 269 */ 288 - bool (*rmap_one)(struct page *page, struct vm_area_struct *vma, 270 + bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma, 289 271 unsigned long addr, void *arg); 290 - int (*done)(struct page *page); 291 - struct anon_vma *(*anon_lock)(struct page *page); 272 + int (*done)(struct folio *folio); 273 + struct anon_vma *(*anon_lock)(struct folio *folio); 292 274 bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); 293 275 }; 294 276 295 - void rmap_walk(struct page *page, struct rmap_walk_control *rwc); 296 - void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc); 277 + void rmap_walk(struct folio *folio, const struct rmap_walk_control *rwc); 278 + void rmap_walk_locked(struct folio *folio, const struct rmap_walk_control *rwc); 297 279 298 280 #else /* !CONFIG_MMU */ 299 281 ··· 301 283 #define anon_vma_prepare(vma) (0) 302 284 #define anon_vma_link(vma) do {} while (0) 303 285 304 - static inline int page_referenced(struct page *page, int is_locked, 286 + static inline int folio_referenced(struct folio *folio, int is_locked, 305 287 struct mem_cgroup *memcg, 306 288 unsigned long *vm_flags) 307 289 { ··· 309 291 return 0; 310 292 } 311 293 312 - static inline void try_to_unmap(struct page *page, enum ttu_flags flags) 294 + static inline void try_to_unmap(struct folio *folio, enum ttu_flags flags) 313 295 { 314 296 } 315 297

+4 -5

include/linux/swap.h

··· 328 328 329 329 /* linux/mm/workingset.c */ 330 330 void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); 331 - void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg); 331 + void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg); 332 332 void workingset_refault(struct folio *folio, void *shadow); 333 333 void workingset_activation(struct folio *folio); 334 334 ··· 375 375 extern void lru_add_drain_cpu(int cpu); 376 376 extern void lru_add_drain_cpu_zone(struct zone *zone); 377 377 extern void lru_add_drain_all(void); 378 - extern void deactivate_file_page(struct page *page); 379 378 extern void deactivate_page(struct page *page); 380 379 extern void mark_page_lazyfree(struct page *page); 381 380 extern void swap_setup(void); ··· 396 397 unsigned long *nr_scanned); 397 398 extern unsigned long shrink_all_memory(unsigned long nr_pages); 398 399 extern int vm_swappiness; 399 - extern int remove_mapping(struct address_space *mapping, struct page *page); 400 + long remove_mapping(struct address_space *mapping, struct folio *folio); 400 401 401 402 extern unsigned long reclaim_pages(struct list_head *page_list); 402 403 #ifdef CONFIG_NUMA ··· 742 743 #endif 743 744 744 745 #ifdef CONFIG_MEMCG_SWAP 745 - extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry); 746 + void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry); 746 747 extern int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); 747 748 static inline int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) 748 749 { ··· 762 763 extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); 763 764 extern bool mem_cgroup_swap_full(struct page *page); 764 765 #else 765 - static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry) 766 + static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) 766 767 { 767 768 } 768 769

+5 -5

include/trace/events/vmscan.h

··· 327 327 __print_symbolic(__entry->lru, LRU_NAMES)) 328 328 ); 329 329 330 - TRACE_EVENT(mm_vmscan_writepage, 330 + TRACE_EVENT(mm_vmscan_write_folio, 331 331 332 - TP_PROTO(struct page *page), 332 + TP_PROTO(struct folio *folio), 333 333 334 - TP_ARGS(page), 334 + TP_ARGS(folio), 335 335 336 336 TP_STRUCT__entry( 337 337 __field(unsigned long, pfn) ··· 339 339 ), 340 340 341 341 TP_fast_assign( 342 - __entry->pfn = page_to_pfn(page); 342 + __entry->pfn = folio_pfn(folio); 343 343 __entry->reclaim_flags = trace_reclaim_flags( 344 - page_is_file_lru(page)); 344 + folio_is_file_lru(folio)); 345 345 ), 346 346 347 347 TP_printk("page=%p pfn=0x%lx flags=%s",

+3 -10

kernel/events/uprobes.c

··· 155 155 struct page *old_page, struct page *new_page) 156 156 { 157 157 struct mm_struct *mm = vma->vm_mm; 158 - struct page_vma_mapped_walk pvmw = { 159 - .page = compound_head(old_page), 160 - .vma = vma, 161 - .address = addr, 162 - }; 158 + DEFINE_FOLIO_VMA_WALK(pvmw, page_folio(old_page), vma, addr, 0); 163 159 int err; 164 160 struct mmu_notifier_range range; 165 161 ··· 169 173 return err; 170 174 } 171 175 172 - /* For try_to_free_swap() and munlock_vma_page() below */ 176 + /* For try_to_free_swap() below */ 173 177 lock_page(old_page); 174 178 175 179 mmu_notifier_invalidate_range_start(&range); ··· 197 201 set_pte_at_notify(mm, addr, pvmw.pte, 198 202 mk_pte(new_page, vma->vm_page_prot)); 199 203 200 - page_remove_rmap(old_page, false); 204 + page_remove_rmap(old_page, vma, false); 201 205 if (!page_mapped(old_page)) 202 206 try_to_free_swap(old_page); 203 207 page_vma_mapped_walk_done(&pvmw); 204 - 205 - if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page)) 206 - munlock_vma_page(old_page); 207 208 put_page(old_page); 208 209 209 210 err = 0;

+1 -1

kernel/futex/core.c

··· 302 302 * found it, but truncated or holepunched or subjected to 303 303 * invalidate_complete_page2 before we got the page lock (also 304 304 * cases which we are happy to fail). And we hold a reference, 305 - * so refcount care in invalidate_complete_page's remove_mapping 305 + * so refcount care in invalidate_inode_page's remove_mapping 306 306 * prevents drop_caches from setting mapping to NULL beneath us. 307 307 * 308 308 * The case we do have to guard against is when memory pressure made

+3 -1

lib/test_hmm.c

··· 12 12 #include <linux/kernel.h> 13 13 #include <linux/cdev.h> 14 14 #include <linux/device.h> 15 + #include <linux/memremap.h> 15 16 #include <linux/mutex.h> 16 17 #include <linux/rwsem.h> 17 18 #include <linux/sched.h> ··· 27 26 #include <linux/sched/mm.h> 28 27 #include <linux/platform_device.h> 29 28 #include <linux/rmap.h> 29 + #include <linux/mmu_notifier.h> 30 + #include <linux/migrate.h> 30 31 31 32 #include "test_hmm_uapi.h" 32 33 ··· 566 563 } 567 564 568 565 dpage->zone_device_data = rpage; 569 - get_page(dpage); 570 566 lock_page(dpage); 571 567 return dpage; 572 568

+3 -4

mm/Kconfig

··· 249 249 pages as migration can relocate pages to satisfy a huge page 250 250 allocation instead of reclaiming. 251 251 252 + config DEVICE_MIGRATION 253 + def_bool MIGRATION && ZONE_DEVICE 254 + 252 255 config ARCH_ENABLE_HUGEPAGE_MIGRATION 253 256 bool 254 257 ··· 794 791 795 792 If FS_DAX is enabled, then say Y. 796 793 797 - config DEV_PAGEMAP_OPS 798 - bool 799 - 800 794 # 801 795 # Helpers to mirror range of the CPU page tables of a process into device page 802 796 # tables. ··· 805 805 config DEVICE_PRIVATE 806 806 bool "Unaddressable device memory (GPU memory, ...)" 807 807 depends on ZONE_DEVICE 808 - select DEV_PAGEMAP_OPS 809 808 810 809 help 811 810 Allows creation of struct pages to represent unaddressable device

+1

mm/Makefile

··· 92 92 obj-$(CONFIG_FAILSLAB) += failslab.o 93 93 obj-$(CONFIG_MEMTEST) += memtest.o 94 94 obj-$(CONFIG_MIGRATION) += migrate.o 95 + obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o 95 96 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o 96 97 obj-$(CONFIG_PAGE_COUNTER) += page_counter.o 97 98 obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o

+29 -33

mm/damon/paddr.c

··· 16 16 #include "../internal.h" 17 17 #include "ops-common.h" 18 18 19 - static bool __damon_pa_mkold(struct page *page, struct vm_area_struct *vma, 19 + static bool __damon_pa_mkold(struct folio *folio, struct vm_area_struct *vma, 20 20 unsigned long addr, void *arg) 21 21 { 22 - struct page_vma_mapped_walk pvmw = { 23 - .page = page, 24 - .vma = vma, 25 - .address = addr, 26 - }; 22 + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); 27 23 28 24 while (page_vma_mapped_walk(&pvmw)) { 29 25 addr = pvmw.address; ··· 33 37 34 38 static void damon_pa_mkold(unsigned long paddr) 35 39 { 40 + struct folio *folio; 36 41 struct page *page = damon_get_page(PHYS_PFN(paddr)); 37 42 struct rmap_walk_control rwc = { 38 43 .rmap_one = __damon_pa_mkold, 39 - .anon_lock = page_lock_anon_vma_read, 44 + .anon_lock = folio_lock_anon_vma_read, 40 45 }; 41 46 bool need_lock; 42 47 43 48 if (!page) 44 49 return; 50 + folio = page_folio(page); 45 51 46 - if (!page_mapped(page) || !page_rmapping(page)) { 47 - set_page_idle(page); 52 + if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { 53 + folio_set_idle(folio); 48 54 goto out; 49 55 } 50 56 51 - need_lock = !PageAnon(page) || PageKsm(page); 52 - if (need_lock && !trylock_page(page)) 57 + need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); 58 + if (need_lock && !folio_trylock(folio)) 53 59 goto out; 54 60 55 - rmap_walk(page, &rwc); 61 + rmap_walk(folio, &rwc); 56 62 57 63 if (need_lock) 58 - unlock_page(page); 64 + folio_unlock(folio); 59 65 60 66 out: 61 - put_page(page); 67 + folio_put(folio); 62 68 } 63 69 64 70 static void __damon_pa_prepare_access_check(struct damon_ctx *ctx, ··· 87 89 bool accessed; 88 90 }; 89 91 90 - static bool __damon_pa_young(struct page *page, struct vm_area_struct *vma, 92 + static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, 91 93 unsigned long addr, void *arg) 92 94 { 93 95 struct damon_pa_access_chk_result *result = arg; 94 - struct page_vma_mapped_walk pvmw = { 95 - .page = page, 96 - .vma = vma, 97 - .address = addr, 98 - }; 96 + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); 99 97 100 98 result->accessed = false; 101 99 result->page_sz = PAGE_SIZE; ··· 99 105 addr = pvmw.address; 100 106 if (pvmw.pte) { 101 107 result->accessed = pte_young(*pvmw.pte) || 102 - !page_is_idle(page) || 108 + !folio_test_idle(folio) || 103 109 mmu_notifier_test_young(vma->vm_mm, addr); 104 110 } else { 105 111 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 106 112 result->accessed = pmd_young(*pvmw.pmd) || 107 - !page_is_idle(page) || 113 + !folio_test_idle(folio) || 108 114 mmu_notifier_test_young(vma->vm_mm, addr); 109 115 result->page_sz = ((1UL) << HPAGE_PMD_SHIFT); 110 116 #else ··· 123 129 124 130 static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz) 125 131 { 132 + struct folio *folio; 126 133 struct page *page = damon_get_page(PHYS_PFN(paddr)); 127 134 struct damon_pa_access_chk_result result = { 128 135 .page_sz = PAGE_SIZE, ··· 132 137 struct rmap_walk_control rwc = { 133 138 .arg = &result, 134 139 .rmap_one = __damon_pa_young, 135 - .anon_lock = page_lock_anon_vma_read, 140 + .anon_lock = folio_lock_anon_vma_read, 136 141 }; 137 142 bool need_lock; 138 143 139 144 if (!page) 140 145 return false; 146 + folio = page_folio(page); 141 147 142 - if (!page_mapped(page) || !page_rmapping(page)) { 143 - if (page_is_idle(page)) 148 + if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { 149 + if (folio_test_idle(folio)) 144 150 result.accessed = false; 145 151 else 146 152 result.accessed = true; 147 - put_page(page); 153 + folio_put(folio); 148 154 goto out; 149 155 } 150 156 151 - need_lock = !PageAnon(page) || PageKsm(page); 152 - if (need_lock && !trylock_page(page)) { 153 - put_page(page); 154 - return NULL; 157 + need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); 158 + if (need_lock && !folio_trylock(folio)) { 159 + folio_put(folio); 160 + return false; 155 161 } 156 162 157 - rmap_walk(page, &rwc); 163 + rmap_walk(folio, &rwc); 158 164 159 165 if (need_lock) 160 - unlock_page(page); 161 - put_page(page); 166 + folio_unlock(folio); 167 + folio_put(folio); 162 168 163 169 out: 164 170 *page_sz = result.page_sz;

+7 -11

mm/debug.c

··· 48 48 49 49 static void __dump_page(struct page *page) 50 50 { 51 - struct page *head = compound_head(page); 51 + struct folio *folio = page_folio(page); 52 + struct page *head = &folio->page; 52 53 struct address_space *mapping; 53 54 bool compound = PageCompound(page); 54 55 /* ··· 77 76 else 78 77 mapping = (void *)(tmp & ~PAGE_MAPPING_FLAGS); 79 78 head = page; 79 + folio = (struct folio *)page; 80 80 compound = false; 81 81 } else { 82 82 mapping = page_mapping(page); ··· 94 92 page, page_ref_count(head), mapcount, mapping, 95 93 page_to_pgoff(page), page_to_pfn(page)); 96 94 if (compound) { 97 - if (hpage_pincount_available(page)) { 98 - pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n", 99 - head, compound_order(head), 100 - head_compound_mapcount(head), 101 - head_compound_pincount(head)); 102 - } else { 103 - pr_warn("head:%p order:%u compound_mapcount:%d\n", 104 - head, compound_order(head), 105 - head_compound_mapcount(head)); 106 - } 95 + pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n", 96 + head, compound_order(head), 97 + folio_entire_mapcount(folio), 98 + head_compound_pincount(head)); 107 99 } 108 100 109 101 #ifdef CONFIG_MEMCG

+41 -18

mm/filemap.c

··· 842 842 { 843 843 XA_STATE(xas, &mapping->i_pages, index); 844 844 int huge = folio_test_hugetlb(folio); 845 - int error; 846 845 bool charged = false; 846 + long nr = 1; 847 847 848 848 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 849 849 VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio); 850 850 mapping_set_update(&xas, mapping); 851 851 852 - folio_get(folio); 853 - folio->mapping = mapping; 854 - folio->index = index; 855 - 856 852 if (!huge) { 857 - error = mem_cgroup_charge(folio, NULL, gfp); 853 + int error = mem_cgroup_charge(folio, NULL, gfp); 858 854 VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); 859 855 if (error) 860 - goto error; 856 + return error; 861 857 charged = true; 858 + xas_set_order(&xas, index, folio_order(folio)); 859 + nr = folio_nr_pages(folio); 862 860 } 863 861 864 862 gfp &= GFP_RECLAIM_MASK; 863 + folio_ref_add(folio, nr); 864 + folio->mapping = mapping; 865 + folio->index = xas.xa_index; 865 866 866 867 do { 867 868 unsigned int order = xa_get_order(xas.xa, xas.xa_index); ··· 886 885 /* entry may have been split before we acquired lock */ 887 886 order = xa_get_order(xas.xa, xas.xa_index); 888 887 if (order > folio_order(folio)) { 888 + /* How to handle large swap entries? */ 889 + BUG_ON(shmem_mapping(mapping)); 889 890 xas_split(&xas, old, order); 890 891 xas_reset(&xas); 891 892 } ··· 897 894 if (xas_error(&xas)) 898 895 goto unlock; 899 896 900 - mapping->nrpages++; 897 + mapping->nrpages += nr; 901 898 902 899 /* hugetlb pages do not participate in page cache accounting */ 903 - if (!huge) 904 - __lruvec_stat_add_folio(folio, NR_FILE_PAGES); 900 + if (!huge) { 901 + __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); 902 + if (folio_test_pmd_mappable(folio)) 903 + __lruvec_stat_mod_folio(folio, 904 + NR_FILE_THPS, nr); 905 + } 905 906 unlock: 906 907 xas_unlock_irq(&xas); 907 908 } while (xas_nomem(&xas, gfp)); 908 909 909 - if (xas_error(&xas)) { 910 - error = xas_error(&xas); 911 - if (charged) 912 - mem_cgroup_uncharge(folio); 910 + if (xas_error(&xas)) 913 911 goto error; 914 - } 915 912 916 913 trace_mm_filemap_add_to_page_cache(folio); 917 914 return 0; 918 915 error: 916 + if (charged) 917 + mem_cgroup_uncharge(folio); 919 918 folio->mapping = NULL; 920 919 /* Leave page->index set: truncation relies upon it */ 921 - folio_put(folio); 922 - return error; 920 + folio_put_refs(folio, nr); 921 + return xas_error(&xas); 923 922 } 924 923 ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO); 925 924 ··· 3002 2997 struct file *fpin = NULL; 3003 2998 unsigned int mmap_miss; 3004 2999 3000 + #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3001 + /* Use the readahead code, even if readahead is disabled */ 3002 + if (vmf->vma->vm_flags & VM_HUGEPAGE) { 3003 + fpin = maybe_unlock_mmap_for_io(vmf, fpin); 3004 + ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1); 3005 + ra->size = HPAGE_PMD_NR; 3006 + /* 3007 + * Fetch two PMD folios, so we get the chance to actually 3008 + * readahead, unless we've been told not to. 3009 + */ 3010 + if (!(vmf->vma->vm_flags & VM_RAND_READ)) 3011 + ra->size *= 2; 3012 + ra->async_size = HPAGE_PMD_NR; 3013 + page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER); 3014 + return fpin; 3015 + } 3016 + #endif 3017 + 3005 3018 /* If we don't want any read-ahead, don't bother */ 3006 3019 if (vmf->vma->vm_flags & VM_RAND_READ) 3007 3020 return fpin; ··· 3052 3029 ra->size = ra->ra_pages; 3053 3030 ra->async_size = ra->ra_pages / 4; 3054 3031 ractl._index = ra->start; 3055 - do_page_cache_ra(&ractl, ra->size, ra->async_size); 3032 + page_cache_ra_order(&ractl, ra, 0); 3056 3033 return fpin; 3057 3034 } 3058 3035

+13

mm/folio-compat.c

··· 7 7 #include <linux/migrate.h> 8 8 #include <linux/pagemap.h> 9 9 #include <linux/swap.h> 10 + #include "internal.h" 10 11 11 12 struct address_space *page_mapping(struct page *page) 12 13 { ··· 152 151 return filemap_release_folio(page_folio(page), gfp); 153 152 } 154 153 EXPORT_SYMBOL(try_to_release_page); 154 + 155 + int isolate_lru_page(struct page *page) 156 + { 157 + if (WARN_RATELIMIT(PageTail(page), "trying to isolate tail page")) 158 + return -EBUSY; 159 + return folio_isolate_lru((struct folio *)page); 160 + } 161 + 162 + void putback_lru_page(struct page *page) 163 + { 164 + folio_putback_lru(page_folio(page)); 165 + }

+208 -284

mm/gup.c

··· 29 29 unsigned int page_mask; 30 30 }; 31 31 32 - static void hpage_pincount_add(struct page *page, int refs) 33 - { 34 - VM_BUG_ON_PAGE(!hpage_pincount_available(page), page); 35 - VM_BUG_ON_PAGE(page != compound_head(page), page); 36 - 37 - atomic_add(refs, compound_pincount_ptr(page)); 38 - } 39 - 40 - static void hpage_pincount_sub(struct page *page, int refs) 41 - { 42 - VM_BUG_ON_PAGE(!hpage_pincount_available(page), page); 43 - VM_BUG_ON_PAGE(page != compound_head(page), page); 44 - 45 - atomic_sub(refs, compound_pincount_ptr(page)); 46 - } 47 - 48 - /* Equivalent to calling put_page() @refs times. */ 49 - static void put_page_refs(struct page *page, int refs) 50 - { 51 - #ifdef CONFIG_DEBUG_VM 52 - if (VM_WARN_ON_ONCE_PAGE(page_ref_count(page) < refs, page)) 53 - return; 54 - #endif 55 - 56 - /* 57 - * Calling put_page() for each ref is unnecessarily slow. Only the last 58 - * ref needs a put_page(). 59 - */ 60 - if (refs > 1) 61 - page_ref_sub(page, refs - 1); 62 - put_page(page); 63 - } 64 - 65 32 /* 66 - * Return the compound head page with ref appropriately incremented, 33 + * Return the folio with ref appropriately incremented, 67 34 * or NULL if that failed. 68 35 */ 69 - static inline struct page *try_get_compound_head(struct page *page, int refs) 36 + static inline struct folio *try_get_folio(struct page *page, int refs) 70 37 { 71 - struct page *head = compound_head(page); 38 + struct folio *folio; 72 39 73 - if (WARN_ON_ONCE(page_ref_count(head) < 0)) 40 + retry: 41 + folio = page_folio(page); 42 + if (WARN_ON_ONCE(folio_ref_count(folio) < 0)) 74 43 return NULL; 75 - if (unlikely(!page_cache_add_speculative(head, refs))) 44 + if (unlikely(!folio_ref_try_add_rcu(folio, refs))) 76 45 return NULL; 77 46 78 47 /* 79 - * At this point we have a stable reference to the head page; but it 80 - * could be that between the compound_head() lookup and the refcount 81 - * increment, the compound page was split, in which case we'd end up 82 - * holding a reference on a page that has nothing to do with the page 48 + * At this point we have a stable reference to the folio; but it 49 + * could be that between calling page_folio() and the refcount 50 + * increment, the folio was split, in which case we'd end up 51 + * holding a reference on a folio that has nothing to do with the page 83 52 * we were given anymore. 84 - * So now that the head page is stable, recheck that the pages still 85 - * belong together. 53 + * So now that the folio is stable, recheck that the page still 54 + * belongs to this folio. 86 55 */ 87 - if (unlikely(compound_head(page) != head)) { 88 - put_page_refs(head, refs); 89 - return NULL; 56 + if (unlikely(page_folio(page) != folio)) { 57 + folio_put_refs(folio, refs); 58 + goto retry; 90 59 } 91 60 92 - return head; 61 + return folio; 93 62 } 94 63 95 64 /** 96 - * try_grab_compound_head() - attempt to elevate a page's refcount, by a 97 - * flags-dependent amount. 98 - * 99 - * Even though the name includes "compound_head", this function is still 100 - * appropriate for callers that have a non-compound @page to get. 101 - * 65 + * try_grab_folio() - Attempt to get or pin a folio. 102 66 * @page: pointer to page to be grabbed 103 - * @refs: the value to (effectively) add to the page's refcount 67 + * @refs: the value to (effectively) add to the folio's refcount 104 68 * @flags: gup flags: these are the FOLL_* flag values. 105 69 * 106 70 * "grab" names in this file mean, "look at flags to decide whether to use 107 - * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount. 71 + * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. 108 72 * 109 73 * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the 110 74 * same time. (That's true throughout the get_user_pages*() and 111 75 * pin_user_pages*() APIs.) Cases: 112 76 * 113 - * FOLL_GET: page's refcount will be incremented by @refs. 77 + * FOLL_GET: folio's refcount will be incremented by @refs. 114 78 * 115 - * FOLL_PIN on compound pages that are > two pages long: page's refcount will 116 - * be incremented by @refs, and page[2].hpage_pinned_refcount will be 117 - * incremented by @refs * GUP_PIN_COUNTING_BIAS. 79 + * FOLL_PIN on large folios: folio's refcount will be incremented by 80 + * @refs, and its compound_pincount will be incremented by @refs. 118 81 * 119 - * FOLL_PIN on normal pages, or compound pages that are two pages long: 120 - * page's refcount will be incremented by @refs * GUP_PIN_COUNTING_BIAS. 82 + * FOLL_PIN on single-page folios: folio's refcount will be incremented by 83 + * @refs * GUP_PIN_COUNTING_BIAS. 121 84 * 122 - * Return: head page (with refcount appropriately incremented) for success, or 123 - * NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's 124 - * considered failure, and furthermore, a likely bug in the caller, so a warning 125 - * is also emitted. 85 + * Return: The folio containing @page (with refcount appropriately 86 + * incremented) for success, or NULL upon failure. If neither FOLL_GET 87 + * nor FOLL_PIN was set, that's considered failure, and furthermore, 88 + * a likely bug in the caller, so a warning is also emitted. 126 89 */ 127 - __maybe_unused struct page *try_grab_compound_head(struct page *page, 128 - int refs, unsigned int flags) 90 + struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) 129 91 { 130 92 if (flags & FOLL_GET) 131 - return try_get_compound_head(page, refs); 93 + return try_get_folio(page, refs); 132 94 else if (flags & FOLL_PIN) { 95 + struct folio *folio; 96 + 133 97 /* 134 98 * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a 135 99 * right zone, so fail and let the caller fall back to the slow ··· 107 143 * CAUTION: Don't use compound_head() on the page before this 108 144 * point, the result won't be stable. 109 145 */ 110 - page = try_get_compound_head(page, refs); 111 - if (!page) 146 + folio = try_get_folio(page, refs); 147 + if (!folio) 112 148 return NULL; 113 149 114 150 /* 115 - * When pinning a compound page of order > 1 (which is what 116 - * hpage_pincount_available() checks for), use an exact count to 117 - * track it, via hpage_pincount_add/_sub(). 151 + * When pinning a large folio, use an exact count to track it. 118 152 * 119 - * However, be sure to *also* increment the normal page refcount 120 - * field at least once, so that the page really is pinned. 121 - * That's why the refcount from the earlier 122 - * try_get_compound_head() is left intact. 153 + * However, be sure to *also* increment the normal folio 154 + * refcount field at least once, so that the folio really 155 + * is pinned. That's why the refcount from the earlier 156 + * try_get_folio() is left intact. 123 157 */ 124 - if (hpage_pincount_available(page)) 125 - hpage_pincount_add(page, refs); 158 + if (folio_test_large(folio)) 159 + atomic_add(refs, folio_pincount_ptr(folio)); 126 160 else 127 - page_ref_add(page, refs * (GUP_PIN_COUNTING_BIAS - 1)); 161 + folio_ref_add(folio, 162 + refs * (GUP_PIN_COUNTING_BIAS - 1)); 163 + node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); 128 164 129 - mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 130 - refs); 131 - 132 - return page; 165 + return folio; 133 166 } 134 167 135 168 WARN_ON_ONCE(1); 136 169 return NULL; 137 170 } 138 171 139 - static void put_compound_head(struct page *page, int refs, unsigned int flags) 172 + static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) 140 173 { 141 174 if (flags & FOLL_PIN) { 142 - mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED, 143 - refs); 144 - 145 - if (hpage_pincount_available(page)) 146 - hpage_pincount_sub(page, refs); 175 + node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs); 176 + if (folio_test_large(folio)) 177 + atomic_sub(refs, folio_pincount_ptr(folio)); 147 178 else 148 179 refs *= GUP_PIN_COUNTING_BIAS; 149 180 } 150 181 151 - put_page_refs(page, refs); 182 + folio_put_refs(folio, refs); 152 183 } 153 184 154 185 /** 155 186 * try_grab_page() - elevate a page's refcount by a flag-dependent amount 187 + * @page: pointer to page to be grabbed 188 + * @flags: gup flags: these are the FOLL_* flag values. 156 189 * 157 190 * This might not do anything at all, depending on the flags argument. 158 191 * 159 192 * "grab" names in this file mean, "look at flags to decide whether to use 160 193 * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount. 161 194 * 162 - * @page: pointer to page to be grabbed 163 - * @flags: gup flags: these are the FOLL_* flag values. 164 - * 165 195 * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same 166 - * time. Cases: please see the try_grab_compound_head() documentation, with 196 + * time. Cases: please see the try_grab_folio() documentation, with 167 197 * "refs=1". 168 198 * 169 199 * Return: true for success, or if no action was required (if neither FOLL_PIN ··· 166 208 */ 167 209 bool __must_check try_grab_page(struct page *page, unsigned int flags) 168 210 { 211 + struct folio *folio = page_folio(page); 212 + 169 213 WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN)); 214 + if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) 215 + return false; 170 216 171 217 if (flags & FOLL_GET) 172 - return try_get_page(page); 218 + folio_ref_inc(folio); 173 219 else if (flags & FOLL_PIN) { 174 - int refs = 1; 175 - 176 - page = compound_head(page); 177 - 178 - if (WARN_ON_ONCE(page_ref_count(page) <= 0)) 179 - return false; 180 - 181 - if (hpage_pincount_available(page)) 182 - hpage_pincount_add(page, 1); 183 - else 184 - refs = GUP_PIN_COUNTING_BIAS; 185 - 186 220 /* 187 - * Similar to try_grab_compound_head(): even if using the 188 - * hpage_pincount_add/_sub() routines, be sure to 189 - * *also* increment the normal page refcount field at least 190 - * once, so that the page really is pinned. 221 + * Similar to try_grab_folio(): be sure to *also* 222 + * increment the normal page refcount field at least once, 223 + * so that the page really is pinned. 191 224 */ 192 - page_ref_add(page, refs); 225 + if (folio_test_large(folio)) { 226 + folio_ref_add(folio, 1); 227 + atomic_add(1, folio_pincount_ptr(folio)); 228 + } else { 229 + folio_ref_add(folio, GUP_PIN_COUNTING_BIAS); 230 + } 193 231 194 - mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1); 232 + node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1); 195 233 } 196 234 197 235 return true; ··· 204 250 */ 205 251 void unpin_user_page(struct page *page) 206 252 { 207 - put_compound_head(compound_head(page), 1, FOLL_PIN); 253 + gup_put_folio(page_folio(page), 1, FOLL_PIN); 208 254 } 209 255 EXPORT_SYMBOL(unpin_user_page); 210 256 211 - static inline void compound_range_next(unsigned long i, unsigned long npages, 212 - struct page **list, struct page **head, 213 - unsigned int *ntails) 257 + static inline struct folio *gup_folio_range_next(struct page *start, 258 + unsigned long npages, unsigned long i, unsigned int *ntails) 214 259 { 215 - struct page *next, *page; 260 + struct page *next = nth_page(start, i); 261 + struct folio *folio = page_folio(next); 216 262 unsigned int nr = 1; 217 263 218 - if (i >= npages) 219 - return; 264 + if (folio_test_large(folio)) 265 + nr = min_t(unsigned int, npages - i, 266 + folio_nr_pages(folio) - folio_page_idx(folio, next)); 220 267 221 - next = *list + i; 222 - page = compound_head(next); 223 - if (PageCompound(page) && compound_order(page) >= 1) 224 - nr = min_t(unsigned int, 225 - page + compound_nr(page) - next, npages - i); 226 - 227 - *head = page; 228 268 *ntails = nr; 269 + return folio; 229 270 } 230 271 231 - #define for_each_compound_range(__i, __list, __npages, __head, __ntails) \ 232 - for (__i = 0, \ 233 - compound_range_next(__i, __npages, __list, &(__head), &(__ntails)); \ 234 - __i < __npages; __i += __ntails, \ 235 - compound_range_next(__i, __npages, __list, &(__head), &(__ntails))) 236 - 237 - static inline void compound_next(unsigned long i, unsigned long npages, 238 - struct page **list, struct page **head, 239 - unsigned int *ntails) 272 + static inline struct folio *gup_folio_next(struct page **list, 273 + unsigned long npages, unsigned long i, unsigned int *ntails) 240 274 { 241 - struct page *page; 275 + struct folio *folio = page_folio(list[i]); 242 276 unsigned int nr; 243 277 244 - if (i >= npages) 245 - return; 246 - 247 - page = compound_head(list[i]); 248 278 for (nr = i + 1; nr < npages; nr++) { 249 - if (compound_head(list[nr]) != page) 279 + if (page_folio(list[nr]) != folio) 250 280 break; 251 281 } 252 282 253 - *head = page; 254 283 *ntails = nr - i; 284 + return folio; 255 285 } 256 - 257 - #define for_each_compound_head(__i, __list, __npages, __head, __ntails) \ 258 - for (__i = 0, \ 259 - compound_next(__i, __npages, __list, &(__head), &(__ntails)); \ 260 - __i < __npages; __i += __ntails, \ 261 - compound_next(__i, __npages, __list, &(__head), &(__ntails))) 262 286 263 287 /** 264 288 * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages ··· 263 331 void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, 264 332 bool make_dirty) 265 333 { 266 - unsigned long index; 267 - struct page *head; 268 - unsigned int ntails; 334 + unsigned long i; 335 + struct folio *folio; 336 + unsigned int nr; 269 337 270 338 if (!make_dirty) { 271 339 unpin_user_pages(pages, npages); 272 340 return; 273 341 } 274 342 275 - for_each_compound_head(index, pages, npages, head, ntails) { 343 + for (i = 0; i < npages; i += nr) { 344 + folio = gup_folio_next(pages, npages, i, &nr); 276 345 /* 277 346 * Checking PageDirty at this point may race with 278 347 * clear_page_dirty_for_io(), but that's OK. Two key ··· 294 361 * written back, so it gets written back again in the 295 362 * next writeback cycle. This is harmless. 296 363 */ 297 - if (!PageDirty(head)) 298 - set_page_dirty_lock(head); 299 - put_compound_head(head, ntails, FOLL_PIN); 364 + if (!folio_test_dirty(folio)) { 365 + folio_lock(folio); 366 + folio_mark_dirty(folio); 367 + folio_unlock(folio); 368 + } 369 + gup_put_folio(folio, nr, FOLL_PIN); 300 370 } 301 371 } 302 372 EXPORT_SYMBOL(unpin_user_pages_dirty_lock); ··· 328 392 void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, 329 393 bool make_dirty) 330 394 { 331 - unsigned long index; 332 - struct page *head; 333 - unsigned int ntails; 395 + unsigned long i; 396 + struct folio *folio; 397 + unsigned int nr; 334 398 335 - for_each_compound_range(index, &page, npages, head, ntails) { 336 - if (make_dirty && !PageDirty(head)) 337 - set_page_dirty_lock(head); 338 - put_compound_head(head, ntails, FOLL_PIN); 399 + for (i = 0; i < npages; i += nr) { 400 + folio = gup_folio_range_next(page, npages, i, &nr); 401 + if (make_dirty && !folio_test_dirty(folio)) { 402 + folio_lock(folio); 403 + folio_mark_dirty(folio); 404 + folio_unlock(folio); 405 + } 406 + gup_put_folio(folio, nr, FOLL_PIN); 339 407 } 340 408 } 341 409 EXPORT_SYMBOL(unpin_user_page_range_dirty_lock); ··· 355 415 */ 356 416 void unpin_user_pages(struct page **pages, unsigned long npages) 357 417 { 358 - unsigned long index; 359 - struct page *head; 360 - unsigned int ntails; 418 + unsigned long i; 419 + struct folio *folio; 420 + unsigned int nr; 361 421 362 422 /* 363 423 * If this WARN_ON() fires, then the system *might* be leaking pages (by ··· 367 427 if (WARN_ON(IS_ERR_VALUE(npages))) 368 428 return; 369 429 370 - for_each_compound_head(index, pages, npages, head, ntails) 371 - put_compound_head(head, ntails, FOLL_PIN); 430 + for (i = 0; i < npages; i += nr) { 431 + folio = gup_folio_next(pages, npages, i, &nr); 432 + gup_put_folio(folio, nr, FOLL_PIN); 433 + } 372 434 } 373 435 EXPORT_SYMBOL(unpin_user_pages); 374 436 ··· 534 592 * mark_page_accessed(). 535 593 */ 536 594 mark_page_accessed(page); 537 - } 538 - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 539 - /* Do not mlock pte-mapped THP */ 540 - if (PageTransCompound(page)) 541 - goto out; 542 - 543 - /* 544 - * The preliminary mapping check is mainly to avoid the 545 - * pointless overhead of lock_page on the ZERO_PAGE 546 - * which might bounce very badly if there is contention. 547 - * 548 - * If the page is already locked, we don't need to 549 - * handle it now - vmscan will handle it later if and 550 - * when it attempts to reclaim the page. 551 - */ 552 - if (page->mapping && trylock_page(page)) { 553 - lru_add_drain(); /* push cached pages to LRU */ 554 - /* 555 - * Because we lock page here, and migration is 556 - * blocked by the pte's page reference, and we 557 - * know the page is still mapped, we don't even 558 - * need to check for file-cache page truncation. 559 - */ 560 - mlock_vma_page(page); 561 - unlock_page(page); 562 - } 563 595 } 564 596 out: 565 597 pte_unmap_unlock(ptep, ptl); ··· 857 941 unsigned int fault_flags = 0; 858 942 vm_fault_t ret; 859 943 860 - /* mlock all present pages, but do not fault in new pages */ 861 - if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) 862 - return -ENOENT; 863 944 if (*flags & FOLL_NOFAULT) 864 945 return -EFAULT; 865 946 if (*flags & FOLL_WRITE) ··· 1107 1194 case -ENOMEM: 1108 1195 case -EHWPOISON: 1109 1196 goto out; 1110 - case -ENOENT: 1111 - goto next_page; 1112 1197 } 1113 1198 BUG(); 1114 1199 } else if (PTR_ERR(page) == -EEXIST) { ··· 1411 1500 VM_BUG_ON_VMA(end > vma->vm_end, vma); 1412 1501 mmap_assert_locked(mm); 1413 1502 1414 - gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; 1503 + /* 1504 + * Rightly or wrongly, the VM_LOCKONFAULT case has never used 1505 + * faultin_page() to break COW, so it has no work to do here. 1506 + */ 1415 1507 if (vma->vm_flags & VM_LOCKONFAULT) 1416 - gup_flags &= ~FOLL_POPULATE; 1508 + return nr_pages; 1509 + 1510 + gup_flags = FOLL_TOUCH; 1417 1511 /* 1418 1512 * We want to touch writable mappings with a write fault in order 1419 1513 * to break COW, except for shared mappings because these don't COW ··· 1485 1569 * in the page table. 1486 1570 * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit 1487 1571 * a poisoned page. 1488 - * FOLL_POPULATE: Always populate memory with VM_LOCKONFAULT. 1489 1572 * !FOLL_FORCE: Require proper access permissions. 1490 1573 */ 1491 - gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK | FOLL_HWPOISON; 1574 + gup_flags = FOLL_TOUCH | FOLL_HWPOISON; 1492 1575 if (write) 1493 1576 gup_flags |= FOLL_WRITE; 1494 1577 ··· 1767 1852 struct page **pages, 1768 1853 unsigned int gup_flags) 1769 1854 { 1770 - unsigned long i; 1771 - unsigned long isolation_error_count = 0; 1772 - bool drain_allow = true; 1855 + unsigned long isolation_error_count = 0, i; 1856 + struct folio *prev_folio = NULL; 1773 1857 LIST_HEAD(movable_page_list); 1774 - long ret = 0; 1775 - struct page *prev_head = NULL; 1776 - struct page *head; 1777 - struct migration_target_control mtc = { 1778 - .nid = NUMA_NO_NODE, 1779 - .gfp_mask = GFP_USER | __GFP_NOWARN, 1780 - }; 1858 + bool drain_allow = true; 1859 + int ret = 0; 1781 1860 1782 1861 for (i = 0; i < nr_pages; i++) { 1783 - head = compound_head(pages[i]); 1784 - if (head == prev_head) 1785 - continue; 1786 - prev_head = head; 1787 - /* 1788 - * If we get a movable page, since we are going to be pinning 1789 - * these entries, try to move them out if possible. 1790 - */ 1791 - if (!is_pinnable_page(head)) { 1792 - if (PageHuge(head)) { 1793 - if (!isolate_huge_page(head, &movable_page_list)) 1794 - isolation_error_count++; 1795 - } else { 1796 - if (!PageLRU(head) && drain_allow) { 1797 - lru_add_drain_all(); 1798 - drain_allow = false; 1799 - } 1862 + struct folio *folio = page_folio(pages[i]); 1800 1863 1801 - if (isolate_lru_page(head)) { 1802 - isolation_error_count++; 1803 - continue; 1804 - } 1805 - list_add_tail(&head->lru, &movable_page_list); 1806 - mod_node_page_state(page_pgdat(head), 1807 - NR_ISOLATED_ANON + 1808 - page_is_file_lru(head), 1809 - thp_nr_pages(head)); 1810 - } 1864 + if (folio == prev_folio) 1865 + continue; 1866 + prev_folio = folio; 1867 + 1868 + if (folio_is_pinnable(folio)) 1869 + continue; 1870 + 1871 + /* 1872 + * Try to move out any movable page before pinning the range. 1873 + */ 1874 + if (folio_test_hugetlb(folio)) { 1875 + if (!isolate_huge_page(&folio->page, 1876 + &movable_page_list)) 1877 + isolation_error_count++; 1878 + continue; 1811 1879 } 1880 + 1881 + if (!folio_test_lru(folio) && drain_allow) { 1882 + lru_add_drain_all(); 1883 + drain_allow = false; 1884 + } 1885 + 1886 + if (folio_isolate_lru(folio)) { 1887 + isolation_error_count++; 1888 + continue; 1889 + } 1890 + list_add_tail(&folio->lru, &movable_page_list); 1891 + node_stat_mod_folio(folio, 1892 + NR_ISOLATED_ANON + folio_is_file_lru(folio), 1893 + folio_nr_pages(folio)); 1812 1894 } 1895 + 1896 + if (!list_empty(&movable_page_list) || isolation_error_count) 1897 + goto unpin_pages; 1813 1898 1814 1899 /* 1815 1900 * If list is empty, and no isolation errors, means that all pages are 1816 1901 * in the correct zone. 1817 1902 */ 1818 - if (list_empty(&movable_page_list) && !isolation_error_count) 1819 - return nr_pages; 1903 + return nr_pages; 1820 1904 1905 + unpin_pages: 1821 1906 if (gup_flags & FOLL_PIN) { 1822 1907 unpin_user_pages(pages, nr_pages); 1823 1908 } else { 1824 1909 for (i = 0; i < nr_pages; i++) 1825 1910 put_page(pages[i]); 1826 1911 } 1912 + 1827 1913 if (!list_empty(&movable_page_list)) { 1914 + struct migration_target_control mtc = { 1915 + .nid = NUMA_NO_NODE, 1916 + .gfp_mask = GFP_USER | __GFP_NOWARN, 1917 + }; 1918 + 1828 1919 ret = migrate_pages(&movable_page_list, alloc_migration_target, 1829 1920 NULL, (unsigned long)&mtc, MIGRATE_SYNC, 1830 1921 MR_LONGTERM_PIN, NULL); 1831 - if (ret && !list_empty(&movable_page_list)) 1832 - putback_movable_pages(&movable_page_list); 1922 + if (ret > 0) /* number of pages not migrated */ 1923 + ret = -ENOMEM; 1833 1924 } 1834 1925 1835 - return ret > 0 ? -ENOMEM : ret; 1926 + if (ret && !list_empty(&movable_page_list)) 1927 + putback_movable_pages(&movable_page_list); 1928 + return ret; 1836 1929 } 1837 1930 #else 1838 1931 static long check_and_migrate_movable_pages(unsigned long nr_pages, ··· 2150 2227 ptem = ptep = pte_offset_map(&pmd, addr); 2151 2228 do { 2152 2229 pte_t pte = ptep_get_lockless(ptep); 2153 - struct page *head, *page; 2230 + struct page *page; 2231 + struct folio *folio; 2154 2232 2155 2233 /* 2156 2234 * Similar to the PMD case below, NUMA hinting must take slow ··· 2178 2254 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 2179 2255 page = pte_page(pte); 2180 2256 2181 - head = try_grab_compound_head(page, 1, flags); 2182 - if (!head) 2257 + folio = try_grab_folio(page, 1, flags); 2258 + if (!folio) 2183 2259 goto pte_unmap; 2184 2260 2185 2261 if (unlikely(page_is_secretmem(page))) { 2186 - put_compound_head(head, 1, flags); 2262 + gup_put_folio(folio, 1, flags); 2187 2263 goto pte_unmap; 2188 2264 } 2189 2265 2190 2266 if (unlikely(pte_val(pte) != pte_val(*ptep))) { 2191 - put_compound_head(head, 1, flags); 2267 + gup_put_folio(folio, 1, flags); 2192 2268 goto pte_unmap; 2193 2269 } 2194 - 2195 - VM_BUG_ON_PAGE(compound_head(page) != head, page); 2196 2270 2197 2271 /* 2198 2272 * We need to make the page accessible if and only if we are ··· 2201 2279 if (flags & FOLL_PIN) { 2202 2280 ret = arch_make_page_accessible(page); 2203 2281 if (ret) { 2204 - unpin_user_page(page); 2282 + gup_put_folio(folio, 1, flags); 2205 2283 goto pte_unmap; 2206 2284 } 2207 2285 } 2208 - SetPageReferenced(page); 2286 + folio_set_referenced(folio); 2209 2287 pages[*nr] = page; 2210 2288 (*nr)++; 2211 - 2212 2289 } while (ptep++, addr += PAGE_SIZE, addr != end); 2213 2290 2214 2291 ret = 1; ··· 2324 2403 { 2325 2404 int nr; 2326 2405 2327 - for (nr = 0; addr != end; addr += PAGE_SIZE) 2328 - pages[nr++] = page++; 2406 + for (nr = 0; addr != end; nr++, addr += PAGE_SIZE) 2407 + pages[nr] = nth_page(page, nr); 2329 2408 2330 2409 return nr; 2331 2410 } ··· 2343 2422 struct page **pages, int *nr) 2344 2423 { 2345 2424 unsigned long pte_end; 2346 - struct page *head, *page; 2425 + struct page *page; 2426 + struct folio *folio; 2347 2427 pte_t pte; 2348 2428 int refs; 2349 2429 ··· 2360 2438 /* hugepages are never "special" */ 2361 2439 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 2362 2440 2363 - head = pte_page(pte); 2364 - page = head + ((addr & (sz-1)) >> PAGE_SHIFT); 2441 + page = nth_page(pte_page(pte), (addr & (sz - 1)) >> PAGE_SHIFT); 2365 2442 refs = record_subpages(page, addr, end, pages + *nr); 2366 2443 2367 - head = try_grab_compound_head(head, refs, flags); 2368 - if (!head) 2444 + folio = try_grab_folio(page, refs, flags); 2445 + if (!folio) 2369 2446 return 0; 2370 2447 2371 2448 if (unlikely(pte_val(pte) != pte_val(*ptep))) { 2372 - put_compound_head(head, refs, flags); 2449 + gup_put_folio(folio, refs, flags); 2373 2450 return 0; 2374 2451 } 2375 2452 2376 2453 *nr += refs; 2377 - SetPageReferenced(head); 2454 + folio_set_referenced(folio); 2378 2455 return 1; 2379 2456 } 2380 2457 ··· 2407 2486 unsigned long end, unsigned int flags, 2408 2487 struct page **pages, int *nr) 2409 2488 { 2410 - struct page *head, *page; 2489 + struct page *page; 2490 + struct folio *folio; 2411 2491 int refs; 2412 2492 2413 2493 if (!pmd_access_permitted(orig, flags & FOLL_WRITE)) ··· 2421 2499 pages, nr); 2422 2500 } 2423 2501 2424 - page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 2502 + page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT); 2425 2503 refs = record_subpages(page, addr, end, pages + *nr); 2426 2504 2427 - head = try_grab_compound_head(pmd_page(orig), refs, flags); 2428 - if (!head) 2505 + folio = try_grab_folio(page, refs, flags); 2506 + if (!folio) 2429 2507 return 0; 2430 2508 2431 2509 if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { 2432 - put_compound_head(head, refs, flags); 2510 + gup_put_folio(folio, refs, flags); 2433 2511 return 0; 2434 2512 } 2435 2513 2436 2514 *nr += refs; 2437 - SetPageReferenced(head); 2515 + folio_set_referenced(folio); 2438 2516 return 1; 2439 2517 } 2440 2518 ··· 2442 2520 unsigned long end, unsigned int flags, 2443 2521 struct page **pages, int *nr) 2444 2522 { 2445 - struct page *head, *page; 2523 + struct page *page; 2524 + struct folio *folio; 2446 2525 int refs; 2447 2526 2448 2527 if (!pud_access_permitted(orig, flags & FOLL_WRITE)) ··· 2456 2533 pages, nr); 2457 2534 } 2458 2535 2459 - page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 2536 + page = nth_page(pud_page(orig), (addr & ~PUD_MASK) >> PAGE_SHIFT); 2460 2537 refs = record_subpages(page, addr, end, pages + *nr); 2461 2538 2462 - head = try_grab_compound_head(pud_page(orig), refs, flags); 2463 - if (!head) 2539 + folio = try_grab_folio(page, refs, flags); 2540 + if (!folio) 2464 2541 return 0; 2465 2542 2466 2543 if (unlikely(pud_val(orig) != pud_val(*pudp))) { 2467 - put_compound_head(head, refs, flags); 2544 + gup_put_folio(folio, refs, flags); 2468 2545 return 0; 2469 2546 } 2470 2547 2471 2548 *nr += refs; 2472 - SetPageReferenced(head); 2549 + folio_set_referenced(folio); 2473 2550 return 1; 2474 2551 } 2475 2552 ··· 2478 2555 struct page **pages, int *nr) 2479 2556 { 2480 2557 int refs; 2481 - struct page *head, *page; 2558 + struct page *page; 2559 + struct folio *folio; 2482 2560 2483 2561 if (!pgd_access_permitted(orig, flags & FOLL_WRITE)) 2484 2562 return 0; 2485 2563 2486 2564 BUILD_BUG_ON(pgd_devmap(orig)); 2487 2565 2488 - page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); 2566 + page = nth_page(pgd_page(orig), (addr & ~PGDIR_MASK) >> PAGE_SHIFT); 2489 2567 refs = record_subpages(page, addr, end, pages + *nr); 2490 2568 2491 - head = try_grab_compound_head(pgd_page(orig), refs, flags); 2492 - if (!head) 2569 + folio = try_grab_folio(page, refs, flags); 2570 + if (!folio) 2493 2571 return 0; 2494 2572 2495 2573 if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) { 2496 - put_compound_head(head, refs, flags); 2574 + gup_put_folio(folio, refs, flags); 2497 2575 return 0; 2498 2576 } 2499 2577 2500 2578 *nr += refs; 2501 - SetPageReferenced(head); 2579 + folio_set_referenced(folio); 2502 2580 return 1; 2503 2581 } 2504 2582

+62 -116

mm/huge_memory.c

··· 583 583 unsigned long ret; 584 584 loff_t off = (loff_t)pgoff << PAGE_SHIFT; 585 585 586 - if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD)) 587 - goto out; 588 - 589 586 ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE); 590 587 if (ret) 591 588 return ret; 592 - out: 589 + 593 590 return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); 594 591 } 595 592 EXPORT_SYMBOL_GPL(thp_get_unmapped_area); ··· 1378 1381 if (flags & FOLL_TOUCH) 1379 1382 touch_pmd(vma, addr, pmd, flags); 1380 1383 1381 - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 1382 - /* 1383 - * We don't mlock() pte-mapped THPs. This way we can avoid 1384 - * leaking mlocked pages into non-VM_LOCKED VMAs. 1385 - * 1386 - * For anon THP: 1387 - * 1388 - * In most cases the pmd is the only mapping of the page as we 1389 - * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for 1390 - * writable private mappings in populate_vma_page_range(). 1391 - * 1392 - * The only scenario when we have the page shared here is if we 1393 - * mlocking read-only mapping shared over fork(). We skip 1394 - * mlocking such pages. 1395 - * 1396 - * For file THP: 1397 - * 1398 - * We can expect PageDoubleMap() to be stable under page lock: 1399 - * for file pages we set it in page_add_file_rmap(), which 1400 - * requires page to be locked. 1401 - */ 1402 - 1403 - if (PageAnon(page) && compound_mapcount(page) != 1) 1404 - goto skip_mlock; 1405 - if (PageDoubleMap(page) || !page->mapping) 1406 - goto skip_mlock; 1407 - if (!trylock_page(page)) 1408 - goto skip_mlock; 1409 - if (page->mapping && !PageDoubleMap(page)) 1410 - mlock_vma_page(page); 1411 - unlock_page(page); 1412 - } 1413 - skip_mlock: 1414 1384 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; 1415 1385 VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); 1416 1386 ··· 1575 1611 1576 1612 if (pmd_present(orig_pmd)) { 1577 1613 page = pmd_page(orig_pmd); 1578 - page_remove_rmap(page, true); 1614 + page_remove_rmap(page, vma, true); 1579 1615 VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); 1580 1616 VM_BUG_ON_PAGE(!PageHead(page), page); 1581 1617 } else if (thp_migration_supported()) { ··· 1971 2007 set_page_dirty(page); 1972 2008 if (!PageReferenced(page) && pmd_young(old_pmd)) 1973 2009 SetPageReferenced(page); 1974 - page_remove_rmap(page, true); 2010 + page_remove_rmap(page, vma, true); 1975 2011 put_page(page); 1976 2012 } 1977 2013 add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); ··· 2105 2141 } 2106 2142 } 2107 2143 unlock_page_memcg(page); 2144 + 2145 + /* Above is effectively page_remove_rmap(page, vma, true) */ 2146 + munlock_vma_page(page, vma, true); 2108 2147 } 2109 2148 2110 2149 smp_wmb(); /* make pte visible before pmd */ ··· 2115 2148 2116 2149 if (freeze) { 2117 2150 for (i = 0; i < HPAGE_PMD_NR; i++) { 2118 - page_remove_rmap(page + i, false); 2151 + page_remove_rmap(page + i, vma, false); 2119 2152 put_page(page + i); 2120 2153 } 2121 2154 } 2122 2155 } 2123 2156 2124 2157 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 2125 - unsigned long address, bool freeze, struct page *page) 2158 + unsigned long address, bool freeze, struct folio *folio) 2126 2159 { 2127 2160 spinlock_t *ptl; 2128 2161 struct mmu_notifier_range range; 2129 - bool do_unlock_page = false; 2162 + bool do_unlock_folio = false; 2130 2163 pmd_t _pmd; 2131 2164 2132 2165 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, ··· 2136 2169 ptl = pmd_lock(vma->vm_mm, pmd); 2137 2170 2138 2171 /* 2139 - * If caller asks to setup a migration entries, we need a page to check 2140 - * pmd against. Otherwise we can end up replacing wrong page. 2172 + * If caller asks to setup a migration entry, we need a folio to check 2173 + * pmd against. Otherwise we can end up replacing wrong folio. 2141 2174 */ 2142 - VM_BUG_ON(freeze && !page); 2143 - if (page) { 2144 - VM_WARN_ON_ONCE(!PageLocked(page)); 2145 - if (page != pmd_page(*pmd)) 2175 + VM_BUG_ON(freeze && !folio); 2176 + if (folio) { 2177 + VM_WARN_ON_ONCE(!folio_test_locked(folio)); 2178 + if (folio != page_folio(pmd_page(*pmd))) 2146 2179 goto out; 2147 2180 } 2148 2181 2149 2182 repeat: 2150 2183 if (pmd_trans_huge(*pmd)) { 2151 - if (!page) { 2152 - page = pmd_page(*pmd); 2184 + if (!folio) { 2185 + folio = page_folio(pmd_page(*pmd)); 2153 2186 /* 2154 2187 * An anonymous page must be locked, to ensure that a 2155 2188 * concurrent reuse_swap_page() sees stable mapcount; ··· 2157 2190 * and page lock must not be taken when zap_pmd_range() 2158 2191 * calls __split_huge_pmd() while i_mmap_lock is held. 2159 2192 */ 2160 - if (PageAnon(page)) { 2161 - if (unlikely(!trylock_page(page))) { 2162 - get_page(page); 2193 + if (folio_test_anon(folio)) { 2194 + if (unlikely(!folio_trylock(folio))) { 2195 + folio_get(folio); 2163 2196 _pmd = *pmd; 2164 2197 spin_unlock(ptl); 2165 - lock_page(page); 2198 + folio_lock(folio); 2166 2199 spin_lock(ptl); 2167 2200 if (unlikely(!pmd_same(*pmd, _pmd))) { 2168 - unlock_page(page); 2169 - put_page(page); 2170 - page = NULL; 2201 + folio_unlock(folio); 2202 + folio_put(folio); 2203 + folio = NULL; 2171 2204 goto repeat; 2172 2205 } 2173 - put_page(page); 2206 + folio_put(folio); 2174 2207 } 2175 - do_unlock_page = true; 2208 + do_unlock_folio = true; 2176 2209 } 2177 2210 } 2178 - if (PageMlocked(page)) 2179 - clear_page_mlock(page); 2180 2211 } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) 2181 2212 goto out; 2182 2213 __split_huge_pmd_locked(vma, pmd, range.start, freeze); 2183 2214 out: 2184 2215 spin_unlock(ptl); 2185 - if (do_unlock_page) 2186 - unlock_page(page); 2216 + if (do_unlock_folio) 2217 + folio_unlock(folio); 2187 2218 /* 2188 2219 * No need to double call mmu_notifier->invalidate_range() callback. 2189 2220 * They are 3 cases to consider inside __split_huge_pmd_locked(): ··· 2199 2234 } 2200 2235 2201 2236 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, 2202 - bool freeze, struct page *page) 2237 + bool freeze, struct folio *folio) 2203 2238 { 2204 2239 pgd_t *pgd; 2205 2240 p4d_t *p4d; ··· 2220 2255 2221 2256 pmd = pmd_offset(pud, address); 2222 2257 2223 - __split_huge_pmd(vma, pmd, address, freeze, page); 2258 + __split_huge_pmd(vma, pmd, address, freeze, folio); 2224 2259 } 2225 2260 2226 2261 static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address) ··· 2260 2295 2261 2296 static void unmap_page(struct page *page) 2262 2297 { 2298 + struct folio *folio = page_folio(page); 2263 2299 enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | 2264 2300 TTU_SYNC; 2265 2301 ··· 2271 2305 * pages can simply be left unmapped, then faulted back on demand. 2272 2306 * If that is ever changed (perhaps for mlock), update remap_page(). 2273 2307 */ 2274 - if (PageAnon(page)) 2275 - try_to_migrate(page, ttu_flags); 2308 + if (folio_test_anon(folio)) 2309 + try_to_migrate(folio, ttu_flags); 2276 2310 else 2277 - try_to_unmap(page, ttu_flags | TTU_IGNORE_MLOCK); 2311 + try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK); 2278 2312 2279 2313 VM_WARN_ON_ONCE_PAGE(page_mapped(page), page); 2280 2314 } 2281 2315 2282 - static void remap_page(struct page *page, unsigned int nr) 2316 + static void remap_page(struct folio *folio, unsigned long nr) 2283 2317 { 2284 - int i; 2318 + int i = 0; 2285 2319 2286 2320 /* If unmap_page() uses try_to_migrate() on file, remove this check */ 2287 - if (!PageAnon(page)) 2321 + if (!folio_test_anon(folio)) 2288 2322 return; 2289 - if (PageTransHuge(page)) { 2290 - remove_migration_ptes(page, page, true); 2291 - } else { 2292 - for (i = 0; i < nr; i++) 2293 - remove_migration_ptes(page + i, page + i, true); 2323 + for (;;) { 2324 + remove_migration_ptes(folio, folio, true); 2325 + i += folio_nr_pages(folio); 2326 + if (i >= nr) 2327 + break; 2328 + folio = folio_next(folio); 2294 2329 } 2295 2330 } 2296 2331 ··· 2311 2344 } else { 2312 2345 /* head is still on lru (and we have it frozen) */ 2313 2346 VM_WARN_ON(!PageLRU(head)); 2347 + if (PageUnevictable(tail)) 2348 + tail->mlock_count = 0; 2349 + else 2350 + list_add_tail(&tail->lru, &head->lru); 2314 2351 SetPageLRU(tail); 2315 - list_add_tail(&tail->lru, &head->lru); 2316 2352 } 2317 2353 } 2318 2354 ··· 2451 2481 } 2452 2482 local_irq_enable(); 2453 2483 2454 - remap_page(head, nr); 2484 + remap_page(folio, nr); 2455 2485 2456 2486 if (PageSwapCache(head)) { 2457 2487 swp_entry_t entry = { .val = page_private(head) }; ··· 2474 2504 */ 2475 2505 put_page(subpage); 2476 2506 } 2477 - } 2478 - 2479 - int total_mapcount(struct page *page) 2480 - { 2481 - int i, compound, nr, ret; 2482 - 2483 - VM_BUG_ON_PAGE(PageTail(page), page); 2484 - 2485 - if (likely(!PageCompound(page))) 2486 - return atomic_read(&page->_mapcount) + 1; 2487 - 2488 - compound = compound_mapcount(page); 2489 - nr = compound_nr(page); 2490 - if (PageHuge(page)) 2491 - return compound; 2492 - ret = compound; 2493 - for (i = 0; i < nr; i++) 2494 - ret += atomic_read(&page[i]._mapcount) + 1; 2495 - /* File pages has compound_mapcount included in _mapcount */ 2496 - if (!PageAnon(page)) 2497 - return ret - compound * nr; 2498 - if (PageDoubleMap(page)) 2499 - ret -= nr; 2500 - return ret; 2501 2507 } 2502 2508 2503 2509 /* ··· 2525 2579 } 2526 2580 2527 2581 /* Racy check whether the huge page can be split */ 2528 - bool can_split_huge_page(struct page *page, int *pextra_pins) 2582 + bool can_split_folio(struct folio *folio, int *pextra_pins) 2529 2583 { 2530 2584 int extra_pins; 2531 2585 2532 2586 /* Additional pins from page cache */ 2533 - if (PageAnon(page)) 2534 - extra_pins = PageSwapCache(page) ? thp_nr_pages(page) : 0; 2587 + if (folio_test_anon(folio)) 2588 + extra_pins = folio_test_swapcache(folio) ? 2589 + folio_nr_pages(folio) : 0; 2535 2590 else 2536 - extra_pins = thp_nr_pages(page); 2591 + extra_pins = folio_nr_pages(folio); 2537 2592 if (pextra_pins) 2538 2593 *pextra_pins = extra_pins; 2539 - return total_mapcount(page) == page_count(page) - extra_pins - 1; 2594 + return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1; 2540 2595 } 2541 2596 2542 2597 /* ··· 2561 2614 */ 2562 2615 int split_huge_page_to_list(struct page *page, struct list_head *list) 2563 2616 { 2564 - struct page *head = compound_head(page); 2617 + struct folio *folio = page_folio(page); 2618 + struct page *head = &folio->page; 2565 2619 struct deferred_split *ds_queue = get_deferred_split_queue(head); 2566 2620 XA_STATE(xas, &head->mapping->i_pages, head->index); 2567 2621 struct anon_vma *anon_vma = NULL; ··· 2582 2634 * The caller does not necessarily hold an mmap_lock that would 2583 2635 * prevent the anon_vma disappearing so we first we take a 2584 2636 * reference to it and then lock the anon_vma for write. This 2585 - * is similar to page_lock_anon_vma_read except the write lock 2637 + * is similar to folio_lock_anon_vma_read except the write lock 2586 2638 * is taken to serialise against parallel split or collapse 2587 2639 * operations. 2588 2640 */ ··· 2629 2681 * Racy check if we can split the page, before unmap_page() will 2630 2682 * split PMDs 2631 2683 */ 2632 - if (!can_split_huge_page(head, &extra_pins)) { 2684 + if (!can_split_folio(folio, &extra_pins)) { 2633 2685 ret = -EBUSY; 2634 2686 goto out_unlock; 2635 2687 } ··· 2679 2731 if (mapping) 2680 2732 xas_unlock(&xas); 2681 2733 local_irq_enable(); 2682 - remap_page(head, thp_nr_pages(head)); 2734 + remap_page(folio, folio_nr_pages(folio)); 2683 2735 ret = -EBUSY; 2684 2736 } 2685 2737 ··· 2936 2988 goto next; 2937 2989 2938 2990 total++; 2939 - if (!can_split_huge_page(compound_head(page), NULL)) 2991 + if (!can_split_folio(page_folio(page), NULL)) 2940 2992 goto next; 2941 2993 2942 2994 if (!trylock_page(page)) ··· 3129 3181 if (pmd_soft_dirty(pmdval)) 3130 3182 pmdswp = pmd_swp_mksoft_dirty(pmdswp); 3131 3183 set_pmd_at(mm, address, pvmw->pmd, pmdswp); 3132 - page_remove_rmap(page, true); 3184 + page_remove_rmap(page, vma, true); 3133 3185 put_page(page); 3134 3186 } 3135 3187 ··· 3158 3210 if (PageAnon(new)) 3159 3211 page_add_anon_rmap(new, vma, mmun_start, true); 3160 3212 else 3161 - page_add_file_rmap(new, true); 3213 + page_add_file_rmap(new, vma, true); 3162 3214 set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); 3163 - if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new)) 3164 - mlock_vma_page(new); 3165 3215 3166 3216 /* No need to invalidate - it was non-present before */ 3167 3217 update_mmu_cache_pmd(vma, address, pvmw->pmd);

+9 -6

mm/hugetlb.c

··· 1321 1321 } 1322 1322 1323 1323 set_compound_order(page, 0); 1324 + #ifdef CONFIG_64BIT 1324 1325 page[1].compound_nr = 0; 1326 + #endif 1325 1327 __ClearPageHead(page); 1326 1328 } 1327 1329 ··· 1815 1813 for (; j < nr_pages; j++, p = mem_map_next(p, page, j)) 1816 1814 __ClearPageReserved(p); 1817 1815 set_compound_order(page, 0); 1816 + #ifdef CONFIG_64BIT 1818 1817 page[1].compound_nr = 0; 1818 + #endif 1819 1819 __ClearPageHead(page); 1820 1820 return false; 1821 1821 } ··· 5017 5013 set_page_dirty(page); 5018 5014 5019 5015 hugetlb_count_sub(pages_per_huge_page(h), mm); 5020 - page_remove_rmap(page, true); 5016 + page_remove_rmap(page, vma, true); 5021 5017 5022 5018 spin_unlock(ptl); 5023 5019 tlb_remove_page_size(tlb, page, huge_page_size(h)); ··· 5262 5258 /* Break COW */ 5263 5259 huge_ptep_clear_flush(vma, haddr, ptep); 5264 5260 mmu_notifier_invalidate_range(mm, range.start, range.end); 5265 - page_remove_rmap(old_page, true); 5261 + page_remove_rmap(old_page, vma, true); 5266 5262 hugepage_add_new_anon_rmap(new_page, vma, haddr); 5267 5263 set_huge_pte_at(mm, haddr, ptep, 5268 5264 make_huge_pte(vma, new_page, 1)); ··· 6078 6074 6079 6075 if (pages) { 6080 6076 /* 6081 - * try_grab_compound_head() should always succeed here, 6077 + * try_grab_folio() should always succeed here, 6082 6078 * because: a) we hold the ptl lock, and b) we've just 6083 6079 * checked that the huge page is present in the page 6084 6080 * tables. If the huge page is present, then the tail ··· 6087 6083 * any way. So this page must be available at this 6088 6084 * point, unless the page refcount overflowed: 6089 6085 */ 6090 - if (WARN_ON_ONCE(!try_grab_compound_head(pages[i], 6091 - refs, 6092 - flags))) { 6086 + if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs, 6087 + flags))) { 6093 6088 spin_unlock(ptl); 6094 6089 remainder = 0; 6095 6090 err = -ENOMEM;

+79 -38

mm/internal.h

··· 10 10 #include <linux/fs.h> 11 11 #include <linux/mm.h> 12 12 #include <linux/pagemap.h> 13 + #include <linux/rmap.h> 13 14 #include <linux/tracepoint-defs.h> 14 15 15 16 struct folio_batch; ··· 67 66 vm_fault_t do_swap_page(struct vm_fault *vmf); 68 67 void folio_rotate_reclaimable(struct folio *folio); 69 68 bool __folio_end_writeback(struct folio *folio); 69 + void deactivate_file_folio(struct folio *folio); 70 70 71 71 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, 72 72 unsigned long floor, unsigned long ceiling); 73 73 void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); 74 - 75 - static inline bool can_madv_lru_vma(struct vm_area_struct *vma) 76 - { 77 - return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); 78 - } 79 74 80 75 struct zap_details; 81 76 void unmap_page_range(struct mmu_gather *tlb, ··· 79 82 unsigned long addr, unsigned long end, 80 83 struct zap_details *details); 81 84 82 - void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read, 83 - unsigned long lookahead_size); 85 + void page_cache_ra_order(struct readahead_control *, struct file_ra_state *, 86 + unsigned int order); 84 87 void force_page_cache_ra(struct readahead_control *, unsigned long nr); 85 88 static inline void force_page_cache_readahead(struct address_space *mapping, 86 89 struct file *file, pgoff_t index, unsigned long nr_to_read) ··· 97 100 int truncate_inode_folio(struct address_space *mapping, struct folio *folio); 98 101 bool truncate_inode_partial_folio(struct folio *folio, loff_t start, 99 102 loff_t end); 103 + long invalidate_inode_page(struct page *page); 104 + unsigned long invalidate_mapping_pagevec(struct address_space *mapping, 105 + pgoff_t start, pgoff_t end, unsigned long *nr_pagevec); 100 106 101 107 /** 102 108 * folio_evictable - Test whether a folio is evictable. ··· 163 163 /* 164 164 * in mm/vmscan.c: 165 165 */ 166 - extern int isolate_lru_page(struct page *page); 167 - extern void putback_lru_page(struct page *page); 166 + int isolate_lru_page(struct page *page); 167 + int folio_isolate_lru(struct folio *folio); 168 + void putback_lru_page(struct page *page); 169 + void folio_putback_lru(struct folio *folio); 168 170 extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); 169 171 170 172 /* ··· 398 396 void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, 399 397 struct vm_area_struct *prev); 400 398 void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma); 399 + struct anon_vma *folio_anon_vma(struct folio *folio); 401 400 402 401 #ifdef CONFIG_MMU 403 402 void unmap_mapping_folio(struct folio *folio); ··· 407 404 extern long faultin_vma_page_range(struct vm_area_struct *vma, 408 405 unsigned long start, unsigned long end, 409 406 bool write, int *locked); 410 - extern void munlock_vma_pages_range(struct vm_area_struct *vma, 411 - unsigned long start, unsigned long end); 412 - static inline void munlock_vma_pages_all(struct vm_area_struct *vma) 413 - { 414 - munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); 415 - } 416 - 417 - /* 418 - * must be called with vma's mmap_lock held for read or write, and page locked. 419 - */ 420 - extern void mlock_vma_page(struct page *page); 421 - extern unsigned int munlock_vma_page(struct page *page); 422 - 423 407 extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, 424 408 unsigned long len); 425 - 426 409 /* 427 - * Clear the page's PageMlocked(). This can be useful in a situation where 428 - * we want to unconditionally remove a page from the pagecache -- e.g., 429 - * on truncation or freeing. 410 + * mlock_vma_page() and munlock_vma_page(): 411 + * should be called with vma's mmap_lock held for read or write, 412 + * under page table lock for the pte/pmd being added or removed. 430 413 * 431 - * It is legal to call this function for any page, mlocked or not. 432 - * If called for a page that is still mapped by mlocked vmas, all we do 433 - * is revert to lazy LRU behaviour -- semantics are not broken. 414 + * mlock is usually called at the end of page_add_*_rmap(), 415 + * munlock at the end of page_remove_rmap(); but new anon 416 + * pages are managed by lru_cache_add_inactive_or_unevictable() 417 + * calling mlock_new_page(). 418 + * 419 + * @compound is used to include pmd mappings of THPs, but filter out 420 + * pte mappings of THPs, which cannot be consistently counted: a pte 421 + * mapping of the THP head cannot be distinguished by the page alone. 434 422 */ 435 - extern void clear_page_mlock(struct page *page); 423 + void mlock_folio(struct folio *folio); 424 + static inline void mlock_vma_folio(struct folio *folio, 425 + struct vm_area_struct *vma, bool compound) 426 + { 427 + /* 428 + * The VM_SPECIAL check here serves two purposes. 429 + * 1) VM_IO check prevents migration from double-counting during mlock. 430 + * 2) Although mmap_region() and mlock_fixup() take care that VM_LOCKED 431 + * is never left set on a VM_SPECIAL vma, there is an interval while 432 + * file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may 433 + * still be set while VM_SPECIAL bits are added: so ignore it then. 434 + */ 435 + if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) && 436 + (compound || !folio_test_large(folio))) 437 + mlock_folio(folio); 438 + } 439 + 440 + static inline void mlock_vma_page(struct page *page, 441 + struct vm_area_struct *vma, bool compound) 442 + { 443 + mlock_vma_folio(page_folio(page), vma, compound); 444 + } 445 + 446 + void munlock_page(struct page *page); 447 + static inline void munlock_vma_page(struct page *page, 448 + struct vm_area_struct *vma, bool compound) 449 + { 450 + if (unlikely(vma->vm_flags & VM_LOCKED) && 451 + (compound || !PageTransCompound(page))) 452 + munlock_page(page); 453 + } 454 + void mlock_new_page(struct page *page); 455 + bool need_mlock_page_drain(int cpu); 456 + void mlock_page_drain(int cpu); 436 457 437 458 extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); 438 459 ··· 490 463 } 491 464 492 465 /* 493 - * Then at what user virtual address will none of the page be found in vma? 466 + * Then at what user virtual address will none of the range be found in vma? 494 467 * Assumes that vma_address() already returned a good starting address. 495 - * If page is a compound head, the entire compound page is considered. 496 468 */ 497 - static inline unsigned long 498 - vma_address_end(struct page *page, struct vm_area_struct *vma) 469 + static inline unsigned long vma_address_end(struct page_vma_mapped_walk *pvmw) 499 470 { 471 + struct vm_area_struct *vma = pvmw->vma; 500 472 pgoff_t pgoff; 501 473 unsigned long address; 502 474 503 - VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ 504 - pgoff = page_to_pgoff(page) + compound_nr(page); 475 + /* Common case, plus ->pgoff is invalid for KSM */ 476 + if (pvmw->nr_pages == 1) 477 + return pvmw->address + PAGE_SIZE; 478 + 479 + pgoff = pvmw->pgoff + pvmw->nr_pages; 505 480 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 506 481 /* Check for address beyond vma (or wrapped through 0?) */ 507 482 if (address < vma->vm_start || address > vma->vm_end) ··· 533 504 } 534 505 #else /* !CONFIG_MMU */ 535 506 static inline void unmap_mapping_folio(struct folio *folio) { } 536 - static inline void clear_page_mlock(struct page *page) { } 537 - static inline void mlock_vma_page(struct page *page) { } 507 + static inline void mlock_vma_page(struct page *page, 508 + struct vm_area_struct *vma, bool compound) { } 509 + static inline void munlock_vma_page(struct page *page, 510 + struct vm_area_struct *vma, bool compound) { } 511 + static inline void mlock_new_page(struct page *page) { } 512 + static inline bool need_mlock_page_drain(int cpu) { return false; } 513 + static inline void mlock_page_drain(int cpu) { } 538 514 static inline void vunmap_range_noflush(unsigned long start, unsigned long end) 539 515 { 540 516 } ··· 746 712 747 713 int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, 748 714 unsigned long addr, int page_nid, int *flags); 715 + 716 + void free_zone_device_page(struct page *page); 717 + 718 + /* 719 + * mm/gup.c 720 + */ 721 + struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); 749 722 750 723 DECLARE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); 751 724

+11 -4

mm/khugepaged.c

··· 774 774 */ 775 775 spin_lock(ptl); 776 776 ptep_clear(vma->vm_mm, address, _pte); 777 - page_remove_rmap(src_page, false); 777 + page_remove_rmap(src_page, vma, false); 778 778 spin_unlock(ptl); 779 779 free_page_and_swap_cache(src_page); 780 780 } ··· 1513 1513 if (pte_none(*pte)) 1514 1514 continue; 1515 1515 page = vm_normal_page(vma, addr, *pte); 1516 - page_remove_rmap(page, false); 1516 + page_remove_rmap(page, vma, false); 1517 1517 } 1518 1518 1519 1519 pte_unmap_unlock(start_pte, ptl); ··· 1834 1834 } 1835 1835 1836 1836 if (page_mapped(page)) 1837 - unmap_mapping_pages(mapping, index, 1, false); 1837 + try_to_unmap(page_folio(page), 1838 + TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); 1838 1839 1839 1840 xas_lock_irq(&xas); 1840 1841 xas_set(&xas, index); 1841 1842 1842 1843 VM_BUG_ON_PAGE(page != xas_load(&xas), page); 1843 - VM_BUG_ON_PAGE(page_mapped(page), page); 1844 1844 1845 1845 /* 1846 1846 * The page is expected to have page_count() == 3: ··· 1903 1903 xa_locked: 1904 1904 xas_unlock_irq(&xas); 1905 1905 xa_unlocked: 1906 + 1907 + /* 1908 + * If collapse is successful, flush must be done now before copying. 1909 + * If collapse is unsuccessful, does flush actually need to be done? 1910 + * Do it anyway, to clear the state. 1911 + */ 1912 + try_to_unmap_flush(); 1906 1913 1907 1914 if (result == SCAN_SUCCEED) { 1908 1915 struct page *page, *tmp;

+10 -22

mm/ksm.c

··· 1034 1034 pte_t *orig_pte) 1035 1035 { 1036 1036 struct mm_struct *mm = vma->vm_mm; 1037 - struct page_vma_mapped_walk pvmw = { 1038 - .page = page, 1039 - .vma = vma, 1040 - }; 1037 + DEFINE_PAGE_VMA_WALK(pvmw, page, vma, 0, 0); 1041 1038 int swapped; 1042 1039 int err = -EFAULT; 1043 1040 struct mmu_notifier_range range; ··· 1174 1177 ptep_clear_flush(vma, addr, ptep); 1175 1178 set_pte_at_notify(mm, addr, ptep, newpte); 1176 1179 1177 - page_remove_rmap(page, false); 1180 + page_remove_rmap(page, vma, false); 1178 1181 if (!page_mapped(page)) 1179 1182 try_to_free_swap(page); 1180 1183 put_page(page); ··· 1247 1250 err = 0; 1248 1251 } else if (pages_identical(page, kpage)) 1249 1252 err = replace_page(vma, page, kpage, orig_pte); 1250 - } 1251 - 1252 - if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { 1253 - munlock_vma_page(page); 1254 - if (!PageMlocked(kpage)) { 1255 - unlock_page(page); 1256 - lock_page(kpage); 1257 - mlock_vma_page(kpage); 1258 - page = kpage; /* for final unlock */ 1259 - } 1260 1253 } 1261 1254 1262 1255 out_unlock: ··· 2554 2567 struct page *ksm_might_need_to_copy(struct page *page, 2555 2568 struct vm_area_struct *vma, unsigned long address) 2556 2569 { 2557 - struct anon_vma *anon_vma = page_anon_vma(page); 2570 + struct folio *folio = page_folio(page); 2571 + struct anon_vma *anon_vma = folio_anon_vma(folio); 2558 2572 struct page *new_page; 2559 2573 2560 2574 if (PageKsm(page)) { ··· 2591 2603 return new_page; 2592 2604 } 2593 2605 2594 - void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) 2606 + void rmap_walk_ksm(struct folio *folio, const struct rmap_walk_control *rwc) 2595 2607 { 2596 2608 struct stable_node *stable_node; 2597 2609 struct rmap_item *rmap_item; 2598 2610 int search_new_forks = 0; 2599 2611 2600 - VM_BUG_ON_PAGE(!PageKsm(page), page); 2612 + VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio); 2601 2613 2602 2614 /* 2603 2615 * Rely on the page lock to protect against concurrent modifications 2604 2616 * to that page's node of the stable tree. 2605 2617 */ 2606 - VM_BUG_ON_PAGE(!PageLocked(page), page); 2618 + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 2607 2619 2608 - stable_node = page_stable_node(page); 2620 + stable_node = folio_stable_node(folio); 2609 2621 if (!stable_node) 2610 2622 return; 2611 2623 again: ··· 2640 2652 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 2641 2653 continue; 2642 2654 2643 - if (!rwc->rmap_one(page, vma, addr, rwc->arg)) { 2655 + if (!rwc->rmap_one(folio, vma, addr, rwc->arg)) { 2644 2656 anon_vma_unlock_read(anon_vma); 2645 2657 return; 2646 2658 } 2647 - if (rwc->done && rwc->done(page)) { 2659 + if (rwc->done && rwc->done(folio)) { 2648 2660 anon_vma_unlock_read(anon_vma); 2649 2661 return; 2650 2662 }

+5

mm/madvise.c

··· 502 502 tlb_end_vma(tlb, vma); 503 503 } 504 504 505 + static inline bool can_madv_lru_vma(struct vm_area_struct *vma) 506 + { 507 + return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); 508 + } 509 + 505 510 static long madvise_cold(struct vm_area_struct *vma, 506 511 struct vm_area_struct **prev, 507 512 unsigned long start_addr, unsigned long end_addr)

+16 -21

mm/memcontrol.c

··· 53 53 #include <linux/fs.h> 54 54 #include <linux/seq_file.h> 55 55 #include <linux/vmpressure.h> 56 + #include <linux/memremap.h> 56 57 #include <linux/mm_inline.h> 57 58 #include <linux/swap_cgroup.h> 58 59 #include <linux/cpu.h> ··· 1272 1271 * @nr_pages: positive when adding or negative when removing 1273 1272 * 1274 1273 * This function must be called under lru_lock, just before a page is added 1275 - * to or just after a page is removed from an lru list (that ordering being 1276 - * so as to allow it to check that lru_size 0 is consistent with list_empty). 1274 + * to or just after a page is removed from an lru list. 1277 1275 */ 1278 1276 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 1279 1277 int zid, int nr_pages) ··· 5436 5436 return NULL; 5437 5437 5438 5438 /* 5439 - * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to 5440 - * a device and because they are not accessible by CPU they are store 5441 - * as special swap entry in the CPU page table. 5439 + * Handle device private pages that are not accessible by the CPU, but 5440 + * stored as special swap entries in the page table. 5442 5441 */ 5443 5442 if (is_device_private_entry(ent)) { 5444 5443 page = pfn_swap_entry_to_page(ent); 5445 - /* 5446 - * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have 5447 - * a refcount of 1 when free (unlike normal page) 5448 - */ 5449 - if (!page_ref_add_unless(page, 1, 1)) 5444 + if (!get_page_unless_zero(page)) 5450 5445 return NULL; 5451 5446 return page; 5452 5447 } ··· 7048 7053 7049 7054 /** 7050 7055 * mem_cgroup_swapout - transfer a memsw charge to swap 7051 - * @page: page whose memsw charge to transfer 7056 + * @folio: folio whose memsw charge to transfer 7052 7057 * @entry: swap entry to move the charge to 7053 7058 * 7054 - * Transfer the memsw charge of @page to @entry. 7059 + * Transfer the memsw charge of @folio to @entry. 7055 7060 */ 7056 - void mem_cgroup_swapout(struct page *page, swp_entry_t entry) 7061 + void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) 7057 7062 { 7058 7063 struct mem_cgroup *memcg, *swap_memcg; 7059 7064 unsigned int nr_entries; 7060 7065 unsigned short oldid; 7061 7066 7062 - VM_BUG_ON_PAGE(PageLRU(page), page); 7063 - VM_BUG_ON_PAGE(page_count(page), page); 7067 + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 7068 + VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); 7064 7069 7065 7070 if (mem_cgroup_disabled()) 7066 7071 return; ··· 7068 7073 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 7069 7074 return; 7070 7075 7071 - memcg = page_memcg(page); 7076 + memcg = folio_memcg(folio); 7072 7077 7073 - VM_WARN_ON_ONCE_PAGE(!memcg, page); 7078 + VM_WARN_ON_ONCE_FOLIO(!memcg, folio); 7074 7079 if (!memcg) 7075 7080 return; 7076 7081 ··· 7080 7085 * ancestor for the swap instead and transfer the memory+swap charge. 7081 7086 */ 7082 7087 swap_memcg = mem_cgroup_id_get_online(memcg); 7083 - nr_entries = thp_nr_pages(page); 7088 + nr_entries = folio_nr_pages(folio); 7084 7089 /* Get references for the tail pages, too */ 7085 7090 if (nr_entries > 1) 7086 7091 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); 7087 7092 oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 7088 7093 nr_entries); 7089 - VM_BUG_ON_PAGE(oldid, page); 7094 + VM_BUG_ON_FOLIO(oldid, folio); 7090 7095 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); 7091 7096 7092 - page->memcg_data = 0; 7097 + folio->memcg_data = 0; 7093 7098 7094 7099 if (!mem_cgroup_is_root(memcg)) 7095 7100 page_counter_uncharge(&memcg->memory, nr_entries); ··· 7109 7114 memcg_stats_lock(); 7110 7115 mem_cgroup_charge_statistics(memcg, -nr_entries); 7111 7116 memcg_stats_unlock(); 7112 - memcg_check_events(memcg, page_to_nid(page)); 7117 + memcg_check_events(memcg, folio_nid(folio)); 7113 7118 7114 7119 css_put(&memcg->css); 7115 7120 }

+7 -5

mm/memory-failure.c

··· 478 478 static void collect_procs_anon(struct page *page, struct list_head *to_kill, 479 479 int force_early) 480 480 { 481 + struct folio *folio = page_folio(page); 481 482 struct vm_area_struct *vma; 482 483 struct task_struct *tsk; 483 484 struct anon_vma *av; 484 485 pgoff_t pgoff; 485 486 486 - av = page_lock_anon_vma_read(page); 487 + av = folio_lock_anon_vma_read(folio); 487 488 if (av == NULL) /* Not actually mapped anymore */ 488 489 return; 489 490 ··· 1348 1347 static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, 1349 1348 int flags, struct page *hpage) 1350 1349 { 1350 + struct folio *folio = page_folio(hpage); 1351 1351 enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC; 1352 1352 struct address_space *mapping; 1353 1353 LIST_HEAD(tokill); ··· 1423 1421 */ 1424 1422 mapping = hugetlb_page_mapping_lock_write(hpage); 1425 1423 if (mapping) { 1426 - try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED); 1424 + try_to_unmap(folio, ttu|TTU_RMAP_LOCKED); 1427 1425 i_mmap_unlock_write(mapping); 1428 1426 } else 1429 1427 pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn); 1430 1428 } else { 1431 - try_to_unmap(hpage, ttu); 1429 + try_to_unmap(folio, ttu); 1432 1430 } 1433 1431 1434 1432 unmap_success = !page_mapped(hpage); ··· 2171 2169 */ 2172 2170 static int __soft_offline_page(struct page *page) 2173 2171 { 2174 - int ret = 0; 2172 + long ret = 0; 2175 2173 unsigned long pfn = page_to_pfn(page); 2176 2174 struct page *hpage = compound_head(page); 2177 2175 char const *msg_page[] = {"page", "hugepage"}; ··· 2218 2216 if (!list_empty(&pagelist)) 2219 2217 putback_movable_pages(&pagelist); 2220 2218 2221 - pr_info("soft offline: %#lx: %s migration failed %d, type %pGp\n", 2219 + pr_info("soft offline: %#lx: %s migration failed %ld, type %pGp\n", 2222 2220 pfn, msg_page[huge], ret, &page->flags); 2223 2221 if (ret > 0) 2224 2222 ret = -EBUSY;

+15 -28

mm/memory.c

··· 735 735 736 736 set_pte_at(vma->vm_mm, address, ptep, pte); 737 737 738 - if (vma->vm_flags & VM_LOCKED) 739 - mlock_vma_page(page); 740 - 741 738 /* 742 739 * No need to invalidate - it was non-present before. However 743 740 * secondary CPUs may have mappings that need invalidating. ··· 1386 1389 mark_page_accessed(page); 1387 1390 } 1388 1391 rss[mm_counter(page)]--; 1389 - page_remove_rmap(page, false); 1392 + page_remove_rmap(page, vma, false); 1390 1393 if (unlikely(page_mapcount(page) < 0)) 1391 1394 print_bad_pte(vma, addr, ptent, page); 1392 1395 if (unlikely(__tlb_remove_page(tlb, page))) { ··· 1405 1408 continue; 1406 1409 rss[mm_counter(page)]--; 1407 1410 if (is_device_private_entry(entry)) 1408 - page_remove_rmap(page, false); 1411 + page_remove_rmap(page, vma, false); 1409 1412 put_page(page); 1410 1413 } else if (!non_swap_entry(entry)) { 1411 1414 /* Genuine swap entry, hence a private anon page */ ··· 1760 1763 return 0; 1761 1764 } 1762 1765 1763 - static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte, 1766 + static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, 1764 1767 unsigned long addr, struct page *page, pgprot_t prot) 1765 1768 { 1766 1769 if (!pte_none(*pte)) 1767 1770 return -EBUSY; 1768 1771 /* Ok, finally just insert the thing.. */ 1769 1772 get_page(page); 1770 - inc_mm_counter_fast(mm, mm_counter_file(page)); 1771 - page_add_file_rmap(page, false); 1772 - set_pte_at(mm, addr, pte, mk_pte(page, prot)); 1773 + inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); 1774 + page_add_file_rmap(page, vma, false); 1775 + set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot)); 1773 1776 return 0; 1774 1777 } 1775 1778 ··· 1783 1786 static int insert_page(struct vm_area_struct *vma, unsigned long addr, 1784 1787 struct page *page, pgprot_t prot) 1785 1788 { 1786 - struct mm_struct *mm = vma->vm_mm; 1787 1789 int retval; 1788 1790 pte_t *pte; 1789 1791 spinlock_t *ptl; ··· 1791 1795 if (retval) 1792 1796 goto out; 1793 1797 retval = -ENOMEM; 1794 - pte = get_locked_pte(mm, addr, &ptl); 1798 + pte = get_locked_pte(vma->vm_mm, addr, &ptl); 1795 1799 if (!pte) 1796 1800 goto out; 1797 - retval = insert_page_into_pte_locked(mm, pte, addr, page, prot); 1801 + retval = insert_page_into_pte_locked(vma, pte, addr, page, prot); 1798 1802 pte_unmap_unlock(pte, ptl); 1799 1803 out: 1800 1804 return retval; 1801 1805 } 1802 1806 1803 1807 #ifdef pte_index 1804 - static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte, 1808 + static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte, 1805 1809 unsigned long addr, struct page *page, pgprot_t prot) 1806 1810 { 1807 1811 int err; ··· 1811 1815 err = validate_page_before_insert(page); 1812 1816 if (err) 1813 1817 return err; 1814 - return insert_page_into_pte_locked(mm, pte, addr, page, prot); 1818 + return insert_page_into_pte_locked(vma, pte, addr, page, prot); 1815 1819 } 1816 1820 1817 1821 /* insert_pages() amortizes the cost of spinlock operations ··· 1848 1852 1849 1853 start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock); 1850 1854 for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) { 1851 - int err = insert_page_in_batch_locked(mm, pte, 1855 + int err = insert_page_in_batch_locked(vma, pte, 1852 1856 addr, pages[curr_page_idx], prot); 1853 1857 if (unlikely(err)) { 1854 1858 pte_unmap_unlock(start_pte, pte_lock); ··· 3104 3108 * mapcount is visible. So transitively, TLBs to 3105 3109 * old page will be flushed before it can be reused. 3106 3110 */ 3107 - page_remove_rmap(old_page, false); 3111 + page_remove_rmap(old_page, vma, false); 3108 3112 } 3109 3113 3110 3114 /* Free the old page.. */ ··· 3124 3128 */ 3125 3129 mmu_notifier_invalidate_range_only_end(&range); 3126 3130 if (old_page) { 3127 - /* 3128 - * Don't let another task, with possibly unlocked vma, 3129 - * keep the mlocked page. 3130 - */ 3131 - if (page_copied && (vma->vm_flags & VM_LOCKED)) { 3132 - lock_page(old_page); /* LRU manipulation */ 3133 - if (PageMlocked(old_page)) 3134 - munlock_vma_page(old_page); 3135 - unlock_page(old_page); 3136 - } 3137 3131 if (page_copied) 3138 3132 free_swap_cache(old_page); 3139 3133 put_page(old_page); ··· 3944 3958 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 3945 3959 3946 3960 add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); 3947 - page_add_file_rmap(page, true); 3961 + page_add_file_rmap(page, vma, true); 3962 + 3948 3963 /* 3949 3964 * deposit and withdraw with pmd lock held 3950 3965 */ ··· 3994 4007 lru_cache_add_inactive_or_unevictable(page, vma); 3995 4008 } else { 3996 4009 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); 3997 - page_add_file_rmap(page, false); 4010 + page_add_file_rmap(page, vma, false); 3998 4011 } 3999 4012 set_pte_at(vma->vm_mm, addr, vmf->pte, entry); 4000 4013 }

+8 -5

mm/memory_hotplug.c

··· 1617 1617 DEFAULT_RATELIMIT_BURST); 1618 1618 1619 1619 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 1620 + struct folio *folio; 1621 + 1620 1622 if (!pfn_valid(pfn)) 1621 1623 continue; 1622 1624 page = pfn_to_page(pfn); 1623 - head = compound_head(page); 1625 + folio = page_folio(page); 1626 + head = &folio->page; 1624 1627 1625 1628 if (PageHuge(page)) { 1626 1629 pfn = page_to_pfn(head) + compound_nr(head) - 1; ··· 1640 1637 * the unmap as the catch all safety net). 1641 1638 */ 1642 1639 if (PageHWPoison(page)) { 1643 - if (WARN_ON(PageLRU(page))) 1644 - isolate_lru_page(page); 1645 - if (page_mapped(page)) 1646 - try_to_unmap(page, TTU_IGNORE_MLOCK); 1640 + if (WARN_ON(folio_test_lru(folio))) 1641 + folio_isolate_lru(folio); 1642 + if (folio_mapped(folio)) 1643 + try_to_unmap(folio, TTU_IGNORE_MLOCK); 1647 1644 continue; 1648 1645 } 1649 1646

+34 -31

mm/memremap.c

··· 4 4 #include <linux/io.h> 5 5 #include <linux/kasan.h> 6 6 #include <linux/memory_hotplug.h> 7 - #include <linux/mm.h> 7 + #include <linux/memremap.h> 8 8 #include <linux/pfn_t.h> 9 9 #include <linux/swap.h> 10 10 #include <linux/mmzone.h> ··· 12 12 #include <linux/types.h> 13 13 #include <linux/wait_bit.h> 14 14 #include <linux/xarray.h> 15 + #include "internal.h" 15 16 16 17 static DEFINE_XARRAY(pgmap_array); 17 18 ··· 38 37 EXPORT_SYMBOL_GPL(memremap_compat_align); 39 38 #endif 40 39 41 - #ifdef CONFIG_DEV_PAGEMAP_OPS 40 + #ifdef CONFIG_FS_DAX 42 41 DEFINE_STATIC_KEY_FALSE(devmap_managed_key); 43 42 EXPORT_SYMBOL(devmap_managed_key); 44 43 45 44 static void devmap_managed_enable_put(struct dev_pagemap *pgmap) 46 45 { 47 - if (pgmap->type == MEMORY_DEVICE_PRIVATE || 48 - pgmap->type == MEMORY_DEVICE_FS_DAX) 46 + if (pgmap->type == MEMORY_DEVICE_FS_DAX) 49 47 static_branch_dec(&devmap_managed_key); 50 48 } 51 49 52 50 static void devmap_managed_enable_get(struct dev_pagemap *pgmap) 53 51 { 54 - if (pgmap->type == MEMORY_DEVICE_PRIVATE || 55 - pgmap->type == MEMORY_DEVICE_FS_DAX) 52 + if (pgmap->type == MEMORY_DEVICE_FS_DAX) 56 53 static_branch_inc(&devmap_managed_key); 57 54 } 58 55 #else ··· 60 61 static void devmap_managed_enable_put(struct dev_pagemap *pgmap) 61 62 { 62 63 } 63 - #endif /* CONFIG_DEV_PAGEMAP_OPS */ 64 + #endif /* CONFIG_FS_DAX */ 64 65 65 66 static void pgmap_array_delete(struct range *range) 66 67 { ··· 101 102 return (range->start + range_len(range)) >> PAGE_SHIFT; 102 103 } 103 104 104 - static unsigned long pfn_next(struct dev_pagemap *pgmap, unsigned long pfn) 105 - { 106 - if (pfn % (1024 << pgmap->vmemmap_shift)) 107 - cond_resched(); 108 - return pfn + pgmap_vmemmap_nr(pgmap); 109 - } 110 - 111 105 static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id) 112 106 { 113 107 return (pfn_end(pgmap, range_id) - 114 108 pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift; 115 109 } 116 - 117 - #define for_each_device_pfn(pfn, map, i) \ 118 - for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); \ 119 - pfn = pfn_next(map, pfn)) 120 110 121 111 static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) 122 112 { ··· 135 147 136 148 void memunmap_pages(struct dev_pagemap *pgmap) 137 149 { 138 - unsigned long pfn; 139 150 int i; 140 151 141 152 percpu_ref_kill(&pgmap->ref); 142 153 for (i = 0; i < pgmap->nr_range; i++) 143 - for_each_device_pfn(pfn, pgmap, i) 144 - put_page(pfn_to_page(pfn)); 154 + percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i)); 145 155 wait_for_completion(&pgmap->done); 146 156 percpu_ref_exit(&pgmap->ref); 147 157 ··· 315 329 } 316 330 break; 317 331 case MEMORY_DEVICE_FS_DAX: 318 - if (!IS_ENABLED(CONFIG_ZONE_DEVICE) || 319 - IS_ENABLED(CONFIG_FS_DAX_LIMITED)) { 332 + if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) { 320 333 WARN(1, "File system DAX not supported\n"); 321 334 return ERR_PTR(-EINVAL); 322 335 } ··· 451 466 } 452 467 EXPORT_SYMBOL_GPL(get_dev_pagemap); 453 468 454 - #ifdef CONFIG_DEV_PAGEMAP_OPS 455 - void free_devmap_managed_page(struct page *page) 469 + void free_zone_device_page(struct page *page) 456 470 { 457 - /* notify page idle for dax */ 458 - if (!is_device_private_page(page)) { 459 - wake_up_var(&page->_refcount); 471 + if (WARN_ON_ONCE(!page->pgmap->ops || !page->pgmap->ops->page_free)) 460 472 return; 461 - } 462 473 463 474 __ClearPageWaiters(page); 464 475 465 476 mem_cgroup_uncharge(page_folio(page)); 466 477 467 478 /* 468 - * When a device_private page is freed, the page->mapping field 479 + * When a device managed page is freed, the page->mapping field 469 480 * may still contain a (stale) mapping value. For example, the 470 481 * lower bits of page->mapping may still identify the page as an 471 482 * anonymous page. Ultimately, this entire field is just stale ··· 483 502 */ 484 503 page->mapping = NULL; 485 504 page->pgmap->ops->page_free(page); 505 + 506 + /* 507 + * Reset the page count to 1 to prepare for handing out the page again. 508 + */ 509 + set_page_count(page, 1); 486 510 } 487 - #endif /* CONFIG_DEV_PAGEMAP_OPS */ 511 + 512 + #ifdef CONFIG_FS_DAX 513 + bool __put_devmap_managed_page(struct page *page) 514 + { 515 + if (page->pgmap->type != MEMORY_DEVICE_FS_DAX) 516 + return false; 517 + 518 + /* 519 + * fsdax page refcounts are 1-based, rather than 0-based: if 520 + * refcount is 1, then the page is free and the refcount is 521 + * stable because nobody holds a reference on the page. 522 + */ 523 + if (page_ref_dec_return(page) == 1) 524 + wake_up_var(&page->_refcount); 525 + return true; 526 + } 527 + EXPORT_SYMBOL(__put_devmap_managed_page); 528 + #endif /* CONFIG_FS_DAX */

+56 -814

mm/migrate.c

··· 38 38 #include <linux/hugetlb.h> 39 39 #include <linux/hugetlb_cgroup.h> 40 40 #include <linux/gfp.h> 41 - #include <linux/pagewalk.h> 42 41 #include <linux/pfn_t.h> 43 42 #include <linux/memremap.h> 44 43 #include <linux/userfaultfd_k.h> 45 44 #include <linux/balloon_compaction.h> 46 - #include <linux/mmu_notifier.h> 47 45 #include <linux/page_idle.h> 48 46 #include <linux/page_owner.h> 49 47 #include <linux/sched/mm.h> ··· 172 174 /* 173 175 * Restore a potential migration pte to a working pte entry 174 176 */ 175 - static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, 176 - unsigned long addr, void *old) 177 + static bool remove_migration_pte(struct folio *folio, 178 + struct vm_area_struct *vma, unsigned long addr, void *old) 177 179 { 178 - struct page_vma_mapped_walk pvmw = { 179 - .page = old, 180 - .vma = vma, 181 - .address = addr, 182 - .flags = PVMW_SYNC | PVMW_MIGRATION, 183 - }; 184 - struct page *new; 185 - pte_t pte; 186 - swp_entry_t entry; 180 + DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION); 187 181 188 - VM_BUG_ON_PAGE(PageTail(page), page); 189 182 while (page_vma_mapped_walk(&pvmw)) { 190 - if (PageKsm(page)) 191 - new = page; 192 - else 193 - new = page - pvmw.page->index + 194 - linear_page_index(vma, pvmw.address); 183 + pte_t pte; 184 + swp_entry_t entry; 185 + struct page *new; 186 + unsigned long idx = 0; 187 + 188 + /* pgoff is invalid for ksm pages, but they are never large */ 189 + if (folio_test_large(folio) && !folio_test_hugetlb(folio)) 190 + idx = linear_page_index(vma, pvmw.address) - pvmw.pgoff; 191 + new = folio_page(folio, idx); 195 192 196 193 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 197 194 /* PMD-mapped THP migration entry */ 198 195 if (!pvmw.pte) { 199 - VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); 196 + VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || 197 + !folio_test_pmd_mappable(folio), folio); 200 198 remove_migration_pmd(&pvmw, new); 201 199 continue; 202 200 } 203 201 #endif 204 202 205 - get_page(new); 203 + folio_get(folio); 206 204 pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot))); 207 205 if (pte_swp_soft_dirty(*pvmw.pte)) 208 206 pte = pte_mksoft_dirty(pte); ··· 227 233 } 228 234 229 235 #ifdef CONFIG_HUGETLB_PAGE 230 - if (PageHuge(new)) { 236 + if (folio_test_hugetlb(folio)) { 231 237 unsigned int shift = huge_page_shift(hstate_vma(vma)); 232 238 233 239 pte = pte_mkhuge(pte); 234 240 pte = arch_make_huge_pte(pte, shift, vma->vm_flags); 235 - if (PageAnon(new)) 241 + if (folio_test_anon(folio)) 236 242 hugepage_add_anon_rmap(new, vma, pvmw.address); 237 243 else 238 244 page_dup_rmap(new, true); ··· 240 246 } else 241 247 #endif 242 248 { 243 - if (PageAnon(new)) 249 + if (folio_test_anon(folio)) 244 250 page_add_anon_rmap(new, vma, pvmw.address, false); 245 251 else 246 - page_add_file_rmap(new, false); 252 + page_add_file_rmap(new, vma, false); 247 253 set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); 248 254 } 249 - if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) 250 - mlock_vma_page(new); 251 - 252 - if (PageTransHuge(page) && PageMlocked(page)) 253 - clear_page_mlock(page); 255 + if (vma->vm_flags & VM_LOCKED) 256 + mlock_page_drain(smp_processor_id()); 254 257 255 258 /* No need to invalidate - it was non-present before */ 256 259 update_mmu_cache(vma, pvmw.address, pvmw.pte); ··· 260 269 * Get rid of all migration entries and replace them by 261 270 * references to the indicated page. 262 271 */ 263 - void remove_migration_ptes(struct page *old, struct page *new, bool locked) 272 + void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked) 264 273 { 265 274 struct rmap_walk_control rwc = { 266 275 .rmap_one = remove_migration_pte, 267 - .arg = old, 276 + .arg = src, 268 277 }; 269 278 270 279 if (locked) 271 - rmap_walk_locked(new, &rwc); 280 + rmap_walk_locked(dst, &rwc); 272 281 else 273 - rmap_walk(new, &rwc); 282 + rmap_walk(dst, &rwc); 274 283 } 275 284 276 285 /* ··· 333 342 { 334 343 int expected_count = 1; 335 344 336 - /* 337 - * Device private pages have an extra refcount as they are 338 - * ZONE_DEVICE pages. 339 - */ 340 - expected_count += is_device_private_page(page); 341 345 if (mapping) 342 346 expected_count += compound_nr(page) + page_has_private(page); 343 - 344 347 return expected_count; 345 348 } 346 349 ··· 757 772 */ 758 773 static int writeout(struct address_space *mapping, struct page *page) 759 774 { 775 + struct folio *folio = page_folio(page); 760 776 struct writeback_control wbc = { 761 777 .sync_mode = WB_SYNC_NONE, 762 778 .nr_to_write = 1, ··· 783 797 * At this point we know that the migration attempt cannot 784 798 * be successful. 785 799 */ 786 - remove_migration_ptes(page, page, false); 800 + remove_migration_ptes(folio, folio, false); 787 801 788 802 rc = mapping->a_ops->writepage(page, &wbc); 789 803 ··· 913 927 static int __unmap_and_move(struct page *page, struct page *newpage, 914 928 int force, enum migrate_mode mode) 915 929 { 930 + struct folio *folio = page_folio(page); 931 + struct folio *dst = page_folio(newpage); 916 932 int rc = -EAGAIN; 917 933 bool page_was_mapped = false; 918 934 struct anon_vma *anon_vma = NULL; ··· 1018 1030 /* Establish migration ptes */ 1019 1031 VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma, 1020 1032 page); 1021 - try_to_migrate(page, 0); 1033 + try_to_migrate(folio, 0); 1022 1034 page_was_mapped = true; 1023 1035 } 1024 1036 1025 1037 if (!page_mapped(page)) 1026 1038 rc = move_to_new_page(newpage, page, mode); 1027 1039 1040 + /* 1041 + * When successful, push newpage to LRU immediately: so that if it 1042 + * turns out to be an mlocked page, remove_migration_ptes() will 1043 + * automatically build up the correct newpage->mlock_count for it. 1044 + * 1045 + * We would like to do something similar for the old page, when 1046 + * unsuccessful, and other cases when a page has been temporarily 1047 + * isolated from the unevictable LRU: but this case is the easiest. 1048 + */ 1049 + if (rc == MIGRATEPAGE_SUCCESS) { 1050 + lru_cache_add(newpage); 1051 + if (page_was_mapped) 1052 + lru_add_drain(); 1053 + } 1054 + 1028 1055 if (page_was_mapped) 1029 - remove_migration_ptes(page, 1030 - rc == MIGRATEPAGE_SUCCESS ? newpage : page, false); 1056 + remove_migration_ptes(folio, 1057 + rc == MIGRATEPAGE_SUCCESS ? dst : folio, false); 1031 1058 1032 1059 out_unlock_both: 1033 1060 unlock_page(newpage); ··· 1053 1050 unlock_page(page); 1054 1051 out: 1055 1052 /* 1056 - * If migration is successful, decrease refcount of the newpage 1053 + * If migration is successful, decrease refcount of the newpage, 1057 1054 * which will not free the page because new page owner increased 1058 - * refcounter. As well, if it is LRU page, add the page to LRU 1059 - * list in here. Use the old state of the isolated source page to 1060 - * determine if we migrated a LRU page. newpage was already unlocked 1061 - * and possibly modified by its owner - don't rely on the page 1062 - * state. 1055 + * refcounter. 1063 1056 */ 1064 - if (rc == MIGRATEPAGE_SUCCESS) { 1065 - if (unlikely(!is_lru)) 1066 - put_page(newpage); 1067 - else 1068 - putback_lru_page(newpage); 1069 - } 1057 + if (rc == MIGRATEPAGE_SUCCESS) 1058 + put_page(newpage); 1070 1059 1071 1060 return rc; 1072 1061 } ··· 1168 1173 enum migrate_mode mode, int reason, 1169 1174 struct list_head *ret) 1170 1175 { 1176 + struct folio *dst, *src = page_folio(hpage); 1171 1177 int rc = -EAGAIN; 1172 1178 int page_was_mapped = 0; 1173 1179 struct page *new_hpage; ··· 1196 1200 new_hpage = get_new_page(hpage, private); 1197 1201 if (!new_hpage) 1198 1202 return -ENOMEM; 1203 + dst = page_folio(new_hpage); 1199 1204 1200 1205 if (!trylock_page(hpage)) { 1201 1206 if (!force) ··· 1246 1249 ttu |= TTU_RMAP_LOCKED; 1247 1250 } 1248 1251 1249 - try_to_migrate(hpage, ttu); 1252 + try_to_migrate(src, ttu); 1250 1253 page_was_mapped = 1; 1251 1254 1252 1255 if (mapping_locked) ··· 1257 1260 rc = move_to_new_page(new_hpage, hpage, mode); 1258 1261 1259 1262 if (page_was_mapped) 1260 - remove_migration_ptes(hpage, 1261 - rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false); 1263 + remove_migration_ptes(src, 1264 + rc == MIGRATEPAGE_SUCCESS ? dst : src, false); 1262 1265 1263 1266 unlock_put_anon: 1264 1267 unlock_page(new_hpage); ··· 2143 2146 } 2144 2147 #endif /* CONFIG_NUMA_BALANCING */ 2145 2148 #endif /* CONFIG_NUMA */ 2146 - 2147 - #ifdef CONFIG_DEVICE_PRIVATE 2148 - static int migrate_vma_collect_skip(unsigned long start, 2149 - unsigned long end, 2150 - struct mm_walk *walk) 2151 - { 2152 - struct migrate_vma *migrate = walk->private; 2153 - unsigned long addr; 2154 - 2155 - for (addr = start; addr < end; addr += PAGE_SIZE) { 2156 - migrate->dst[migrate->npages] = 0; 2157 - migrate->src[migrate->npages++] = 0; 2158 - } 2159 - 2160 - return 0; 2161 - } 2162 - 2163 - static int migrate_vma_collect_hole(unsigned long start, 2164 - unsigned long end, 2165 - __always_unused int depth, 2166 - struct mm_walk *walk) 2167 - { 2168 - struct migrate_vma *migrate = walk->private; 2169 - unsigned long addr; 2170 - 2171 - /* Only allow populating anonymous memory. */ 2172 - if (!vma_is_anonymous(walk->vma)) 2173 - return migrate_vma_collect_skip(start, end, walk); 2174 - 2175 - for (addr = start; addr < end; addr += PAGE_SIZE) { 2176 - migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; 2177 - migrate->dst[migrate->npages] = 0; 2178 - migrate->npages++; 2179 - migrate->cpages++; 2180 - } 2181 - 2182 - return 0; 2183 - } 2184 - 2185 - static int migrate_vma_collect_pmd(pmd_t *pmdp, 2186 - unsigned long start, 2187 - unsigned long end, 2188 - struct mm_walk *walk) 2189 - { 2190 - struct migrate_vma *migrate = walk->private; 2191 - struct vm_area_struct *vma = walk->vma; 2192 - struct mm_struct *mm = vma->vm_mm; 2193 - unsigned long addr = start, unmapped = 0; 2194 - spinlock_t *ptl; 2195 - pte_t *ptep; 2196 - 2197 - again: 2198 - if (pmd_none(*pmdp)) 2199 - return migrate_vma_collect_hole(start, end, -1, walk); 2200 - 2201 - if (pmd_trans_huge(*pmdp)) { 2202 - struct page *page; 2203 - 2204 - ptl = pmd_lock(mm, pmdp); 2205 - if (unlikely(!pmd_trans_huge(*pmdp))) { 2206 - spin_unlock(ptl); 2207 - goto again; 2208 - } 2209 - 2210 - page = pmd_page(*pmdp); 2211 - if (is_huge_zero_page(page)) { 2212 - spin_unlock(ptl); 2213 - split_huge_pmd(vma, pmdp, addr); 2214 - if (pmd_trans_unstable(pmdp)) 2215 - return migrate_vma_collect_skip(start, end, 2216 - walk); 2217 - } else { 2218 - int ret; 2219 - 2220 - get_page(page); 2221 - spin_unlock(ptl); 2222 - if (unlikely(!trylock_page(page))) 2223 - return migrate_vma_collect_skip(start, end, 2224 - walk); 2225 - ret = split_huge_page(page); 2226 - unlock_page(page); 2227 - put_page(page); 2228 - if (ret) 2229 - return migrate_vma_collect_skip(start, end, 2230 - walk); 2231 - if (pmd_none(*pmdp)) 2232 - return migrate_vma_collect_hole(start, end, -1, 2233 - walk); 2234 - } 2235 - } 2236 - 2237 - if (unlikely(pmd_bad(*pmdp))) 2238 - return migrate_vma_collect_skip(start, end, walk); 2239 - 2240 - ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 2241 - arch_enter_lazy_mmu_mode(); 2242 - 2243 - for (; addr < end; addr += PAGE_SIZE, ptep++) { 2244 - unsigned long mpfn = 0, pfn; 2245 - struct page *page; 2246 - swp_entry_t entry; 2247 - pte_t pte; 2248 - 2249 - pte = *ptep; 2250 - 2251 - if (pte_none(pte)) { 2252 - if (vma_is_anonymous(vma)) { 2253 - mpfn = MIGRATE_PFN_MIGRATE; 2254 - migrate->cpages++; 2255 - } 2256 - goto next; 2257 - } 2258 - 2259 - if (!pte_present(pte)) { 2260 - /* 2261 - * Only care about unaddressable device page special 2262 - * page table entry. Other special swap entries are not 2263 - * migratable, and we ignore regular swapped page. 2264 - */ 2265 - entry = pte_to_swp_entry(pte); 2266 - if (!is_device_private_entry(entry)) 2267 - goto next; 2268 - 2269 - page = pfn_swap_entry_to_page(entry); 2270 - if (!(migrate->flags & 2271 - MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || 2272 - page->pgmap->owner != migrate->pgmap_owner) 2273 - goto next; 2274 - 2275 - mpfn = migrate_pfn(page_to_pfn(page)) | 2276 - MIGRATE_PFN_MIGRATE; 2277 - if (is_writable_device_private_entry(entry)) 2278 - mpfn |= MIGRATE_PFN_WRITE; 2279 - } else { 2280 - if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) 2281 - goto next; 2282 - pfn = pte_pfn(pte); 2283 - if (is_zero_pfn(pfn)) { 2284 - mpfn = MIGRATE_PFN_MIGRATE; 2285 - migrate->cpages++; 2286 - goto next; 2287 - } 2288 - page = vm_normal_page(migrate->vma, addr, pte); 2289 - mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; 2290 - mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; 2291 - } 2292 - 2293 - /* FIXME support THP */ 2294 - if (!page || !page->mapping || PageTransCompound(page)) { 2295 - mpfn = 0; 2296 - goto next; 2297 - } 2298 - 2299 - /* 2300 - * By getting a reference on the page we pin it and that blocks 2301 - * any kind of migration. Side effect is that it "freezes" the 2302 - * pte. 2303 - * 2304 - * We drop this reference after isolating the page from the lru 2305 - * for non device page (device page are not on the lru and thus 2306 - * can't be dropped from it). 2307 - */ 2308 - get_page(page); 2309 - 2310 - /* 2311 - * Optimize for the common case where page is only mapped once 2312 - * in one process. If we can lock the page, then we can safely 2313 - * set up a special migration page table entry now. 2314 - */ 2315 - if (trylock_page(page)) { 2316 - pte_t swp_pte; 2317 - 2318 - migrate->cpages++; 2319 - ptep_get_and_clear(mm, addr, ptep); 2320 - 2321 - /* Setup special migration page table entry */ 2322 - if (mpfn & MIGRATE_PFN_WRITE) 2323 - entry = make_writable_migration_entry( 2324 - page_to_pfn(page)); 2325 - else 2326 - entry = make_readable_migration_entry( 2327 - page_to_pfn(page)); 2328 - swp_pte = swp_entry_to_pte(entry); 2329 - if (pte_present(pte)) { 2330 - if (pte_soft_dirty(pte)) 2331 - swp_pte = pte_swp_mksoft_dirty(swp_pte); 2332 - if (pte_uffd_wp(pte)) 2333 - swp_pte = pte_swp_mkuffd_wp(swp_pte); 2334 - } else { 2335 - if (pte_swp_soft_dirty(pte)) 2336 - swp_pte = pte_swp_mksoft_dirty(swp_pte); 2337 - if (pte_swp_uffd_wp(pte)) 2338 - swp_pte = pte_swp_mkuffd_wp(swp_pte); 2339 - } 2340 - set_pte_at(mm, addr, ptep, swp_pte); 2341 - 2342 - /* 2343 - * This is like regular unmap: we remove the rmap and 2344 - * drop page refcount. Page won't be freed, as we took 2345 - * a reference just above. 2346 - */ 2347 - page_remove_rmap(page, false); 2348 - put_page(page); 2349 - 2350 - if (pte_present(pte)) 2351 - unmapped++; 2352 - } else { 2353 - put_page(page); 2354 - mpfn = 0; 2355 - } 2356 - 2357 - next: 2358 - migrate->dst[migrate->npages] = 0; 2359 - migrate->src[migrate->npages++] = mpfn; 2360 - } 2361 - arch_leave_lazy_mmu_mode(); 2362 - pte_unmap_unlock(ptep - 1, ptl); 2363 - 2364 - /* Only flush the TLB if we actually modified any entries */ 2365 - if (unmapped) 2366 - flush_tlb_range(walk->vma, start, end); 2367 - 2368 - return 0; 2369 - } 2370 - 2371 - static const struct mm_walk_ops migrate_vma_walk_ops = { 2372 - .pmd_entry = migrate_vma_collect_pmd, 2373 - .pte_hole = migrate_vma_collect_hole, 2374 - }; 2375 - 2376 - /* 2377 - * migrate_vma_collect() - collect pages over a range of virtual addresses 2378 - * @migrate: migrate struct containing all migration information 2379 - * 2380 - * This will walk the CPU page table. For each virtual address backed by a 2381 - * valid page, it updates the src array and takes a reference on the page, in 2382 - * order to pin the page until we lock it and unmap it. 2383 - */ 2384 - static void migrate_vma_collect(struct migrate_vma *migrate) 2385 - { 2386 - struct mmu_notifier_range range; 2387 - 2388 - /* 2389 - * Note that the pgmap_owner is passed to the mmu notifier callback so 2390 - * that the registered device driver can skip invalidating device 2391 - * private page mappings that won't be migrated. 2392 - */ 2393 - mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0, 2394 - migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end, 2395 - migrate->pgmap_owner); 2396 - mmu_notifier_invalidate_range_start(&range); 2397 - 2398 - walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end, 2399 - &migrate_vma_walk_ops, migrate); 2400 - 2401 - mmu_notifier_invalidate_range_end(&range); 2402 - migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); 2403 - } 2404 - 2405 - /* 2406 - * migrate_vma_check_page() - check if page is pinned or not 2407 - * @page: struct page to check 2408 - * 2409 - * Pinned pages cannot be migrated. This is the same test as in 2410 - * folio_migrate_mapping(), except that here we allow migration of a 2411 - * ZONE_DEVICE page. 2412 - */ 2413 - static bool migrate_vma_check_page(struct page *page) 2414 - { 2415 - /* 2416 - * One extra ref because caller holds an extra reference, either from 2417 - * isolate_lru_page() for a regular page, or migrate_vma_collect() for 2418 - * a device page. 2419 - */ 2420 - int extra = 1; 2421 - 2422 - /* 2423 - * FIXME support THP (transparent huge page), it is bit more complex to 2424 - * check them than regular pages, because they can be mapped with a pmd 2425 - * or with a pte (split pte mapping). 2426 - */ 2427 - if (PageCompound(page)) 2428 - return false; 2429 - 2430 - /* Page from ZONE_DEVICE have one extra reference */ 2431 - if (is_zone_device_page(page)) 2432 - extra++; 2433 - 2434 - /* For file back page */ 2435 - if (page_mapping(page)) 2436 - extra += 1 + page_has_private(page); 2437 - 2438 - if ((page_count(page) - extra) > page_mapcount(page)) 2439 - return false; 2440 - 2441 - return true; 2442 - } 2443 - 2444 - /* 2445 - * migrate_vma_unmap() - replace page mapping with special migration pte entry 2446 - * @migrate: migrate struct containing all migration information 2447 - * 2448 - * Isolate pages from the LRU and replace mappings (CPU page table pte) with a 2449 - * special migration pte entry and check if it has been pinned. Pinned pages are 2450 - * restored because we cannot migrate them. 2451 - * 2452 - * This is the last step before we call the device driver callback to allocate 2453 - * destination memory and copy contents of original page over to new page. 2454 - */ 2455 - static void migrate_vma_unmap(struct migrate_vma *migrate) 2456 - { 2457 - const unsigned long npages = migrate->npages; 2458 - unsigned long i, restore = 0; 2459 - bool allow_drain = true; 2460 - 2461 - lru_add_drain(); 2462 - 2463 - for (i = 0; i < npages; i++) { 2464 - struct page *page = migrate_pfn_to_page(migrate->src[i]); 2465 - 2466 - if (!page) 2467 - continue; 2468 - 2469 - /* ZONE_DEVICE pages are not on LRU */ 2470 - if (!is_zone_device_page(page)) { 2471 - if (!PageLRU(page) && allow_drain) { 2472 - /* Drain CPU's pagevec */ 2473 - lru_add_drain_all(); 2474 - allow_drain = false; 2475 - } 2476 - 2477 - if (isolate_lru_page(page)) { 2478 - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2479 - migrate->cpages--; 2480 - restore++; 2481 - continue; 2482 - } 2483 - 2484 - /* Drop the reference we took in collect */ 2485 - put_page(page); 2486 - } 2487 - 2488 - if (page_mapped(page)) 2489 - try_to_migrate(page, 0); 2490 - 2491 - if (page_mapped(page) || !migrate_vma_check_page(page)) { 2492 - if (!is_zone_device_page(page)) { 2493 - get_page(page); 2494 - putback_lru_page(page); 2495 - } 2496 - 2497 - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2498 - migrate->cpages--; 2499 - restore++; 2500 - continue; 2501 - } 2502 - } 2503 - 2504 - for (i = 0; i < npages && restore; i++) { 2505 - struct page *page = migrate_pfn_to_page(migrate->src[i]); 2506 - 2507 - if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) 2508 - continue; 2509 - 2510 - remove_migration_ptes(page, page, false); 2511 - 2512 - migrate->src[i] = 0; 2513 - unlock_page(page); 2514 - put_page(page); 2515 - restore--; 2516 - } 2517 - } 2518 - 2519 - /** 2520 - * migrate_vma_setup() - prepare to migrate a range of memory 2521 - * @args: contains the vma, start, and pfns arrays for the migration 2522 - * 2523 - * Returns: negative errno on failures, 0 when 0 or more pages were migrated 2524 - * without an error. 2525 - * 2526 - * Prepare to migrate a range of memory virtual address range by collecting all 2527 - * the pages backing each virtual address in the range, saving them inside the 2528 - * src array. Then lock those pages and unmap them. Once the pages are locked 2529 - * and unmapped, check whether each page is pinned or not. Pages that aren't 2530 - * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the 2531 - * corresponding src array entry. Then restores any pages that are pinned, by 2532 - * remapping and unlocking those pages. 2533 - * 2534 - * The caller should then allocate destination memory and copy source memory to 2535 - * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE 2536 - * flag set). Once these are allocated and copied, the caller must update each 2537 - * corresponding entry in the dst array with the pfn value of the destination 2538 - * page and with MIGRATE_PFN_VALID. Destination pages must be locked via 2539 - * lock_page(). 2540 - * 2541 - * Note that the caller does not have to migrate all the pages that are marked 2542 - * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from 2543 - * device memory to system memory. If the caller cannot migrate a device page 2544 - * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe 2545 - * consequences for the userspace process, so it must be avoided if at all 2546 - * possible. 2547 - * 2548 - * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we 2549 - * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus 2550 - * allowing the caller to allocate device memory for those unbacked virtual 2551 - * addresses. For this the caller simply has to allocate device memory and 2552 - * properly set the destination entry like for regular migration. Note that 2553 - * this can still fail, and thus inside the device driver you must check if the 2554 - * migration was successful for those entries after calling migrate_vma_pages(), 2555 - * just like for regular migration. 2556 - * 2557 - * After that, the callers must call migrate_vma_pages() to go over each entry 2558 - * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag 2559 - * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, 2560 - * then migrate_vma_pages() to migrate struct page information from the source 2561 - * struct page to the destination struct page. If it fails to migrate the 2562 - * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the 2563 - * src array. 2564 - * 2565 - * At this point all successfully migrated pages have an entry in the src 2566 - * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst 2567 - * array entry with MIGRATE_PFN_VALID flag set. 2568 - * 2569 - * Once migrate_vma_pages() returns the caller may inspect which pages were 2570 - * successfully migrated, and which were not. Successfully migrated pages will 2571 - * have the MIGRATE_PFN_MIGRATE flag set for their src array entry. 2572 - * 2573 - * It is safe to update device page table after migrate_vma_pages() because 2574 - * both destination and source page are still locked, and the mmap_lock is held 2575 - * in read mode (hence no one can unmap the range being migrated). 2576 - * 2577 - * Once the caller is done cleaning up things and updating its page table (if it 2578 - * chose to do so, this is not an obligation) it finally calls 2579 - * migrate_vma_finalize() to update the CPU page table to point to new pages 2580 - * for successfully migrated pages or otherwise restore the CPU page table to 2581 - * point to the original source pages. 2582 - */ 2583 - int migrate_vma_setup(struct migrate_vma *args) 2584 - { 2585 - long nr_pages = (args->end - args->start) >> PAGE_SHIFT; 2586 - 2587 - args->start &= PAGE_MASK; 2588 - args->end &= PAGE_MASK; 2589 - if (!args->vma || is_vm_hugetlb_page(args->vma) || 2590 - (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma)) 2591 - return -EINVAL; 2592 - if (nr_pages <= 0) 2593 - return -EINVAL; 2594 - if (args->start < args->vma->vm_start || 2595 - args->start >= args->vma->vm_end) 2596 - return -EINVAL; 2597 - if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end) 2598 - return -EINVAL; 2599 - if (!args->src || !args->dst) 2600 - return -EINVAL; 2601 - 2602 - memset(args->src, 0, sizeof(*args->src) * nr_pages); 2603 - args->cpages = 0; 2604 - args->npages = 0; 2605 - 2606 - migrate_vma_collect(args); 2607 - 2608 - if (args->cpages) 2609 - migrate_vma_unmap(args); 2610 - 2611 - /* 2612 - * At this point pages are locked and unmapped, and thus they have 2613 - * stable content and can safely be copied to destination memory that 2614 - * is allocated by the drivers. 2615 - */ 2616 - return 0; 2617 - 2618 - } 2619 - EXPORT_SYMBOL(migrate_vma_setup); 2620 - 2621 - /* 2622 - * This code closely matches the code in: 2623 - * __handle_mm_fault() 2624 - * handle_pte_fault() 2625 - * do_anonymous_page() 2626 - * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE 2627 - * private page. 2628 - */ 2629 - static void migrate_vma_insert_page(struct migrate_vma *migrate, 2630 - unsigned long addr, 2631 - struct page *page, 2632 - unsigned long *src) 2633 - { 2634 - struct vm_area_struct *vma = migrate->vma; 2635 - struct mm_struct *mm = vma->vm_mm; 2636 - bool flush = false; 2637 - spinlock_t *ptl; 2638 - pte_t entry; 2639 - pgd_t *pgdp; 2640 - p4d_t *p4dp; 2641 - pud_t *pudp; 2642 - pmd_t *pmdp; 2643 - pte_t *ptep; 2644 - 2645 - /* Only allow populating anonymous memory */ 2646 - if (!vma_is_anonymous(vma)) 2647 - goto abort; 2648 - 2649 - pgdp = pgd_offset(mm, addr); 2650 - p4dp = p4d_alloc(mm, pgdp, addr); 2651 - if (!p4dp) 2652 - goto abort; 2653 - pudp = pud_alloc(mm, p4dp, addr); 2654 - if (!pudp) 2655 - goto abort; 2656 - pmdp = pmd_alloc(mm, pudp, addr); 2657 - if (!pmdp) 2658 - goto abort; 2659 - 2660 - if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) 2661 - goto abort; 2662 - 2663 - /* 2664 - * Use pte_alloc() instead of pte_alloc_map(). We can't run 2665 - * pte_offset_map() on pmds where a huge pmd might be created 2666 - * from a different thread. 2667 - * 2668 - * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when 2669 - * parallel threads are excluded by other means. 2670 - * 2671 - * Here we only have mmap_read_lock(mm). 2672 - */ 2673 - if (pte_alloc(mm, pmdp)) 2674 - goto abort; 2675 - 2676 - /* See the comment in pte_alloc_one_map() */ 2677 - if (unlikely(pmd_trans_unstable(pmdp))) 2678 - goto abort; 2679 - 2680 - if (unlikely(anon_vma_prepare(vma))) 2681 - goto abort; 2682 - if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL)) 2683 - goto abort; 2684 - 2685 - /* 2686 - * The memory barrier inside __SetPageUptodate makes sure that 2687 - * preceding stores to the page contents become visible before 2688 - * the set_pte_at() write. 2689 - */ 2690 - __SetPageUptodate(page); 2691 - 2692 - if (is_zone_device_page(page)) { 2693 - if (is_device_private_page(page)) { 2694 - swp_entry_t swp_entry; 2695 - 2696 - if (vma->vm_flags & VM_WRITE) 2697 - swp_entry = make_writable_device_private_entry( 2698 - page_to_pfn(page)); 2699 - else 2700 - swp_entry = make_readable_device_private_entry( 2701 - page_to_pfn(page)); 2702 - entry = swp_entry_to_pte(swp_entry); 2703 - } else { 2704 - /* 2705 - * For now we only support migrating to un-addressable 2706 - * device memory. 2707 - */ 2708 - pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); 2709 - goto abort; 2710 - } 2711 - } else { 2712 - entry = mk_pte(page, vma->vm_page_prot); 2713 - if (vma->vm_flags & VM_WRITE) 2714 - entry = pte_mkwrite(pte_mkdirty(entry)); 2715 - } 2716 - 2717 - ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 2718 - 2719 - if (check_stable_address_space(mm)) 2720 - goto unlock_abort; 2721 - 2722 - if (pte_present(*ptep)) { 2723 - unsigned long pfn = pte_pfn(*ptep); 2724 - 2725 - if (!is_zero_pfn(pfn)) 2726 - goto unlock_abort; 2727 - flush = true; 2728 - } else if (!pte_none(*ptep)) 2729 - goto unlock_abort; 2730 - 2731 - /* 2732 - * Check for userfaultfd but do not deliver the fault. Instead, 2733 - * just back off. 2734 - */ 2735 - if (userfaultfd_missing(vma)) 2736 - goto unlock_abort; 2737 - 2738 - inc_mm_counter(mm, MM_ANONPAGES); 2739 - page_add_new_anon_rmap(page, vma, addr, false); 2740 - if (!is_zone_device_page(page)) 2741 - lru_cache_add_inactive_or_unevictable(page, vma); 2742 - get_page(page); 2743 - 2744 - if (flush) { 2745 - flush_cache_page(vma, addr, pte_pfn(*ptep)); 2746 - ptep_clear_flush_notify(vma, addr, ptep); 2747 - set_pte_at_notify(mm, addr, ptep, entry); 2748 - update_mmu_cache(vma, addr, ptep); 2749 - } else { 2750 - /* No need to invalidate - it was non-present before */ 2751 - set_pte_at(mm, addr, ptep, entry); 2752 - update_mmu_cache(vma, addr, ptep); 2753 - } 2754 - 2755 - pte_unmap_unlock(ptep, ptl); 2756 - *src = MIGRATE_PFN_MIGRATE; 2757 - return; 2758 - 2759 - unlock_abort: 2760 - pte_unmap_unlock(ptep, ptl); 2761 - abort: 2762 - *src &= ~MIGRATE_PFN_MIGRATE; 2763 - } 2764 - 2765 - /** 2766 - * migrate_vma_pages() - migrate meta-data from src page to dst page 2767 - * @migrate: migrate struct containing all migration information 2768 - * 2769 - * This migrates struct page meta-data from source struct page to destination 2770 - * struct page. This effectively finishes the migration from source page to the 2771 - * destination page. 2772 - */ 2773 - void migrate_vma_pages(struct migrate_vma *migrate) 2774 - { 2775 - const unsigned long npages = migrate->npages; 2776 - const unsigned long start = migrate->start; 2777 - struct mmu_notifier_range range; 2778 - unsigned long addr, i; 2779 - bool notified = false; 2780 - 2781 - for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { 2782 - struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); 2783 - struct page *page = migrate_pfn_to_page(migrate->src[i]); 2784 - struct address_space *mapping; 2785 - int r; 2786 - 2787 - if (!newpage) { 2788 - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2789 - continue; 2790 - } 2791 - 2792 - if (!page) { 2793 - if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) 2794 - continue; 2795 - if (!notified) { 2796 - notified = true; 2797 - 2798 - mmu_notifier_range_init_owner(&range, 2799 - MMU_NOTIFY_MIGRATE, 0, migrate->vma, 2800 - migrate->vma->vm_mm, addr, migrate->end, 2801 - migrate->pgmap_owner); 2802 - mmu_notifier_invalidate_range_start(&range); 2803 - } 2804 - migrate_vma_insert_page(migrate, addr, newpage, 2805 - &migrate->src[i]); 2806 - continue; 2807 - } 2808 - 2809 - mapping = page_mapping(page); 2810 - 2811 - if (is_zone_device_page(newpage)) { 2812 - if (is_device_private_page(newpage)) { 2813 - /* 2814 - * For now only support private anonymous when 2815 - * migrating to un-addressable device memory. 2816 - */ 2817 - if (mapping) { 2818 - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2819 - continue; 2820 - } 2821 - } else { 2822 - /* 2823 - * Other types of ZONE_DEVICE page are not 2824 - * supported. 2825 - */ 2826 - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2827 - continue; 2828 - } 2829 - } 2830 - 2831 - r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY); 2832 - if (r != MIGRATEPAGE_SUCCESS) 2833 - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 2834 - } 2835 - 2836 - /* 2837 - * No need to double call mmu_notifier->invalidate_range() callback as 2838 - * the above ptep_clear_flush_notify() inside migrate_vma_insert_page() 2839 - * did already call it. 2840 - */ 2841 - if (notified) 2842 - mmu_notifier_invalidate_range_only_end(&range); 2843 - } 2844 - EXPORT_SYMBOL(migrate_vma_pages); 2845 - 2846 - /** 2847 - * migrate_vma_finalize() - restore CPU page table entry 2848 - * @migrate: migrate struct containing all migration information 2849 - * 2850 - * This replaces the special migration pte entry with either a mapping to the 2851 - * new page if migration was successful for that page, or to the original page 2852 - * otherwise. 2853 - * 2854 - * This also unlocks the pages and puts them back on the lru, or drops the extra 2855 - * refcount, for device pages. 2856 - */ 2857 - void migrate_vma_finalize(struct migrate_vma *migrate) 2858 - { 2859 - const unsigned long npages = migrate->npages; 2860 - unsigned long i; 2861 - 2862 - for (i = 0; i < npages; i++) { 2863 - struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); 2864 - struct page *page = migrate_pfn_to_page(migrate->src[i]); 2865 - 2866 - if (!page) { 2867 - if (newpage) { 2868 - unlock_page(newpage); 2869 - put_page(newpage); 2870 - } 2871 - continue; 2872 - } 2873 - 2874 - if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) { 2875 - if (newpage) { 2876 - unlock_page(newpage); 2877 - put_page(newpage); 2878 - } 2879 - newpage = page; 2880 - } 2881 - 2882 - remove_migration_ptes(page, newpage, false); 2883 - unlock_page(page); 2884 - 2885 - if (is_zone_device_page(page)) 2886 - put_page(page); 2887 - else 2888 - putback_lru_page(page); 2889 - 2890 - if (newpage != page) { 2891 - unlock_page(newpage); 2892 - if (is_zone_device_page(newpage)) 2893 - put_page(newpage); 2894 - else 2895 - putback_lru_page(newpage); 2896 - } 2897 - } 2898 - } 2899 - EXPORT_SYMBOL(migrate_vma_finalize); 2900 - #endif /* CONFIG_DEVICE_PRIVATE */ 2901 2149 2902 2150 /* 2903 2151 * node_demotion[] example:

+773

mm/migrate_device.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Device Memory Migration functionality. 4 + * 5 + * Originally written by Jérôme Glisse. 6 + */ 7 + #include <linux/export.h> 8 + #include <linux/memremap.h> 9 + #include <linux/migrate.h> 10 + #include <linux/mm_inline.h> 11 + #include <linux/mmu_notifier.h> 12 + #include <linux/oom.h> 13 + #include <linux/pagewalk.h> 14 + #include <linux/rmap.h> 15 + #include <linux/swapops.h> 16 + #include <asm/tlbflush.h> 17 + #include "internal.h" 18 + 19 + static int migrate_vma_collect_skip(unsigned long start, 20 + unsigned long end, 21 + struct mm_walk *walk) 22 + { 23 + struct migrate_vma *migrate = walk->private; 24 + unsigned long addr; 25 + 26 + for (addr = start; addr < end; addr += PAGE_SIZE) { 27 + migrate->dst[migrate->npages] = 0; 28 + migrate->src[migrate->npages++] = 0; 29 + } 30 + 31 + return 0; 32 + } 33 + 34 + static int migrate_vma_collect_hole(unsigned long start, 35 + unsigned long end, 36 + __always_unused int depth, 37 + struct mm_walk *walk) 38 + { 39 + struct migrate_vma *migrate = walk->private; 40 + unsigned long addr; 41 + 42 + /* Only allow populating anonymous memory. */ 43 + if (!vma_is_anonymous(walk->vma)) 44 + return migrate_vma_collect_skip(start, end, walk); 45 + 46 + for (addr = start; addr < end; addr += PAGE_SIZE) { 47 + migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; 48 + migrate->dst[migrate->npages] = 0; 49 + migrate->npages++; 50 + migrate->cpages++; 51 + } 52 + 53 + return 0; 54 + } 55 + 56 + static int migrate_vma_collect_pmd(pmd_t *pmdp, 57 + unsigned long start, 58 + unsigned long end, 59 + struct mm_walk *walk) 60 + { 61 + struct migrate_vma *migrate = walk->private; 62 + struct vm_area_struct *vma = walk->vma; 63 + struct mm_struct *mm = vma->vm_mm; 64 + unsigned long addr = start, unmapped = 0; 65 + spinlock_t *ptl; 66 + pte_t *ptep; 67 + 68 + again: 69 + if (pmd_none(*pmdp)) 70 + return migrate_vma_collect_hole(start, end, -1, walk); 71 + 72 + if (pmd_trans_huge(*pmdp)) { 73 + struct page *page; 74 + 75 + ptl = pmd_lock(mm, pmdp); 76 + if (unlikely(!pmd_trans_huge(*pmdp))) { 77 + spin_unlock(ptl); 78 + goto again; 79 + } 80 + 81 + page = pmd_page(*pmdp); 82 + if (is_huge_zero_page(page)) { 83 + spin_unlock(ptl); 84 + split_huge_pmd(vma, pmdp, addr); 85 + if (pmd_trans_unstable(pmdp)) 86 + return migrate_vma_collect_skip(start, end, 87 + walk); 88 + } else { 89 + int ret; 90 + 91 + get_page(page); 92 + spin_unlock(ptl); 93 + if (unlikely(!trylock_page(page))) 94 + return migrate_vma_collect_skip(start, end, 95 + walk); 96 + ret = split_huge_page(page); 97 + unlock_page(page); 98 + put_page(page); 99 + if (ret) 100 + return migrate_vma_collect_skip(start, end, 101 + walk); 102 + if (pmd_none(*pmdp)) 103 + return migrate_vma_collect_hole(start, end, -1, 104 + walk); 105 + } 106 + } 107 + 108 + if (unlikely(pmd_bad(*pmdp))) 109 + return migrate_vma_collect_skip(start, end, walk); 110 + 111 + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 112 + arch_enter_lazy_mmu_mode(); 113 + 114 + for (; addr < end; addr += PAGE_SIZE, ptep++) { 115 + unsigned long mpfn = 0, pfn; 116 + struct page *page; 117 + swp_entry_t entry; 118 + pte_t pte; 119 + 120 + pte = *ptep; 121 + 122 + if (pte_none(pte)) { 123 + if (vma_is_anonymous(vma)) { 124 + mpfn = MIGRATE_PFN_MIGRATE; 125 + migrate->cpages++; 126 + } 127 + goto next; 128 + } 129 + 130 + if (!pte_present(pte)) { 131 + /* 132 + * Only care about unaddressable device page special 133 + * page table entry. Other special swap entries are not 134 + * migratable, and we ignore regular swapped page. 135 + */ 136 + entry = pte_to_swp_entry(pte); 137 + if (!is_device_private_entry(entry)) 138 + goto next; 139 + 140 + page = pfn_swap_entry_to_page(entry); 141 + if (!(migrate->flags & 142 + MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || 143 + page->pgmap->owner != migrate->pgmap_owner) 144 + goto next; 145 + 146 + mpfn = migrate_pfn(page_to_pfn(page)) | 147 + MIGRATE_PFN_MIGRATE; 148 + if (is_writable_device_private_entry(entry)) 149 + mpfn |= MIGRATE_PFN_WRITE; 150 + } else { 151 + if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) 152 + goto next; 153 + pfn = pte_pfn(pte); 154 + if (is_zero_pfn(pfn)) { 155 + mpfn = MIGRATE_PFN_MIGRATE; 156 + migrate->cpages++; 157 + goto next; 158 + } 159 + page = vm_normal_page(migrate->vma, addr, pte); 160 + mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; 161 + mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; 162 + } 163 + 164 + /* FIXME support THP */ 165 + if (!page || !page->mapping || PageTransCompound(page)) { 166 + mpfn = 0; 167 + goto next; 168 + } 169 + 170 + /* 171 + * By getting a reference on the page we pin it and that blocks 172 + * any kind of migration. Side effect is that it "freezes" the 173 + * pte. 174 + * 175 + * We drop this reference after isolating the page from the lru 176 + * for non device page (device page are not on the lru and thus 177 + * can't be dropped from it). 178 + */ 179 + get_page(page); 180 + 181 + /* 182 + * Optimize for the common case where page is only mapped once 183 + * in one process. If we can lock the page, then we can safely 184 + * set up a special migration page table entry now. 185 + */ 186 + if (trylock_page(page)) { 187 + pte_t swp_pte; 188 + 189 + migrate->cpages++; 190 + ptep_get_and_clear(mm, addr, ptep); 191 + 192 + /* Setup special migration page table entry */ 193 + if (mpfn & MIGRATE_PFN_WRITE) 194 + entry = make_writable_migration_entry( 195 + page_to_pfn(page)); 196 + else 197 + entry = make_readable_migration_entry( 198 + page_to_pfn(page)); 199 + swp_pte = swp_entry_to_pte(entry); 200 + if (pte_present(pte)) { 201 + if (pte_soft_dirty(pte)) 202 + swp_pte = pte_swp_mksoft_dirty(swp_pte); 203 + if (pte_uffd_wp(pte)) 204 + swp_pte = pte_swp_mkuffd_wp(swp_pte); 205 + } else { 206 + if (pte_swp_soft_dirty(pte)) 207 + swp_pte = pte_swp_mksoft_dirty(swp_pte); 208 + if (pte_swp_uffd_wp(pte)) 209 + swp_pte = pte_swp_mkuffd_wp(swp_pte); 210 + } 211 + set_pte_at(mm, addr, ptep, swp_pte); 212 + 213 + /* 214 + * This is like regular unmap: we remove the rmap and 215 + * drop page refcount. Page won't be freed, as we took 216 + * a reference just above. 217 + */ 218 + page_remove_rmap(page, vma, false); 219 + put_page(page); 220 + 221 + if (pte_present(pte)) 222 + unmapped++; 223 + } else { 224 + put_page(page); 225 + mpfn = 0; 226 + } 227 + 228 + next: 229 + migrate->dst[migrate->npages] = 0; 230 + migrate->src[migrate->npages++] = mpfn; 231 + } 232 + arch_leave_lazy_mmu_mode(); 233 + pte_unmap_unlock(ptep - 1, ptl); 234 + 235 + /* Only flush the TLB if we actually modified any entries */ 236 + if (unmapped) 237 + flush_tlb_range(walk->vma, start, end); 238 + 239 + return 0; 240 + } 241 + 242 + static const struct mm_walk_ops migrate_vma_walk_ops = { 243 + .pmd_entry = migrate_vma_collect_pmd, 244 + .pte_hole = migrate_vma_collect_hole, 245 + }; 246 + 247 + /* 248 + * migrate_vma_collect() - collect pages over a range of virtual addresses 249 + * @migrate: migrate struct containing all migration information 250 + * 251 + * This will walk the CPU page table. For each virtual address backed by a 252 + * valid page, it updates the src array and takes a reference on the page, in 253 + * order to pin the page until we lock it and unmap it. 254 + */ 255 + static void migrate_vma_collect(struct migrate_vma *migrate) 256 + { 257 + struct mmu_notifier_range range; 258 + 259 + /* 260 + * Note that the pgmap_owner is passed to the mmu notifier callback so 261 + * that the registered device driver can skip invalidating device 262 + * private page mappings that won't be migrated. 263 + */ 264 + mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0, 265 + migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end, 266 + migrate->pgmap_owner); 267 + mmu_notifier_invalidate_range_start(&range); 268 + 269 + walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end, 270 + &migrate_vma_walk_ops, migrate); 271 + 272 + mmu_notifier_invalidate_range_end(&range); 273 + migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); 274 + } 275 + 276 + /* 277 + * migrate_vma_check_page() - check if page is pinned or not 278 + * @page: struct page to check 279 + * 280 + * Pinned pages cannot be migrated. This is the same test as in 281 + * folio_migrate_mapping(), except that here we allow migration of a 282 + * ZONE_DEVICE page. 283 + */ 284 + static bool migrate_vma_check_page(struct page *page) 285 + { 286 + /* 287 + * One extra ref because caller holds an extra reference, either from 288 + * isolate_lru_page() for a regular page, or migrate_vma_collect() for 289 + * a device page. 290 + */ 291 + int extra = 1; 292 + 293 + /* 294 + * FIXME support THP (transparent huge page), it is bit more complex to 295 + * check them than regular pages, because they can be mapped with a pmd 296 + * or with a pte (split pte mapping). 297 + */ 298 + if (PageCompound(page)) 299 + return false; 300 + 301 + /* Page from ZONE_DEVICE have one extra reference */ 302 + if (is_zone_device_page(page)) 303 + extra++; 304 + 305 + /* For file back page */ 306 + if (page_mapping(page)) 307 + extra += 1 + page_has_private(page); 308 + 309 + if ((page_count(page) - extra) > page_mapcount(page)) 310 + return false; 311 + 312 + return true; 313 + } 314 + 315 + /* 316 + * migrate_vma_unmap() - replace page mapping with special migration pte entry 317 + * @migrate: migrate struct containing all migration information 318 + * 319 + * Isolate pages from the LRU and replace mappings (CPU page table pte) with a 320 + * special migration pte entry and check if it has been pinned. Pinned pages are 321 + * restored because we cannot migrate them. 322 + * 323 + * This is the last step before we call the device driver callback to allocate 324 + * destination memory and copy contents of original page over to new page. 325 + */ 326 + static void migrate_vma_unmap(struct migrate_vma *migrate) 327 + { 328 + const unsigned long npages = migrate->npages; 329 + unsigned long i, restore = 0; 330 + bool allow_drain = true; 331 + 332 + lru_add_drain(); 333 + 334 + for (i = 0; i < npages; i++) { 335 + struct page *page = migrate_pfn_to_page(migrate->src[i]); 336 + struct folio *folio; 337 + 338 + if (!page) 339 + continue; 340 + 341 + /* ZONE_DEVICE pages are not on LRU */ 342 + if (!is_zone_device_page(page)) { 343 + if (!PageLRU(page) && allow_drain) { 344 + /* Drain CPU's pagevec */ 345 + lru_add_drain_all(); 346 + allow_drain = false; 347 + } 348 + 349 + if (isolate_lru_page(page)) { 350 + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 351 + migrate->cpages--; 352 + restore++; 353 + continue; 354 + } 355 + 356 + /* Drop the reference we took in collect */ 357 + put_page(page); 358 + } 359 + 360 + folio = page_folio(page); 361 + if (folio_mapped(folio)) 362 + try_to_migrate(folio, 0); 363 + 364 + if (page_mapped(page) || !migrate_vma_check_page(page)) { 365 + if (!is_zone_device_page(page)) { 366 + get_page(page); 367 + putback_lru_page(page); 368 + } 369 + 370 + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 371 + migrate->cpages--; 372 + restore++; 373 + continue; 374 + } 375 + } 376 + 377 + for (i = 0; i < npages && restore; i++) { 378 + struct page *page = migrate_pfn_to_page(migrate->src[i]); 379 + struct folio *folio; 380 + 381 + if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) 382 + continue; 383 + 384 + folio = page_folio(page); 385 + remove_migration_ptes(folio, folio, false); 386 + 387 + migrate->src[i] = 0; 388 + folio_unlock(folio); 389 + folio_put(folio); 390 + restore--; 391 + } 392 + } 393 + 394 + /** 395 + * migrate_vma_setup() - prepare to migrate a range of memory 396 + * @args: contains the vma, start, and pfns arrays for the migration 397 + * 398 + * Returns: negative errno on failures, 0 when 0 or more pages were migrated 399 + * without an error. 400 + * 401 + * Prepare to migrate a range of memory virtual address range by collecting all 402 + * the pages backing each virtual address in the range, saving them inside the 403 + * src array. Then lock those pages and unmap them. Once the pages are locked 404 + * and unmapped, check whether each page is pinned or not. Pages that aren't 405 + * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the 406 + * corresponding src array entry. Then restores any pages that are pinned, by 407 + * remapping and unlocking those pages. 408 + * 409 + * The caller should then allocate destination memory and copy source memory to 410 + * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE 411 + * flag set). Once these are allocated and copied, the caller must update each 412 + * corresponding entry in the dst array with the pfn value of the destination 413 + * page and with MIGRATE_PFN_VALID. Destination pages must be locked via 414 + * lock_page(). 415 + * 416 + * Note that the caller does not have to migrate all the pages that are marked 417 + * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from 418 + * device memory to system memory. If the caller cannot migrate a device page 419 + * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe 420 + * consequences for the userspace process, so it must be avoided if at all 421 + * possible. 422 + * 423 + * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we 424 + * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus 425 + * allowing the caller to allocate device memory for those unbacked virtual 426 + * addresses. For this the caller simply has to allocate device memory and 427 + * properly set the destination entry like for regular migration. Note that 428 + * this can still fail, and thus inside the device driver you must check if the 429 + * migration was successful for those entries after calling migrate_vma_pages(), 430 + * just like for regular migration. 431 + * 432 + * After that, the callers must call migrate_vma_pages() to go over each entry 433 + * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag 434 + * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, 435 + * then migrate_vma_pages() to migrate struct page information from the source 436 + * struct page to the destination struct page. If it fails to migrate the 437 + * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the 438 + * src array. 439 + * 440 + * At this point all successfully migrated pages have an entry in the src 441 + * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst 442 + * array entry with MIGRATE_PFN_VALID flag set. 443 + * 444 + * Once migrate_vma_pages() returns the caller may inspect which pages were 445 + * successfully migrated, and which were not. Successfully migrated pages will 446 + * have the MIGRATE_PFN_MIGRATE flag set for their src array entry. 447 + * 448 + * It is safe to update device page table after migrate_vma_pages() because 449 + * both destination and source page are still locked, and the mmap_lock is held 450 + * in read mode (hence no one can unmap the range being migrated). 451 + * 452 + * Once the caller is done cleaning up things and updating its page table (if it 453 + * chose to do so, this is not an obligation) it finally calls 454 + * migrate_vma_finalize() to update the CPU page table to point to new pages 455 + * for successfully migrated pages or otherwise restore the CPU page table to 456 + * point to the original source pages. 457 + */ 458 + int migrate_vma_setup(struct migrate_vma *args) 459 + { 460 + long nr_pages = (args->end - args->start) >> PAGE_SHIFT; 461 + 462 + args->start &= PAGE_MASK; 463 + args->end &= PAGE_MASK; 464 + if (!args->vma || is_vm_hugetlb_page(args->vma) || 465 + (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma)) 466 + return -EINVAL; 467 + if (nr_pages <= 0) 468 + return -EINVAL; 469 + if (args->start < args->vma->vm_start || 470 + args->start >= args->vma->vm_end) 471 + return -EINVAL; 472 + if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end) 473 + return -EINVAL; 474 + if (!args->src || !args->dst) 475 + return -EINVAL; 476 + 477 + memset(args->src, 0, sizeof(*args->src) * nr_pages); 478 + args->cpages = 0; 479 + args->npages = 0; 480 + 481 + migrate_vma_collect(args); 482 + 483 + if (args->cpages) 484 + migrate_vma_unmap(args); 485 + 486 + /* 487 + * At this point pages are locked and unmapped, and thus they have 488 + * stable content and can safely be copied to destination memory that 489 + * is allocated by the drivers. 490 + */ 491 + return 0; 492 + 493 + } 494 + EXPORT_SYMBOL(migrate_vma_setup); 495 + 496 + /* 497 + * This code closely matches the code in: 498 + * __handle_mm_fault() 499 + * handle_pte_fault() 500 + * do_anonymous_page() 501 + * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE 502 + * private page. 503 + */ 504 + static void migrate_vma_insert_page(struct migrate_vma *migrate, 505 + unsigned long addr, 506 + struct page *page, 507 + unsigned long *src) 508 + { 509 + struct vm_area_struct *vma = migrate->vma; 510 + struct mm_struct *mm = vma->vm_mm; 511 + bool flush = false; 512 + spinlock_t *ptl; 513 + pte_t entry; 514 + pgd_t *pgdp; 515 + p4d_t *p4dp; 516 + pud_t *pudp; 517 + pmd_t *pmdp; 518 + pte_t *ptep; 519 + 520 + /* Only allow populating anonymous memory */ 521 + if (!vma_is_anonymous(vma)) 522 + goto abort; 523 + 524 + pgdp = pgd_offset(mm, addr); 525 + p4dp = p4d_alloc(mm, pgdp, addr); 526 + if (!p4dp) 527 + goto abort; 528 + pudp = pud_alloc(mm, p4dp, addr); 529 + if (!pudp) 530 + goto abort; 531 + pmdp = pmd_alloc(mm, pudp, addr); 532 + if (!pmdp) 533 + goto abort; 534 + 535 + if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) 536 + goto abort; 537 + 538 + /* 539 + * Use pte_alloc() instead of pte_alloc_map(). We can't run 540 + * pte_offset_map() on pmds where a huge pmd might be created 541 + * from a different thread. 542 + * 543 + * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when 544 + * parallel threads are excluded by other means. 545 + * 546 + * Here we only have mmap_read_lock(mm). 547 + */ 548 + if (pte_alloc(mm, pmdp)) 549 + goto abort; 550 + 551 + /* See the comment in pte_alloc_one_map() */ 552 + if (unlikely(pmd_trans_unstable(pmdp))) 553 + goto abort; 554 + 555 + if (unlikely(anon_vma_prepare(vma))) 556 + goto abort; 557 + if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL)) 558 + goto abort; 559 + 560 + /* 561 + * The memory barrier inside __SetPageUptodate makes sure that 562 + * preceding stores to the page contents become visible before 563 + * the set_pte_at() write. 564 + */ 565 + __SetPageUptodate(page); 566 + 567 + if (is_device_private_page(page)) { 568 + swp_entry_t swp_entry; 569 + 570 + if (vma->vm_flags & VM_WRITE) 571 + swp_entry = make_writable_device_private_entry( 572 + page_to_pfn(page)); 573 + else 574 + swp_entry = make_readable_device_private_entry( 575 + page_to_pfn(page)); 576 + entry = swp_entry_to_pte(swp_entry); 577 + } else { 578 + /* 579 + * For now we only support migrating to un-addressable device 580 + * memory. 581 + */ 582 + if (is_zone_device_page(page)) { 583 + pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); 584 + goto abort; 585 + } 586 + entry = mk_pte(page, vma->vm_page_prot); 587 + if (vma->vm_flags & VM_WRITE) 588 + entry = pte_mkwrite(pte_mkdirty(entry)); 589 + } 590 + 591 + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 592 + 593 + if (check_stable_address_space(mm)) 594 + goto unlock_abort; 595 + 596 + if (pte_present(*ptep)) { 597 + unsigned long pfn = pte_pfn(*ptep); 598 + 599 + if (!is_zero_pfn(pfn)) 600 + goto unlock_abort; 601 + flush = true; 602 + } else if (!pte_none(*ptep)) 603 + goto unlock_abort; 604 + 605 + /* 606 + * Check for userfaultfd but do not deliver the fault. Instead, 607 + * just back off. 608 + */ 609 + if (userfaultfd_missing(vma)) 610 + goto unlock_abort; 611 + 612 + inc_mm_counter(mm, MM_ANONPAGES); 613 + page_add_new_anon_rmap(page, vma, addr, false); 614 + if (!is_zone_device_page(page)) 615 + lru_cache_add_inactive_or_unevictable(page, vma); 616 + get_page(page); 617 + 618 + if (flush) { 619 + flush_cache_page(vma, addr, pte_pfn(*ptep)); 620 + ptep_clear_flush_notify(vma, addr, ptep); 621 + set_pte_at_notify(mm, addr, ptep, entry); 622 + update_mmu_cache(vma, addr, ptep); 623 + } else { 624 + /* No need to invalidate - it was non-present before */ 625 + set_pte_at(mm, addr, ptep, entry); 626 + update_mmu_cache(vma, addr, ptep); 627 + } 628 + 629 + pte_unmap_unlock(ptep, ptl); 630 + *src = MIGRATE_PFN_MIGRATE; 631 + return; 632 + 633 + unlock_abort: 634 + pte_unmap_unlock(ptep, ptl); 635 + abort: 636 + *src &= ~MIGRATE_PFN_MIGRATE; 637 + } 638 + 639 + /** 640 + * migrate_vma_pages() - migrate meta-data from src page to dst page 641 + * @migrate: migrate struct containing all migration information 642 + * 643 + * This migrates struct page meta-data from source struct page to destination 644 + * struct page. This effectively finishes the migration from source page to the 645 + * destination page. 646 + */ 647 + void migrate_vma_pages(struct migrate_vma *migrate) 648 + { 649 + const unsigned long npages = migrate->npages; 650 + const unsigned long start = migrate->start; 651 + struct mmu_notifier_range range; 652 + unsigned long addr, i; 653 + bool notified = false; 654 + 655 + for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { 656 + struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); 657 + struct page *page = migrate_pfn_to_page(migrate->src[i]); 658 + struct address_space *mapping; 659 + int r; 660 + 661 + if (!newpage) { 662 + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 663 + continue; 664 + } 665 + 666 + if (!page) { 667 + if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) 668 + continue; 669 + if (!notified) { 670 + notified = true; 671 + 672 + mmu_notifier_range_init_owner(&range, 673 + MMU_NOTIFY_MIGRATE, 0, migrate->vma, 674 + migrate->vma->vm_mm, addr, migrate->end, 675 + migrate->pgmap_owner); 676 + mmu_notifier_invalidate_range_start(&range); 677 + } 678 + migrate_vma_insert_page(migrate, addr, newpage, 679 + &migrate->src[i]); 680 + continue; 681 + } 682 + 683 + mapping = page_mapping(page); 684 + 685 + if (is_device_private_page(newpage)) { 686 + /* 687 + * For now only support private anonymous when migrating 688 + * to un-addressable device memory. 689 + */ 690 + if (mapping) { 691 + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 692 + continue; 693 + } 694 + } else if (is_zone_device_page(newpage)) { 695 + /* 696 + * Other types of ZONE_DEVICE page are not supported. 697 + */ 698 + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 699 + continue; 700 + } 701 + 702 + r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY); 703 + if (r != MIGRATEPAGE_SUCCESS) 704 + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; 705 + } 706 + 707 + /* 708 + * No need to double call mmu_notifier->invalidate_range() callback as 709 + * the above ptep_clear_flush_notify() inside migrate_vma_insert_page() 710 + * did already call it. 711 + */ 712 + if (notified) 713 + mmu_notifier_invalidate_range_only_end(&range); 714 + } 715 + EXPORT_SYMBOL(migrate_vma_pages); 716 + 717 + /** 718 + * migrate_vma_finalize() - restore CPU page table entry 719 + * @migrate: migrate struct containing all migration information 720 + * 721 + * This replaces the special migration pte entry with either a mapping to the 722 + * new page if migration was successful for that page, or to the original page 723 + * otherwise. 724 + * 725 + * This also unlocks the pages and puts them back on the lru, or drops the extra 726 + * refcount, for device pages. 727 + */ 728 + void migrate_vma_finalize(struct migrate_vma *migrate) 729 + { 730 + const unsigned long npages = migrate->npages; 731 + unsigned long i; 732 + 733 + for (i = 0; i < npages; i++) { 734 + struct folio *dst, *src; 735 + struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); 736 + struct page *page = migrate_pfn_to_page(migrate->src[i]); 737 + 738 + if (!page) { 739 + if (newpage) { 740 + unlock_page(newpage); 741 + put_page(newpage); 742 + } 743 + continue; 744 + } 745 + 746 + if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) { 747 + if (newpage) { 748 + unlock_page(newpage); 749 + put_page(newpage); 750 + } 751 + newpage = page; 752 + } 753 + 754 + src = page_folio(page); 755 + dst = page_folio(newpage); 756 + remove_migration_ptes(src, dst, false); 757 + folio_unlock(src); 758 + 759 + if (is_zone_device_page(page)) 760 + put_page(page); 761 + else 762 + putback_lru_page(page); 763 + 764 + if (newpage != page) { 765 + unlock_page(newpage); 766 + if (is_zone_device_page(newpage)) 767 + put_page(newpage); 768 + else 769 + putback_lru_page(newpage); 770 + } 771 + } 772 + } 773 + EXPORT_SYMBOL(migrate_vma_finalize);

+282 -388

mm/mlock.c

··· 14 14 #include <linux/swapops.h> 15 15 #include <linux/pagemap.h> 16 16 #include <linux/pagevec.h> 17 + #include <linux/pagewalk.h> 17 18 #include <linux/mempolicy.h> 18 19 #include <linux/syscalls.h> 19 20 #include <linux/sched.h> ··· 27 26 #include <linux/secretmem.h> 28 27 29 28 #include "internal.h" 29 + 30 + static DEFINE_PER_CPU(struct pagevec, mlock_pvec); 30 31 31 32 bool can_do_mlock(void) 32 33 { ··· 49 46 * be placed on the LRU "unevictable" list, rather than the [in]active lists. 50 47 * The unevictable list is an LRU sibling list to the [in]active lists. 51 48 * PageUnevictable is set to indicate the unevictable state. 52 - * 53 - * When lazy mlocking via vmscan, it is important to ensure that the 54 - * vma's VM_LOCKED status is not concurrently being modified, otherwise we 55 - * may have mlocked a page that is being munlocked. So lazy mlock must take 56 - * the mmap_lock for read, and verify that the vma really is locked 57 - * (see mm/rmap.c). 58 49 */ 59 50 60 - /* 61 - * LRU accounting for clear_page_mlock() 62 - */ 63 - void clear_page_mlock(struct page *page) 51 + static struct lruvec *__mlock_page(struct page *page, struct lruvec *lruvec) 64 52 { 65 - int nr_pages; 53 + /* There is nothing more we can do while it's off LRU */ 54 + if (!TestClearPageLRU(page)) 55 + return lruvec; 66 56 67 - if (!TestClearPageMlocked(page)) 68 - return; 57 + lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); 69 58 70 - nr_pages = thp_nr_pages(page); 71 - mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); 72 - count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); 73 - /* 74 - * The previous TestClearPageMlocked() corresponds to the smp_mb() 75 - * in __pagevec_lru_add_fn(). 76 - * 77 - * See __pagevec_lru_add_fn for more explanation. 78 - */ 79 - if (!isolate_lru_page(page)) { 80 - putback_lru_page(page); 81 - } else { 59 + if (unlikely(page_evictable(page))) { 82 60 /* 83 - * We lost the race. the page already moved to evictable list. 61 + * This is a little surprising, but quite possible: 62 + * PageMlocked must have got cleared already by another CPU. 63 + * Could this page be on the Unevictable LRU? I'm not sure, 64 + * but move it now if so. 84 65 */ 85 - if (PageUnevictable(page)) 86 - count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); 66 + if (PageUnevictable(page)) { 67 + del_page_from_lru_list(page, lruvec); 68 + ClearPageUnevictable(page); 69 + add_page_to_lru_list(page, lruvec); 70 + __count_vm_events(UNEVICTABLE_PGRESCUED, 71 + thp_nr_pages(page)); 72 + } 73 + goto out; 87 74 } 88 - } 89 75 90 - /* 91 - * Mark page as mlocked if not already. 92 - * If page on LRU, isolate and putback to move to unevictable list. 93 - */ 94 - void mlock_vma_page(struct page *page) 95 - { 96 - /* Serialize with page migration */ 97 - BUG_ON(!PageLocked(page)); 98 - 99 - VM_BUG_ON_PAGE(PageTail(page), page); 100 - VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); 101 - 102 - if (!TestSetPageMlocked(page)) { 103 - int nr_pages = thp_nr_pages(page); 104 - 105 - mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); 106 - count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); 107 - if (!isolate_lru_page(page)) 108 - putback_lru_page(page); 76 + if (PageUnevictable(page)) { 77 + if (PageMlocked(page)) 78 + page->mlock_count++; 79 + goto out; 109 80 } 81 + 82 + del_page_from_lru_list(page, lruvec); 83 + ClearPageActive(page); 84 + SetPageUnevictable(page); 85 + page->mlock_count = !!PageMlocked(page); 86 + add_page_to_lru_list(page, lruvec); 87 + __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page)); 88 + out: 89 + SetPageLRU(page); 90 + return lruvec; 110 91 } 111 92 112 - /* 113 - * Finish munlock after successful page isolation 114 - * 115 - * Page must be locked. This is a wrapper for page_mlock() 116 - * and putback_lru_page() with munlock accounting. 117 - */ 118 - static void __munlock_isolated_page(struct page *page) 93 + static struct lruvec *__mlock_new_page(struct page *page, struct lruvec *lruvec) 119 94 { 120 - /* 121 - * Optimization: if the page was mapped just once, that's our mapping 122 - * and we don't need to check all the other vmas. 123 - */ 124 - if (page_mapcount(page) > 1) 125 - page_mlock(page); 95 + VM_BUG_ON_PAGE(PageLRU(page), page); 126 96 127 - /* Did try_to_unlock() succeed or punt? */ 128 - if (!PageMlocked(page)) 129 - count_vm_events(UNEVICTABLE_PGMUNLOCKED, thp_nr_pages(page)); 97 + lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); 130 98 131 - putback_lru_page(page); 99 + /* As above, this is a little surprising, but possible */ 100 + if (unlikely(page_evictable(page))) 101 + goto out; 102 + 103 + SetPageUnevictable(page); 104 + page->mlock_count = !!PageMlocked(page); 105 + __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page)); 106 + out: 107 + add_page_to_lru_list(page, lruvec); 108 + SetPageLRU(page); 109 + return lruvec; 132 110 } 133 111 134 - /* 135 - * Accounting for page isolation fail during munlock 136 - * 137 - * Performs accounting when page isolation fails in munlock. There is nothing 138 - * else to do because it means some other task has already removed the page 139 - * from the LRU. putback_lru_page() will take care of removing the page from 140 - * the unevictable list, if necessary. vmscan [page_referenced()] will move 141 - * the page back to the unevictable list if some other vma has it mlocked. 142 - */ 143 - static void __munlock_isolation_failed(struct page *page) 112 + static struct lruvec *__munlock_page(struct page *page, struct lruvec *lruvec) 144 113 { 145 114 int nr_pages = thp_nr_pages(page); 115 + bool isolated = false; 146 116 147 - if (PageUnevictable(page)) 148 - __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); 149 - else 150 - __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); 117 + if (!TestClearPageLRU(page)) 118 + goto munlock; 119 + 120 + isolated = true; 121 + lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); 122 + 123 + if (PageUnevictable(page)) { 124 + /* Then mlock_count is maintained, but might undercount */ 125 + if (page->mlock_count) 126 + page->mlock_count--; 127 + if (page->mlock_count) 128 + goto out; 129 + } 130 + /* else assume that was the last mlock: reclaim will fix it if not */ 131 + 132 + munlock: 133 + if (TestClearPageMlocked(page)) { 134 + __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); 135 + if (isolated || !PageUnevictable(page)) 136 + __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); 137 + else 138 + __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); 139 + } 140 + 141 + /* page_evictable() has to be checked *after* clearing Mlocked */ 142 + if (isolated && PageUnevictable(page) && page_evictable(page)) { 143 + del_page_from_lru_list(page, lruvec); 144 + ClearPageUnevictable(page); 145 + add_page_to_lru_list(page, lruvec); 146 + __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); 147 + } 148 + out: 149 + if (isolated) 150 + SetPageLRU(page); 151 + return lruvec; 152 + } 153 + 154 + /* 155 + * Flags held in the low bits of a struct page pointer on the mlock_pvec. 156 + */ 157 + #define LRU_PAGE 0x1 158 + #define NEW_PAGE 0x2 159 + static inline struct page *mlock_lru(struct page *page) 160 + { 161 + return (struct page *)((unsigned long)page + LRU_PAGE); 162 + } 163 + 164 + static inline struct page *mlock_new(struct page *page) 165 + { 166 + return (struct page *)((unsigned long)page + NEW_PAGE); 167 + } 168 + 169 + /* 170 + * mlock_pagevec() is derived from pagevec_lru_move_fn(): 171 + * perhaps that can make use of such page pointer flags in future, 172 + * but for now just keep it for mlock. We could use three separate 173 + * pagevecs instead, but one feels better (munlocking a full pagevec 174 + * does not need to drain mlocking pagevecs first). 175 + */ 176 + static void mlock_pagevec(struct pagevec *pvec) 177 + { 178 + struct lruvec *lruvec = NULL; 179 + unsigned long mlock; 180 + struct page *page; 181 + int i; 182 + 183 + for (i = 0; i < pagevec_count(pvec); i++) { 184 + page = pvec->pages[i]; 185 + mlock = (unsigned long)page & (LRU_PAGE | NEW_PAGE); 186 + page = (struct page *)((unsigned long)page - mlock); 187 + pvec->pages[i] = page; 188 + 189 + if (mlock & LRU_PAGE) 190 + lruvec = __mlock_page(page, lruvec); 191 + else if (mlock & NEW_PAGE) 192 + lruvec = __mlock_new_page(page, lruvec); 193 + else 194 + lruvec = __munlock_page(page, lruvec); 195 + } 196 + 197 + if (lruvec) 198 + unlock_page_lruvec_irq(lruvec); 199 + release_pages(pvec->pages, pvec->nr); 200 + pagevec_reinit(pvec); 201 + } 202 + 203 + void mlock_page_drain(int cpu) 204 + { 205 + struct pagevec *pvec; 206 + 207 + pvec = &per_cpu(mlock_pvec, cpu); 208 + if (pagevec_count(pvec)) 209 + mlock_pagevec(pvec); 210 + } 211 + 212 + bool need_mlock_page_drain(int cpu) 213 + { 214 + return pagevec_count(&per_cpu(mlock_pvec, cpu)); 151 215 } 152 216 153 217 /** 154 - * munlock_vma_page - munlock a vma page 155 - * @page: page to be unlocked, either a normal page or THP page head 156 - * 157 - * returns the size of the page as a page mask (0 for normal page, 158 - * HPAGE_PMD_NR - 1 for THP head page) 159 - * 160 - * called from munlock()/munmap() path with page supposedly on the LRU. 161 - * When we munlock a page, because the vma where we found the page is being 162 - * munlock()ed or munmap()ed, we want to check whether other vmas hold the 163 - * page locked so that we can leave it on the unevictable lru list and not 164 - * bother vmscan with it. However, to walk the page's rmap list in 165 - * page_mlock() we must isolate the page from the LRU. If some other 166 - * task has removed the page from the LRU, we won't be able to do that. 167 - * So we clear the PageMlocked as we might not get another chance. If we 168 - * can't isolate the page, we leave it for putback_lru_page() and vmscan 169 - * [page_referenced()/try_to_unmap()] to deal with. 218 + * mlock_folio - mlock a folio already on (or temporarily off) LRU 219 + * @folio: folio to be mlocked. 170 220 */ 171 - unsigned int munlock_vma_page(struct page *page) 221 + void mlock_folio(struct folio *folio) 172 222 { 173 - int nr_pages; 223 + struct pagevec *pvec = &get_cpu_var(mlock_pvec); 174 224 175 - /* For page_mlock() and to serialize with page migration */ 176 - BUG_ON(!PageLocked(page)); 177 - VM_BUG_ON_PAGE(PageTail(page), page); 225 + if (!folio_test_set_mlocked(folio)) { 226 + int nr_pages = folio_nr_pages(folio); 178 227 179 - if (!TestClearPageMlocked(page)) { 180 - /* Potentially, PTE-mapped THP: do not skip the rest PTEs */ 181 - return 0; 228 + zone_stat_mod_folio(folio, NR_MLOCK, nr_pages); 229 + __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); 182 230 } 183 231 184 - nr_pages = thp_nr_pages(page); 185 - mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); 186 - 187 - if (!isolate_lru_page(page)) 188 - __munlock_isolated_page(page); 189 - else 190 - __munlock_isolation_failed(page); 191 - 192 - return nr_pages - 1; 232 + folio_get(folio); 233 + if (!pagevec_add(pvec, mlock_lru(&folio->page)) || 234 + folio_test_large(folio) || lru_cache_disabled()) 235 + mlock_pagevec(pvec); 236 + put_cpu_var(mlock_pvec); 193 237 } 194 238 195 - /* 196 - * convert get_user_pages() return value to posix mlock() error 239 + /** 240 + * mlock_new_page - mlock a newly allocated page not yet on LRU 241 + * @page: page to be mlocked, either a normal page or a THP head. 197 242 */ 198 - static int __mlock_posix_error_return(long retval) 243 + void mlock_new_page(struct page *page) 199 244 { 200 - if (retval == -EFAULT) 201 - retval = -ENOMEM; 202 - else if (retval == -ENOMEM) 203 - retval = -EAGAIN; 204 - return retval; 245 + struct pagevec *pvec = &get_cpu_var(mlock_pvec); 246 + int nr_pages = thp_nr_pages(page); 247 + 248 + SetPageMlocked(page); 249 + mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); 250 + __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); 251 + 252 + get_page(page); 253 + if (!pagevec_add(pvec, mlock_new(page)) || 254 + PageHead(page) || lru_cache_disabled()) 255 + mlock_pagevec(pvec); 256 + put_cpu_var(mlock_pvec); 205 257 } 206 258 207 - /* 208 - * Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec() 209 - * 210 - * The fast path is available only for evictable pages with single mapping. 211 - * Then we can bypass the per-cpu pvec and get better performance. 212 - * when mapcount > 1 we need page_mlock() which can fail. 213 - * when !page_evictable(), we need the full redo logic of putback_lru_page to 214 - * avoid leaving evictable page in unevictable list. 215 - * 216 - * In case of success, @page is added to @pvec and @pgrescued is incremented 217 - * in case that the page was previously unevictable. @page is also unlocked. 259 + /** 260 + * munlock_page - munlock a page 261 + * @page: page to be munlocked, either a normal page or a THP head. 218 262 */ 219 - static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec, 220 - int *pgrescued) 263 + void munlock_page(struct page *page) 221 264 { 222 - VM_BUG_ON_PAGE(PageLRU(page), page); 223 - VM_BUG_ON_PAGE(!PageLocked(page), page); 224 - 225 - if (page_mapcount(page) <= 1 && page_evictable(page)) { 226 - pagevec_add(pvec, page); 227 - if (TestClearPageUnevictable(page)) 228 - (*pgrescued)++; 229 - unlock_page(page); 230 - return true; 231 - } 232 - 233 - return false; 234 - } 235 - 236 - /* 237 - * Putback multiple evictable pages to the LRU 238 - * 239 - * Batched putback of evictable pages that bypasses the per-cpu pvec. Some of 240 - * the pages might have meanwhile become unevictable but that is OK. 241 - */ 242 - static void __putback_lru_fast(struct pagevec *pvec, int pgrescued) 243 - { 244 - count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec)); 245 - /* 246 - *__pagevec_lru_add() calls release_pages() so we don't call 247 - * put_page() explicitly 248 - */ 249 - __pagevec_lru_add(pvec); 250 - count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); 251 - } 252 - 253 - /* 254 - * Munlock a batch of pages from the same zone 255 - * 256 - * The work is split to two main phases. First phase clears the Mlocked flag 257 - * and attempts to isolate the pages, all under a single zone lru lock. 258 - * The second phase finishes the munlock only for pages where isolation 259 - * succeeded. 260 - * 261 - * Note that the pagevec may be modified during the process. 262 - */ 263 - static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) 264 - { 265 - int i; 266 - int nr = pagevec_count(pvec); 267 - int delta_munlocked = -nr; 268 - struct pagevec pvec_putback; 269 - struct lruvec *lruvec = NULL; 270 - int pgrescued = 0; 271 - 272 - pagevec_init(&pvec_putback); 273 - 274 - /* Phase 1: page isolation */ 275 - for (i = 0; i < nr; i++) { 276 - struct page *page = pvec->pages[i]; 277 - struct folio *folio = page_folio(page); 278 - 279 - if (TestClearPageMlocked(page)) { 280 - /* 281 - * We already have pin from follow_page_mask() 282 - * so we can spare the get_page() here. 283 - */ 284 - if (TestClearPageLRU(page)) { 285 - lruvec = folio_lruvec_relock_irq(folio, lruvec); 286 - del_page_from_lru_list(page, lruvec); 287 - continue; 288 - } else 289 - __munlock_isolation_failed(page); 290 - } else { 291 - delta_munlocked++; 292 - } 293 - 294 - /* 295 - * We won't be munlocking this page in the next phase 296 - * but we still need to release the follow_page_mask() 297 - * pin. We cannot do it under lru_lock however. If it's 298 - * the last pin, __page_cache_release() would deadlock. 299 - */ 300 - pagevec_add(&pvec_putback, pvec->pages[i]); 301 - pvec->pages[i] = NULL; 302 - } 303 - if (lruvec) { 304 - __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); 305 - unlock_page_lruvec_irq(lruvec); 306 - } else if (delta_munlocked) { 307 - mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); 308 - } 309 - 310 - /* Now we can release pins of pages that we are not munlocking */ 311 - pagevec_release(&pvec_putback); 312 - 313 - /* Phase 2: page munlock */ 314 - for (i = 0; i < nr; i++) { 315 - struct page *page = pvec->pages[i]; 316 - 317 - if (page) { 318 - lock_page(page); 319 - if (!__putback_lru_fast_prepare(page, &pvec_putback, 320 - &pgrescued)) { 321 - /* 322 - * Slow path. We don't want to lose the last 323 - * pin before unlock_page() 324 - */ 325 - get_page(page); /* for putback_lru_page() */ 326 - __munlock_isolated_page(page); 327 - unlock_page(page); 328 - put_page(page); /* from follow_page_mask() */ 329 - } 330 - } 331 - } 265 + struct pagevec *pvec = &get_cpu_var(mlock_pvec); 332 266 333 267 /* 334 - * Phase 3: page putback for pages that qualified for the fast path 335 - * This will also call put_page() to return pin from follow_page_mask() 268 + * TestClearPageMlocked(page) must be left to __munlock_page(), 269 + * which will check whether the page is multiply mlocked. 336 270 */ 337 - if (pagevec_count(&pvec_putback)) 338 - __putback_lru_fast(&pvec_putback, pgrescued); 271 + 272 + get_page(page); 273 + if (!pagevec_add(pvec, page) || 274 + PageHead(page) || lru_cache_disabled()) 275 + mlock_pagevec(pvec); 276 + put_cpu_var(mlock_pvec); 339 277 } 340 278 341 - /* 342 - * Fill up pagevec for __munlock_pagevec using pte walk 343 - * 344 - * The function expects that the struct page corresponding to @start address is 345 - * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone. 346 - * 347 - * The rest of @pvec is filled by subsequent pages within the same pmd and same 348 - * zone, as long as the pte's are present and vm_normal_page() succeeds. These 349 - * pages also get pinned. 350 - * 351 - * Returns the address of the next page that should be scanned. This equals 352 - * @start + PAGE_SIZE when no page could be added by the pte walk. 353 - */ 354 - static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, 355 - struct vm_area_struct *vma, struct zone *zone, 356 - unsigned long start, unsigned long end) 279 + static int mlock_pte_range(pmd_t *pmd, unsigned long addr, 280 + unsigned long end, struct mm_walk *walk) 281 + 357 282 { 358 - pte_t *pte; 283 + struct vm_area_struct *vma = walk->vma; 359 284 spinlock_t *ptl; 285 + pte_t *start_pte, *pte; 286 + struct page *page; 360 287 361 - /* 362 - * Initialize pte walk starting at the already pinned page where we 363 - * are sure that there is a pte, as it was pinned under the same 364 - * mmap_lock write op. 365 - */ 366 - pte = get_locked_pte(vma->vm_mm, start, &ptl); 367 - /* Make sure we do not cross the page table boundary */ 368 - end = pgd_addr_end(start, end); 369 - end = p4d_addr_end(start, end); 370 - end = pud_addr_end(start, end); 371 - end = pmd_addr_end(start, end); 372 - 373 - /* The page next to the pinned page is the first we will try to get */ 374 - start += PAGE_SIZE; 375 - while (start < end) { 376 - struct page *page = NULL; 377 - pte++; 378 - if (pte_present(*pte)) 379 - page = vm_normal_page(vma, start, *pte); 380 - /* 381 - * Break if page could not be obtained or the page's node+zone does not 382 - * match 383 - */ 384 - if (!page || page_zone(page) != zone) 385 - break; 386 - 387 - /* 388 - * Do not use pagevec for PTE-mapped THP, 389 - * munlock_vma_pages_range() will handle them. 390 - */ 391 - if (PageTransCompound(page)) 392 - break; 393 - 394 - get_page(page); 395 - /* 396 - * Increase the address that will be returned *before* the 397 - * eventual break due to pvec becoming full by adding the page 398 - */ 399 - start += PAGE_SIZE; 400 - if (pagevec_add(pvec, page) == 0) 401 - break; 288 + ptl = pmd_trans_huge_lock(pmd, vma); 289 + if (ptl) { 290 + if (!pmd_present(*pmd)) 291 + goto out; 292 + if (is_huge_zero_pmd(*pmd)) 293 + goto out; 294 + page = pmd_page(*pmd); 295 + if (vma->vm_flags & VM_LOCKED) 296 + mlock_folio(page_folio(page)); 297 + else 298 + munlock_page(page); 299 + goto out; 402 300 } 403 - pte_unmap_unlock(pte, ptl); 404 - return start; 301 + 302 + start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 303 + for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) { 304 + if (!pte_present(*pte)) 305 + continue; 306 + page = vm_normal_page(vma, addr, *pte); 307 + if (!page) 308 + continue; 309 + if (PageTransCompound(page)) 310 + continue; 311 + if (vma->vm_flags & VM_LOCKED) 312 + mlock_folio(page_folio(page)); 313 + else 314 + munlock_page(page); 315 + } 316 + pte_unmap(start_pte); 317 + out: 318 + spin_unlock(ptl); 319 + cond_resched(); 320 + return 0; 405 321 } 406 322 407 323 /* 408 - * munlock_vma_pages_range() - munlock all pages in the vma range.' 409 - * @vma - vma containing range to be munlock()ed. 324 + * mlock_vma_pages_range() - mlock any pages already in the range, 325 + * or munlock all pages in the range. 326 + * @vma - vma containing range to be mlock()ed or munlock()ed 410 327 * @start - start address in @vma of the range 411 - * @end - end of range in @vma. 328 + * @end - end of range in @vma 329 + * @newflags - the new set of flags for @vma. 412 330 * 413 - * For mremap(), munmap() and exit(). 414 - * 415 - * Called with @vma VM_LOCKED. 416 - * 417 - * Returns with VM_LOCKED cleared. Callers must be prepared to 418 - * deal with this. 419 - * 420 - * We don't save and restore VM_LOCKED here because pages are 421 - * still on lru. In unmap path, pages might be scanned by reclaim 422 - * and re-mlocked by page_mlock/try_to_unmap before we unmap and 423 - * free them. This will result in freeing mlocked pages. 331 + * Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED; 332 + * called for munlock() and munlockall(), to clear VM_LOCKED from @vma. 424 333 */ 425 - void munlock_vma_pages_range(struct vm_area_struct *vma, 426 - unsigned long start, unsigned long end) 334 + static void mlock_vma_pages_range(struct vm_area_struct *vma, 335 + unsigned long start, unsigned long end, vm_flags_t newflags) 427 336 { 428 - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; 337 + static const struct mm_walk_ops mlock_walk_ops = { 338 + .pmd_entry = mlock_pte_range, 339 + }; 429 340 430 - while (start < end) { 431 - struct page *page; 432 - unsigned int page_mask = 0; 433 - unsigned long page_increm; 434 - struct pagevec pvec; 435 - struct zone *zone; 341 + /* 342 + * There is a slight chance that concurrent page migration, 343 + * or page reclaim finding a page of this now-VM_LOCKED vma, 344 + * will call mlock_vma_page() and raise page's mlock_count: 345 + * double counting, leaving the page unevictable indefinitely. 346 + * Communicate this danger to mlock_vma_page() with VM_IO, 347 + * which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas. 348 + * mmap_lock is held in write mode here, so this weird 349 + * combination should not be visible to other mmap_lock users; 350 + * but WRITE_ONCE so rmap walkers must see VM_IO if VM_LOCKED. 351 + */ 352 + if (newflags & VM_LOCKED) 353 + newflags |= VM_IO; 354 + WRITE_ONCE(vma->vm_flags, newflags); 436 355 437 - pagevec_init(&pvec); 438 - /* 439 - * Although FOLL_DUMP is intended for get_dump_page(), 440 - * it just so happens that its special treatment of the 441 - * ZERO_PAGE (returning an error instead of doing get_page) 442 - * suits munlock very well (and if somehow an abnormal page 443 - * has sneaked into the range, we won't oops here: great). 444 - */ 445 - page = follow_page(vma, start, FOLL_GET | FOLL_DUMP); 356 + lru_add_drain(); 357 + walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL); 358 + lru_add_drain(); 446 359 447 - if (page && !IS_ERR(page)) { 448 - if (PageTransTail(page)) { 449 - VM_BUG_ON_PAGE(PageMlocked(page), page); 450 - put_page(page); /* follow_page_mask() */ 451 - } else if (PageTransHuge(page)) { 452 - lock_page(page); 453 - /* 454 - * Any THP page found by follow_page_mask() may 455 - * have gotten split before reaching 456 - * munlock_vma_page(), so we need to compute 457 - * the page_mask here instead. 458 - */ 459 - page_mask = munlock_vma_page(page); 460 - unlock_page(page); 461 - put_page(page); /* follow_page_mask() */ 462 - } else { 463 - /* 464 - * Non-huge pages are handled in batches via 465 - * pagevec. The pin from follow_page_mask() 466 - * prevents them from collapsing by THP. 467 - */ 468 - pagevec_add(&pvec, page); 469 - zone = page_zone(page); 470 - 471 - /* 472 - * Try to fill the rest of pagevec using fast 473 - * pte walk. This will also update start to 474 - * the next page to process. Then munlock the 475 - * pagevec. 476 - */ 477 - start = __munlock_pagevec_fill(&pvec, vma, 478 - zone, start, end); 479 - __munlock_pagevec(&pvec, zone); 480 - goto next; 481 - } 482 - } 483 - page_increm = 1 + page_mask; 484 - start += page_increm * PAGE_SIZE; 485 - next: 486 - cond_resched(); 360 + if (newflags & VM_IO) { 361 + newflags &= ~VM_IO; 362 + WRITE_ONCE(vma->vm_flags, newflags); 487 363 } 488 364 } 489 365 ··· 382 500 pgoff_t pgoff; 383 501 int nr_pages; 384 502 int ret = 0; 385 - int lock = !!(newflags & VM_LOCKED); 386 - vm_flags_t old_flags = vma->vm_flags; 503 + vm_flags_t oldflags = vma->vm_flags; 387 504 388 - if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || 505 + if (newflags == oldflags || (oldflags & VM_SPECIAL) || 389 506 is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) || 390 507 vma_is_dax(vma) || vma_is_secretmem(vma)) 391 508 /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ ··· 416 535 * Keep track of amount of locked VM. 417 536 */ 418 537 nr_pages = (end - start) >> PAGE_SHIFT; 419 - if (!lock) 538 + if (!(newflags & VM_LOCKED)) 420 539 nr_pages = -nr_pages; 421 - else if (old_flags & VM_LOCKED) 540 + else if (oldflags & VM_LOCKED) 422 541 nr_pages = 0; 423 542 mm->locked_vm += nr_pages; 424 543 ··· 428 547 * set VM_LOCKED, populate_vma_page_range will bring it back. 429 548 */ 430 549 431 - if (lock) 550 + if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) { 551 + /* No work to do, and mlocking twice would be wrong */ 432 552 vma->vm_flags = newflags; 433 - else 434 - munlock_vma_pages_range(vma, start, end); 435 - 553 + } else { 554 + mlock_vma_pages_range(vma, start, end, newflags); 555 + } 436 556 out: 437 557 *prev = vma; 438 558 return ret; ··· 525 643 } 526 644 527 645 return count >> PAGE_SHIFT; 646 + } 647 + 648 + /* 649 + * convert get_user_pages() return value to posix mlock() error 650 + */ 651 + static int __mlock_posix_error_return(long retval) 652 + { 653 + if (retval == -EFAULT) 654 + retval = -ENOMEM; 655 + else if (retval == -ENOMEM) 656 + retval = -EAGAIN; 657 + return retval; 528 658 } 529 659 530 660 static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)

+2 -30

mm/mmap.c

··· 2672 2672 vma->vm_prev = NULL; 2673 2673 do { 2674 2674 vma_rb_erase(vma, &mm->mm_rb); 2675 + if (vma->vm_flags & VM_LOCKED) 2676 + mm->locked_vm -= vma_pages(vma); 2675 2677 mm->map_count--; 2676 2678 tail_vma = vma; 2677 2679 vma = vma->vm_next; ··· 2778 2776 return __split_vma(mm, vma, addr, new_below); 2779 2777 } 2780 2778 2781 - static inline void 2782 - unlock_range(struct vm_area_struct *start, unsigned long limit) 2783 - { 2784 - struct mm_struct *mm = start->vm_mm; 2785 - struct vm_area_struct *tmp = start; 2786 - 2787 - while (tmp && tmp->vm_start < limit) { 2788 - if (tmp->vm_flags & VM_LOCKED) { 2789 - mm->locked_vm -= vma_pages(tmp); 2790 - munlock_vma_pages_all(tmp); 2791 - } 2792 - 2793 - tmp = tmp->vm_next; 2794 - } 2795 - } 2796 - 2797 2779 /* Munmap is split into 2 main parts -- this part which finds 2798 2780 * what needs doing, and the areas themselves, which do the 2799 2781 * work. This now handles partial unmappings. ··· 2857 2871 if (error) 2858 2872 return error; 2859 2873 } 2860 - 2861 - /* 2862 - * unlock any mlock()ed ranges before detaching vmas 2863 - */ 2864 - if (mm->locked_vm) 2865 - unlock_range(vma, end); 2866 2874 2867 2875 /* Detach vmas from rbtree */ 2868 2876 if (!detach_vmas_to_be_unmapped(mm, vma, prev, end)) ··· 3125 3145 * Nothing can be holding mm->mmap_lock here and the above call 3126 3146 * to mmu_notifier_release(mm) ensures mmu notifier callbacks in 3127 3147 * __oom_reap_task_mm() will not block. 3128 - * 3129 - * This needs to be done before calling unlock_range(), 3130 - * which clears VM_LOCKED, otherwise the oom reaper cannot 3131 - * reliably test it. 3132 3148 */ 3133 3149 (void)__oom_reap_task_mm(mm); 3134 - 3135 3150 set_bit(MMF_OOM_SKIP, &mm->flags); 3136 3151 } 3137 3152 3138 3153 mmap_write_lock(mm); 3139 - if (mm->locked_vm) 3140 - unlock_range(mm->mmap, ULONG_MAX); 3141 - 3142 3154 arch_exit_mmap(mm); 3143 3155 3144 3156 vma = mm->mmap;

+7

mm/mmzone.c

··· 81 81 82 82 for_each_lru(lru) 83 83 INIT_LIST_HEAD(&lruvec->lists[lru]); 84 + /* 85 + * The "Unevictable LRU" is imaginary: though its size is maintained, 86 + * it is never scanned, and unevictable pages are not threaded on it 87 + * (so that their lru fields can be reused to hold mlock_count). 88 + * Poison its list head, so that any operations on it would crash. 89 + */ 90 + list_del(&lruvec->lists[LRU_UNEVICTABLE]); 84 91 } 85 92 86 93 #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)

+1 -1

mm/oom_kill.c

··· 523 523 set_bit(MMF_UNSTABLE, &mm->flags); 524 524 525 525 for (vma = mm->mmap ; vma; vma = vma->vm_next) { 526 - if (!can_madv_lru_vma(vma)) 526 + if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP)) 527 527 continue; 528 528 529 529 /*

+1 -2

mm/page_alloc.c

··· 734 734 set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); 735 735 set_compound_order(page, order); 736 736 atomic_set(compound_mapcount_ptr(page), -1); 737 - if (hpage_pincount_available(page)) 738 - atomic_set(compound_pincount_ptr(page), 0); 737 + atomic_set(compound_pincount_ptr(page), 0); 739 738 } 740 739 741 740 static void prep_compound_tail(struct page *head, int tail_idx)

+14 -16

mm/page_idle.c

··· 13 13 #include <linux/page_ext.h> 14 14 #include <linux/page_idle.h> 15 15 16 + #include "internal.h" 17 + 16 18 #define BITMAP_CHUNK_SIZE sizeof(u64) 17 19 #define BITMAP_CHUNK_BITS (BITMAP_CHUNK_SIZE * BITS_PER_BYTE) 18 20 ··· 46 44 return page; 47 45 } 48 46 49 - static bool page_idle_clear_pte_refs_one(struct page *page, 47 + static bool page_idle_clear_pte_refs_one(struct folio *folio, 50 48 struct vm_area_struct *vma, 51 49 unsigned long addr, void *arg) 52 50 { 53 - struct page_vma_mapped_walk pvmw = { 54 - .page = page, 55 - .vma = vma, 56 - .address = addr, 57 - }; 51 + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); 58 52 bool referenced = false; 59 53 60 54 while (page_vma_mapped_walk(&pvmw)) { ··· 72 74 } 73 75 74 76 if (referenced) { 75 - clear_page_idle(page); 77 + folio_clear_idle(folio); 76 78 /* 77 79 * We cleared the referenced bit in a mapping to this page. To 78 80 * avoid interference with page reclaim, mark it young so that 79 - * page_referenced() will return > 0. 81 + * folio_referenced() will return > 0. 80 82 */ 81 - set_page_young(page); 83 + folio_set_young(folio); 82 84 } 83 85 return true; 84 86 } 85 87 86 88 static void page_idle_clear_pte_refs(struct page *page) 87 89 { 90 + struct folio *folio = page_folio(page); 88 91 /* 89 92 * Since rwc.arg is unused, rwc is effectively immutable, so we 90 93 * can make it static const to save some cycles and stack. 91 94 */ 92 95 static const struct rmap_walk_control rwc = { 93 96 .rmap_one = page_idle_clear_pte_refs_one, 94 - .anon_lock = page_lock_anon_vma_read, 97 + .anon_lock = folio_lock_anon_vma_read, 95 98 }; 96 99 bool need_lock; 97 100 98 - if (!page_mapped(page) || 99 - !page_rmapping(page)) 101 + if (!folio_mapped(folio) || !folio_raw_mapping(folio)) 100 102 return; 101 103 102 - need_lock = !PageAnon(page) || PageKsm(page); 103 - if (need_lock && !trylock_page(page)) 104 + need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); 105 + if (need_lock && !folio_trylock(folio)) 104 106 return; 105 107 106 - rmap_walk(page, (struct rmap_walk_control *)&rwc); 108 + rmap_walk(folio, &rwc); 107 109 108 110 if (need_lock) 109 - unlock_page(page); 111 + folio_unlock(folio); 110 112 } 111 113 112 114 static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,

+26 -32

mm/page_vma_mapped.c

··· 53 53 return true; 54 54 } 55 55 56 - static inline bool pfn_is_match(struct page *page, unsigned long pfn) 57 - { 58 - unsigned long page_pfn = page_to_pfn(page); 59 - 60 - /* normal page and hugetlbfs page */ 61 - if (!PageTransCompound(page) || PageHuge(page)) 62 - return page_pfn == pfn; 63 - 64 - /* THP can be referenced by any subpage */ 65 - return pfn >= page_pfn && pfn - page_pfn < thp_nr_pages(page); 66 - } 67 - 68 56 /** 69 57 * check_pte - check if @pvmw->page is mapped at the @pvmw->pte 70 58 * @pvmw: page_vma_mapped_walk struct, includes a pair pte and page for checking ··· 104 116 pfn = pte_pfn(*pvmw->pte); 105 117 } 106 118 107 - return pfn_is_match(pvmw->page, pfn); 119 + return (pfn - pvmw->pfn) < pvmw->nr_pages; 120 + } 121 + 122 + /* Returns true if the two ranges overlap. Careful to not overflow. */ 123 + static bool check_pmd(unsigned long pfn, struct page_vma_mapped_walk *pvmw) 124 + { 125 + if ((pfn + HPAGE_PMD_NR - 1) < pvmw->pfn) 126 + return false; 127 + if (pfn > pvmw->pfn + pvmw->nr_pages - 1) 128 + return false; 129 + return true; 108 130 } 109 131 110 132 static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size) ··· 125 127 } 126 128 127 129 /** 128 - * page_vma_mapped_walk - check if @pvmw->page is mapped in @pvmw->vma at 130 + * page_vma_mapped_walk - check if @pvmw->pfn is mapped in @pvmw->vma at 129 131 * @pvmw->address 130 132 * @pvmw: pointer to struct page_vma_mapped_walk. page, vma, address and flags 131 133 * must be set. pmd, pte and ptl must be NULL. ··· 150 152 */ 151 153 bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) 152 154 { 153 - struct mm_struct *mm = pvmw->vma->vm_mm; 154 - struct page *page = pvmw->page; 155 + struct vm_area_struct *vma = pvmw->vma; 156 + struct mm_struct *mm = vma->vm_mm; 155 157 unsigned long end; 156 158 pgd_t *pgd; 157 159 p4d_t *p4d; ··· 162 164 if (pvmw->pmd && !pvmw->pte) 163 165 return not_found(pvmw); 164 166 165 - if (unlikely(PageHuge(page))) { 167 + if (unlikely(is_vm_hugetlb_page(vma))) { 168 + unsigned long size = pvmw->nr_pages * PAGE_SIZE; 166 169 /* The only possible mapping was handled on last iteration */ 167 170 if (pvmw->pte) 168 171 return not_found(pvmw); 169 172 170 173 /* when pud is not present, pte will be NULL */ 171 - pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page)); 174 + pvmw->pte = huge_pte_offset(mm, pvmw->address, size); 172 175 if (!pvmw->pte) 173 176 return false; 174 177 175 - pvmw->ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte); 178 + pvmw->ptl = huge_pte_lockptr(size_to_hstate(size), mm, 179 + pvmw->pte); 176 180 spin_lock(pvmw->ptl); 177 181 if (!check_pte(pvmw)) 178 182 return not_found(pvmw); 179 183 return true; 180 184 } 181 185 182 - /* 183 - * Seek to next pte only makes sense for THP. 184 - * But more important than that optimization, is to filter out 185 - * any PageKsm page: whose page->index misleads vma_address() 186 - * and vma_address_end() to disaster. 187 - */ 188 - end = PageTransCompound(page) ? 189 - vma_address_end(page, pvmw->vma) : 190 - pvmw->address + PAGE_SIZE; 186 + end = vma_address_end(pvmw); 191 187 if (pvmw->pte) 192 188 goto next_pte; 193 189 restart: ··· 216 224 if (likely(pmd_trans_huge(pmde))) { 217 225 if (pvmw->flags & PVMW_MIGRATION) 218 226 return not_found(pvmw); 219 - if (pmd_page(pmde) != page) 227 + if (!check_pmd(pmd_pfn(pmde), pvmw)) 220 228 return not_found(pvmw); 221 229 return true; 222 230 } ··· 228 236 return not_found(pvmw); 229 237 entry = pmd_to_swp_entry(pmde); 230 238 if (!is_migration_entry(entry) || 231 - pfn_swap_entry_to_page(entry) != page) 239 + !check_pmd(swp_offset(entry), pvmw)) 232 240 return not_found(pvmw); 233 241 return true; 234 242 } ··· 242 250 * cleared *pmd but not decremented compound_mapcount(). 243 251 */ 244 252 if ((pvmw->flags & PVMW_SYNC) && 245 - PageTransCompound(page)) { 253 + transparent_hugepage_active(vma) && 254 + (pvmw->nr_pages >= HPAGE_PMD_NR)) { 246 255 spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); 247 256 248 257 spin_unlock(ptl); ··· 300 307 int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) 301 308 { 302 309 struct page_vma_mapped_walk pvmw = { 303 - .page = page, 310 + .pfn = page_to_pfn(page), 311 + .nr_pages = 1, 304 312 .vma = vma, 305 313 .flags = PVMW_SYNC, 306 314 };

+100 -8

mm/readahead.c

··· 262 262 263 263 blk_finish_plug(&plug); 264 264 265 - BUG_ON(!list_empty(pages)); 265 + BUG_ON(pages && !list_empty(pages)); 266 266 BUG_ON(readahead_count(rac)); 267 267 268 268 out: ··· 361 361 * behaviour which would occur if page allocations are causing VM writeback. 362 362 * We really don't want to intermingle reads and writes like that. 363 363 */ 364 - void do_page_cache_ra(struct readahead_control *ractl, 364 + static void do_page_cache_ra(struct readahead_control *ractl, 365 365 unsigned long nr_to_read, unsigned long lookahead_size) 366 366 { 367 367 struct inode *inode = ractl->mapping->host; ··· 546 546 } 547 547 548 548 /* 549 + * There are some parts of the kernel which assume that PMD entries 550 + * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, 551 + * limit the maximum allocation order to PMD size. I'm not aware of any 552 + * assumptions about maximum order if THP are disabled, but 8 seems like 553 + * a good order (that's 1MB if you're using 4kB pages) 554 + */ 555 + #ifdef CONFIG_TRANSPARENT_HUGEPAGE 556 + #define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER 557 + #else 558 + #define MAX_PAGECACHE_ORDER 8 559 + #endif 560 + 561 + static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, 562 + pgoff_t mark, unsigned int order, gfp_t gfp) 563 + { 564 + int err; 565 + struct folio *folio = filemap_alloc_folio(gfp, order); 566 + 567 + if (!folio) 568 + return -ENOMEM; 569 + if (mark - index < (1UL << order)) 570 + folio_set_readahead(folio); 571 + err = filemap_add_folio(ractl->mapping, folio, index, gfp); 572 + if (err) 573 + folio_put(folio); 574 + else 575 + ractl->_nr_pages += 1UL << order; 576 + return err; 577 + } 578 + 579 + void page_cache_ra_order(struct readahead_control *ractl, 580 + struct file_ra_state *ra, unsigned int new_order) 581 + { 582 + struct address_space *mapping = ractl->mapping; 583 + pgoff_t index = readahead_index(ractl); 584 + pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; 585 + pgoff_t mark = index + ra->size - ra->async_size; 586 + int err = 0; 587 + gfp_t gfp = readahead_gfp_mask(mapping); 588 + 589 + if (!mapping_large_folio_support(mapping) || ra->size < 4) 590 + goto fallback; 591 + 592 + limit = min(limit, index + ra->size - 1); 593 + 594 + if (new_order < MAX_PAGECACHE_ORDER) { 595 + new_order += 2; 596 + if (new_order > MAX_PAGECACHE_ORDER) 597 + new_order = MAX_PAGECACHE_ORDER; 598 + while ((1 << new_order) > ra->size) 599 + new_order--; 600 + } 601 + 602 + while (index <= limit) { 603 + unsigned int order = new_order; 604 + 605 + /* Align with smaller pages if needed */ 606 + if (index & ((1UL << order) - 1)) { 607 + order = __ffs(index); 608 + if (order == 1) 609 + order = 0; 610 + } 611 + /* Don't allocate pages past EOF */ 612 + while (index + (1UL << order) - 1 > limit) { 613 + if (--order == 1) 614 + order = 0; 615 + } 616 + err = ra_alloc_folio(ractl, index, mark, order, gfp); 617 + if (err) 618 + break; 619 + index += 1UL << order; 620 + } 621 + 622 + if (index > limit) { 623 + ra->size += index - limit - 1; 624 + ra->async_size += index - limit - 1; 625 + } 626 + 627 + read_pages(ractl, NULL, false); 628 + 629 + /* 630 + * If there were already pages in the page cache, then we may have 631 + * left some gaps. Let the regular readahead code take care of this 632 + * situation. 633 + */ 634 + if (!err) 635 + return; 636 + fallback: 637 + do_page_cache_ra(ractl, ra->size, ra->async_size); 638 + } 639 + 640 + /* 549 641 * A minimal readahead algorithm for trivial sequential/random reads. 550 642 */ 551 643 static void ondemand_readahead(struct readahead_control *ractl, 552 - bool hit_readahead_marker, unsigned long req_size) 644 + struct folio *folio, unsigned long req_size) 553 645 { 554 646 struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host); 555 647 struct file_ra_state *ra = ractl->ra; ··· 676 584 } 677 585 678 586 /* 679 - * Hit a marked page without valid readahead state. 587 + * Hit a marked folio without valid readahead state. 680 588 * E.g. interleaved reads. 681 589 * Query the pagecache for async_size, which normally equals to 682 590 * readahead size. Ramp it up and use it as the new readahead size. 683 591 */ 684 - if (hit_readahead_marker) { 592 + if (folio) { 685 593 pgoff_t start; 686 594 687 595 rcu_read_lock(); ··· 754 662 } 755 663 756 664 ractl->_index = ra->start; 757 - do_page_cache_ra(ractl, ra->size, ra->async_size); 665 + page_cache_ra_order(ractl, ra, folio ? folio_order(folio) : 0); 758 666 } 759 667 760 668 void page_cache_sync_ra(struct readahead_control *ractl, ··· 782 690 } 783 691 784 692 /* do read-ahead */ 785 - ondemand_readahead(ractl, false, req_count); 693 + ondemand_readahead(ractl, NULL, req_count); 786 694 } 787 695 EXPORT_SYMBOL_GPL(page_cache_sync_ra); 788 696 ··· 805 713 return; 806 714 807 715 /* do read-ahead */ 808 - ondemand_readahead(ractl, true, req_count); 716 + ondemand_readahead(ractl, folio, req_count); 809 717 } 810 718 EXPORT_SYMBOL_GPL(page_cache_async_ra); 811 719

+239 -339

mm/rmap.c

··· 107 107 VM_BUG_ON(atomic_read(&anon_vma->refcount)); 108 108 109 109 /* 110 - * Synchronize against page_lock_anon_vma_read() such that 110 + * Synchronize against folio_lock_anon_vma_read() such that 111 111 * we can safely hold the lock without the anon_vma getting 112 112 * freed. 113 113 * 114 114 * Relies on the full mb implied by the atomic_dec_and_test() from 115 115 * put_anon_vma() against the acquire barrier implied by 116 - * down_read_trylock() from page_lock_anon_vma_read(). This orders: 116 + * down_read_trylock() from folio_lock_anon_vma_read(). This orders: 117 117 * 118 - * page_lock_anon_vma_read() VS put_anon_vma() 118 + * folio_lock_anon_vma_read() VS put_anon_vma() 119 119 * down_read_trylock() atomic_dec_and_test() 120 120 * LOCK MB 121 121 * atomic_read() rwsem_is_locked() ··· 168 168 * allocate a new one. 169 169 * 170 170 * Anon-vma allocations are very subtle, because we may have 171 - * optimistically looked up an anon_vma in page_lock_anon_vma_read() 171 + * optimistically looked up an anon_vma in folio_lock_anon_vma_read() 172 172 * and that may actually touch the rwsem even in the newly 173 173 * allocated vma (it depends on RCU to make sure that the 174 174 * anon_vma isn't actually destroyed). ··· 526 526 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a 527 527 * reference like with page_get_anon_vma() and then block on the mutex. 528 528 */ 529 - struct anon_vma *page_lock_anon_vma_read(struct page *page) 529 + struct anon_vma *folio_lock_anon_vma_read(struct folio *folio) 530 530 { 531 531 struct anon_vma *anon_vma = NULL; 532 532 struct anon_vma *root_anon_vma; 533 533 unsigned long anon_mapping; 534 534 535 535 rcu_read_lock(); 536 - anon_mapping = (unsigned long)READ_ONCE(page->mapping); 536 + anon_mapping = (unsigned long)READ_ONCE(folio->mapping); 537 537 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) 538 538 goto out; 539 - if (!page_mapped(page)) 539 + if (!folio_mapped(folio)) 540 540 goto out; 541 541 542 542 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 543 543 root_anon_vma = READ_ONCE(anon_vma->root); 544 544 if (down_read_trylock(&root_anon_vma->rwsem)) { 545 545 /* 546 - * If the page is still mapped, then this anon_vma is still 546 + * If the folio is still mapped, then this anon_vma is still 547 547 * its anon_vma, and holding the mutex ensures that it will 548 548 * not go away, see anon_vma_free(). 549 549 */ 550 - if (!page_mapped(page)) { 550 + if (!folio_mapped(folio)) { 551 551 up_read(&root_anon_vma->rwsem); 552 552 anon_vma = NULL; 553 553 } ··· 560 560 goto out; 561 561 } 562 562 563 - if (!page_mapped(page)) { 563 + if (!folio_mapped(folio)) { 564 564 rcu_read_unlock(); 565 565 put_anon_vma(anon_vma); 566 566 return NULL; ··· 737 737 */ 738 738 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 739 739 { 740 - if (PageAnon(page)) { 741 - struct anon_vma *page__anon_vma = page_anon_vma(page); 740 + struct folio *folio = page_folio(page); 741 + if (folio_test_anon(folio)) { 742 + struct anon_vma *page__anon_vma = folio_anon_vma(folio); 742 743 /* 743 744 * Note: swapoff's unuse_vma() is more efficient with this 744 745 * check, and needs it to match anon_vma when KSM is active. ··· 749 748 return -EFAULT; 750 749 } else if (!vma->vm_file) { 751 750 return -EFAULT; 752 - } else if (vma->vm_file->f_mapping != compound_head(page)->mapping) { 751 + } else if (vma->vm_file->f_mapping != folio->mapping) { 753 752 return -EFAULT; 754 753 } 755 754 ··· 790 789 return pmd; 791 790 } 792 791 793 - struct page_referenced_arg { 792 + struct folio_referenced_arg { 794 793 int mapcount; 795 794 int referenced; 796 795 unsigned long vm_flags; 797 796 struct mem_cgroup *memcg; 798 797 }; 799 798 /* 800 - * arg: page_referenced_arg will be passed 799 + * arg: folio_referenced_arg will be passed 801 800 */ 802 - static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, 803 - unsigned long address, void *arg) 801 + static bool folio_referenced_one(struct folio *folio, 802 + struct vm_area_struct *vma, unsigned long address, void *arg) 804 803 { 805 - struct page_referenced_arg *pra = arg; 806 - struct page_vma_mapped_walk pvmw = { 807 - .page = page, 808 - .vma = vma, 809 - .address = address, 810 - }; 804 + struct folio_referenced_arg *pra = arg; 805 + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); 811 806 int referenced = 0; 812 807 813 808 while (page_vma_mapped_walk(&pvmw)) { 814 809 address = pvmw.address; 815 810 816 - if (vma->vm_flags & VM_LOCKED) { 811 + if ((vma->vm_flags & VM_LOCKED) && 812 + (!folio_test_large(folio) || !pvmw.pte)) { 813 + /* Restore the mlock which got missed */ 814 + mlock_vma_folio(folio, vma, !pvmw.pte); 817 815 page_vma_mapped_walk_done(&pvmw); 818 816 pra->vm_flags |= VM_LOCKED; 819 817 return false; /* To break the loop */ ··· 824 824 /* 825 825 * Don't treat a reference through 826 826 * a sequentially read mapping as such. 827 - * If the page has been used in another mapping, 827 + * If the folio has been used in another mapping, 828 828 * we will catch it; if this other mapping is 829 829 * already gone, the unmap path will have set 830 - * PG_referenced or activated the page. 830 + * the referenced flag or activated the folio. 831 831 */ 832 832 if (likely(!(vma->vm_flags & VM_SEQ_READ))) 833 833 referenced++; ··· 837 837 pvmw.pmd)) 838 838 referenced++; 839 839 } else { 840 - /* unexpected pmd-mapped page? */ 840 + /* unexpected pmd-mapped folio? */ 841 841 WARN_ON_ONCE(1); 842 842 } 843 843 ··· 845 845 } 846 846 847 847 if (referenced) 848 - clear_page_idle(page); 849 - if (test_and_clear_page_young(page)) 848 + folio_clear_idle(folio); 849 + if (folio_test_clear_young(folio)) 850 850 referenced++; 851 851 852 852 if (referenced) { 853 853 pra->referenced++; 854 - pra->vm_flags |= vma->vm_flags; 854 + pra->vm_flags |= vma->vm_flags & ~VM_LOCKED; 855 855 } 856 856 857 857 if (!pra->mapcount) ··· 860 860 return true; 861 861 } 862 862 863 - static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) 863 + static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg) 864 864 { 865 - struct page_referenced_arg *pra = arg; 865 + struct folio_referenced_arg *pra = arg; 866 866 struct mem_cgroup *memcg = pra->memcg; 867 867 868 868 if (!mm_match_cgroup(vma->vm_mm, memcg)) ··· 872 872 } 873 873 874 874 /** 875 - * page_referenced - test if the page was referenced 876 - * @page: the page to test 877 - * @is_locked: caller holds lock on the page 875 + * folio_referenced() - Test if the folio was referenced. 876 + * @folio: The folio to test. 877 + * @is_locked: Caller holds lock on the folio. 878 878 * @memcg: target memory cgroup 879 - * @vm_flags: collect encountered vma->vm_flags who actually referenced the page 879 + * @vm_flags: A combination of all the vma->vm_flags which referenced the folio. 880 880 * 881 - * Quick test_and_clear_referenced for all mappings to a page, 882 - * returns the number of ptes which referenced the page. 881 + * Quick test_and_clear_referenced for all mappings of a folio, 882 + * 883 + * Return: The number of mappings which referenced the folio. 883 884 */ 884 - int page_referenced(struct page *page, 885 - int is_locked, 886 - struct mem_cgroup *memcg, 887 - unsigned long *vm_flags) 885 + int folio_referenced(struct folio *folio, int is_locked, 886 + struct mem_cgroup *memcg, unsigned long *vm_flags) 888 887 { 889 888 int we_locked = 0; 890 - struct page_referenced_arg pra = { 891 - .mapcount = total_mapcount(page), 889 + struct folio_referenced_arg pra = { 890 + .mapcount = folio_mapcount(folio), 892 891 .memcg = memcg, 893 892 }; 894 893 struct rmap_walk_control rwc = { 895 - .rmap_one = page_referenced_one, 894 + .rmap_one = folio_referenced_one, 896 895 .arg = (void *)&pra, 897 - .anon_lock = page_lock_anon_vma_read, 896 + .anon_lock = folio_lock_anon_vma_read, 898 897 }; 899 898 900 899 *vm_flags = 0; 901 900 if (!pra.mapcount) 902 901 return 0; 903 902 904 - if (!page_rmapping(page)) 903 + if (!folio_raw_mapping(folio)) 905 904 return 0; 906 905 907 - if (!is_locked && (!PageAnon(page) || PageKsm(page))) { 908 - we_locked = trylock_page(page); 906 + if (!is_locked && (!folio_test_anon(folio) || folio_test_ksm(folio))) { 907 + we_locked = folio_trylock(folio); 909 908 if (!we_locked) 910 909 return 1; 911 910 } ··· 915 916 * cgroups 916 917 */ 917 918 if (memcg) { 918 - rwc.invalid_vma = invalid_page_referenced_vma; 919 + rwc.invalid_vma = invalid_folio_referenced_vma; 919 920 } 920 921 921 - rmap_walk(page, &rwc); 922 + rmap_walk(folio, &rwc); 922 923 *vm_flags = pra.vm_flags; 923 924 924 925 if (we_locked) 925 - unlock_page(page); 926 + folio_unlock(folio); 926 927 927 928 return pra.referenced; 928 929 } 929 930 930 - static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, 931 + static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma, 931 932 unsigned long address, void *arg) 932 933 { 933 - struct page_vma_mapped_walk pvmw = { 934 - .page = page, 935 - .vma = vma, 936 - .address = address, 937 - .flags = PVMW_SYNC, 938 - }; 934 + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC); 939 935 struct mmu_notifier_range range; 940 936 int *cleaned = arg; 941 937 942 938 /* 943 939 * We have to assume the worse case ie pmd for invalidation. Note that 944 - * the page can not be free from this function. 940 + * the folio can not be freed from this function. 945 941 */ 946 942 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 947 943 0, vma, vma->vm_mm, address, 948 - vma_address_end(page, vma)); 944 + vma_address_end(&pvmw)); 949 945 mmu_notifier_invalidate_range_start(&range); 950 946 951 947 while (page_vma_mapped_walk(&pvmw)) { ··· 968 974 if (!pmd_dirty(*pmd) && !pmd_write(*pmd)) 969 975 continue; 970 976 971 - flush_cache_page(vma, address, page_to_pfn(page)); 977 + flush_cache_page(vma, address, folio_pfn(folio)); 972 978 entry = pmdp_invalidate(vma, address, pmd); 973 979 entry = pmd_wrprotect(entry); 974 980 entry = pmd_mkclean(entry); 975 981 set_pmd_at(vma->vm_mm, address, pmd, entry); 976 982 ret = 1; 977 983 #else 978 - /* unexpected pmd-mapped page? */ 984 + /* unexpected pmd-mapped folio? */ 979 985 WARN_ON_ONCE(1); 980 986 #endif 981 987 } ··· 1023 1029 if (!mapping) 1024 1030 return 0; 1025 1031 1026 - rmap_walk(&folio->page, &rwc); 1032 + rmap_walk(folio, &rwc); 1027 1033 1028 1034 return cleaned; 1029 1035 } ··· 1051 1057 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 1052 1058 /* 1053 1059 * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written 1054 - * simultaneously, so a concurrent reader (eg page_referenced()'s 1055 - * PageAnon()) will not see one without the other. 1060 + * simultaneously, so a concurrent reader (eg folio_referenced()'s 1061 + * folio_test_anon()) will not see one without the other. 1056 1062 */ 1057 1063 WRITE_ONCE(page->mapping, (struct address_space *) anon_vma); 1058 1064 } ··· 1102 1108 static void __page_check_anon_rmap(struct page *page, 1103 1109 struct vm_area_struct *vma, unsigned long address) 1104 1110 { 1111 + struct folio *folio = page_folio(page); 1105 1112 /* 1106 1113 * The page's anon-rmap details (mapping and index) are guaranteed to 1107 1114 * be set up correctly at this point. ··· 1114 1119 * are initially only visible via the pagetables, and the pte is locked 1115 1120 * over the call to page_add_new_anon_rmap. 1116 1121 */ 1117 - VM_BUG_ON_PAGE(page_anon_vma(page)->root != vma->anon_vma->root, page); 1122 + VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root, 1123 + folio); 1118 1124 VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address), 1119 1125 page); 1120 1126 } ··· 1177 1181 __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); 1178 1182 } 1179 1183 1180 - if (unlikely(PageKsm(page))) { 1184 + if (unlikely(PageKsm(page))) 1181 1185 unlock_page_memcg(page); 1182 - return; 1183 - } 1184 1186 1185 1187 /* address might be in next vma when migration races vma_adjust */ 1186 - if (first) 1188 + else if (first) 1187 1189 __page_set_anon_rmap(page, vma, address, 1188 1190 flags & RMAP_EXCLUSIVE); 1189 1191 else 1190 1192 __page_check_anon_rmap(page, vma, address); 1193 + 1194 + mlock_vma_page(page, vma, compound); 1191 1195 } 1192 1196 1193 1197 /** ··· 1212 1216 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 1213 1217 /* increment count (starts at -1) */ 1214 1218 atomic_set(compound_mapcount_ptr(page), 0); 1215 - if (hpage_pincount_available(page)) 1216 - atomic_set(compound_pincount_ptr(page), 0); 1219 + atomic_set(compound_pincount_ptr(page), 0); 1217 1220 1218 1221 __mod_lruvec_page_state(page, NR_ANON_THPS, nr); 1219 1222 } else { ··· 1227 1232 1228 1233 /** 1229 1234 * page_add_file_rmap - add pte mapping to a file page 1230 - * @page: the page to add the mapping to 1231 - * @compound: charge the page as compound or small page 1235 + * @page: the page to add the mapping to 1236 + * @vma: the vm area in which the mapping is added 1237 + * @compound: charge the page as compound or small page 1232 1238 * 1233 1239 * The caller needs to hold the pte lock. 1234 1240 */ 1235 - void page_add_file_rmap(struct page *page, bool compound) 1241 + void page_add_file_rmap(struct page *page, 1242 + struct vm_area_struct *vma, bool compound) 1236 1243 { 1237 1244 int i, nr = 1; 1238 1245 ··· 1268 1271 nr_pages); 1269 1272 } else { 1270 1273 if (PageTransCompound(page) && page_mapping(page)) { 1271 - struct page *head = compound_head(page); 1272 - 1273 1274 VM_WARN_ON_ONCE(!PageLocked(page)); 1274 - 1275 - SetPageDoubleMap(head); 1276 - if (PageMlocked(page)) 1277 - clear_page_mlock(head); 1275 + SetPageDoubleMap(compound_head(page)); 1278 1276 } 1279 1277 if (!atomic_inc_and_test(&page->_mapcount)) 1280 1278 goto out; ··· 1277 1285 __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); 1278 1286 out: 1279 1287 unlock_page_memcg(page); 1288 + 1289 + mlock_vma_page(page, vma, compound); 1280 1290 } 1281 1291 1282 1292 static void page_remove_file_rmap(struct page *page, bool compound) ··· 1321 1327 * pte lock(a spinlock) is held, which implies preemption disabled. 1322 1328 */ 1323 1329 __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr); 1324 - 1325 - if (unlikely(PageMlocked(page))) 1326 - clear_page_mlock(page); 1327 1330 } 1328 1331 1329 1332 static void page_remove_anon_compound_rmap(struct page *page) ··· 1360 1369 nr = thp_nr_pages(page); 1361 1370 } 1362 1371 1363 - if (unlikely(PageMlocked(page))) 1364 - clear_page_mlock(page); 1365 - 1366 1372 if (nr) 1367 1373 __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr); 1368 1374 } ··· 1367 1379 /** 1368 1380 * page_remove_rmap - take down pte mapping from a page 1369 1381 * @page: page to remove mapping from 1382 + * @vma: the vm area from which the mapping is removed 1370 1383 * @compound: uncharge the page as compound or small page 1371 1384 * 1372 1385 * The caller needs to hold the pte lock. 1373 1386 */ 1374 - void page_remove_rmap(struct page *page, bool compound) 1387 + void page_remove_rmap(struct page *page, 1388 + struct vm_area_struct *vma, bool compound) 1375 1389 { 1376 1390 lock_page_memcg(page); 1377 1391 ··· 1398 1408 */ 1399 1409 __dec_lruvec_page_state(page, NR_ANON_MAPPED); 1400 1410 1401 - if (unlikely(PageMlocked(page))) 1402 - clear_page_mlock(page); 1403 - 1404 1411 if (PageTransCompound(page)) 1405 1412 deferred_split_huge_page(compound_head(page)); 1406 1413 ··· 1412 1425 */ 1413 1426 out: 1414 1427 unlock_page_memcg(page); 1428 + 1429 + munlock_vma_page(page, vma, compound); 1415 1430 } 1416 1431 1417 1432 /* 1418 1433 * @arg: enum ttu_flags will be passed to this argument 1419 1434 */ 1420 - static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 1435 + static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, 1421 1436 unsigned long address, void *arg) 1422 1437 { 1423 1438 struct mm_struct *mm = vma->vm_mm; 1424 - struct page_vma_mapped_walk pvmw = { 1425 - .page = page, 1426 - .vma = vma, 1427 - .address = address, 1428 - }; 1439 + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); 1429 1440 pte_t pteval; 1430 1441 struct page *subpage; 1431 1442 bool ret = true; ··· 1440 1455 pvmw.flags = PVMW_SYNC; 1441 1456 1442 1457 if (flags & TTU_SPLIT_HUGE_PMD) 1443 - split_huge_pmd_address(vma, address, false, page); 1458 + split_huge_pmd_address(vma, address, false, folio); 1444 1459 1445 1460 /* 1446 1461 * For THP, we have to assume the worse case ie pmd for invalidation. 1447 1462 * For hugetlb, it could be much worse if we need to do pud 1448 1463 * invalidation in the case of pmd sharing. 1449 1464 * 1450 - * Note that the page can not be free in this function as call of 1451 - * try_to_unmap() must hold a reference on the page. 1465 + * Note that the folio can not be freed in this function as call of 1466 + * try_to_unmap() must hold a reference on the folio. 1452 1467 */ 1453 - range.end = PageKsm(page) ? 1454 - address + PAGE_SIZE : vma_address_end(page, vma); 1468 + range.end = vma_address_end(&pvmw); 1455 1469 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, 1456 1470 address, range.end); 1457 - if (PageHuge(page)) { 1471 + if (folio_test_hugetlb(folio)) { 1458 1472 /* 1459 1473 * If sharing is possible, start and end will be adjusted 1460 1474 * accordingly. ··· 1464 1480 mmu_notifier_invalidate_range_start(&range); 1465 1481 1466 1482 while (page_vma_mapped_walk(&pvmw)) { 1483 + /* Unexpected PMD-mapped THP? */ 1484 + VM_BUG_ON_FOLIO(!pvmw.pte, folio); 1485 + 1467 1486 /* 1468 - * If the page is mlock()d, we cannot swap it out. 1487 + * If the folio is in an mlock()d vma, we must not swap it out. 1469 1488 */ 1470 1489 if (!(flags & TTU_IGNORE_MLOCK) && 1471 1490 (vma->vm_flags & VM_LOCKED)) { 1472 - /* 1473 - * PTE-mapped THP are never marked as mlocked: so do 1474 - * not set it on a DoubleMap THP, nor on an Anon THP 1475 - * (which may still be PTE-mapped after DoubleMap was 1476 - * cleared). But stop unmapping even in those cases. 1477 - */ 1478 - if (!PageTransCompound(page) || (PageHead(page) && 1479 - !PageDoubleMap(page) && !PageAnon(page))) 1480 - mlock_vma_page(page); 1491 + /* Restore the mlock which got missed */ 1492 + mlock_vma_folio(folio, vma, false); 1481 1493 page_vma_mapped_walk_done(&pvmw); 1482 1494 ret = false; 1483 1495 break; 1484 1496 } 1485 1497 1486 - /* Unexpected PMD-mapped THP? */ 1487 - VM_BUG_ON_PAGE(!pvmw.pte, page); 1488 - 1489 - subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); 1498 + subpage = folio_page(folio, 1499 + pte_pfn(*pvmw.pte) - folio_pfn(folio)); 1490 1500 address = pvmw.address; 1491 1501 1492 - if (PageHuge(page) && !PageAnon(page)) { 1502 + if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { 1493 1503 /* 1494 1504 * To call huge_pmd_unshare, i_mmap_rwsem must be 1495 1505 * held in write mode. Caller needs to explicitly ··· 1522 1544 if (should_defer_flush(mm, flags)) { 1523 1545 /* 1524 1546 * We clear the PTE but do not flush so potentially 1525 - * a remote CPU could still be writing to the page. 1547 + * a remote CPU could still be writing to the folio. 1526 1548 * If the entry was previously clean then the 1527 1549 * architecture must guarantee that a clear->dirty 1528 1550 * transition on a cached TLB entry is written through ··· 1535 1557 pteval = ptep_clear_flush(vma, address, pvmw.pte); 1536 1558 } 1537 1559 1538 - /* Move the dirty bit to the page. Now the pte is gone. */ 1560 + /* Set the dirty flag on the folio now the pte is gone. */ 1539 1561 if (pte_dirty(pteval)) 1540 - set_page_dirty(page); 1562 + folio_mark_dirty(folio); 1541 1563 1542 1564 /* Update high watermark before we lower rss */ 1543 1565 update_hiwater_rss(mm); 1544 1566 1545 1567 if (PageHWPoison(subpage) && !(flags & TTU_IGNORE_HWPOISON)) { 1546 1568 pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); 1547 - if (PageHuge(page)) { 1548 - hugetlb_count_sub(compound_nr(page), mm); 1569 + if (folio_test_hugetlb(folio)) { 1570 + hugetlb_count_sub(folio_nr_pages(folio), mm); 1549 1571 set_huge_swap_pte_at(mm, address, 1550 1572 pvmw.pte, pteval, 1551 1573 vma_mmu_pagesize(vma)); 1552 1574 } else { 1553 - dec_mm_counter(mm, mm_counter(page)); 1575 + dec_mm_counter(mm, mm_counter(&folio->page)); 1554 1576 set_pte_at(mm, address, pvmw.pte, pteval); 1555 1577 } 1556 1578 ··· 1565 1587 * migration) will not expect userfaults on already 1566 1588 * copied pages. 1567 1589 */ 1568 - dec_mm_counter(mm, mm_counter(page)); 1590 + dec_mm_counter(mm, mm_counter(&folio->page)); 1569 1591 /* We have to invalidate as we cleared the pte */ 1570 1592 mmu_notifier_invalidate_range(mm, address, 1571 1593 address + PAGE_SIZE); 1572 - } else if (PageAnon(page)) { 1594 + } else if (folio_test_anon(folio)) { 1573 1595 swp_entry_t entry = { .val = page_private(subpage) }; 1574 1596 pte_t swp_pte; 1575 1597 /* 1576 1598 * Store the swap location in the pte. 1577 1599 * See handle_pte_fault() ... 1578 1600 */ 1579 - if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) { 1601 + if (unlikely(folio_test_swapbacked(folio) != 1602 + folio_test_swapcache(folio))) { 1580 1603 WARN_ON_ONCE(1); 1581 1604 ret = false; 1582 1605 /* We have to invalidate as we cleared the pte */ ··· 1588 1609 } 1589 1610 1590 1611 /* MADV_FREE page check */ 1591 - if (!PageSwapBacked(page)) { 1592 - if (!PageDirty(page)) { 1612 + if (!folio_test_swapbacked(folio)) { 1613 + if (!folio_test_dirty(folio)) { 1593 1614 /* Invalidate as we cleared the pte */ 1594 1615 mmu_notifier_invalidate_range(mm, 1595 1616 address, address + PAGE_SIZE); ··· 1598 1619 } 1599 1620 1600 1621 /* 1601 - * If the page was redirtied, it cannot be 1622 + * If the folio was redirtied, it cannot be 1602 1623 * discarded. Remap the page to page table. 1603 1624 */ 1604 1625 set_pte_at(mm, address, pvmw.pte, pteval); 1605 - SetPageSwapBacked(page); 1626 + folio_set_swapbacked(folio); 1606 1627 ret = false; 1607 1628 page_vma_mapped_walk_done(&pvmw); 1608 1629 break; ··· 1639 1660 address + PAGE_SIZE); 1640 1661 } else { 1641 1662 /* 1642 - * This is a locked file-backed page, thus it cannot 1643 - * be removed from the page cache and replaced by a new 1644 - * page before mmu_notifier_invalidate_range_end, so no 1645 - * concurrent thread might update its page table to 1646 - * point at new page while a device still is using this 1647 - * page. 1663 + * This is a locked file-backed folio, 1664 + * so it cannot be removed from the page 1665 + * cache and replaced by a new folio before 1666 + * mmu_notifier_invalidate_range_end, so no 1667 + * concurrent thread might update its page table 1668 + * to point at a new folio while a device is 1669 + * still using this folio. 1648 1670 * 1649 1671 * See Documentation/vm/mmu_notifier.rst 1650 1672 */ 1651 - dec_mm_counter(mm, mm_counter_file(page)); 1673 + dec_mm_counter(mm, mm_counter_file(&folio->page)); 1652 1674 } 1653 1675 discard: 1654 1676 /* ··· 1659 1679 * 1660 1680 * See Documentation/vm/mmu_notifier.rst 1661 1681 */ 1662 - page_remove_rmap(subpage, PageHuge(page)); 1663 - put_page(page); 1682 + page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); 1683 + if (vma->vm_flags & VM_LOCKED) 1684 + mlock_page_drain(smp_processor_id()); 1685 + folio_put(folio); 1664 1686 } 1665 1687 1666 1688 mmu_notifier_invalidate_range_end(&range); ··· 1675 1693 return vma_is_temporary_stack(vma); 1676 1694 } 1677 1695 1678 - static int page_not_mapped(struct page *page) 1696 + static int page_not_mapped(struct folio *folio) 1679 1697 { 1680 - return !page_mapped(page); 1698 + return !folio_mapped(folio); 1681 1699 } 1682 1700 1683 1701 /** 1684 - * try_to_unmap - try to remove all page table mappings to a page 1685 - * @page: the page to get unmapped 1702 + * try_to_unmap - Try to remove all page table mappings to a folio. 1703 + * @folio: The folio to unmap. 1686 1704 * @flags: action and flags 1687 1705 * 1688 1706 * Tries to remove all the page table entries which are mapping this 1689 - * page, used in the pageout path. Caller must hold the page lock. 1707 + * folio. It is the caller's responsibility to check if the folio is 1708 + * still mapped if needed (use TTU_SYNC to prevent accounting races). 1690 1709 * 1691 - * It is the caller's responsibility to check if the page is still 1692 - * mapped when needed (use TTU_SYNC to prevent accounting races). 1710 + * Context: Caller must hold the folio lock. 1693 1711 */ 1694 - void try_to_unmap(struct page *page, enum ttu_flags flags) 1712 + void try_to_unmap(struct folio *folio, enum ttu_flags flags) 1695 1713 { 1696 1714 struct rmap_walk_control rwc = { 1697 1715 .rmap_one = try_to_unmap_one, 1698 1716 .arg = (void *)flags, 1699 1717 .done = page_not_mapped, 1700 - .anon_lock = page_lock_anon_vma_read, 1718 + .anon_lock = folio_lock_anon_vma_read, 1701 1719 }; 1702 1720 1703 1721 if (flags & TTU_RMAP_LOCKED) 1704 - rmap_walk_locked(page, &rwc); 1722 + rmap_walk_locked(folio, &rwc); 1705 1723 else 1706 - rmap_walk(page, &rwc); 1724 + rmap_walk(folio, &rwc); 1707 1725 } 1708 1726 1709 1727 /* ··· 1712 1730 * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs 1713 1731 * containing migration entries. 1714 1732 */ 1715 - static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, 1733 + static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, 1716 1734 unsigned long address, void *arg) 1717 1735 { 1718 1736 struct mm_struct *mm = vma->vm_mm; 1719 - struct page_vma_mapped_walk pvmw = { 1720 - .page = page, 1721 - .vma = vma, 1722 - .address = address, 1723 - }; 1737 + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); 1724 1738 pte_t pteval; 1725 1739 struct page *subpage; 1726 1740 bool ret = true; ··· 1737 1759 * TTU_SPLIT_HUGE_PMD and it wants to freeze. 1738 1760 */ 1739 1761 if (flags & TTU_SPLIT_HUGE_PMD) 1740 - split_huge_pmd_address(vma, address, true, page); 1762 + split_huge_pmd_address(vma, address, true, folio); 1741 1763 1742 1764 /* 1743 1765 * For THP, we have to assume the worse case ie pmd for invalidation. ··· 1747 1769 * Note that the page can not be free in this function as call of 1748 1770 * try_to_unmap() must hold a reference on the page. 1749 1771 */ 1750 - range.end = PageKsm(page) ? 1751 - address + PAGE_SIZE : vma_address_end(page, vma); 1772 + range.end = vma_address_end(&pvmw); 1752 1773 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, 1753 1774 address, range.end); 1754 - if (PageHuge(page)) { 1775 + if (folio_test_hugetlb(folio)) { 1755 1776 /* 1756 1777 * If sharing is possible, start and end will be adjusted 1757 1778 * accordingly. ··· 1764 1787 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 1765 1788 /* PMD-mapped THP migration entry */ 1766 1789 if (!pvmw.pte) { 1767 - VM_BUG_ON_PAGE(PageHuge(page) || 1768 - !PageTransCompound(page), page); 1790 + subpage = folio_page(folio, 1791 + pmd_pfn(*pvmw.pmd) - folio_pfn(folio)); 1792 + VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || 1793 + !folio_test_pmd_mappable(folio), folio); 1769 1794 1770 - set_pmd_migration_entry(&pvmw, page); 1795 + set_pmd_migration_entry(&pvmw, subpage); 1771 1796 continue; 1772 1797 } 1773 1798 #endif 1774 1799 1775 1800 /* Unexpected PMD-mapped THP? */ 1776 - VM_BUG_ON_PAGE(!pvmw.pte, page); 1801 + VM_BUG_ON_FOLIO(!pvmw.pte, folio); 1777 1802 1778 - subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); 1803 + subpage = folio_page(folio, 1804 + pte_pfn(*pvmw.pte) - folio_pfn(folio)); 1779 1805 address = pvmw.address; 1780 1806 1781 - if (PageHuge(page) && !PageAnon(page)) { 1807 + if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { 1782 1808 /* 1783 1809 * To call huge_pmd_unshare, i_mmap_rwsem must be 1784 1810 * held in write mode. Caller needs to explicitly ··· 1819 1839 flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); 1820 1840 pteval = ptep_clear_flush(vma, address, pvmw.pte); 1821 1841 1822 - /* Move the dirty bit to the page. Now the pte is gone. */ 1842 + /* Set the dirty flag on the folio now the pte is gone. */ 1823 1843 if (pte_dirty(pteval)) 1824 - set_page_dirty(page); 1844 + folio_mark_dirty(folio); 1825 1845 1826 1846 /* Update high watermark before we lower rss */ 1827 1847 update_hiwater_rss(mm); 1828 1848 1829 - if (is_zone_device_page(page)) { 1830 - unsigned long pfn = page_to_pfn(page); 1849 + if (folio_is_zone_device(folio)) { 1850 + unsigned long pfn = folio_pfn(folio); 1831 1851 swp_entry_t entry; 1832 1852 pte_t swp_pte; 1833 1853 ··· 1863 1883 * changed when hugepage migrations to device private 1864 1884 * memory are supported. 1865 1885 */ 1866 - subpage = page; 1886 + subpage = &folio->page; 1867 1887 } else if (PageHWPoison(subpage)) { 1868 1888 pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); 1869 - if (PageHuge(page)) { 1870 - hugetlb_count_sub(compound_nr(page), mm); 1889 + if (folio_test_hugetlb(folio)) { 1890 + hugetlb_count_sub(folio_nr_pages(folio), mm); 1871 1891 set_huge_swap_pte_at(mm, address, 1872 1892 pvmw.pte, pteval, 1873 1893 vma_mmu_pagesize(vma)); 1874 1894 } else { 1875 - dec_mm_counter(mm, mm_counter(page)); 1895 + dec_mm_counter(mm, mm_counter(&folio->page)); 1876 1896 set_pte_at(mm, address, pvmw.pte, pteval); 1877 1897 } 1878 1898 ··· 1887 1907 * migration) will not expect userfaults on already 1888 1908 * copied pages. 1889 1909 */ 1890 - dec_mm_counter(mm, mm_counter(page)); 1910 + dec_mm_counter(mm, mm_counter(&folio->page)); 1891 1911 /* We have to invalidate as we cleared the pte */ 1892 1912 mmu_notifier_invalidate_range(mm, address, 1893 1913 address + PAGE_SIZE); ··· 1933 1953 * 1934 1954 * See Documentation/vm/mmu_notifier.rst 1935 1955 */ 1936 - page_remove_rmap(subpage, PageHuge(page)); 1937 - put_page(page); 1956 + page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); 1957 + if (vma->vm_flags & VM_LOCKED) 1958 + mlock_page_drain(smp_processor_id()); 1959 + folio_put(folio); 1938 1960 } 1939 1961 1940 1962 mmu_notifier_invalidate_range_end(&range); ··· 1946 1964 1947 1965 /** 1948 1966 * try_to_migrate - try to replace all page table mappings with swap entries 1949 - * @page: the page to replace page table entries for 1967 + * @folio: the folio to replace page table entries for 1950 1968 * @flags: action and flags 1951 1969 * 1952 - * Tries to remove all the page table entries which are mapping this page and 1953 - * replace them with special swap entries. Caller must hold the page lock. 1970 + * Tries to remove all the page table entries which are mapping this folio and 1971 + * replace them with special swap entries. Caller must hold the folio lock. 1954 1972 */ 1955 - void try_to_migrate(struct page *page, enum ttu_flags flags) 1973 + void try_to_migrate(struct folio *folio, enum ttu_flags flags) 1956 1974 { 1957 1975 struct rmap_walk_control rwc = { 1958 1976 .rmap_one = try_to_migrate_one, 1959 1977 .arg = (void *)flags, 1960 1978 .done = page_not_mapped, 1961 - .anon_lock = page_lock_anon_vma_read, 1979 + .anon_lock = folio_lock_anon_vma_read, 1962 1980 }; 1963 1981 1964 1982 /* ··· 1969 1987 TTU_SYNC))) 1970 1988 return; 1971 1989 1972 - if (is_zone_device_page(page) && !is_device_private_page(page)) 1990 + if (folio_is_zone_device(folio) && !folio_is_device_private(folio)) 1973 1991 return; 1974 1992 1975 1993 /* ··· 1980 1998 * locking requirements of exec(), migration skips 1981 1999 * temporary VMAs until after exec() completes. 1982 2000 */ 1983 - if (!PageKsm(page) && PageAnon(page)) 2001 + if (!folio_test_ksm(folio) && folio_test_anon(folio)) 1984 2002 rwc.invalid_vma = invalid_migration_vma; 1985 2003 1986 2004 if (flags & TTU_RMAP_LOCKED) 1987 - rmap_walk_locked(page, &rwc); 2005 + rmap_walk_locked(folio, &rwc); 1988 2006 else 1989 - rmap_walk(page, &rwc); 1990 - } 1991 - 1992 - /* 1993 - * Walks the vma's mapping a page and mlocks the page if any locked vma's are 1994 - * found. Once one is found the page is locked and the scan can be terminated. 1995 - */ 1996 - static bool page_mlock_one(struct page *page, struct vm_area_struct *vma, 1997 - unsigned long address, void *unused) 1998 - { 1999 - struct page_vma_mapped_walk pvmw = { 2000 - .page = page, 2001 - .vma = vma, 2002 - .address = address, 2003 - }; 2004 - 2005 - /* An un-locked vma doesn't have any pages to lock, continue the scan */ 2006 - if (!(vma->vm_flags & VM_LOCKED)) 2007 - return true; 2008 - 2009 - while (page_vma_mapped_walk(&pvmw)) { 2010 - /* 2011 - * Need to recheck under the ptl to serialise with 2012 - * __munlock_pagevec_fill() after VM_LOCKED is cleared in 2013 - * munlock_vma_pages_range(). 2014 - */ 2015 - if (vma->vm_flags & VM_LOCKED) { 2016 - /* 2017 - * PTE-mapped THP are never marked as mlocked; but 2018 - * this function is never called on a DoubleMap THP, 2019 - * nor on an Anon THP (which may still be PTE-mapped 2020 - * after DoubleMap was cleared). 2021 - */ 2022 - mlock_vma_page(page); 2023 - /* 2024 - * No need to scan further once the page is marked 2025 - * as mlocked. 2026 - */ 2027 - page_vma_mapped_walk_done(&pvmw); 2028 - return false; 2029 - } 2030 - } 2031 - 2032 - return true; 2033 - } 2034 - 2035 - /** 2036 - * page_mlock - try to mlock a page 2037 - * @page: the page to be mlocked 2038 - * 2039 - * Called from munlock code. Checks all of the VMAs mapping the page and mlocks 2040 - * the page if any are found. The page will be returned with PG_mlocked cleared 2041 - * if it is not mapped by any locked vmas. 2042 - */ 2043 - void page_mlock(struct page *page) 2044 - { 2045 - struct rmap_walk_control rwc = { 2046 - .rmap_one = page_mlock_one, 2047 - .done = page_not_mapped, 2048 - .anon_lock = page_lock_anon_vma_read, 2049 - 2050 - }; 2051 - 2052 - VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page); 2053 - VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); 2054 - 2055 - /* Anon THP are only marked as mlocked when singly mapped */ 2056 - if (PageTransCompound(page) && PageAnon(page)) 2057 - return; 2058 - 2059 - rmap_walk(page, &rwc); 2007 + rmap_walk(folio, &rwc); 2060 2008 } 2061 2009 2062 2010 #ifdef CONFIG_DEVICE_PRIVATE ··· 1997 2085 bool valid; 1998 2086 }; 1999 2087 2000 - static bool page_make_device_exclusive_one(struct page *page, 2088 + static bool page_make_device_exclusive_one(struct folio *folio, 2001 2089 struct vm_area_struct *vma, unsigned long address, void *priv) 2002 2090 { 2003 2091 struct mm_struct *mm = vma->vm_mm; 2004 - struct page_vma_mapped_walk pvmw = { 2005 - .page = page, 2006 - .vma = vma, 2007 - .address = address, 2008 - }; 2092 + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); 2009 2093 struct make_exclusive_args *args = priv; 2010 2094 pte_t pteval; 2011 2095 struct page *subpage; ··· 2012 2104 2013 2105 mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma, 2014 2106 vma->vm_mm, address, min(vma->vm_end, 2015 - address + page_size(page)), args->owner); 2107 + address + folio_size(folio)), 2108 + args->owner); 2016 2109 mmu_notifier_invalidate_range_start(&range); 2017 2110 2018 2111 while (page_vma_mapped_walk(&pvmw)) { 2019 2112 /* Unexpected PMD-mapped THP? */ 2020 - VM_BUG_ON_PAGE(!pvmw.pte, page); 2113 + VM_BUG_ON_FOLIO(!pvmw.pte, folio); 2021 2114 2022 2115 if (!pte_present(*pvmw.pte)) { 2023 2116 ret = false; ··· 2026 2117 break; 2027 2118 } 2028 2119 2029 - subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); 2120 + subpage = folio_page(folio, 2121 + pte_pfn(*pvmw.pte) - folio_pfn(folio)); 2030 2122 address = pvmw.address; 2031 2123 2032 2124 /* Nuke the page table entry. */ 2033 2125 flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); 2034 2126 pteval = ptep_clear_flush(vma, address, pvmw.pte); 2035 2127 2036 - /* Move the dirty bit to the page. Now the pte is gone. */ 2128 + /* Set the dirty flag on the folio now the pte is gone. */ 2037 2129 if (pte_dirty(pteval)) 2038 - set_page_dirty(page); 2130 + folio_mark_dirty(folio); 2039 2131 2040 2132 /* 2041 2133 * Check that our target page is still mapped at the expected ··· 2069 2159 * There is a reference on the page for the swap entry which has 2070 2160 * been removed, so shouldn't take another. 2071 2161 */ 2072 - page_remove_rmap(subpage, false); 2162 + page_remove_rmap(subpage, vma, false); 2073 2163 } 2074 2164 2075 2165 mmu_notifier_invalidate_range_end(&range); ··· 2078 2168 } 2079 2169 2080 2170 /** 2081 - * page_make_device_exclusive - mark the page exclusively owned by a device 2082 - * @page: the page to replace page table entries for 2083 - * @mm: the mm_struct where the page is expected to be mapped 2084 - * @address: address where the page is expected to be mapped 2171 + * folio_make_device_exclusive - Mark the folio exclusively owned by a device. 2172 + * @folio: The folio to replace page table entries for. 2173 + * @mm: The mm_struct where the folio is expected to be mapped. 2174 + * @address: Address where the folio is expected to be mapped. 2085 2175 * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks 2086 2176 * 2087 - * Tries to remove all the page table entries which are mapping this page and 2088 - * replace them with special device exclusive swap entries to grant a device 2089 - * exclusive access to the page. Caller must hold the page lock. 2177 + * Tries to remove all the page table entries which are mapping this 2178 + * folio and replace them with special device exclusive swap entries to 2179 + * grant a device exclusive access to the folio. 2090 2180 * 2091 - * Returns false if the page is still mapped, or if it could not be unmapped 2181 + * Context: Caller must hold the folio lock. 2182 + * Return: false if the page is still mapped, or if it could not be unmapped 2092 2183 * from the expected address. Otherwise returns true (success). 2093 2184 */ 2094 - static bool page_make_device_exclusive(struct page *page, struct mm_struct *mm, 2095 - unsigned long address, void *owner) 2185 + static bool folio_make_device_exclusive(struct folio *folio, 2186 + struct mm_struct *mm, unsigned long address, void *owner) 2096 2187 { 2097 2188 struct make_exclusive_args args = { 2098 2189 .mm = mm, ··· 2104 2193 struct rmap_walk_control rwc = { 2105 2194 .rmap_one = page_make_device_exclusive_one, 2106 2195 .done = page_not_mapped, 2107 - .anon_lock = page_lock_anon_vma_read, 2196 + .anon_lock = folio_lock_anon_vma_read, 2108 2197 .arg = &args, 2109 2198 }; 2110 2199 2111 2200 /* 2112 - * Restrict to anonymous pages for now to avoid potential writeback 2113 - * issues. Also tail pages shouldn't be passed to rmap_walk so skip 2114 - * those. 2201 + * Restrict to anonymous folios for now to avoid potential writeback 2202 + * issues. 2115 2203 */ 2116 - if (!PageAnon(page) || PageTail(page)) 2204 + if (!folio_test_anon(folio)) 2117 2205 return false; 2118 2206 2119 - rmap_walk(page, &rwc); 2207 + rmap_walk(folio, &rwc); 2120 2208 2121 - return args.valid && !page_mapcount(page); 2209 + return args.valid && !folio_mapcount(folio); 2122 2210 } 2123 2211 2124 2212 /** ··· 2155 2245 return npages; 2156 2246 2157 2247 for (i = 0; i < npages; i++, start += PAGE_SIZE) { 2158 - if (!trylock_page(pages[i])) { 2159 - put_page(pages[i]); 2248 + struct folio *folio = page_folio(pages[i]); 2249 + if (PageTail(pages[i]) || !folio_trylock(folio)) { 2250 + folio_put(folio); 2160 2251 pages[i] = NULL; 2161 2252 continue; 2162 2253 } 2163 2254 2164 - if (!page_make_device_exclusive(pages[i], mm, start, owner)) { 2165 - unlock_page(pages[i]); 2166 - put_page(pages[i]); 2255 + if (!folio_make_device_exclusive(folio, mm, start, owner)) { 2256 + folio_unlock(folio); 2257 + folio_put(folio); 2167 2258 pages[i] = NULL; 2168 2259 } 2169 2260 } ··· 2183 2272 anon_vma_free(root); 2184 2273 } 2185 2274 2186 - static struct anon_vma *rmap_walk_anon_lock(struct page *page, 2187 - struct rmap_walk_control *rwc) 2275 + static struct anon_vma *rmap_walk_anon_lock(struct folio *folio, 2276 + const struct rmap_walk_control *rwc) 2188 2277 { 2189 2278 struct anon_vma *anon_vma; 2190 2279 2191 2280 if (rwc->anon_lock) 2192 - return rwc->anon_lock(page); 2281 + return rwc->anon_lock(folio); 2193 2282 2194 2283 /* 2195 - * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() 2284 + * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read() 2196 2285 * because that depends on page_mapped(); but not all its usages 2197 2286 * are holding mmap_lock. Users without mmap_lock are required to 2198 2287 * take a reference count to prevent the anon_vma disappearing 2199 2288 */ 2200 - anon_vma = page_anon_vma(page); 2289 + anon_vma = folio_anon_vma(folio); 2201 2290 if (!anon_vma) 2202 2291 return NULL; 2203 2292 ··· 2213 2302 * 2214 2303 * Find all the mappings of a page using the mapping pointer and the vma chains 2215 2304 * contained in the anon_vma struct it points to. 2216 - * 2217 - * When called from page_mlock(), the mmap_lock of the mm containing the vma 2218 - * where the page was found will be held for write. So, we won't recheck 2219 - * vm_flags for that VMA. That should be OK, because that vma shouldn't be 2220 - * LOCKED. 2221 2305 */ 2222 - static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, 2223 - bool locked) 2306 + static void rmap_walk_anon(struct folio *folio, 2307 + const struct rmap_walk_control *rwc, bool locked) 2224 2308 { 2225 2309 struct anon_vma *anon_vma; 2226 2310 pgoff_t pgoff_start, pgoff_end; 2227 2311 struct anon_vma_chain *avc; 2228 2312 2229 2313 if (locked) { 2230 - anon_vma = page_anon_vma(page); 2314 + anon_vma = folio_anon_vma(folio); 2231 2315 /* anon_vma disappear under us? */ 2232 - VM_BUG_ON_PAGE(!anon_vma, page); 2316 + VM_BUG_ON_FOLIO(!anon_vma, folio); 2233 2317 } else { 2234 - anon_vma = rmap_walk_anon_lock(page, rwc); 2318 + anon_vma = rmap_walk_anon_lock(folio, rwc); 2235 2319 } 2236 2320 if (!anon_vma) 2237 2321 return; 2238 2322 2239 - pgoff_start = page_to_pgoff(page); 2240 - pgoff_end = pgoff_start + thp_nr_pages(page) - 1; 2323 + pgoff_start = folio_pgoff(folio); 2324 + pgoff_end = pgoff_start + folio_nr_pages(folio) - 1; 2241 2325 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, 2242 2326 pgoff_start, pgoff_end) { 2243 2327 struct vm_area_struct *vma = avc->vma; 2244 - unsigned long address = vma_address(page, vma); 2328 + unsigned long address = vma_address(&folio->page, vma); 2245 2329 2246 2330 VM_BUG_ON_VMA(address == -EFAULT, vma); 2247 2331 cond_resched(); ··· 2244 2338 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 2245 2339 continue; 2246 2340 2247 - if (!rwc->rmap_one(page, vma, address, rwc->arg)) 2341 + if (!rwc->rmap_one(folio, vma, address, rwc->arg)) 2248 2342 break; 2249 - if (rwc->done && rwc->done(page)) 2343 + if (rwc->done && rwc->done(folio)) 2250 2344 break; 2251 2345 } 2252 2346 ··· 2261 2355 * 2262 2356 * Find all the mappings of a page using the mapping pointer and the vma chains 2263 2357 * contained in the address_space struct it points to. 2264 - * 2265 - * When called from page_mlock(), the mmap_lock of the mm containing the vma 2266 - * where the page was found will be held for write. So, we won't recheck 2267 - * vm_flags for that VMA. That should be OK, because that vma shouldn't be 2268 - * LOCKED. 2269 2358 */ 2270 - static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, 2271 - bool locked) 2359 + static void rmap_walk_file(struct folio *folio, 2360 + const struct rmap_walk_control *rwc, bool locked) 2272 2361 { 2273 - struct address_space *mapping = page_mapping(page); 2362 + struct address_space *mapping = folio_mapping(folio); 2274 2363 pgoff_t pgoff_start, pgoff_end; 2275 2364 struct vm_area_struct *vma; 2276 2365 ··· 2275 2374 * structure at mapping cannot be freed and reused yet, 2276 2375 * so we can safely take mapping->i_mmap_rwsem. 2277 2376 */ 2278 - VM_BUG_ON_PAGE(!PageLocked(page), page); 2377 + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 2279 2378 2280 2379 if (!mapping) 2281 2380 return; 2282 2381 2283 - pgoff_start = page_to_pgoff(page); 2284 - pgoff_end = pgoff_start + thp_nr_pages(page) - 1; 2382 + pgoff_start = folio_pgoff(folio); 2383 + pgoff_end = pgoff_start + folio_nr_pages(folio) - 1; 2285 2384 if (!locked) 2286 2385 i_mmap_lock_read(mapping); 2287 2386 vma_interval_tree_foreach(vma, &mapping->i_mmap, 2288 2387 pgoff_start, pgoff_end) { 2289 - unsigned long address = vma_address(page, vma); 2388 + unsigned long address = vma_address(&folio->page, vma); 2290 2389 2291 2390 VM_BUG_ON_VMA(address == -EFAULT, vma); 2292 2391 cond_resched(); ··· 2294 2393 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 2295 2394 continue; 2296 2395 2297 - if (!rwc->rmap_one(page, vma, address, rwc->arg)) 2396 + if (!rwc->rmap_one(folio, vma, address, rwc->arg)) 2298 2397 goto done; 2299 - if (rwc->done && rwc->done(page)) 2398 + if (rwc->done && rwc->done(folio)) 2300 2399 goto done; 2301 2400 } 2302 2401 ··· 2305 2404 i_mmap_unlock_read(mapping); 2306 2405 } 2307 2406 2308 - void rmap_walk(struct page *page, struct rmap_walk_control *rwc) 2407 + void rmap_walk(struct folio *folio, const struct rmap_walk_control *rwc) 2309 2408 { 2310 - if (unlikely(PageKsm(page))) 2311 - rmap_walk_ksm(page, rwc); 2312 - else if (PageAnon(page)) 2313 - rmap_walk_anon(page, rwc, false); 2409 + if (unlikely(folio_test_ksm(folio))) 2410 + rmap_walk_ksm(folio, rwc); 2411 + else if (folio_test_anon(folio)) 2412 + rmap_walk_anon(folio, rwc, false); 2314 2413 else 2315 - rmap_walk_file(page, rwc, false); 2414 + rmap_walk_file(folio, rwc, false); 2316 2415 } 2317 2416 2318 2417 /* Like rmap_walk, but caller holds relevant rmap lock */ 2319 - void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) 2418 + void rmap_walk_locked(struct folio *folio, const struct rmap_walk_control *rwc) 2320 2419 { 2321 2420 /* no ksm support for now */ 2322 - VM_BUG_ON_PAGE(PageKsm(page), page); 2323 - if (PageAnon(page)) 2324 - rmap_walk_anon(page, rwc, true); 2421 + VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio); 2422 + if (folio_test_anon(folio)) 2423 + rmap_walk_anon(folio, rwc, true); 2325 2424 else 2326 - rmap_walk_file(page, rwc, true); 2425 + rmap_walk_file(folio, rwc, true); 2327 2426 } 2328 2427 2329 2428 #ifdef CONFIG_HUGETLB_PAGE ··· 2351 2450 { 2352 2451 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 2353 2452 atomic_set(compound_mapcount_ptr(page), 0); 2354 - if (hpage_pincount_available(page)) 2355 - atomic_set(compound_pincount_ptr(page), 0); 2453 + atomic_set(compound_pincount_ptr(page), 0); 2356 2454 2357 2455 __page_set_anon_rmap(page, vma, address, 1); 2358 2456 }

+70 -107

mm/swap.c

··· 74 74 }; 75 75 76 76 /* 77 - * This path almost never happens for VM activity - pages are normally 78 - * freed via pagevecs. But it gets used by networking. 77 + * This path almost never happens for VM activity - pages are normally freed 78 + * via pagevecs. But it gets used by networking - and for compound pages. 79 79 */ 80 80 static void __page_cache_release(struct page *page) 81 81 { ··· 88 88 del_page_from_lru_list(page, lruvec); 89 89 __clear_page_lru_flags(page); 90 90 unlock_page_lruvec_irqrestore(lruvec, flags); 91 + } 92 + /* See comment on PageMlocked in release_pages() */ 93 + if (unlikely(PageMlocked(page))) { 94 + int nr_pages = thp_nr_pages(page); 95 + 96 + __ClearPageMlocked(page); 97 + mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); 98 + count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); 91 99 } 92 100 __ClearPageWaiters(page); 93 101 } ··· 122 114 123 115 void __put_page(struct page *page) 124 116 { 125 - if (is_zone_device_page(page)) { 126 - put_dev_pagemap(page->pgmap); 127 - 128 - /* 129 - * The page belongs to the device that created pgmap. Do 130 - * not return it to page allocator. 131 - */ 132 - return; 133 - } 134 - 135 - if (unlikely(PageCompound(page))) 117 + if (unlikely(is_zone_device_page(page))) 118 + free_zone_device_page(page); 119 + else if (unlikely(PageCompound(page))) 136 120 __put_compound_page(page); 137 121 else 138 122 __put_single_page(page); ··· 482 482 void lru_cache_add_inactive_or_unevictable(struct page *page, 483 483 struct vm_area_struct *vma) 484 484 { 485 - bool unevictable; 486 - 487 485 VM_BUG_ON_PAGE(PageLRU(page), page); 488 486 489 - unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED; 490 - if (unlikely(unevictable) && !TestSetPageMlocked(page)) { 491 - int nr_pages = thp_nr_pages(page); 492 - /* 493 - * We use the irq-unsafe __mod_zone_page_state because this 494 - * counter is not modified from interrupt context, and the pte 495 - * lock is held(spinlock), which implies preemption disabled. 496 - */ 497 - __mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); 498 - count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); 499 - } 500 - lru_cache_add(page); 487 + if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED)) 488 + mlock_new_page(page); 489 + else 490 + lru_cache_add(page); 501 491 } 502 492 503 493 /* ··· 626 636 pagevec_lru_move_fn(pvec, lru_lazyfree_fn); 627 637 628 638 activate_page_drain(cpu); 639 + mlock_page_drain(cpu); 629 640 } 630 641 631 642 /** 632 - * deactivate_file_page - forcefully deactivate a file page 633 - * @page: page to deactivate 643 + * deactivate_file_folio() - Forcefully deactivate a file folio. 644 + * @folio: Folio to deactivate. 634 645 * 635 - * This function hints the VM that @page is a good reclaim candidate, 636 - * for example if its invalidation fails due to the page being dirty 646 + * This function hints to the VM that @folio is a good reclaim candidate, 647 + * for example if its invalidation fails due to the folio being dirty 637 648 * or under writeback. 649 + * 650 + * Context: Caller holds a reference on the page. 638 651 */ 639 - void deactivate_file_page(struct page *page) 652 + void deactivate_file_folio(struct folio *folio) 640 653 { 654 + struct pagevec *pvec; 655 + 641 656 /* 642 - * In a workload with many unevictable page such as mprotect, 643 - * unevictable page deactivation for accelerating reclaim is pointless. 657 + * In a workload with many unevictable pages such as mprotect, 658 + * unevictable folio deactivation for accelerating reclaim is pointless. 644 659 */ 645 - if (PageUnevictable(page)) 660 + if (folio_test_unevictable(folio)) 646 661 return; 647 662 648 - if (likely(get_page_unless_zero(page))) { 649 - struct pagevec *pvec; 663 + folio_get(folio); 664 + local_lock(&lru_pvecs.lock); 665 + pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file); 650 666 651 - local_lock(&lru_pvecs.lock); 652 - pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file); 653 - 654 - if (pagevec_add_and_need_flush(pvec, page)) 655 - pagevec_lru_move_fn(pvec, lru_deactivate_file_fn); 656 - local_unlock(&lru_pvecs.lock); 657 - } 667 + if (pagevec_add_and_need_flush(pvec, &folio->page)) 668 + pagevec_lru_move_fn(pvec, lru_deactivate_file_fn); 669 + local_unlock(&lru_pvecs.lock); 658 670 } 659 671 660 672 /* ··· 829 837 pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || 830 838 pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) || 831 839 need_activate_page_drain(cpu) || 840 + need_mlock_page_drain(cpu) || 832 841 has_bh_in_lru(cpu, NULL)) { 833 842 INIT_WORK(work, lru_add_drain_per_cpu); 834 843 queue_work_on(cpu, mm_percpu_wq, work); ··· 928 935 unlock_page_lruvec_irqrestore(lruvec, flags); 929 936 lruvec = NULL; 930 937 } 931 - /* 932 - * ZONE_DEVICE pages that return 'false' from 933 - * page_is_devmap_managed() do not require special 934 - * processing, and instead, expect a call to 935 - * put_page_testzero(). 936 - */ 937 - if (page_is_devmap_managed(page)) { 938 - put_devmap_managed_page(page); 938 + if (put_devmap_managed_page(page)) 939 939 continue; 940 - } 941 940 if (put_page_testzero(page)) 942 - put_dev_pagemap(page->pgmap); 941 + free_zone_device_page(page); 943 942 continue; 944 943 } 945 944 ··· 957 972 958 973 del_page_from_lru_list(page, lruvec); 959 974 __clear_page_lru_flags(page); 975 + } 976 + 977 + /* 978 + * In rare cases, when truncation or holepunching raced with 979 + * munlock after VM_LOCKED was cleared, Mlocked may still be 980 + * found set here. This does not indicate a problem, unless 981 + * "unevictable_pgs_cleared" appears worryingly large. 982 + */ 983 + if (unlikely(PageMlocked(page))) { 984 + __ClearPageMlocked(page); 985 + dec_zone_page_state(page, NR_MLOCK); 986 + count_vm_event(UNEVICTABLE_PGCLEARED); 960 987 } 961 988 962 989 __ClearPageWaiters(page); ··· 1011 1014 1012 1015 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 1013 1016 1014 - /* 1015 - * A folio becomes evictable in two ways: 1016 - * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()]. 1017 - * 2) Before acquiring LRU lock to put the folio on the correct LRU 1018 - * and then 1019 - * a) do PageLRU check with lock [check_move_unevictable_pages] 1020 - * b) do PageLRU check before lock [clear_page_mlock] 1021 - * 1022 - * (1) & (2a) are ok as LRU lock will serialize them. For (2b), we need 1023 - * following strict ordering: 1024 - * 1025 - * #0: __pagevec_lru_add_fn #1: clear_page_mlock 1026 - * 1027 - * folio_set_lru() folio_test_clear_mlocked() 1028 - * smp_mb() // explicit ordering // above provides strict 1029 - * // ordering 1030 - * folio_test_mlocked() folio_test_lru() 1031 - * 1032 - * 1033 - * if '#1' does not observe setting of PG_lru by '#0' and 1034 - * fails isolation, the explicit barrier will make sure that 1035 - * folio_evictable check will put the folio on the correct 1036 - * LRU. Without smp_mb(), folio_set_lru() can be reordered 1037 - * after folio_test_mlocked() check and can make '#1' fail the 1038 - * isolation of the folio whose mlocked bit is cleared (#0 is 1039 - * also looking at the same folio) and the evictable folio will 1040 - * be stranded on an unevictable LRU. 1041 - */ 1042 1017 folio_set_lru(folio); 1043 - smp_mb__after_atomic(); 1044 - 1018 + /* 1019 + * Is an smp_mb__after_atomic() still required here, before 1020 + * folio_evictable() tests PageMlocked, to rule out the possibility 1021 + * of stranding an evictable folio on an unevictable LRU? I think 1022 + * not, because __munlock_page() only clears PageMlocked while the LRU 1023 + * lock is held. 1024 + * 1025 + * (That is not true of __page_cache_release(), and not necessarily 1026 + * true of release_pages(): but those only clear PageMlocked after 1027 + * put_page_testzero() has excluded any other users of the page.) 1028 + */ 1045 1029 if (folio_evictable(folio)) { 1046 1030 if (was_unevictable) 1047 1031 __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); 1048 1032 } else { 1049 1033 folio_clear_active(folio); 1050 1034 folio_set_unevictable(folio); 1035 + /* 1036 + * folio->mlock_count = !!folio_test_mlocked(folio)? 1037 + * But that leaves __mlock_page() in doubt whether another 1038 + * actor has already counted the mlock or not. Err on the 1039 + * safe side, underestimate, let page reclaim fix it, rather 1040 + * than leaving a page on the unevictable LRU indefinitely. 1041 + */ 1042 + folio->mlock_count = 0; 1051 1043 if (!was_unevictable) 1052 1044 __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); 1053 1045 } ··· 1144 1158 * _really_ don't want to cluster much more 1145 1159 */ 1146 1160 } 1147 - 1148 - #ifdef CONFIG_DEV_PAGEMAP_OPS 1149 - void put_devmap_managed_page(struct page *page) 1150 - { 1151 - int count; 1152 - 1153 - if (WARN_ON_ONCE(!page_is_devmap_managed(page))) 1154 - return; 1155 - 1156 - count = page_ref_dec_return(page); 1157 - 1158 - /* 1159 - * devmap page refcounts are 1-based, rather than 0-based: if 1160 - * refcount is 1, then the page is free and the refcount is 1161 - * stable because nobody holds a reference on the page. 1162 - */ 1163 - if (count == 1) 1164 - free_devmap_managed_page(page); 1165 - else if (!count) 1166 - __put_page(page); 1167 - } 1168 - EXPORT_SYMBOL(put_devmap_managed_page); 1169 - #endif

+54 -63

mm/truncate.c

··· 193 193 folio_clear_mappedtodisk(folio); 194 194 } 195 195 196 - /* 197 - * This is for invalidate_mapping_pages(). That function can be called at 198 - * any time, and is not supposed to throw away dirty pages. But pages can 199 - * be marked dirty at any time too, so use remove_mapping which safely 200 - * discards clean, unused pages. 201 - * 202 - * Returns non-zero if the page was successfully invalidated. 203 - */ 204 - static int 205 - invalidate_complete_page(struct address_space *mapping, struct page *page) 206 - { 207 - 208 - if (page->mapping != mapping) 209 - return 0; 210 - 211 - if (page_has_private(page) && !try_to_release_page(page, 0)) 212 - return 0; 213 - 214 - return remove_mapping(mapping, page); 215 - } 216 - 217 196 int truncate_inode_folio(struct address_space *mapping, struct folio *folio) 218 197 { 219 198 if (folio->mapping != mapping) ··· 273 294 } 274 295 EXPORT_SYMBOL(generic_error_remove_page); 275 296 276 - /* 277 - * Safely invalidate one page from its pagecache mapping. 278 - * It only drops clean, unused pages. The page must be locked. 279 - * 280 - * Returns 1 if the page is successfully invalidated, otherwise 0. 281 - */ 282 - int invalidate_inode_page(struct page *page) 297 + static long mapping_evict_folio(struct address_space *mapping, 298 + struct folio *folio) 283 299 { 284 - struct address_space *mapping = page_mapping(page); 300 + if (folio_test_dirty(folio) || folio_test_writeback(folio)) 301 + return 0; 302 + /* The refcount will be elevated if any page in the folio is mapped */ 303 + if (folio_ref_count(folio) > 304 + folio_nr_pages(folio) + folio_has_private(folio) + 1) 305 + return 0; 306 + if (folio_has_private(folio) && !filemap_release_folio(folio, 0)) 307 + return 0; 308 + 309 + return remove_mapping(mapping, folio); 310 + } 311 + 312 + /** 313 + * invalidate_inode_page() - Remove an unused page from the pagecache. 314 + * @page: The page to remove. 315 + * 316 + * Safely invalidate one page from its pagecache mapping. 317 + * It only drops clean, unused pages. 318 + * 319 + * Context: Page must be locked. 320 + * Return: The number of pages successfully removed. 321 + */ 322 + long invalidate_inode_page(struct page *page) 323 + { 324 + struct folio *folio = page_folio(page); 325 + struct address_space *mapping = folio_mapping(folio); 326 + 327 + /* The page may have been truncated before it was locked */ 285 328 if (!mapping) 286 329 return 0; 287 - if (PageDirty(page) || PageWriteback(page)) 288 - return 0; 289 - if (page_mapped(page)) 290 - return 0; 291 - return invalidate_complete_page(mapping, page); 330 + return mapping_evict_folio(mapping, folio); 292 331 } 293 332 294 333 /** ··· 494 497 } 495 498 EXPORT_SYMBOL(truncate_inode_pages_final); 496 499 497 - static unsigned long __invalidate_mapping_pages(struct address_space *mapping, 500 + /** 501 + * invalidate_mapping_pagevec - Invalidate all the unlocked pages of one inode 502 + * @mapping: the address_space which holds the pages to invalidate 503 + * @start: the offset 'from' which to invalidate 504 + * @end: the offset 'to' which to invalidate (inclusive) 505 + * @nr_pagevec: invalidate failed page number for caller 506 + * 507 + * This helper is similar to invalidate_mapping_pages(), except that it accounts 508 + * for pages that are likely on a pagevec and counts them in @nr_pagevec, which 509 + * will be used by the caller. 510 + */ 511 + unsigned long invalidate_mapping_pagevec(struct address_space *mapping, 498 512 pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) 499 513 { 500 514 pgoff_t indices[PAGEVEC_SIZE]; ··· 518 510 folio_batch_init(&fbatch); 519 511 while (find_lock_entries(mapping, index, end, &fbatch, indices)) { 520 512 for (i = 0; i < folio_batch_count(&fbatch); i++) { 521 - struct page *page = &fbatch.folios[i]->page; 513 + struct folio *folio = fbatch.folios[i]; 522 514 523 - /* We rely upon deletion not changing page->index */ 515 + /* We rely upon deletion not changing folio->index */ 524 516 index = indices[i]; 525 517 526 - if (xa_is_value(page)) { 518 + if (xa_is_value(folio)) { 527 519 count += invalidate_exceptional_entry(mapping, 528 520 index, 529 - page); 521 + folio); 530 522 continue; 531 523 } 532 - index += thp_nr_pages(page) - 1; 524 + index += folio_nr_pages(folio) - 1; 533 525 534 - ret = invalidate_inode_page(page); 535 - unlock_page(page); 526 + ret = mapping_evict_folio(mapping, folio); 527 + folio_unlock(folio); 536 528 /* 537 - * Invalidation is a hint that the page is no longer 529 + * Invalidation is a hint that the folio is no longer 538 530 * of interest and try to speed up its reclaim. 539 531 */ 540 532 if (!ret) { 541 - deactivate_file_page(page); 533 + deactivate_file_folio(folio); 542 534 /* It is likely on the pagevec of a remote CPU */ 543 535 if (nr_pagevec) 544 536 (*nr_pagevec)++; ··· 570 562 unsigned long invalidate_mapping_pages(struct address_space *mapping, 571 563 pgoff_t start, pgoff_t end) 572 564 { 573 - return __invalidate_mapping_pages(mapping, start, end, NULL); 565 + return invalidate_mapping_pagevec(mapping, start, end, NULL); 574 566 } 575 567 EXPORT_SYMBOL(invalidate_mapping_pages); 576 568 577 - /** 578 - * invalidate_mapping_pagevec - Invalidate all the unlocked pages of one inode 579 - * @mapping: the address_space which holds the pages to invalidate 580 - * @start: the offset 'from' which to invalidate 581 - * @end: the offset 'to' which to invalidate (inclusive) 582 - * @nr_pagevec: invalidate failed page number for caller 583 - * 584 - * This helper is similar to invalidate_mapping_pages(), except that it accounts 585 - * for pages that are likely on a pagevec and counts them in @nr_pagevec, which 586 - * will be used by the caller. 587 - */ 588 - void invalidate_mapping_pagevec(struct address_space *mapping, 589 - pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) 590 - { 591 - __invalidate_mapping_pages(mapping, start, end, nr_pagevec); 592 - } 593 - 594 569 /* 595 - * This is like invalidate_complete_page(), except it ignores the page's 570 + * This is like invalidate_inode_page(), except it ignores the page's 596 571 * refcount. We do this because invalidate_inode_pages2() needs stronger 597 572 * invalidation guarantees, and cannot afford to leave pages behind because 598 573 * shrink_page_list() has a temp ref on them, or because they're transiently

+8 -6

mm/userfaultfd.c

··· 95 95 if (!pte_none(*dst_pte)) 96 96 goto out_unlock; 97 97 98 - if (page_in_cache) 99 - page_add_file_rmap(page, false); 100 - else 98 + if (page_in_cache) { 99 + /* Usually, cache pages are already added to LRU */ 100 + if (newly_allocated) 101 + lru_cache_add(page); 102 + page_add_file_rmap(page, dst_vma, false); 103 + } else { 101 104 page_add_new_anon_rmap(page, dst_vma, dst_addr, false); 105 + lru_cache_add_inactive_or_unevictable(page, dst_vma); 106 + } 102 107 103 108 /* 104 109 * Must happen after rmap, as mm_counter() checks mapping (via 105 110 * PageAnon()), which is set by __page_set_anon_rmap(). 106 111 */ 107 112 inc_mm_counter(dst_mm, mm_counter(page)); 108 - 109 - if (newly_allocated) 110 - lru_cache_add_inactive_or_unevictable(page, dst_vma); 111 113 112 114 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 113 115

+34 -2

mm/util.c

··· 681 681 } 682 682 EXPORT_SYMBOL(folio_mapped); 683 683 684 - struct anon_vma *page_anon_vma(struct page *page) 684 + struct anon_vma *folio_anon_vma(struct folio *folio) 685 685 { 686 - struct folio *folio = page_folio(page); 687 686 unsigned long mapping = (unsigned long)folio->mapping; 688 687 689 688 if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) ··· 740 741 return ret; 741 742 } 742 743 EXPORT_SYMBOL_GPL(__page_mapcount); 744 + 745 + /** 746 + * folio_mapcount() - Calculate the number of mappings of this folio. 747 + * @folio: The folio. 748 + * 749 + * A large folio tracks both how many times the entire folio is mapped, 750 + * and how many times each individual page in the folio is mapped. 751 + * This function calculates the total number of times the folio is 752 + * mapped. 753 + * 754 + * Return: The number of times this folio is mapped. 755 + */ 756 + int folio_mapcount(struct folio *folio) 757 + { 758 + int i, compound, nr, ret; 759 + 760 + if (likely(!folio_test_large(folio))) 761 + return atomic_read(&folio->_mapcount) + 1; 762 + 763 + compound = folio_entire_mapcount(folio); 764 + nr = folio_nr_pages(folio); 765 + if (folio_test_hugetlb(folio)) 766 + return compound; 767 + ret = compound; 768 + for (i = 0; i < nr; i++) 769 + ret += atomic_read(&folio_page(folio, i)->_mapcount) + 1; 770 + /* File pages has compound_mapcount included in _mapcount */ 771 + if (!folio_test_anon(folio)) 772 + return ret - compound * nr; 773 + if (folio_test_double_map(folio)) 774 + ret -= nr; 775 + return ret; 776 + } 743 777 744 778 /** 745 779 * folio_copy - Copy the contents of one folio to another.

+156 -149

mm/vmscan.c

··· 979 979 drop_slab_node(nid); 980 980 } 981 981 982 - static inline int is_page_cache_freeable(struct page *page) 982 + static inline int is_page_cache_freeable(struct folio *folio) 983 983 { 984 984 /* 985 985 * A freeable page cache page is referenced only by the caller 986 986 * that isolated the page, the page cache and optional buffer 987 987 * heads at page->private. 988 988 */ 989 - int page_cache_pins = thp_nr_pages(page); 990 - return page_count(page) - page_has_private(page) == 1 + page_cache_pins; 989 + return folio_ref_count(folio) - folio_test_private(folio) == 990 + 1 + folio_nr_pages(folio); 991 991 } 992 992 993 993 /* 994 - * We detected a synchronous write error writing a page out. Probably 994 + * We detected a synchronous write error writing a folio out. Probably 995 995 * -ENOSPC. We need to propagate that into the address_space for a subsequent 996 996 * fsync(), msync() or close(). 997 997 * 998 998 * The tricky part is that after writepage we cannot touch the mapping: nothing 999 - * prevents it from being freed up. But we have a ref on the page and once 1000 - * that page is locked, the mapping is pinned. 999 + * prevents it from being freed up. But we have a ref on the folio and once 1000 + * that folio is locked, the mapping is pinned. 1001 1001 * 1002 - * We're allowed to run sleeping lock_page() here because we know the caller has 1002 + * We're allowed to run sleeping folio_lock() here because we know the caller has 1003 1003 * __GFP_FS. 1004 1004 */ 1005 1005 static void handle_write_error(struct address_space *mapping, 1006 - struct page *page, int error) 1006 + struct folio *folio, int error) 1007 1007 { 1008 - lock_page(page); 1009 - if (page_mapping(page) == mapping) 1008 + folio_lock(folio); 1009 + if (folio_mapping(folio) == mapping) 1010 1010 mapping_set_error(mapping, error); 1011 - unlock_page(page); 1011 + folio_unlock(folio); 1012 1012 } 1013 1013 1014 1014 static bool skip_throttle_noprogress(pg_data_t *pgdat) ··· 1155 1155 * pageout is called by shrink_page_list() for each dirty page. 1156 1156 * Calls ->writepage(). 1157 1157 */ 1158 - static pageout_t pageout(struct page *page, struct address_space *mapping) 1158 + static pageout_t pageout(struct folio *folio, struct address_space *mapping) 1159 1159 { 1160 1160 /* 1161 - * If the page is dirty, only perform writeback if that write 1161 + * If the folio is dirty, only perform writeback if that write 1162 1162 * will be non-blocking. To prevent this allocation from being 1163 1163 * stalled by pagecache activity. But note that there may be 1164 1164 * stalls if we need to run get_block(). We could test 1165 1165 * PagePrivate for that. 1166 1166 * 1167 1167 * If this process is currently in __generic_file_write_iter() against 1168 - * this page's queue, we can perform writeback even if that 1168 + * this folio's queue, we can perform writeback even if that 1169 1169 * will block. 1170 1170 * 1171 - * If the page is swapcache, write it back even if that would 1171 + * If the folio is swapcache, write it back even if that would 1172 1172 * block, for some throttling. This happens by accident, because 1173 1173 * swap_backing_dev_info is bust: it doesn't reflect the 1174 1174 * congestion state of the swapdevs. Easy to fix, if needed. 1175 1175 */ 1176 - if (!is_page_cache_freeable(page)) 1176 + if (!is_page_cache_freeable(folio)) 1177 1177 return PAGE_KEEP; 1178 1178 if (!mapping) { 1179 1179 /* 1180 - * Some data journaling orphaned pages can have 1181 - * page->mapping == NULL while being dirty with clean buffers. 1180 + * Some data journaling orphaned folios can have 1181 + * folio->mapping == NULL while being dirty with clean buffers. 1182 1182 */ 1183 - if (page_has_private(page)) { 1184 - if (try_to_free_buffers(page)) { 1185 - ClearPageDirty(page); 1186 - pr_info("%s: orphaned page\n", __func__); 1183 + if (folio_test_private(folio)) { 1184 + if (try_to_free_buffers(&folio->page)) { 1185 + folio_clear_dirty(folio); 1186 + pr_info("%s: orphaned folio\n", __func__); 1187 1187 return PAGE_CLEAN; 1188 1188 } 1189 1189 } ··· 1192 1192 if (mapping->a_ops->writepage == NULL) 1193 1193 return PAGE_ACTIVATE; 1194 1194 1195 - if (clear_page_dirty_for_io(page)) { 1195 + if (folio_clear_dirty_for_io(folio)) { 1196 1196 int res; 1197 1197 struct writeback_control wbc = { 1198 1198 .sync_mode = WB_SYNC_NONE, ··· 1202 1202 .for_reclaim = 1, 1203 1203 }; 1204 1204 1205 - SetPageReclaim(page); 1206 - res = mapping->a_ops->writepage(page, &wbc); 1205 + folio_set_reclaim(folio); 1206 + res = mapping->a_ops->writepage(&folio->page, &wbc); 1207 1207 if (res < 0) 1208 - handle_write_error(mapping, page, res); 1208 + handle_write_error(mapping, folio, res); 1209 1209 if (res == AOP_WRITEPAGE_ACTIVATE) { 1210 - ClearPageReclaim(page); 1210 + folio_clear_reclaim(folio); 1211 1211 return PAGE_ACTIVATE; 1212 1212 } 1213 1213 1214 - if (!PageWriteback(page)) { 1214 + if (!folio_test_writeback(folio)) { 1215 1215 /* synchronous write or broken a_ops? */ 1216 - ClearPageReclaim(page); 1216 + folio_clear_reclaim(folio); 1217 1217 } 1218 - trace_mm_vmscan_writepage(page); 1219 - inc_node_page_state(page, NR_VMSCAN_WRITE); 1218 + trace_mm_vmscan_write_folio(folio); 1219 + node_stat_add_folio(folio, NR_VMSCAN_WRITE); 1220 1220 return PAGE_SUCCESS; 1221 1221 } 1222 1222 ··· 1227 1227 * Same as remove_mapping, but if the page is removed from the mapping, it 1228 1228 * gets returned with a refcount of 0. 1229 1229 */ 1230 - static int __remove_mapping(struct address_space *mapping, struct page *page, 1230 + static int __remove_mapping(struct address_space *mapping, struct folio *folio, 1231 1231 bool reclaimed, struct mem_cgroup *target_memcg) 1232 1232 { 1233 1233 int refcount; 1234 1234 void *shadow = NULL; 1235 1235 1236 - BUG_ON(!PageLocked(page)); 1237 - BUG_ON(mapping != page_mapping(page)); 1236 + BUG_ON(!folio_test_locked(folio)); 1237 + BUG_ON(mapping != folio_mapping(folio)); 1238 1238 1239 - if (!PageSwapCache(page)) 1239 + if (!folio_test_swapcache(folio)) 1240 1240 spin_lock(&mapping->host->i_lock); 1241 1241 xa_lock_irq(&mapping->i_pages); 1242 1242 /* ··· 1264 1264 * Note that if SetPageDirty is always performed via set_page_dirty, 1265 1265 * and thus under the i_pages lock, then this ordering is not required. 1266 1266 */ 1267 - refcount = 1 + compound_nr(page); 1268 - if (!page_ref_freeze(page, refcount)) 1267 + refcount = 1 + folio_nr_pages(folio); 1268 + if (!folio_ref_freeze(folio, refcount)) 1269 1269 goto cannot_free; 1270 1270 /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */ 1271 - if (unlikely(PageDirty(page))) { 1272 - page_ref_unfreeze(page, refcount); 1271 + if (unlikely(folio_test_dirty(folio))) { 1272 + folio_ref_unfreeze(folio, refcount); 1273 1273 goto cannot_free; 1274 1274 } 1275 1275 1276 - if (PageSwapCache(page)) { 1277 - swp_entry_t swap = { .val = page_private(page) }; 1278 - mem_cgroup_swapout(page, swap); 1276 + if (folio_test_swapcache(folio)) { 1277 + swp_entry_t swap = folio_swap_entry(folio); 1278 + mem_cgroup_swapout(folio, swap); 1279 1279 if (reclaimed && !mapping_exiting(mapping)) 1280 - shadow = workingset_eviction(page, target_memcg); 1281 - __delete_from_swap_cache(page, swap, shadow); 1280 + shadow = workingset_eviction(folio, target_memcg); 1281 + __delete_from_swap_cache(&folio->page, swap, shadow); 1282 1282 xa_unlock_irq(&mapping->i_pages); 1283 - put_swap_page(page, swap); 1283 + put_swap_page(&folio->page, swap); 1284 1284 } else { 1285 1285 void (*freepage)(struct page *); 1286 1286 ··· 1301 1301 * exceptional entries and shadow exceptional entries in the 1302 1302 * same address_space. 1303 1303 */ 1304 - if (reclaimed && page_is_file_lru(page) && 1304 + if (reclaimed && folio_is_file_lru(folio) && 1305 1305 !mapping_exiting(mapping) && !dax_mapping(mapping)) 1306 - shadow = workingset_eviction(page, target_memcg); 1307 - __delete_from_page_cache(page, shadow); 1306 + shadow = workingset_eviction(folio, target_memcg); 1307 + __filemap_remove_folio(folio, shadow); 1308 1308 xa_unlock_irq(&mapping->i_pages); 1309 1309 if (mapping_shrinkable(mapping)) 1310 1310 inode_add_lru(mapping->host); 1311 1311 spin_unlock(&mapping->host->i_lock); 1312 1312 1313 1313 if (freepage != NULL) 1314 - freepage(page); 1314 + freepage(&folio->page); 1315 1315 } 1316 1316 1317 1317 return 1; 1318 1318 1319 1319 cannot_free: 1320 1320 xa_unlock_irq(&mapping->i_pages); 1321 - if (!PageSwapCache(page)) 1321 + if (!folio_test_swapcache(folio)) 1322 1322 spin_unlock(&mapping->host->i_lock); 1323 1323 return 0; 1324 1324 } 1325 1325 1326 - /* 1327 - * Attempt to detach a locked page from its ->mapping. If it is dirty or if 1328 - * someone else has a ref on the page, abort and return 0. If it was 1329 - * successfully detached, return 1. Assumes the caller has a single ref on 1330 - * this page. 1326 + /** 1327 + * remove_mapping() - Attempt to remove a folio from its mapping. 1328 + * @mapping: The address space. 1329 + * @folio: The folio to remove. 1330 + * 1331 + * If the folio is dirty, under writeback or if someone else has a ref 1332 + * on it, removal will fail. 1333 + * Return: The number of pages removed from the mapping. 0 if the folio 1334 + * could not be removed. 1335 + * Context: The caller should have a single refcount on the folio and 1336 + * hold its lock. 1331 1337 */ 1332 - int remove_mapping(struct address_space *mapping, struct page *page) 1338 + long remove_mapping(struct address_space *mapping, struct folio *folio) 1333 1339 { 1334 - if (__remove_mapping(mapping, page, false, NULL)) { 1340 + if (__remove_mapping(mapping, folio, false, NULL)) { 1335 1341 /* 1336 - * Unfreezing the refcount with 1 rather than 2 effectively 1342 + * Unfreezing the refcount with 1 effectively 1337 1343 * drops the pagecache ref for us without requiring another 1338 1344 * atomic operation. 1339 1345 */ 1340 - page_ref_unfreeze(page, 1); 1341 - return 1; 1346 + folio_ref_unfreeze(folio, 1); 1347 + return folio_nr_pages(folio); 1342 1348 } 1343 1349 return 0; 1344 1350 } 1345 1351 1346 1352 /** 1347 - * putback_lru_page - put previously isolated page onto appropriate LRU list 1348 - * @page: page to be put back to appropriate lru list 1353 + * folio_putback_lru - Put previously isolated folio onto appropriate LRU list. 1354 + * @folio: Folio to be returned to an LRU list. 1349 1355 * 1350 - * Add previously isolated @page to appropriate LRU list. 1351 - * Page may still be unevictable for other reasons. 1356 + * Add previously isolated @folio to appropriate LRU list. 1357 + * The folio may still be unevictable for other reasons. 1352 1358 * 1353 - * lru_lock must not be held, interrupts must be enabled. 1359 + * Context: lru_lock must not be held, interrupts must be enabled. 1354 1360 */ 1355 - void putback_lru_page(struct page *page) 1361 + void folio_putback_lru(struct folio *folio) 1356 1362 { 1357 - lru_cache_add(page); 1358 - put_page(page); /* drop ref from isolate */ 1363 + folio_add_lru(folio); 1364 + folio_put(folio); /* drop ref from isolate */ 1359 1365 } 1360 1366 1361 1367 enum page_references { ··· 1371 1365 PAGEREF_ACTIVATE, 1372 1366 }; 1373 1367 1374 - static enum page_references page_check_references(struct page *page, 1368 + static enum page_references folio_check_references(struct folio *folio, 1375 1369 struct scan_control *sc) 1376 1370 { 1377 - int referenced_ptes, referenced_page; 1371 + int referenced_ptes, referenced_folio; 1378 1372 unsigned long vm_flags; 1379 1373 1380 - referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup, 1381 - &vm_flags); 1382 - referenced_page = TestClearPageReferenced(page); 1374 + referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup, 1375 + &vm_flags); 1376 + referenced_folio = folio_test_clear_referenced(folio); 1383 1377 1384 1378 /* 1385 - * Mlock lost the isolation race with us. Let try_to_unmap() 1386 - * move the page to the unevictable list. 1379 + * The supposedly reclaimable folio was found to be in a VM_LOCKED vma. 1380 + * Let the folio, now marked Mlocked, be moved to the unevictable list. 1387 1381 */ 1388 1382 if (vm_flags & VM_LOCKED) 1389 - return PAGEREF_RECLAIM; 1383 + return PAGEREF_ACTIVATE; 1390 1384 1391 1385 if (referenced_ptes) { 1392 1386 /* 1393 - * All mapped pages start out with page table 1387 + * All mapped folios start out with page table 1394 1388 * references from the instantiating fault, so we need 1395 - * to look twice if a mapped file/anon page is used more 1389 + * to look twice if a mapped file/anon folio is used more 1396 1390 * than once. 1397 1391 * 1398 1392 * Mark it and spare it for another trip around the 1399 1393 * inactive list. Another page table reference will 1400 1394 * lead to its activation. 1401 1395 * 1402 - * Note: the mark is set for activated pages as well 1403 - * so that recently deactivated but used pages are 1396 + * Note: the mark is set for activated folios as well 1397 + * so that recently deactivated but used folios are 1404 1398 * quickly recovered. 1405 1399 */ 1406 - SetPageReferenced(page); 1400 + folio_set_referenced(folio); 1407 1401 1408 - if (referenced_page || referenced_ptes > 1) 1402 + if (referenced_folio || referenced_ptes > 1) 1409 1403 return PAGEREF_ACTIVATE; 1410 1404 1411 1405 /* 1412 - * Activate file-backed executable pages after first usage. 1406 + * Activate file-backed executable folios after first usage. 1413 1407 */ 1414 - if ((vm_flags & VM_EXEC) && !PageSwapBacked(page)) 1408 + if ((vm_flags & VM_EXEC) && !folio_test_swapbacked(folio)) 1415 1409 return PAGEREF_ACTIVATE; 1416 1410 1417 1411 return PAGEREF_KEEP; 1418 1412 } 1419 1413 1420 - /* Reclaim if clean, defer dirty pages to writeback */ 1421 - if (referenced_page && !PageSwapBacked(page)) 1414 + /* Reclaim if clean, defer dirty folios to writeback */ 1415 + if (referenced_folio && !folio_test_swapbacked(folio)) 1422 1416 return PAGEREF_RECLAIM_CLEAN; 1423 1417 1424 1418 return PAGEREF_RECLAIM; 1425 1419 } 1426 1420 1427 1421 /* Check if a page is dirty or under writeback */ 1428 - static void page_check_dirty_writeback(struct page *page, 1422 + static void folio_check_dirty_writeback(struct folio *folio, 1429 1423 bool *dirty, bool *writeback) 1430 1424 { 1431 1425 struct address_space *mapping; ··· 1434 1428 * Anonymous pages are not handled by flushers and must be written 1435 1429 * from reclaim context. Do not stall reclaim based on them 1436 1430 */ 1437 - if (!page_is_file_lru(page) || 1438 - (PageAnon(page) && !PageSwapBacked(page))) { 1431 + if (!folio_is_file_lru(folio) || 1432 + (folio_test_anon(folio) && !folio_test_swapbacked(folio))) { 1439 1433 *dirty = false; 1440 1434 *writeback = false; 1441 1435 return; 1442 1436 } 1443 1437 1444 - /* By default assume that the page flags are accurate */ 1445 - *dirty = PageDirty(page); 1446 - *writeback = PageWriteback(page); 1438 + /* By default assume that the folio flags are accurate */ 1439 + *dirty = folio_test_dirty(folio); 1440 + *writeback = folio_test_writeback(folio); 1447 1441 1448 1442 /* Verify dirty/writeback state if the filesystem supports it */ 1449 - if (!page_has_private(page)) 1443 + if (!folio_test_private(folio)) 1450 1444 return; 1451 1445 1452 - mapping = page_mapping(page); 1446 + mapping = folio_mapping(folio); 1453 1447 if (mapping && mapping->a_ops->is_dirty_writeback) 1454 - mapping->a_ops->is_dirty_writeback(page, dirty, writeback); 1448 + mapping->a_ops->is_dirty_writeback(&folio->page, dirty, writeback); 1455 1449 } 1456 1450 1457 1451 static struct page *alloc_demote_page(struct page *page, unsigned long node) ··· 1525 1519 while (!list_empty(page_list)) { 1526 1520 struct address_space *mapping; 1527 1521 struct page *page; 1522 + struct folio *folio; 1528 1523 enum page_references references = PAGEREF_RECLAIM; 1529 1524 bool dirty, writeback, may_enter_fs; 1530 1525 unsigned int nr_pages; 1531 1526 1532 1527 cond_resched(); 1533 1528 1534 - page = lru_to_page(page_list); 1535 - list_del(&page->lru); 1529 + folio = lru_to_folio(page_list); 1530 + list_del(&folio->lru); 1531 + page = &folio->page; 1536 1532 1537 1533 if (!trylock_page(page)) 1538 1534 goto keep; ··· 1560 1552 * reclaim_congested. kswapd will stall and start writing 1561 1553 * pages if the tail of the LRU is all dirty unqueued pages. 1562 1554 */ 1563 - page_check_dirty_writeback(page, &dirty, &writeback); 1555 + folio_check_dirty_writeback(folio, &dirty, &writeback); 1564 1556 if (dirty || writeback) 1565 - stat->nr_dirty++; 1557 + stat->nr_dirty += nr_pages; 1566 1558 1567 1559 if (dirty && !writeback) 1568 - stat->nr_unqueued_dirty++; 1560 + stat->nr_unqueued_dirty += nr_pages; 1569 1561 1570 1562 /* 1571 1563 * Treat this page as congested if the underlying BDI is or if ··· 1575 1567 */ 1576 1568 mapping = page_mapping(page); 1577 1569 if (writeback && PageReclaim(page)) 1578 - stat->nr_congested++; 1570 + stat->nr_congested += nr_pages; 1579 1571 1580 1572 /* 1581 1573 * If a page at the tail of the LRU is under writeback, there ··· 1624 1616 if (current_is_kswapd() && 1625 1617 PageReclaim(page) && 1626 1618 test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { 1627 - stat->nr_immediate++; 1619 + stat->nr_immediate += nr_pages; 1628 1620 goto activate_locked; 1629 1621 1630 1622 /* Case 2 above */ ··· 1642 1634 * and it's also appropriate in global reclaim. 1643 1635 */ 1644 1636 SetPageReclaim(page); 1645 - stat->nr_writeback++; 1637 + stat->nr_writeback += nr_pages; 1646 1638 goto activate_locked; 1647 1639 1648 1640 /* Case 3 above */ ··· 1656 1648 } 1657 1649 1658 1650 if (!ignore_references) 1659 - references = page_check_references(page, sc); 1651 + references = folio_check_references(folio, sc); 1660 1652 1661 1653 switch (references) { 1662 1654 case PAGEREF_ACTIVATE: ··· 1689 1681 if (!PageSwapCache(page)) { 1690 1682 if (!(sc->gfp_mask & __GFP_IO)) 1691 1683 goto keep_locked; 1692 - if (page_maybe_dma_pinned(page)) 1684 + if (folio_maybe_dma_pinned(folio)) 1693 1685 goto keep_locked; 1694 1686 if (PageTransHuge(page)) { 1695 1687 /* cannot split THP, skip it */ 1696 - if (!can_split_huge_page(page, NULL)) 1688 + if (!can_split_folio(folio, NULL)) 1697 1689 goto activate_locked; 1698 1690 /* 1699 1691 * Split pages without a PMD map right 1700 1692 * away. Chances are some or all of the 1701 1693 * tail pages can be freed without IO. 1702 1694 */ 1703 - if (!compound_mapcount(page) && 1704 - split_huge_page_to_list(page, 1705 - page_list)) 1695 + if (!folio_entire_mapcount(folio) && 1696 + split_folio_to_list(folio, 1697 + page_list)) 1706 1698 goto activate_locked; 1707 1699 } 1708 1700 if (!add_to_swap(page)) { 1709 1701 if (!PageTransHuge(page)) 1710 1702 goto activate_locked_split; 1711 1703 /* Fallback to swap normal pages */ 1712 - if (split_huge_page_to_list(page, 1713 - page_list)) 1704 + if (split_folio_to_list(folio, 1705 + page_list)) 1714 1706 goto activate_locked; 1715 1707 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1716 1708 count_vm_event(THP_SWPOUT_FALLBACK); ··· 1724 1716 /* Adding to swap updated mapping */ 1725 1717 mapping = page_mapping(page); 1726 1718 } 1727 - } else if (unlikely(PageTransHuge(page))) { 1728 - /* Split file THP */ 1729 - if (split_huge_page_to_list(page, page_list)) 1719 + } else if (PageSwapBacked(page) && PageTransHuge(page)) { 1720 + /* Split shmem THP */ 1721 + if (split_folio_to_list(folio, page_list)) 1730 1722 goto keep_locked; 1731 1723 } 1732 1724 ··· 1750 1742 enum ttu_flags flags = TTU_BATCH_FLUSH; 1751 1743 bool was_swapbacked = PageSwapBacked(page); 1752 1744 1753 - if (unlikely(PageTransHuge(page))) 1745 + if (PageTransHuge(page) && 1746 + thp_order(page) >= HPAGE_PMD_ORDER) 1754 1747 flags |= TTU_SPLIT_HUGE_PMD; 1755 1748 1756 - try_to_unmap(page, flags); 1749 + try_to_unmap(folio, flags); 1757 1750 if (page_mapped(page)) { 1758 1751 stat->nr_unmap_fail += nr_pages; 1759 1752 if (!was_swapbacked && PageSwapBacked(page)) ··· 1802 1793 * starts and then write it out here. 1803 1794 */ 1804 1795 try_to_unmap_flush_dirty(); 1805 - switch (pageout(page, mapping)) { 1796 + switch (pageout(folio, mapping)) { 1806 1797 case PAGE_KEEP: 1807 1798 goto keep_locked; 1808 1799 case PAGE_ACTIVATE: 1809 1800 goto activate_locked; 1810 1801 case PAGE_SUCCESS: 1811 - stat->nr_pageout += thp_nr_pages(page); 1802 + stat->nr_pageout += nr_pages; 1812 1803 1813 1804 if (PageWriteback(page)) 1814 1805 goto keep; ··· 1886 1877 */ 1887 1878 count_vm_event(PGLAZYFREED); 1888 1879 count_memcg_page_event(page, PGLAZYFREED); 1889 - } else if (!mapping || !__remove_mapping(mapping, page, true, 1880 + } else if (!mapping || !__remove_mapping(mapping, folio, true, 1890 1881 sc->target_mem_cgroup)) 1891 1882 goto keep_locked; 1892 1883 ··· 2141 2132 } 2142 2133 2143 2134 /** 2144 - * isolate_lru_page - tries to isolate a page from its LRU list 2145 - * @page: page to isolate from its LRU list 2135 + * folio_isolate_lru() - Try to isolate a folio from its LRU list. 2136 + * @folio: Folio to isolate from its LRU list. 2146 2137 * 2147 - * Isolates a @page from an LRU list, clears PageLRU and adjusts the 2148 - * vmstat statistic corresponding to whatever LRU list the page was on. 2138 + * Isolate a @folio from an LRU list and adjust the vmstat statistic 2139 + * corresponding to whatever LRU list the folio was on. 2149 2140 * 2150 - * Returns 0 if the page was removed from an LRU list. 2151 - * Returns -EBUSY if the page was not on an LRU list. 2152 - * 2153 - * The returned page will have PageLRU() cleared. If it was found on 2154 - * the active list, it will have PageActive set. If it was found on 2155 - * the unevictable list, it will have the PageUnevictable bit set. That flag 2141 + * The folio will have its LRU flag cleared. If it was found on the 2142 + * active list, it will have the Active flag set. If it was found on the 2143 + * unevictable list, it will have the Unevictable flag set. These flags 2156 2144 * may need to be cleared by the caller before letting the page go. 2157 2145 * 2158 - * The vmstat statistic corresponding to the list on which the page was 2159 - * found will be decremented. 2160 - * 2161 - * Restrictions: 2146 + * Context: 2162 2147 * 2163 2148 * (1) Must be called with an elevated refcount on the page. This is a 2164 - * fundamental difference from isolate_lru_pages (which is called 2149 + * fundamental difference from isolate_lru_pages() (which is called 2165 2150 * without a stable reference). 2166 - * (2) the lru_lock must not be held. 2167 - * (3) interrupts must be enabled. 2151 + * (2) The lru_lock must not be held. 2152 + * (3) Interrupts must be enabled. 2153 + * 2154 + * Return: 0 if the folio was removed from an LRU list. 2155 + * -EBUSY if the folio was not on an LRU list. 2168 2156 */ 2169 - int isolate_lru_page(struct page *page) 2157 + int folio_isolate_lru(struct folio *folio) 2170 2158 { 2171 - struct folio *folio = page_folio(page); 2172 2159 int ret = -EBUSY; 2173 2160 2174 - VM_BUG_ON_PAGE(!page_count(page), page); 2175 - WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"); 2161 + VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio); 2176 2162 2177 - if (TestClearPageLRU(page)) { 2163 + if (folio_test_clear_lru(folio)) { 2178 2164 struct lruvec *lruvec; 2179 2165 2180 - get_page(page); 2166 + folio_get(folio); 2181 2167 lruvec = folio_lruvec_lock_irq(folio); 2182 - del_page_from_lru_list(page, lruvec); 2168 + lruvec_del_folio(lruvec, folio); 2183 2169 unlock_page_lruvec_irq(lruvec); 2184 2170 ret = 0; 2185 2171 } ··· 2410 2406 * 2411 2407 * If the pages are mostly unmapped, the processing is fast and it is 2412 2408 * appropriate to hold lru_lock across the whole operation. But if 2413 - * the pages are mapped, the processing is slow (page_referenced()), so 2409 + * the pages are mapped, the processing is slow (folio_referenced()), so 2414 2410 * we should drop lru_lock around each page. It's impossible to balance 2415 2411 * this, so instead we remove the pages from the LRU while processing them. 2416 2412 * It is safe to rely on PG_active against the non-LRU pages in here because ··· 2430 2426 LIST_HEAD(l_hold); /* The pages which were snipped off */ 2431 2427 LIST_HEAD(l_active); 2432 2428 LIST_HEAD(l_inactive); 2433 - struct page *page; 2434 2429 unsigned nr_deactivate, nr_activate; 2435 2430 unsigned nr_rotated = 0; 2436 2431 int file = is_file_lru(lru); ··· 2451 2448 spin_unlock_irq(&lruvec->lru_lock); 2452 2449 2453 2450 while (!list_empty(&l_hold)) { 2451 + struct folio *folio; 2452 + struct page *page; 2453 + 2454 2454 cond_resched(); 2455 - page = lru_to_page(&l_hold); 2456 - list_del(&page->lru); 2455 + folio = lru_to_folio(&l_hold); 2456 + list_del(&folio->lru); 2457 + page = &folio->page; 2457 2458 2458 2459 if (unlikely(!page_evictable(page))) { 2459 2460 putback_lru_page(page); ··· 2472 2465 } 2473 2466 } 2474 2467 2475 - if (page_referenced(page, 0, sc->target_mem_cgroup, 2476 - &vm_flags)) { 2468 + if (folio_referenced(folio, 0, sc->target_mem_cgroup, 2469 + &vm_flags)) { 2477 2470 /* 2478 2471 * Identify referenced, file-backed active pages and 2479 2472 * give them one more trip around the active list. So

+13 -12

mm/workingset.c

··· 245 245 } 246 246 247 247 /** 248 - * workingset_eviction - note the eviction of a page from memory 248 + * workingset_eviction - note the eviction of a folio from memory 249 249 * @target_memcg: the cgroup that is causing the reclaim 250 - * @page: the page being evicted 250 + * @folio: the folio being evicted 251 251 * 252 - * Return: a shadow entry to be stored in @page->mapping->i_pages in place 253 - * of the evicted @page so that a later refault can be detected. 252 + * Return: a shadow entry to be stored in @folio->mapping->i_pages in place 253 + * of the evicted @folio so that a later refault can be detected. 254 254 */ 255 - void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) 255 + void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) 256 256 { 257 - struct pglist_data *pgdat = page_pgdat(page); 257 + struct pglist_data *pgdat = folio_pgdat(folio); 258 258 unsigned long eviction; 259 259 struct lruvec *lruvec; 260 260 int memcgid; 261 261 262 - /* Page is fully exclusive and pins page's memory cgroup pointer */ 263 - VM_BUG_ON_PAGE(PageLRU(page), page); 264 - VM_BUG_ON_PAGE(page_count(page), page); 265 - VM_BUG_ON_PAGE(!PageLocked(page), page); 262 + /* Folio is fully exclusive and pins folio's memory cgroup pointer */ 263 + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 264 + VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); 265 + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 266 266 267 267 lruvec = mem_cgroup_lruvec(target_memcg, pgdat); 268 268 /* XXX: target_memcg can be NULL, go through lruvec */ 269 269 memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); 270 270 eviction = atomic_long_read(&lruvec->nonresident_age); 271 - workingset_age_nonresident(lruvec, thp_nr_pages(page)); 272 - return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); 271 + workingset_age_nonresident(lruvec, folio_nr_pages(folio)); 272 + return pack_shadow(memcgid, pgdat, eviction, 273 + folio_test_workingset(folio)); 273 274 } 274 275 275 276 /**

+24 -11

tools/testing/selftests/vm/transhuge-stress.c

··· 26 26 #define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1)) 27 27 28 28 int pagemap_fd; 29 + int backing_fd = -1; 30 + int mmap_flags = MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE; 31 + #define PROT_RW (PROT_READ | PROT_WRITE) 29 32 30 33 int64_t allocate_transhuge(void *ptr) 31 34 { 32 35 uint64_t ent[2]; 33 36 34 37 /* drop pmd */ 35 - if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE, 36 - MAP_FIXED | MAP_ANONYMOUS | 37 - MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr) 38 + if (mmap(ptr, HPAGE_SIZE, PROT_RW, MAP_FIXED | mmap_flags, 39 + backing_fd, 0) != ptr) 38 40 errx(2, "mmap transhuge"); 39 41 40 42 if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE)) ··· 62 60 size_t ram, len; 63 61 void *ptr, *p; 64 62 struct timespec a, b; 63 + int i = 0; 64 + char *name = NULL; 65 65 double s; 66 66 uint8_t *map; 67 67 size_t map_len; ··· 73 69 ram = SIZE_MAX / 4; 74 70 else 75 71 ram *= sysconf(_SC_PAGESIZE); 72 + len = ram; 76 73 77 - if (argc == 1) 78 - len = ram; 79 - else if (!strcmp(argv[1], "-h")) 80 - errx(1, "usage: %s [size in MiB]", argv[0]); 81 - else 82 - len = atoll(argv[1]) << 20; 74 + while (++i < argc) { 75 + if (!strcmp(argv[i], "-h")) 76 + errx(1, "usage: %s [size in MiB]", argv[0]); 77 + else if (!strcmp(argv[i], "-f")) 78 + name = argv[++i]; 79 + else 80 + len = atoll(argv[i]) << 20; 81 + } 82 + 83 + if (name) { 84 + backing_fd = open(name, O_RDWR); 85 + if (backing_fd == -1) 86 + errx(2, "open %s", name); 87 + mmap_flags = MAP_SHARED; 88 + } 83 89 84 90 warnx("allocate %zd transhuge pages, using %zd MiB virtual memory" 85 91 " and %zd MiB of ram", len >> HPAGE_SHIFT, len >> 20, ··· 100 86 err(2, "open pagemap"); 101 87 102 88 len -= len % HPAGE_SIZE; 103 - ptr = mmap(NULL, len + HPAGE_SIZE, PROT_READ | PROT_WRITE, 104 - MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0); 89 + ptr = mmap(NULL, len + HPAGE_SIZE, PROT_RW, mmap_flags, backing_fd, 0); 105 90 if (ptr == MAP_FAILED) 106 91 err(2, "initial mmap"); 107 92 ptr += HPAGE_SIZE - (uintptr_t)ptr % HPAGE_SIZE;

Configure Feed

Configure Feed