mm/hugetlb: remove fake head pages

HugeTLB Vmemmap Optimization (HVO) reduces memory usage by freeing most
vmemmap pages for huge pages and remapping the freed range to a single
page containing the struct page metadata.

With the new mask-based compound_info encoding (for power-of-2 struct page
sizes), all tail pages of the same order are now identical regardless of
which compound page they belong to. This means the tail pages can be
truly shared without fake heads.

Allocate a single page of initialized tail struct pages per zone per order
in the vmemmap_tails[] array in struct zone. All huge pages of that order
in the zone share this tail page, mapped read-only into their vmemmap.
The head page remains unique per huge page.

Redefine MAX_FOLIO_ORDER using ilog2(). The define has to produce a
compile-constant as it is used to specify vmemmap_tail array size. For
some reason, compiler is not able to solve get_order() at compile-time,
but ilog2() works.

Avoid PUD_ORDER to define MAX_FOLIO_ORDER as it adds dependency to
<linux/pgtable.h> which generates hard-to-break include loop.

This eliminates fake heads while maintaining the same memory savings, and
simplifies compound_head() by removing fake head detection.

Link: https://lkml.kernel.org/r/20260227194302.274384-13-kas@kernel.org
Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
Reviewed-by: Vlastimil Babka (SUSE) <vbabka@kernel.org>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Baoquan He <bhe@redhat.com>
Cc: Christoph Lameter <cl@gentwo.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Frank van der Linden <fvdl@google.com>
Cc: Harry Yoo <harry.yoo@oracle.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Kiryl Shutsemau and committed by

Andrew Morton 3 months ago 622026e8 76351f2f

+146 -15

5 changed files

expand all

include

linux

mm.h

mmzone.h

hugetlb_vmemmap.c

internal.h

sparse-vmemmap.c

+2 -1

include/linux/mm.h

··· 4479 4479 int node, struct vmem_altmap *altmap); 4480 4480 int vmemmap_populate(unsigned long start, unsigned long end, int node, 4481 4481 struct vmem_altmap *altmap); 4482 - int vmemmap_populate_hvo(unsigned long start, unsigned long end, int node, 4482 + int vmemmap_populate_hvo(unsigned long start, unsigned long end, 4483 + unsigned int order, struct zone *zone, 4483 4484 unsigned long headsize); 4484 4485 void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node, 4485 4486 unsigned long headsize);

+17 -2

include/linux/mmzone.h

··· 81 81 * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect 82 82 * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit. 83 83 */ 84 - #define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G) 84 + #ifdef CONFIG_64BIT 85 + #define MAX_FOLIO_ORDER (ilog2(SZ_16G) - PAGE_SHIFT) 86 + #else 87 + #define MAX_FOLIO_ORDER (ilog2(SZ_1G) - PAGE_SHIFT) 88 + #endif 85 89 #else 86 90 /* 87 91 * Without hugetlb, gigantic folios that are bigger than a single PUD are 88 92 * currently impossible. 89 93 */ 90 - #define MAX_FOLIO_ORDER PUD_ORDER 94 + #define MAX_FOLIO_ORDER (PUD_SHIFT - PAGE_SHIFT) 91 95 #endif 92 96 93 97 #define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER) ··· 106 102 (IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) && \ 107 103 is_power_of_2(sizeof(struct page)) ? \ 108 104 MAX_FOLIO_NR_PAGES * sizeof(struct page) : 0) 105 + 106 + /* 107 + * vmemmap optimization (like HVO) is only possible for page orders that fill 108 + * two or more pages with struct pages. 109 + */ 110 + #define VMEMMAP_TAIL_MIN_ORDER (ilog2(2 * PAGE_SIZE / sizeof(struct page))) 111 + #define __NR_VMEMMAP_TAILS (MAX_FOLIO_ORDER - VMEMMAP_TAIL_MIN_ORDER + 1) 112 + #define NR_VMEMMAP_TAILS (__NR_VMEMMAP_TAILS > 0 ? __NR_VMEMMAP_TAILS : 0) 109 113 110 114 enum migratetype { 111 115 MIGRATE_UNMOVABLE, ··· 1125 1113 /* Zone statistics */ 1126 1114 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; 1127 1115 atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; 1116 + #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP 1117 + struct page *vmemmap_tails[NR_VMEMMAP_TAILS]; 1118 + #endif 1128 1119 } ____cacheline_internodealigned_in_smp; 1129 1120 1130 1121 enum pgdat_flags {

+70 -3

mm/hugetlb_vmemmap.c

··· 19 19 20 20 #include <asm/tlbflush.h> 21 21 #include "hugetlb_vmemmap.h" 22 + #include "internal.h" 22 23 23 24 /** 24 25 * struct vmemmap_remap_walk - walk vmemmap page table ··· 506 505 return true; 507 506 } 508 507 508 + static struct page *vmemmap_get_tail(unsigned int order, struct zone *zone) 509 + { 510 + const unsigned int idx = order - VMEMMAP_TAIL_MIN_ORDER; 511 + struct page *tail, *p; 512 + int node = zone_to_nid(zone); 513 + 514 + tail = READ_ONCE(zone->vmemmap_tails[idx]); 515 + if (likely(tail)) 516 + return tail; 517 + 518 + tail = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); 519 + if (!tail) 520 + return NULL; 521 + 522 + p = page_to_virt(tail); 523 + for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++) 524 + init_compound_tail(p + i, NULL, order, zone); 525 + 526 + if (cmpxchg(&zone->vmemmap_tails[idx], NULL, tail)) { 527 + __free_page(tail); 528 + tail = READ_ONCE(zone->vmemmap_tails[idx]); 529 + } 530 + 531 + return tail; 532 + } 533 + 509 534 static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, 510 535 struct folio *folio, 511 536 struct list_head *vmemmap_pages, ··· 546 519 547 520 if (!vmemmap_should_optimize_folio(h, folio)) 548 521 return ret; 522 + 523 + nid = folio_nid(folio); 524 + vmemmap_tail = vmemmap_get_tail(h->order, folio_zone(folio)); 525 + if (!vmemmap_tail) 526 + return -ENOMEM; 549 527 550 528 static_branch_inc(&hugetlb_optimize_vmemmap_key); 551 529 ··· 569 537 */ 570 538 folio_set_hugetlb_vmemmap_optimized(folio); 571 539 572 - nid = folio_nid(folio); 573 540 vmemmap_head = alloc_pages_node(nid, GFP_KERNEL, 0); 574 541 if (!vmemmap_head) { 575 542 ret = -ENOMEM; ··· 579 548 list_add(&vmemmap_head->lru, vmemmap_pages); 580 549 memmap_pages_add(1); 581 550 582 - vmemmap_tail = vmemmap_head; 583 551 vmemmap_start = (unsigned long)&folio->page; 584 552 vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); 585 553 ··· 806 776 } 807 777 } 808 778 779 + static struct zone *pfn_to_zone(unsigned nid, unsigned long pfn) 780 + { 781 + struct zone *zone; 782 + enum zone_type zone_type; 783 + 784 + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { 785 + zone = &NODE_DATA(nid)->node_zones[zone_type]; 786 + if (zone_spans_pfn(zone, pfn)) 787 + return zone; 788 + } 789 + 790 + return NULL; 791 + } 792 + 809 793 void __init hugetlb_vmemmap_init_late(int nid) 810 794 { 811 795 struct huge_bootmem_page *m, *tm; 812 796 unsigned long phys, nr_pages, start, end; 813 797 unsigned long pfn, nr_mmap; 798 + struct zone *zone = NULL; 814 799 struct hstate *h; 815 800 void *map; 816 801 ··· 859 814 continue; 860 815 } 861 816 862 - if (vmemmap_populate_hvo(start, end, nid, 817 + if (!zone || !zone_spans_pfn(zone, pfn)) 818 + zone = pfn_to_zone(nid, pfn); 819 + if (WARN_ON_ONCE(!zone)) 820 + continue; 821 + 822 + if (vmemmap_populate_hvo(start, end, huge_page_order(h), zone, 863 823 HUGETLB_VMEMMAP_RESERVE_SIZE) < 0) { 864 824 /* Fallback if HVO population fails */ 865 825 vmemmap_populate(start, end, nid, NULL); ··· 892 842 static int __init hugetlb_vmemmap_init(void) 893 843 { 894 844 const struct hstate *h; 845 + struct zone *zone; 895 846 896 847 /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ 897 848 BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES); 849 + 850 + for_each_zone(zone) { 851 + for (int i = 0; i < NR_VMEMMAP_TAILS; i++) { 852 + struct page *tail, *p; 853 + unsigned int order; 854 + 855 + tail = zone->vmemmap_tails[i]; 856 + if (!tail) 857 + continue; 858 + 859 + order = i + VMEMMAP_TAIL_MIN_ORDER; 860 + p = page_to_virt(tail); 861 + for (int j = 0; j < PAGE_SIZE / sizeof(struct page); j++) 862 + init_compound_tail(p + j, NULL, order, zone); 863 + } 864 + } 898 865 899 866 for_each_hstate(h) { 900 867 if (hugetlb_vmemmap_optimizable(h)) {

mm/internal.h

··· 905 905 set_page_private(tail, 0); 906 906 } 907 907 908 + static inline void init_compound_tail(struct page *tail, 909 + const struct page *head, unsigned int order, struct zone *zone) 910 + { 911 + atomic_set(&tail->_mapcount, -1); 912 + set_page_node(tail, zone_to_nid(zone)); 913 + set_page_zone(tail, zone_idx(zone)); 914 + prep_compound_tail(tail, head, order); 915 + } 916 + 908 917 void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); 909 918 extern bool free_pages_prepare(struct page *page, unsigned int order); 910 919

+48 -9

mm/sparse-vmemmap.c

··· 325 325 } 326 326 } 327 327 328 - /* 329 - * Populate vmemmap pages HVO-style. The first page contains the head 330 - * page and needed tail pages, the other ones are mirrors of the first 331 - * page. 332 - */ 333 - int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end, 334 - int node, unsigned long headsize) 328 + #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP 329 + static __meminit struct page *vmemmap_get_tail(unsigned int order, struct zone *zone) 335 330 { 336 - pte_t *pte; 331 + struct page *p, *tail; 332 + unsigned int idx; 333 + int node = zone_to_nid(zone); 334 + 335 + if (WARN_ON_ONCE(order < VMEMMAP_TAIL_MIN_ORDER)) 336 + return NULL; 337 + if (WARN_ON_ONCE(order > MAX_FOLIO_ORDER)) 338 + return NULL; 339 + 340 + idx = order - VMEMMAP_TAIL_MIN_ORDER; 341 + tail = zone->vmemmap_tails[idx]; 342 + if (tail) 343 + return tail; 344 + 345 + /* 346 + * Only allocate the page, but do not initialize it. 347 + * 348 + * Any initialization done here will be overwritten by memmap_init(). 349 + * 350 + * hugetlb_vmemmap_init() will take care of initialization after 351 + * memmap_init(). 352 + */ 353 + 354 + p = vmemmap_alloc_block_zero(PAGE_SIZE, node); 355 + if (!p) 356 + return NULL; 357 + 358 + tail = virt_to_page(p); 359 + zone->vmemmap_tails[idx] = tail; 360 + 361 + return tail; 362 + } 363 + 364 + int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end, 365 + unsigned int order, struct zone *zone, 366 + unsigned long headsize) 367 + { 337 368 unsigned long maddr; 369 + struct page *tail; 370 + pte_t *pte; 371 + int node = zone_to_nid(zone); 372 + 373 + tail = vmemmap_get_tail(order, zone); 374 + if (!tail) 375 + return -ENOMEM; 338 376 339 377 for (maddr = addr; maddr < addr + headsize; maddr += PAGE_SIZE) { 340 378 pte = vmemmap_populate_address(maddr, node, NULL, -1, 0); ··· 384 346 * Reuse the last page struct page mapped above for the rest. 385 347 */ 386 348 return vmemmap_populate_range(maddr, end, node, NULL, 387 - pte_pfn(ptep_get(pte)), 0); 349 + page_to_pfn(tail), 0); 388 350 } 351 + #endif 389 352 390 353 void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, 391 354 unsigned long addr, unsigned long next)

Configure Feed

Configure Feed