mm/hugetlb: use separate nodemask for bootmem allocations

Hugetlb boot allocation has used online nodes for allocation since commit
de55996d7188 ("mm/hugetlb: use online nodes for bootmem allocation").
This was needed to be able to do the allocations earlier in boot, before
N_MEMORY was set.

This might lead to a different distribution of gigantic hugepages across
NUMA nodes if there are memoryless nodes in the system.

What happens is that the memoryless nodes are tried, but then the memblock
allocation fails and falls back, which usually means that the node that
has the highest physical address available will be used (top-down
allocation). While this will end up getting the same number of hugetlb
pages, they might not be be distributed the same way. The fallback for
each memoryless node might not end up coming from the same node as the
successful round-robin allocation from N_MEMORY nodes.

While administrators that rely on having a specific number of hugepages
per node should use the hugepages=N:X syntax, it's better not to change
the old behavior for the plain hugepages=N case.

To do this, construct a nodemask for hugetlb bootmem purposes only,
containing nodes that have memory. Then use that for round-robin bootmem
allocations.

This saves some cycles, and the added advantage here is that hugetlb_cma
can use it too, avoiding the older issue of pointless attempts to create a
CMA area for memoryless nodes (which will also cause the per-node CMA area
size to be too small).

Link: https://lkml.kernel.org/r/20250402205613.3086864-1-fvdl@google.com
Fixes: de55996d7188 ("mm/hugetlb: use online nodes for bootmem allocation")
Signed-off-by: Frank van der Linden <fvdl@google.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Luiz Capitulino <luizcap@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Frank van der Linden and committed by

Andrew Morton 1 year ago 8d88b076 1b6a58e2

+38 -6

3 changed files

expand all

include

linux

hugetlb.h

hugetlb.c

hugetlb_cma.c

include/linux/hugetlb.h

··· 14 14 #include <linux/pgtable.h> 15 15 #include <linux/gfp.h> 16 16 #include <linux/userfaultfd_k.h> 17 + #include <linux/nodemask.h> 17 18 18 19 struct ctl_table; 19 20 struct user_struct; ··· 177 176 178 177 void hugetlb_bootmem_alloc(void); 179 178 bool hugetlb_bootmem_allocated(void); 179 + extern nodemask_t hugetlb_bootmem_nodes; 180 + void hugetlb_bootmem_set_nodes(void); 180 181 181 182 /* arch callbacks */ 182 183

+28 -2

mm/hugetlb.c

··· 58 58 unsigned int default_hstate_idx; 59 59 struct hstate hstates[HUGE_MAX_HSTATE]; 60 60 61 + __initdata nodemask_t hugetlb_bootmem_nodes; 61 62 __initdata struct list_head huge_boot_pages[MAX_NUMNODES]; 62 63 static unsigned long hstate_boot_nrinvalid[HUGE_MAX_HSTATE] __initdata; 63 64 ··· 3220 3219 } 3221 3220 3222 3221 /* allocate from next node when distributing huge pages */ 3223 - for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_ONLINE]) { 3222 + for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, 3223 + &hugetlb_bootmem_nodes) { 3224 3224 m = alloc_bootmem(h, node, false); 3225 3225 if (!m) 3226 3226 return 0; ··· 3685 3683 struct hstate *h, *h2; 3686 3684 3687 3685 for_each_hstate(h) { 3686 + /* 3687 + * Always reset to first_memory_node here, even if 3688 + * next_nid_to_alloc was set before - we can't 3689 + * reference hugetlb_bootmem_nodes after init, and 3690 + * first_memory_node is right for all further allocations. 3691 + */ 3692 + h->next_nid_to_alloc = first_memory_node; 3693 + h->next_nid_to_free = first_memory_node; 3694 + 3688 3695 /* oversize hugepages were init'ed in early boot */ 3689 3696 if (!hstate_is_gigantic(h)) 3690 3697 hugetlb_hstate_alloc_pages(h); ··· 5006 4995 } 5007 4996 hugetlb_early_param("default_hugepagesz", default_hugepagesz_setup); 5008 4997 4998 + void __init hugetlb_bootmem_set_nodes(void) 4999 + { 5000 + int i, nid; 5001 + unsigned long start_pfn, end_pfn; 5002 + 5003 + if (!nodes_empty(hugetlb_bootmem_nodes)) 5004 + return; 5005 + 5006 + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { 5007 + if (end_pfn > start_pfn) 5008 + node_set(nid, hugetlb_bootmem_nodes); 5009 + } 5010 + } 5011 + 5009 5012 static bool __hugetlb_bootmem_allocated __initdata; 5010 5013 5011 5014 bool __init hugetlb_bootmem_allocated(void) ··· 5035 5010 if (__hugetlb_bootmem_allocated) 5036 5011 return; 5037 5012 5013 + hugetlb_bootmem_set_nodes(); 5014 + 5038 5015 for (i = 0; i < MAX_NUMNODES; i++) 5039 5016 INIT_LIST_HEAD(&huge_boot_pages[i]); 5040 5017 ··· 5044 5017 5045 5018 for_each_hstate(h) { 5046 5019 h->next_nid_to_alloc = first_online_node; 5047 - h->next_nid_to_free = first_online_node; 5048 5020 5049 5021 if (hstate_is_gigantic(h)) 5050 5022 hugetlb_hstate_alloc_pages(h);

+7 -4

mm/hugetlb_cma.c

··· 66 66 if (node_exact) 67 67 return NULL; 68 68 69 - for_each_online_node(node) { 69 + for_each_node_mask(node, hugetlb_bootmem_nodes) { 70 70 cma = hugetlb_cma[node]; 71 71 if (!cma || node == *nid) 72 72 continue; ··· 153 153 if (!hugetlb_cma_size) 154 154 return; 155 155 156 + hugetlb_bootmem_set_nodes(); 157 + 156 158 for (nid = 0; nid < MAX_NUMNODES; nid++) { 157 159 if (hugetlb_cma_size_in_node[nid] == 0) 158 160 continue; 159 161 160 - if (!node_online(nid)) { 162 + if (!node_isset(nid, hugetlb_bootmem_nodes)) { 161 163 pr_warn("hugetlb_cma: invalid node %d specified\n", nid); 162 164 hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; 163 165 hugetlb_cma_size_in_node[nid] = 0; ··· 192 190 * If 3 GB area is requested on a machine with 4 numa nodes, 193 191 * let's allocate 1 GB on first three nodes and ignore the last one. 194 192 */ 195 - per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes); 193 + per_node = DIV_ROUND_UP(hugetlb_cma_size, 194 + nodes_weight(hugetlb_bootmem_nodes)); 196 195 pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n", 197 196 hugetlb_cma_size / SZ_1M, per_node / SZ_1M); 198 197 } 199 198 200 199 reserved = 0; 201 - for_each_online_node(nid) { 200 + for_each_node_mask(nid, hugetlb_bootmem_nodes) { 202 201 int res; 203 202 char name[CMA_MAX_NAME]; 204 203

Configure Feed

Configure Feed