Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * sparse memory mappings.
4 */
5#include <linux/mm.h>
6#include <linux/slab.h>
7#include <linux/mmzone.h>
8#include <linux/memblock.h>
9#include <linux/compiler.h>
10#include <linux/highmem.h>
11#include <linux/export.h>
12#include <linux/spinlock.h>
13#include <linux/vmalloc.h>
14#include <linux/swap.h>
15#include <linux/swapops.h>
16#include <linux/bootmem_info.h>
17#include <linux/vmstat.h>
18#include "internal.h"
19#include <asm/dma.h>
20
21/*
22 * Permanent SPARSEMEM data:
23 *
24 * 1) mem_section - memory sections, mem_map's for valid memory
25 */
26#ifdef CONFIG_SPARSEMEM_EXTREME
27struct mem_section **mem_section;
28#else
29struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
30 ____cacheline_internodealigned_in_smp;
31#endif
32EXPORT_SYMBOL(mem_section);
33
34#ifdef NODE_NOT_IN_PAGE_FLAGS
35/*
36 * If we did not store the node number in the page then we have to
37 * do a lookup in the section_to_node_table in order to find which
38 * node the page belongs to.
39 */
40#if MAX_NUMNODES <= 256
41static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
42#else
43static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
44#endif
45
46int memdesc_nid(memdesc_flags_t mdf)
47{
48 return section_to_node_table[memdesc_section(mdf)];
49}
50EXPORT_SYMBOL(memdesc_nid);
51
52static void set_section_nid(unsigned long section_nr, int nid)
53{
54 section_to_node_table[section_nr] = nid;
55}
56#else /* !NODE_NOT_IN_PAGE_FLAGS */
57static inline void set_section_nid(unsigned long section_nr, int nid)
58{
59}
60#endif
61
62#ifdef CONFIG_SPARSEMEM_EXTREME
63static noinline struct mem_section __ref *sparse_index_alloc(int nid)
64{
65 struct mem_section *section = NULL;
66 unsigned long array_size = SECTIONS_PER_ROOT *
67 sizeof(struct mem_section);
68
69 if (slab_is_available()) {
70 section = kzalloc_node(array_size, GFP_KERNEL, nid);
71 } else {
72 section = memblock_alloc_node(array_size, SMP_CACHE_BYTES,
73 nid);
74 if (!section)
75 panic("%s: Failed to allocate %lu bytes nid=%d\n",
76 __func__, array_size, nid);
77 }
78
79 return section;
80}
81
82int __meminit sparse_index_init(unsigned long section_nr, int nid)
83{
84 unsigned long root = SECTION_NR_TO_ROOT(section_nr);
85 struct mem_section *section;
86
87 /*
88 * An existing section is possible in the sub-section hotplug
89 * case. First hot-add instantiates, follow-on hot-add reuses
90 * the existing section.
91 *
92 * The mem_hotplug_lock resolves the apparent race below.
93 */
94 if (mem_section[root])
95 return 0;
96
97 section = sparse_index_alloc(nid);
98 if (!section)
99 return -ENOMEM;
100
101 mem_section[root] = section;
102
103 return 0;
104}
105#else /* !SPARSEMEM_EXTREME */
106int sparse_index_init(unsigned long section_nr, int nid)
107{
108 return 0;
109}
110#endif
111
112/*
113 * During early boot, before section_mem_map is used for an actual
114 * mem_map, we use section_mem_map to store the section's NUMA
115 * node. This keeps us from having to use another data structure. The
116 * node information is cleared just before we store the real mem_map.
117 */
118static inline unsigned long sparse_encode_early_nid(int nid)
119{
120 return ((unsigned long)nid << SECTION_NID_SHIFT);
121}
122
123static inline int sparse_early_nid(struct mem_section *section)
124{
125 return (section->section_mem_map >> SECTION_NID_SHIFT);
126}
127
128/* Validate the physical addressing limitations of the model */
129static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
130 unsigned long *end_pfn)
131{
132 unsigned long max_sparsemem_pfn = (DIRECT_MAP_PHYSMEM_END + 1) >> PAGE_SHIFT;
133
134 /*
135 * Sanity checks - do not allow an architecture to pass
136 * in larger pfns than the maximum scope of sparsemem:
137 */
138 if (*start_pfn > max_sparsemem_pfn) {
139 mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
140 "Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
141 *start_pfn, *end_pfn, max_sparsemem_pfn);
142 WARN_ON_ONCE(1);
143 *start_pfn = max_sparsemem_pfn;
144 *end_pfn = max_sparsemem_pfn;
145 } else if (*end_pfn > max_sparsemem_pfn) {
146 mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
147 "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
148 *start_pfn, *end_pfn, max_sparsemem_pfn);
149 WARN_ON_ONCE(1);
150 *end_pfn = max_sparsemem_pfn;
151 }
152}
153
154/*
155 * There are a number of times that we loop over NR_MEM_SECTIONS,
156 * looking for section_present() on each. But, when we have very
157 * large physical address spaces, NR_MEM_SECTIONS can also be
158 * very large which makes the loops quite long.
159 *
160 * Keeping track of this gives us an easy way to break out of
161 * those loops early.
162 */
163unsigned long __highest_present_section_nr;
164
165static inline unsigned long first_present_section_nr(void)
166{
167 return next_present_section_nr(-1);
168}
169
170/* Record a memory area against a node. */
171static void __init memory_present(int nid, unsigned long start, unsigned long end)
172{
173 unsigned long pfn;
174
175 start &= PAGE_SECTION_MASK;
176 mminit_validate_memmodel_limits(&start, &end);
177 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
178 unsigned long section_nr = pfn_to_section_nr(pfn);
179 struct mem_section *ms;
180
181 sparse_index_init(section_nr, nid);
182 set_section_nid(section_nr, nid);
183
184 ms = __nr_to_section(section_nr);
185 if (!ms->section_mem_map) {
186 ms->section_mem_map = sparse_encode_early_nid(nid) |
187 SECTION_IS_ONLINE;
188 __section_mark_present(ms, section_nr);
189 }
190 }
191}
192
193/*
194 * Mark all memblocks as present using memory_present().
195 * This is a convenience function that is useful to mark all of the systems
196 * memory as present during initialization.
197 */
198static void __init memblocks_present(void)
199{
200 unsigned long start, end;
201 int i, nid;
202
203#ifdef CONFIG_SPARSEMEM_EXTREME
204 if (unlikely(!mem_section)) {
205 unsigned long size, align;
206
207 size = sizeof(struct mem_section *) * NR_SECTION_ROOTS;
208 align = 1 << (INTERNODE_CACHE_SHIFT);
209 mem_section = memblock_alloc_or_panic(size, align);
210 }
211#endif
212
213 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid)
214 memory_present(nid, start, end);
215}
216
217static unsigned long usemap_size(void)
218{
219 return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
220}
221
222size_t mem_section_usage_size(void)
223{
224 return sizeof(struct mem_section_usage) + usemap_size();
225}
226
227#ifdef CONFIG_SPARSEMEM_VMEMMAP
228unsigned long __init section_map_size(void)
229{
230 return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE);
231}
232
233#else
234unsigned long __init section_map_size(void)
235{
236 return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
237}
238
239struct page __init *__populate_section_memmap(unsigned long pfn,
240 unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
241 struct dev_pagemap *pgmap)
242{
243 unsigned long size = section_map_size();
244 struct page *map = sparse_buffer_alloc(size);
245 phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
246
247 if (map)
248 return map;
249
250 map = memmap_alloc(size, size, addr, nid, false);
251 if (!map)
252 panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
253 __func__, size, PAGE_SIZE, nid, &addr);
254
255 return map;
256}
257#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
258
259static void *sparsemap_buf __meminitdata;
260static void *sparsemap_buf_end __meminitdata;
261
262static inline void __meminit sparse_buffer_free(unsigned long size)
263{
264 WARN_ON(!sparsemap_buf || size == 0);
265 memblock_free(sparsemap_buf, size);
266}
267
268static void __init sparse_buffer_init(unsigned long size, int nid)
269{
270 phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
271 WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */
272 /*
273 * Pre-allocated buffer is mainly used by __populate_section_memmap
274 * and we want it to be properly aligned to the section size - this is
275 * especially the case for VMEMMAP which maps memmap to PMDs
276 */
277 sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true);
278 sparsemap_buf_end = sparsemap_buf + size;
279}
280
281static void __init sparse_buffer_fini(void)
282{
283 unsigned long size = sparsemap_buf_end - sparsemap_buf;
284
285 if (sparsemap_buf && size > 0)
286 sparse_buffer_free(size);
287 sparsemap_buf = NULL;
288}
289
290void * __meminit sparse_buffer_alloc(unsigned long size)
291{
292 void *ptr = NULL;
293
294 if (sparsemap_buf) {
295 ptr = (void *) roundup((unsigned long)sparsemap_buf, size);
296 if (ptr + size > sparsemap_buf_end)
297 ptr = NULL;
298 else {
299 /* Free redundant aligned space */
300 if ((unsigned long)(ptr - sparsemap_buf) > 0)
301 sparse_buffer_free((unsigned long)(ptr - sparsemap_buf));
302 sparsemap_buf = ptr + size;
303 }
304 }
305 return ptr;
306}
307
308void __weak __meminit vmemmap_populate_print_last(void)
309{
310}
311
312static void *sparse_usagebuf __meminitdata;
313static void *sparse_usagebuf_end __meminitdata;
314
315/*
316 * Helper function that is used for generic section initialization, and
317 * can also be used by any hooks added above.
318 */
319void __init sparse_init_early_section(int nid, struct page *map,
320 unsigned long pnum, unsigned long flags)
321{
322 BUG_ON(!sparse_usagebuf || sparse_usagebuf >= sparse_usagebuf_end);
323 sparse_init_one_section(__nr_to_section(pnum), pnum, map,
324 sparse_usagebuf, SECTION_IS_EARLY | flags);
325 sparse_usagebuf = (void *)sparse_usagebuf + mem_section_usage_size();
326}
327
328static int __init sparse_usage_init(int nid, unsigned long map_count)
329{
330 unsigned long size;
331
332 size = mem_section_usage_size() * map_count;
333 sparse_usagebuf = memblock_alloc_node(size, SMP_CACHE_BYTES, nid);
334 if (!sparse_usagebuf) {
335 sparse_usagebuf_end = NULL;
336 return -ENOMEM;
337 }
338
339 sparse_usagebuf_end = sparse_usagebuf + size;
340 return 0;
341}
342
343static void __init sparse_usage_fini(void)
344{
345 sparse_usagebuf = sparse_usagebuf_end = NULL;
346}
347
348/*
349 * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
350 * And number of present sections in this node is map_count.
351 */
352static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
353 unsigned long pnum_end,
354 unsigned long map_count)
355{
356 unsigned long pnum;
357 struct page *map;
358 struct mem_section *ms;
359
360 if (sparse_usage_init(nid, map_count)) {
361 pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
362 goto failed;
363 }
364
365 sparse_buffer_init(map_count * section_map_size(), nid);
366
367 sparse_vmemmap_init_nid_early(nid);
368
369 for_each_present_section_nr(pnum_begin, pnum) {
370 unsigned long pfn = section_nr_to_pfn(pnum);
371
372 if (pnum >= pnum_end)
373 break;
374
375 ms = __nr_to_section(pnum);
376 if (!preinited_vmemmap_section(ms)) {
377 map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
378 nid, NULL, NULL);
379 if (!map) {
380 pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
381 __func__, nid);
382 pnum_begin = pnum;
383 sparse_usage_fini();
384 sparse_buffer_fini();
385 goto failed;
386 }
387 memmap_boot_pages_add(DIV_ROUND_UP(PAGES_PER_SECTION * sizeof(struct page),
388 PAGE_SIZE));
389 sparse_init_early_section(nid, map, pnum, 0);
390 }
391 }
392 sparse_usage_fini();
393 sparse_buffer_fini();
394 return;
395failed:
396 /*
397 * We failed to allocate, mark all the following pnums as not present,
398 * except the ones already initialized earlier.
399 */
400 for_each_present_section_nr(pnum_begin, pnum) {
401 if (pnum >= pnum_end)
402 break;
403 ms = __nr_to_section(pnum);
404 if (!preinited_vmemmap_section(ms))
405 ms->section_mem_map = 0;
406 }
407}
408
409/*
410 * Allocate the accumulated non-linear sections, allocate a mem_map
411 * for each and record the physical to section mapping.
412 */
413void __init sparse_init(void)
414{
415 unsigned long pnum_end, pnum_begin, map_count = 1;
416 int nid_begin;
417
418 /* see include/linux/mmzone.h 'struct mem_section' definition */
419 BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section)));
420 memblocks_present();
421
422 if (compound_info_has_mask()) {
423 VM_WARN_ON_ONCE(!IS_ALIGNED((unsigned long) pfn_to_page(0),
424 MAX_FOLIO_VMEMMAP_ALIGN));
425 }
426
427 pnum_begin = first_present_section_nr();
428 nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
429
430 /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
431 set_pageblock_order();
432
433 for_each_present_section_nr(pnum_begin + 1, pnum_end) {
434 int nid = sparse_early_nid(__nr_to_section(pnum_end));
435
436 if (nid == nid_begin) {
437 map_count++;
438 continue;
439 }
440 /* Init node with sections in range [pnum_begin, pnum_end) */
441 sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
442 nid_begin = nid;
443 pnum_begin = pnum_end;
444 map_count = 1;
445 }
446 /* cover the last node */
447 sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
448 vmemmap_populate_print_last();
449}