Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

[PATCH] x86_64: Tell VM about holes in nodes

Some nodes can have large holes on x86-64.

This fixes problems with the VM allowing too many dirty pages because it
overestimates the number of available RAM in a node. In extreme cases you
can end up with all RAM filled with dirty pages which can lead to deadlocks
and other nasty behaviour.

This patch just tells the VM about the known holes from e820. Reserved
(like the kernel text or mem_map) is still not taken into account, but that
should be only a few percent error now.

Small detail is that the flat setup uses the NUMA free_area_init_node() now
too because it offers more flexibility.

(akpm: lotsa thanks to Martin for working this problem out)

Cc: Martin Bligh <mbligh@mbligh.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

authored by

Andi Kleen and committed by
Linus Torvalds
485761bd bebf4688

+55 -5
+34
arch/x86_64/kernel/e820.c
··· 185 185 } 186 186 187 187 /* 188 + * Compute how much memory is missing in a range. 189 + * Unlike the other functions in this file the arguments are in page numbers. 190 + */ 191 + unsigned long __init 192 + e820_hole_size(unsigned long start_pfn, unsigned long end_pfn) 193 + { 194 + unsigned long ram = 0; 195 + unsigned long start = start_pfn << PAGE_SHIFT; 196 + unsigned long end = end_pfn << PAGE_SHIFT; 197 + int i; 198 + for (i = 0; i < e820.nr_map; i++) { 199 + struct e820entry *ei = &e820.map[i]; 200 + unsigned long last, addr; 201 + 202 + if (ei->type != E820_RAM || 203 + ei->addr+ei->size <= start || 204 + ei->addr >= end) 205 + continue; 206 + 207 + addr = round_up(ei->addr, PAGE_SIZE); 208 + if (addr < start) 209 + addr = start; 210 + 211 + last = round_down(ei->addr + ei->size, PAGE_SIZE); 212 + if (last >= end) 213 + last = end; 214 + 215 + if (last > addr) 216 + ram += last - addr; 217 + } 218 + return ((end - start) - ram) >> PAGE_SHIFT; 219 + } 220 + 221 + /* 188 222 * Mark e820 reserved areas as busy for the resource manager. 189 223 */ 190 224 void __init e820_reserve_resources(void)
+12 -4
arch/x86_64/mm/init.c
··· 322 322 void __init paging_init(void) 323 323 { 324 324 { 325 - unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; 325 + unsigned long zones_size[MAX_NR_ZONES]; 326 + unsigned long holes[MAX_NR_ZONES]; 326 327 unsigned int max_dma; 328 + 329 + memset(zones_size, 0, sizeof(zones_size)); 330 + memset(holes, 0, sizeof(holes)); 327 331 328 332 max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; 329 333 330 - if (end_pfn < max_dma) 334 + if (end_pfn < max_dma) { 331 335 zones_size[ZONE_DMA] = end_pfn; 332 - else { 336 + holes[ZONE_DMA] = e820_hole_size(0, end_pfn); 337 + } else { 333 338 zones_size[ZONE_DMA] = max_dma; 339 + holes[ZONE_DMA] = e820_hole_size(0, max_dma); 334 340 zones_size[ZONE_NORMAL] = end_pfn - max_dma; 341 + holes[ZONE_NORMAL] = e820_hole_size(max_dma, end_pfn); 335 342 } 336 - free_area_init(zones_size); 343 + free_area_init_node(0, NODE_DATA(0), zones_size, 344 + __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes); 337 345 } 338 346 return; 339 347 }
+7 -1
arch/x86_64/mm/numa.c
··· 126 126 { 127 127 unsigned long start_pfn, end_pfn; 128 128 unsigned long zones[MAX_NR_ZONES]; 129 + unsigned long holes[MAX_NR_ZONES]; 129 130 unsigned long dma_end_pfn; 130 131 131 132 memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES); 133 + memset(holes, 0, sizeof(unsigned long) * MAX_NR_ZONES); 132 134 133 135 start_pfn = node_start_pfn(nodeid); 134 136 end_pfn = node_end_pfn(nodeid); ··· 141 139 dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT; 142 140 if (start_pfn < dma_end_pfn) { 143 141 zones[ZONE_DMA] = dma_end_pfn - start_pfn; 142 + holes[ZONE_DMA] = e820_hole_size(start_pfn, dma_end_pfn); 144 143 zones[ZONE_NORMAL] = end_pfn - dma_end_pfn; 144 + holes[ZONE_NORMAL] = e820_hole_size(dma_end_pfn, end_pfn); 145 + 145 146 } else { 146 147 zones[ZONE_NORMAL] = end_pfn - start_pfn; 148 + holes[ZONE_NORMAL] = e820_hole_size(start_pfn, end_pfn); 147 149 } 148 150 149 151 free_area_init_node(nodeid, NODE_DATA(nodeid), zones, 150 - start_pfn, NULL); 152 + start_pfn, holes); 151 153 } 152 154 153 155 void __init numa_init_array(void)
+2
include/asm-x86_64/e820.h
··· 51 51 52 52 extern void e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end); 53 53 extern void e820_setup_gap(void); 54 + extern unsigned long e820_hole_size(unsigned long start_pfn, 55 + unsigned long end_pfn); 54 56 55 57 extern void __init parse_memopt(char *p, char **end); 56 58