Merge branch 'akpm' (patches from Andrew)

+24 -2

arch/x86/mm/fault.c

··· 190 190 return pmd_k; 191 191 } 192 192 193 - void vmalloc_sync_all(void) 193 + static void vmalloc_sync(void) 194 194 { 195 195 unsigned long address; 196 196 ··· 215 215 } 216 216 spin_unlock(&pgd_lock); 217 217 } 218 + } 219 + 220 + void vmalloc_sync_mappings(void) 221 + { 222 + vmalloc_sync(); 223 + } 224 + 225 + void vmalloc_sync_unmappings(void) 226 + { 227 + vmalloc_sync(); 218 228 } 219 229 220 230 /* ··· 329 319 330 320 #else /* CONFIG_X86_64: */ 331 321 332 - void vmalloc_sync_all(void) 322 + void vmalloc_sync_mappings(void) 333 323 { 324 + /* 325 + * 64-bit mappings might allocate new p4d/pud pages 326 + * that need to be propagated to all tasks' PGDs. 327 + */ 334 328 sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); 329 + } 330 + 331 + void vmalloc_sync_unmappings(void) 332 + { 333 + /* 334 + * Unmappings never allocate or free p4d/pud pages. 335 + * No work is required here. 336 + */ 335 337 } 336 338 337 339 /*

+1 -1

drivers/acpi/apei/ghes.c

··· 171 171 * New allocation must be visible in all pgd before it can be found by 172 172 * an NMI allocating from the pool. 173 173 */ 174 - vmalloc_sync_all(); 174 + vmalloc_sync_mappings(); 175 175 176 176 rc = gen_pool_add(ghes_estatus_pool, addr, PAGE_ALIGN(len), -1); 177 177 if (rc)

+4 -4

fs/eventpoll.c

··· 1854 1854 waiter = true; 1855 1855 init_waitqueue_entry(&wait, current); 1856 1856 1857 - spin_lock_irq(&ep->wq.lock); 1857 + write_lock_irq(&ep->lock); 1858 1858 __add_wait_queue_exclusive(&ep->wq, &wait); 1859 - spin_unlock_irq(&ep->wq.lock); 1859 + write_unlock_irq(&ep->lock); 1860 1860 } 1861 1861 1862 1862 for (;;) { ··· 1904 1904 goto fetch_events; 1905 1905 1906 1906 if (waiter) { 1907 - spin_lock_irq(&ep->wq.lock); 1907 + write_lock_irq(&ep->lock); 1908 1908 __remove_wait_queue(&ep->wq, &wait); 1909 - spin_unlock_irq(&ep->wq.lock); 1909 + write_unlock_irq(&ep->lock); 1910 1910 } 1911 1911 1912 1912 return res;

+1 -1

include/linux/page-flags.h

··· 311 311 312 312 __PAGEFLAG(Locked, locked, PF_NO_TAIL) 313 313 PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) __CLEARPAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) 314 - PAGEFLAG(Error, error, PF_NO_COMPOUND) TESTCLEARFLAG(Error, error, PF_NO_COMPOUND) 314 + PAGEFLAG(Error, error, PF_NO_TAIL) TESTCLEARFLAG(Error, error, PF_NO_TAIL) 315 315 PAGEFLAG(Referenced, referenced, PF_HEAD) 316 316 TESTCLEARFLAG(Referenced, referenced, PF_HEAD) 317 317 __SETPAGEFLAG(Referenced, referenced, PF_HEAD)

+3 -2

include/linux/vmalloc.h

··· 141 141 142 142 extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, 143 143 unsigned long pgoff); 144 - void vmalloc_sync_all(void); 145 - 144 + void vmalloc_sync_mappings(void); 145 + void vmalloc_sync_unmappings(void); 146 + 146 147 /* 147 148 * Lowlevel-APIs (not for driver use!) 148 149 */

+1 -1

kernel/notifier.c

··· 519 519 520 520 int register_die_notifier(struct notifier_block *nb) 521 521 { 522 - vmalloc_sync_all(); 522 + vmalloc_sync_mappings(); 523 523 return atomic_notifier_chain_register(&die_chain, nb); 524 524 } 525 525 EXPORT_SYMBOL_GPL(register_die_notifier);

+9 -3

mm/madvise.c

··· 335 335 } 336 336 337 337 page = pmd_page(orig_pmd); 338 + 339 + /* Do not interfere with other mappings of this page */ 340 + if (page_mapcount(page) != 1) 341 + goto huge_unlock; 342 + 338 343 if (next - addr != HPAGE_PMD_SIZE) { 339 344 int err; 340 - 341 - if (page_mapcount(page) != 1) 342 - goto huge_unlock; 343 345 344 346 get_page(page); 345 347 spin_unlock(ptl); ··· 427 425 addr -= PAGE_SIZE; 428 426 continue; 429 427 } 428 + 429 + /* Do not interfere with other mappings of this page */ 430 + if (page_mapcount(page) != 1) 431 + continue; 430 432 431 433 VM_BUG_ON_PAGE(PageTransCompound(page), page); 432 434

+66 -37

mm/memcontrol.c

··· 2297 2297 #define MEMCG_DELAY_SCALING_SHIFT 14 2298 2298 2299 2299 /* 2300 - * Scheduled by try_charge() to be executed from the userland return path 2301 - * and reclaims memory over the high limit. 2300 + * Get the number of jiffies that we should penalise a mischievous cgroup which 2301 + * is exceeding its memory.high by checking both it and its ancestors. 2302 2302 */ 2303 - void mem_cgroup_handle_over_high(void) 2303 + static unsigned long calculate_high_delay(struct mem_cgroup *memcg, 2304 + unsigned int nr_pages) 2304 2305 { 2305 - unsigned long usage, high, clamped_high; 2306 - unsigned long pflags; 2307 - unsigned long penalty_jiffies, overage; 2308 - unsigned int nr_pages = current->memcg_nr_pages_over_high; 2309 - struct mem_cgroup *memcg; 2306 + unsigned long penalty_jiffies; 2307 + u64 max_overage = 0; 2310 2308 2311 - if (likely(!nr_pages)) 2312 - return; 2309 + do { 2310 + unsigned long usage, high; 2311 + u64 overage; 2313 2312 2314 - memcg = get_mem_cgroup_from_mm(current->mm); 2315 - reclaim_high(memcg, nr_pages, GFP_KERNEL); 2316 - current->memcg_nr_pages_over_high = 0; 2313 + usage = page_counter_read(&memcg->memory); 2314 + high = READ_ONCE(memcg->high); 2315 + 2316 + /* 2317 + * Prevent division by 0 in overage calculation by acting as if 2318 + * it was a threshold of 1 page 2319 + */ 2320 + high = max(high, 1UL); 2321 + 2322 + overage = usage - high; 2323 + overage <<= MEMCG_DELAY_PRECISION_SHIFT; 2324 + overage = div64_u64(overage, high); 2325 + 2326 + if (overage > max_overage) 2327 + max_overage = overage; 2328 + } while ((memcg = parent_mem_cgroup(memcg)) && 2329 + !mem_cgroup_is_root(memcg)); 2330 + 2331 + if (!max_overage) 2332 + return 0; 2317 2333 2318 2334 /* 2319 - * memory.high is breached and reclaim is unable to keep up. Throttle 2320 - * allocators proactively to slow down excessive growth. 2321 - * 2322 2335 * We use overage compared to memory.high to calculate the number of 2323 2336 * jiffies to sleep (penalty_jiffies). Ideally this value should be 2324 2337 * fairly lenient on small overages, and increasingly harsh when the ··· 2339 2326 * its crazy behaviour, so we exponentially increase the delay based on 2340 2327 * overage amount. 2341 2328 */ 2342 - 2343 - usage = page_counter_read(&memcg->memory); 2344 - high = READ_ONCE(memcg->high); 2345 - 2346 - if (usage <= high) 2347 - goto out; 2348 - 2349 - /* 2350 - * Prevent division by 0 in overage calculation by acting as if it was a 2351 - * threshold of 1 page 2352 - */ 2353 - clamped_high = max(high, 1UL); 2354 - 2355 - overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT, 2356 - clamped_high); 2357 - 2358 - penalty_jiffies = ((u64)overage * overage * HZ) 2359 - >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT); 2329 + penalty_jiffies = max_overage * max_overage * HZ; 2330 + penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT; 2331 + penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT; 2360 2332 2361 2333 /* 2362 2334 * Factor in the task's own contribution to the overage, such that four ··· 2358 2360 * application moving forwards and also permit diagnostics, albeit 2359 2361 * extremely slowly. 2360 2362 */ 2361 - penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); 2363 + return min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); 2364 + } 2365 + 2366 + /* 2367 + * Scheduled by try_charge() to be executed from the userland return path 2368 + * and reclaims memory over the high limit. 2369 + */ 2370 + void mem_cgroup_handle_over_high(void) 2371 + { 2372 + unsigned long penalty_jiffies; 2373 + unsigned long pflags; 2374 + unsigned int nr_pages = current->memcg_nr_pages_over_high; 2375 + struct mem_cgroup *memcg; 2376 + 2377 + if (likely(!nr_pages)) 2378 + return; 2379 + 2380 + memcg = get_mem_cgroup_from_mm(current->mm); 2381 + reclaim_high(memcg, nr_pages, GFP_KERNEL); 2382 + current->memcg_nr_pages_over_high = 0; 2383 + 2384 + /* 2385 + * memory.high is breached and reclaim is unable to keep up. Throttle 2386 + * allocators proactively to slow down excessive growth. 2387 + */ 2388 + penalty_jiffies = calculate_high_delay(memcg, nr_pages); 2362 2389 2363 2390 /* 2364 2391 * Don't sleep if the amount of jiffies this memcg owes us is so low ··· 4050 4027 struct mem_cgroup_thresholds *thresholds; 4051 4028 struct mem_cgroup_threshold_ary *new; 4052 4029 unsigned long usage; 4053 - int i, j, size; 4030 + int i, j, size, entries; 4054 4031 4055 4032 mutex_lock(&memcg->thresholds_lock); 4056 4033 ··· 4070 4047 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4071 4048 4072 4049 /* Calculate new number of threshold */ 4073 - size = 0; 4050 + size = entries = 0; 4074 4051 for (i = 0; i < thresholds->primary->size; i++) { 4075 4052 if (thresholds->primary->entries[i].eventfd != eventfd) 4076 4053 size++; 4054 + else 4055 + entries++; 4077 4056 } 4078 4057 4079 4058 new = thresholds->spare; 4059 + 4060 + /* If no items related to eventfd have been cleared, nothing to do */ 4061 + if (!entries) 4062 + goto unlock; 4080 4063 4081 4064 /* Set thresholds array to NULL if we don't have thresholds */ 4082 4065 if (!size) {

+18 -9

mm/mmu_notifier.c

··· 307 307 * ->release returns. 308 308 */ 309 309 id = srcu_read_lock(&srcu); 310 - hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist) 310 + hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist, 311 + srcu_read_lock_held(&srcu)) 311 312 /* 312 313 * If ->release runs before mmu_notifier_unregister it must be 313 314 * handled, as it's the only way for the driver to flush all ··· 371 370 372 371 id = srcu_read_lock(&srcu); 373 372 hlist_for_each_entry_rcu(subscription, 374 - &mm->notifier_subscriptions->list, hlist) { 373 + &mm->notifier_subscriptions->list, hlist, 374 + srcu_read_lock_held(&srcu)) { 375 375 if (subscription->ops->clear_flush_young) 376 376 young |= subscription->ops->clear_flush_young( 377 377 subscription, mm, start, end); ··· 391 389 392 390 id = srcu_read_lock(&srcu); 393 391 hlist_for_each_entry_rcu(subscription, 394 - &mm->notifier_subscriptions->list, hlist) { 392 + &mm->notifier_subscriptions->list, hlist, 393 + srcu_read_lock_held(&srcu)) { 395 394 if (subscription->ops->clear_young) 396 395 young |= subscription->ops->clear_young(subscription, 397 396 mm, start, end); ··· 410 407 411 408 id = srcu_read_lock(&srcu); 412 409 hlist_for_each_entry_rcu(subscription, 413 - &mm->notifier_subscriptions->list, hlist) { 410 + &mm->notifier_subscriptions->list, hlist, 411 + srcu_read_lock_held(&srcu)) { 414 412 if (subscription->ops->test_young) { 415 413 young = subscription->ops->test_young(subscription, mm, 416 414 address); ··· 432 428 433 429 id = srcu_read_lock(&srcu); 434 430 hlist_for_each_entry_rcu(subscription, 435 - &mm->notifier_subscriptions->list, hlist) { 431 + &mm->notifier_subscriptions->list, hlist, 432 + srcu_read_lock_held(&srcu)) { 436 433 if (subscription->ops->change_pte) 437 434 subscription->ops->change_pte(subscription, mm, address, 438 435 pte); ··· 481 476 int id; 482 477 483 478 id = srcu_read_lock(&srcu); 484 - hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist) { 479 + hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist, 480 + srcu_read_lock_held(&srcu)) { 485 481 const struct mmu_notifier_ops *ops = subscription->ops; 486 482 487 483 if (ops->invalidate_range_start) { ··· 534 528 int id; 535 529 536 530 id = srcu_read_lock(&srcu); 537 - hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist) { 531 + hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist, 532 + srcu_read_lock_held(&srcu)) { 538 533 /* 539 534 * Call invalidate_range here too to avoid the need for the 540 535 * subsystem of having to register an invalidate_range_end ··· 589 582 590 583 id = srcu_read_lock(&srcu); 591 584 hlist_for_each_entry_rcu(subscription, 592 - &mm->notifier_subscriptions->list, hlist) { 585 + &mm->notifier_subscriptions->list, hlist, 586 + srcu_read_lock_held(&srcu)) { 593 587 if (subscription->ops->invalidate_range) 594 588 subscription->ops->invalidate_range(subscription, mm, 595 589 start, end); ··· 722 714 723 715 spin_lock(&mm->notifier_subscriptions->lock); 724 716 hlist_for_each_entry_rcu(subscription, 725 - &mm->notifier_subscriptions->list, hlist) { 717 + &mm->notifier_subscriptions->list, hlist, 718 + lockdep_is_held(&mm->notifier_subscriptions->lock)) { 726 719 if (subscription->ops != ops) 727 720 continue; 728 721

+7 -3

mm/nommu.c

··· 370 370 EXPORT_SYMBOL_GPL(vm_unmap_aliases); 371 371 372 372 /* 373 - * Implement a stub for vmalloc_sync_all() if the architecture chose not to 374 - * have one. 373 + * Implement a stub for vmalloc_sync_[un]mapping() if the architecture 374 + * chose not to have one. 375 375 */ 376 - void __weak vmalloc_sync_all(void) 376 + void __weak vmalloc_sync_mappings(void) 377 + { 378 + } 379 + 380 + void __weak vmalloc_sync_unmappings(void) 377 381 { 378 382 } 379 383

+17 -9

mm/slub.c

··· 1973 1973 1974 1974 if (node == NUMA_NO_NODE) 1975 1975 searchnode = numa_mem_id(); 1976 - else if (!node_present_pages(node)) 1977 - searchnode = node_to_mem_node(node); 1978 1976 1979 1977 object = get_partial_node(s, get_node(s, searchnode), c, flags); 1980 1978 if (object || node != NUMA_NO_NODE) ··· 2561 2563 struct page *page; 2562 2564 2563 2565 page = c->page; 2564 - if (!page) 2566 + if (!page) { 2567 + /* 2568 + * if the node is not online or has no normal memory, just 2569 + * ignore the node constraint 2570 + */ 2571 + if (unlikely(node != NUMA_NO_NODE && 2572 + !node_state(node, N_NORMAL_MEMORY))) 2573 + node = NUMA_NO_NODE; 2565 2574 goto new_slab; 2575 + } 2566 2576 redo: 2567 2577 2568 2578 if (unlikely(!node_match(page, node))) { 2569 - int searchnode = node; 2570 - 2571 - if (node != NUMA_NO_NODE && !node_present_pages(node)) 2572 - searchnode = node_to_mem_node(node); 2573 - 2574 - if (unlikely(!node_match(page, searchnode))) { 2579 + /* 2580 + * same as above but node_match() being false already 2581 + * implies node != NUMA_NO_NODE 2582 + */ 2583 + if (!node_state(node, N_NORMAL_MEMORY)) { 2584 + node = NUMA_NO_NODE; 2585 + goto redo; 2586 + } else { 2575 2587 stat(s, ALLOC_NODE_MISMATCH); 2576 2588 deactivate_slab(s, page, c->freelist, c); 2577 2589 goto new_slab;

+6 -2

mm/sparse.c

··· 734 734 struct mem_section *ms = __pfn_to_section(pfn); 735 735 bool section_is_early = early_section(ms); 736 736 struct page *memmap = NULL; 737 + bool empty; 737 738 unsigned long *subsection_map = ms->usage 738 739 ? &ms->usage->subsection_map[0] : NULL; 739 740 ··· 765 764 * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified 766 765 */ 767 766 bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION); 768 - if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) { 767 + empty = bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION); 768 + if (empty) { 769 769 unsigned long section_nr = pfn_to_section_nr(pfn); 770 770 771 771 /* ··· 781 779 ms->usage = NULL; 782 780 } 783 781 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 784 - ms->section_mem_map = (unsigned long)NULL; 785 782 } 786 783 787 784 if (section_is_early && memmap) 788 785 free_map_bootmem(memmap); 789 786 else 790 787 depopulate_section_memmap(pfn, nr_pages, altmap); 788 + 789 + if (empty) 790 + ms->section_mem_map = (unsigned long)NULL; 791 791 } 792 792 793 793 static struct page * __meminit section_activate(int nid, unsigned long pfn,

+7 -4

mm/vmalloc.c

··· 1295 1295 * First make sure the mappings are removed from all page-tables 1296 1296 * before they are freed. 1297 1297 */ 1298 - vmalloc_sync_all(); 1298 + vmalloc_sync_unmappings(); 1299 1299 1300 1300 /* 1301 1301 * TODO: to calculate a flush range without looping. ··· 3128 3128 EXPORT_SYMBOL(remap_vmalloc_range); 3129 3129 3130 3130 /* 3131 - * Implement a stub for vmalloc_sync_all() if the architecture chose not to 3132 - * have one. 3131 + * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose 3132 + * not to have one. 3133 3133 * 3134 3134 * The purpose of this function is to make sure the vmalloc area 3135 3135 * mappings are identical in all page-tables in the system. 3136 3136 */ 3137 - void __weak vmalloc_sync_all(void) 3137 + void __weak vmalloc_sync_mappings(void) 3138 3138 { 3139 3139 } 3140 3140 3141 + void __weak vmalloc_sync_unmappings(void) 3142 + { 3143 + } 3141 3144 3142 3145 static int f(pte_t *pte, unsigned long addr, void *data) 3143 3146 {

Configure Feed

Configure Feed