mm/page_alloc: prevent pcp corruption with SMP=n

The kernel test robot has reported:

BUG: spinlock trylock failure on UP on CPU#0, kcompactd0/28
lock: 0xffff888807e35ef0, .magic: dead4ead, .owner: kcompactd0/28, .owner_cpu: 0
CPU: 0 UID: 0 PID: 28 Comm: kcompactd0 Not tainted 6.18.0-rc5-00127-ga06157804399 #1 PREEMPT 8cc09ef94dcec767faa911515ce9e609c45db470
Call Trace:
<IRQ>
__dump_stack (lib/dump_stack.c:95)
dump_stack_lvl (lib/dump_stack.c:123)
dump_stack (lib/dump_stack.c:130)
spin_dump (kernel/locking/spinlock_debug.c:71)
do_raw_spin_trylock (kernel/locking/spinlock_debug.c:?)
_raw_spin_trylock (include/linux/spinlock_api_smp.h:89 kernel/locking/spinlock.c:138)
__free_frozen_pages (mm/page_alloc.c:2973)
___free_pages (mm/page_alloc.c:5295)
__free_pages (mm/page_alloc.c:5334)
tlb_remove_table_rcu (include/linux/mm.h:? include/linux/mm.h:3122 include/asm-generic/tlb.h:220 mm/mmu_gather.c:227 mm/mmu_gather.c:290)
? __cfi_tlb_remove_table_rcu (mm/mmu_gather.c:289)
? rcu_core (kernel/rcu/tree.c:?)
rcu_core (include/linux/rcupdate.h:341 kernel/rcu/tree.c:2607 kernel/rcu/tree.c:2861)
rcu_core_si (kernel/rcu/tree.c:2879)
handle_softirqs (arch/x86/include/asm/jump_label.h:36 include/trace/events/irq.h:142 kernel/softirq.c:623)
__irq_exit_rcu (arch/x86/include/asm/jump_label.h:36 kernel/softirq.c:725)
irq_exit_rcu (kernel/softirq.c:741)
sysvec_apic_timer_interrupt (arch/x86/kernel/apic/apic.c:1052)
</IRQ>
<TASK>
RIP: 0010:_raw_spin_unlock_irqrestore (arch/x86/include/asm/preempt.h:95 include/linux/spinlock_api_smp.h:152 kernel/locking/spinlock.c:194)
free_pcppages_bulk (mm/page_alloc.c:1494)
drain_pages_zone (include/linux/spinlock.h:391 mm/page_alloc.c:2632)
__drain_all_pages (mm/page_alloc.c:2731)
drain_all_pages (mm/page_alloc.c:2747)
kcompactd (mm/compaction.c:3115)
kthread (kernel/kthread.c:465)
? __cfi_kcompactd (mm/compaction.c:3166)
? __cfi_kthread (kernel/kthread.c:412)
ret_from_fork (arch/x86/kernel/process.c:164)
? __cfi_kthread (kernel/kthread.c:412)
ret_from_fork_asm (arch/x86/entry/entry_64.S:255)
</TASK>

Matthew has analyzed the report and identified that in drain_page_zone()
we are in a section protected by spin_lock(&pcp->lock) and then get an
interrupt that attempts spin_trylock() on the same lock. The code is
designed to work this way without disabling IRQs and occasionally fail the
trylock with a fallback. However, the SMP=n spinlock implementation
assumes spin_trylock() will always succeed, and thus it's normally a
no-op. Here the enabled lock debugging catches the problem, but otherwise
it could cause a corruption of the pcp structure.

The problem has been introduced by commit 574907741599 ("mm/page_alloc:
leave IRQs enabled for per-cpu page allocations"). The pcp locking scheme
recognizes the need for disabling IRQs to prevent nesting spin_trylock()
sections on SMP=n, but the need to prevent the nesting in spin_lock() has
not been recognized. Fix it by introducing local wrappers that change the
spin_lock() to spin_lock_iqsave() with SMP=n and use them in all places
that do spin_lock(&pcp->lock).

[vbabka@suse.cz: add pcp_ prefix to the spin_lock_irqsave wrappers, per Steven]
Link: https://lkml.kernel.org/r/20260105-fix-pcp-up-v1-1-5579662d2071@suse.cz
Fixes: 574907741599 ("mm/page_alloc: leave IRQs enabled for per-cpu page allocations")
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202512101320.e2f2dd6f-lkp@intel.com
Analyzed-by: Matthew Wilcox <willy@infradead.org>
Link: https://lore.kernel.org/all/aUW05pyc9nZkvY-1@casper.infradead.org/
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Vlastimil Babka and committed by

Andrew Morton 5 months ago 038a1025 4b5c493f

+39 -8

1 changed file

expand all

page_alloc.c

+39 -8

mm/page_alloc.c

··· 167 167 pcp_trylock_finish(UP_flags); \ 168 168 }) 169 169 170 + /* 171 + * With the UP spinlock implementation, when we spin_lock(&pcp->lock) (for i.e. 172 + * a potentially remote cpu drain) and get interrupted by an operation that 173 + * attempts pcp_spin_trylock(), we can't rely on the trylock failure due to UP 174 + * spinlock assumptions making the trylock a no-op. So we have to turn that 175 + * spin_lock() to a spin_lock_irqsave(). This works because on UP there are no 176 + * remote cpu's so we can only be locking the only existing local one. 177 + */ 178 + #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) 179 + static inline void __flags_noop(unsigned long *flags) { } 180 + #define pcp_spin_lock_maybe_irqsave(ptr, flags) \ 181 + ({ \ 182 + __flags_noop(&(flags)); \ 183 + spin_lock(&(ptr)->lock); \ 184 + }) 185 + #define pcp_spin_unlock_maybe_irqrestore(ptr, flags) \ 186 + ({ \ 187 + spin_unlock(&(ptr)->lock); \ 188 + __flags_noop(&(flags)); \ 189 + }) 190 + #else 191 + #define pcp_spin_lock_maybe_irqsave(ptr, flags) \ 192 + spin_lock_irqsave(&(ptr)->lock, flags) 193 + #define pcp_spin_unlock_maybe_irqrestore(ptr, flags) \ 194 + spin_unlock_irqrestore(&(ptr)->lock, flags) 195 + #endif 196 + 170 197 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 171 198 DEFINE_PER_CPU(int, numa_node); 172 199 EXPORT_PER_CPU_SYMBOL(numa_node); ··· 2583 2556 bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) 2584 2557 { 2585 2558 int high_min, to_drain, to_drain_batched, batch; 2559 + unsigned long UP_flags; 2586 2560 bool todo = false; 2587 2561 2588 2562 high_min = READ_ONCE(pcp->high_min); ··· 2603 2575 to_drain = pcp->count - pcp->high; 2604 2576 while (to_drain > 0) { 2605 2577 to_drain_batched = min(to_drain, batch); 2606 - spin_lock(&pcp->lock); 2578 + pcp_spin_lock_maybe_irqsave(pcp, UP_flags); 2607 2579 free_pcppages_bulk(zone, to_drain_batched, pcp, 0); 2608 - spin_unlock(&pcp->lock); 2580 + pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags); 2609 2581 todo = true; 2610 2582 2611 2583 to_drain -= to_drain_batched; ··· 2622 2594 */ 2623 2595 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 2624 2596 { 2597 + unsigned long UP_flags; 2625 2598 int to_drain, batch; 2626 2599 2627 2600 batch = READ_ONCE(pcp->batch); 2628 2601 to_drain = min(pcp->count, batch); 2629 2602 if (to_drain > 0) { 2630 - spin_lock(&pcp->lock); 2603 + pcp_spin_lock_maybe_irqsave(pcp, UP_flags); 2631 2604 free_pcppages_bulk(zone, to_drain, pcp, 0); 2632 - spin_unlock(&pcp->lock); 2605 + pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags); 2633 2606 } 2634 2607 } 2635 2608 #endif ··· 2641 2612 static void drain_pages_zone(unsigned int cpu, struct zone *zone) 2642 2613 { 2643 2614 struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); 2615 + unsigned long UP_flags; 2644 2616 int count; 2645 2617 2646 2618 do { 2647 - spin_lock(&pcp->lock); 2619 + pcp_spin_lock_maybe_irqsave(pcp, UP_flags); 2648 2620 count = pcp->count; 2649 2621 if (count) { 2650 2622 int to_drain = min(count, ··· 2654 2624 free_pcppages_bulk(zone, to_drain, pcp, 0); 2655 2625 count -= to_drain; 2656 2626 } 2657 - spin_unlock(&pcp->lock); 2627 + pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags); 2658 2628 } while (count); 2659 2629 } 2660 2630 ··· 6139 6109 { 6140 6110 struct per_cpu_pages *pcp; 6141 6111 struct cpu_cacheinfo *cci; 6112 + unsigned long UP_flags; 6142 6113 6143 6114 pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); 6144 6115 cci = get_cpu_cacheinfo(cpu); ··· 6150 6119 * This can reduce zone lock contention without hurting 6151 6120 * cache-hot pages sharing. 6152 6121 */ 6153 - spin_lock(&pcp->lock); 6122 + pcp_spin_lock_maybe_irqsave(pcp, UP_flags); 6154 6123 if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch) 6155 6124 pcp->flags |= PCPF_FREE_HIGH_BATCH; 6156 6125 else 6157 6126 pcp->flags &= ~PCPF_FREE_HIGH_BATCH; 6158 - spin_unlock(&pcp->lock); 6127 + pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags); 6159 6128 } 6160 6129 6161 6130 void setup_pcp_cacheinfo(unsigned int cpu)

Configure Feed

Configure Feed