slab: simplify kmalloc_nolock()

The kmalloc_nolock() implementation has several complications and
restrictions due to SLUB's cpu slab locking, lockless fastpath and
PREEMPT_RT differences. With cpu slab usage removed, we can simplify
things:

- relax the PREEMPT_RT context checks as they were before commit
99a3e3a1cfc9 ("slab: fix kmalloc_nolock() context check for
PREEMPT_RT") and also reference the explanation comment in the page
allocator

- the local_lock_cpu_slab() macros became unused, remove them

- we no longer need to set up lockdep classes on PREEMPT_RT

- we no longer need to annotate ___slab_alloc as NOKPROBE_SYMBOL
since there's no lockless cpu freelist manipulation anymore

- __slab_alloc_node() can be called from kmalloc_nolock_noprof()
unconditionally. It can also no longer return EBUSY. But trylock
failures can still happen so retry with the larger bucket if the
allocation fails for any reason.

Note that we still need __CMPXCHG_DOUBLE, because while it was removed
we don't use cmpxchg16b on cpu freelist anymore, we still use it on
slab freelist, and the alternative is slab_lock() which can be
interrupted by a nmi. Clarify the comment to mention it specifically.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Hao Li <hao.li@linux.dev>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Vlastimil Babka 3 months ago 073d5f15 ab2f752a

+29 -116

2 changed files

expand all

slab.h

slub.c

-1

mm/slab.h

··· 190 190 */ 191 191 struct kmem_cache { 192 192 struct kmem_cache_cpu __percpu *cpu_slab; 193 - struct lock_class_key lock_key; 194 193 struct slub_percpu_sheaves __percpu *cpu_sheaves; 195 194 /* Used for retrieving partial slabs, etc. */ 196 195 slab_flags_t flags;

+29 -115

mm/slub.c

··· 3690 3690 3691 3691 static void init_kmem_cache_cpus(struct kmem_cache *s) 3692 3692 { 3693 - #ifdef CONFIG_PREEMPT_RT 3694 - /* 3695 - * Register lockdep key for non-boot kmem caches to avoid 3696 - * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key() 3697 - */ 3698 - bool finegrain_lockdep = !init_section_contains(s, 1); 3699 - #else 3700 - /* 3701 - * Don't bother with different lockdep classes for each 3702 - * kmem_cache, since we only use local_trylock_irqsave(). 3703 - */ 3704 - bool finegrain_lockdep = false; 3705 - #endif 3706 3693 int cpu; 3707 3694 struct kmem_cache_cpu *c; 3708 3695 3709 - if (finegrain_lockdep) 3710 - lockdep_register_key(&s->lock_key); 3711 3696 for_each_possible_cpu(cpu) { 3712 3697 c = per_cpu_ptr(s->cpu_slab, cpu); 3713 3698 local_trylock_init(&c->lock); 3714 - if (finegrain_lockdep) 3715 - lockdep_set_class(&c->lock, &s->lock_key); 3716 3699 c->tid = init_tid(cpu); 3717 3700 } 3718 3701 } ··· 3781 3798 stat(s, DEACTIVATE_FULL); 3782 3799 } 3783 3800 } 3784 - 3785 - /* 3786 - * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock 3787 - * can be acquired without a deadlock before invoking the function. 3788 - * 3789 - * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is 3790 - * using local_lock_is_locked() properly before calling local_lock_cpu_slab(), 3791 - * and kmalloc() is not used in an unsupported context. 3792 - * 3793 - * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave(). 3794 - * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but 3795 - * lockdep_assert() will catch a bug in case: 3796 - * #1 3797 - * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock() 3798 - * or 3799 - * #2 3800 - * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock() 3801 - * 3802 - * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt 3803 - * disabled context. The lock will always be acquired and if needed it 3804 - * block and sleep until the lock is available. 3805 - * #1 is possible in !PREEMPT_RT only. 3806 - * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock: 3807 - * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) -> 3808 - * tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B) 3809 - * 3810 - * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B 3811 - */ 3812 - #if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP) 3813 - #define local_lock_cpu_slab(s, flags) \ 3814 - local_lock_irqsave(&(s)->cpu_slab->lock, flags) 3815 - #else 3816 - #define local_lock_cpu_slab(s, flags) \ 3817 - do { \ 3818 - bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \ 3819 - lockdep_assert(__l); \ 3820 - } while (0) 3821 - #endif 3822 - 3823 - #define local_unlock_cpu_slab(s, flags) \ 3824 - local_unlock_irqrestore(&(s)->cpu_slab->lock, flags) 3825 3801 3826 3802 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 3827 3803 { ··· 4346 4404 4347 4405 return object; 4348 4406 } 4349 - 4350 - /* 4351 - * We disallow kprobes in ___slab_alloc() to prevent reentrance 4352 - * 4353 - * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of 4354 - * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf -> 4355 - * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast() 4356 - * manipulating c->freelist without lock. 4357 - * 4358 - * This does not prevent kprobe in functions called from ___slab_alloc() such as 4359 - * local_lock_irqsave() itself, and that is fine, we only need to protect the 4360 - * c->freelist manipulation in ___slab_alloc() itself. 4361 - */ 4362 - NOKPROBE_SYMBOL(___slab_alloc); 4363 4407 4364 4408 static __always_inline void *__slab_alloc_node(struct kmem_cache *s, 4365 4409 gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) ··· 5187 5259 if (unlikely(!size)) 5188 5260 return ZERO_SIZE_PTR; 5189 5261 5190 - if (IS_ENABLED(CONFIG_PREEMPT_RT) && !preemptible()) 5191 - /* 5192 - * kmalloc_nolock() in PREEMPT_RT is not supported from 5193 - * non-preemptible context because local_lock becomes a 5194 - * sleeping lock on RT. 5195 - */ 5262 + /* 5263 + * See the comment for the same check in 5264 + * alloc_frozen_pages_nolock_noprof() 5265 + */ 5266 + if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) 5196 5267 return NULL; 5268 + 5197 5269 retry: 5198 5270 if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) 5199 5271 return NULL; ··· 5202 5274 if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s)) 5203 5275 /* 5204 5276 * kmalloc_nolock() is not supported on architectures that 5205 - * don't implement cmpxchg16b, but debug caches don't use 5206 - * per-cpu slab and per-cpu partial slabs. They rely on 5207 - * kmem_cache_node->list_lock, so kmalloc_nolock() can 5208 - * attempt to allocate from debug caches by 5277 + * don't implement cmpxchg16b and thus need slab_lock() 5278 + * which could be preempted by a nmi. 5279 + * But debug caches don't use that and only rely on 5280 + * kmem_cache_node->list_lock, so kmalloc_nolock() can attempt 5281 + * to allocate from debug caches by 5209 5282 * spin_trylock_irqsave(&n->list_lock, ...) 5210 5283 */ 5211 5284 return NULL; ··· 5215 5286 if (ret) 5216 5287 goto success; 5217 5288 5218 - ret = ERR_PTR(-EBUSY); 5219 - 5220 5289 /* 5221 5290 * Do not call slab_alloc_node(), since trylock mode isn't 5222 5291 * compatible with slab_pre_alloc_hook/should_failslab and 5223 5292 * kfence_alloc. Hence call __slab_alloc_node() (at most twice) 5224 5293 * and slab_post_alloc_hook() directly. 5225 - * 5226 - * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair 5227 - * in irq saved region. It assumes that the same cpu will not 5228 - * __update_cpu_freelist_fast() into the same (freelist,tid) pair. 5229 - * Therefore use in_nmi() to check whether particular bucket is in 5230 - * irq protected section. 5231 - * 5232 - * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that 5233 - * this cpu was interrupted somewhere inside ___slab_alloc() after 5234 - * it did local_lock_irqsave(&s->cpu_slab->lock, flags). 5235 - * In this case fast path with __update_cpu_freelist_fast() is not safe. 5236 5294 */ 5237 - if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock)) 5238 - ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); 5295 + ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); 5239 5296 5240 - if (PTR_ERR(ret) == -EBUSY) { 5241 - if (can_retry) { 5242 - /* pick the next kmalloc bucket */ 5243 - size = s->object_size + 1; 5244 - /* 5245 - * Another alternative is to 5246 - * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT; 5247 - * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT; 5248 - * to retry from bucket of the same size. 5249 - */ 5250 - can_retry = false; 5251 - goto retry; 5252 - } 5253 - ret = NULL; 5297 + /* 5298 + * It's possible we failed due to trylock as we preempted someone with 5299 + * the sheaves locked, and the list_lock is also held by another cpu. 5300 + * But it should be rare that multiple kmalloc buckets would have 5301 + * sheaves locked, so try a larger one. 5302 + */ 5303 + if (!ret && can_retry) { 5304 + /* pick the next kmalloc bucket */ 5305 + size = s->object_size + 1; 5306 + /* 5307 + * Another alternative is to 5308 + * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT; 5309 + * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT; 5310 + * to retry from bucket of the same size. 5311 + */ 5312 + can_retry = false; 5313 + goto retry; 5254 5314 } 5255 5315 5256 5316 success: ··· 7292 7374 { 7293 7375 cache_random_seq_destroy(s); 7294 7376 pcs_destroy(s); 7295 - #ifdef CONFIG_PREEMPT_RT 7296 - if (s->cpu_slab) 7297 - lockdep_unregister_key(&s->lock_key); 7298 - #endif 7299 7377 free_percpu(s->cpu_slab); 7300 7378 free_kmem_cache_nodes(s); 7301 7379 }

Configure Feed

Configure Feed