Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'slab/for-7.0/sheaves' into slab/for-next

Merge series "slab: replace cpu (partial) slabs with sheaves".

The percpu sheaves caching layer was introduced as opt-in but the goal
was to eventually move all caches to them. This is the next step,
enabling sheaves for all caches (except the two bootstrap ones) and then
removing the per cpu (partial) slabs and lots of associated code.

Besides the lower locking overhead and much more likely fastpath when
freeing, this removes the rather complicated code related to the cpu
slab lockless fastpaths (using this_cpu_try_cmpxchg128/64) and all its
complications for PREEMPT_RT or kmalloc_nolock().

The lockless slab freelist+counters update operation using
try_cmpxchg128/64 remains and is crucial for freeing remote NUMA objects
and to allow flushing objects from sheaves to slabs mostly without the
node list_lock.

Link: https://lore.kernel.org/all/20260123-sheaves-for-all-v4-0-041323d506f7@suse.cz/

+997 -1786
-6
include/linux/slab.h
··· 57 57 #endif 58 58 _SLAB_OBJECT_POISON, 59 59 _SLAB_CMPXCHG_DOUBLE, 60 - #ifdef CONFIG_SLAB_OBJ_EXT 61 60 _SLAB_NO_OBJ_EXT, 62 - #endif 63 61 #if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT) 64 62 _SLAB_OBJ_EXT_IN_OBJ, 65 63 #endif ··· 239 241 #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ 240 242 241 243 /* Slab created using create_boot_cache */ 242 - #ifdef CONFIG_SLAB_OBJ_EXT 243 244 #define SLAB_NO_OBJ_EXT __SLAB_FLAG_BIT(_SLAB_NO_OBJ_EXT) 244 - #else 245 - #define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED 246 - #endif 247 245 248 246 #if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT) 249 247 #define SLAB_OBJ_EXT_IN_OBJ __SLAB_FLAG_BIT(_SLAB_OBJ_EXT_IN_OBJ)
-11
mm/Kconfig
··· 247 247 out which slabs are relevant to a particular load. 248 248 Try running: slabinfo -DA 249 249 250 - config SLUB_CPU_PARTIAL 251 - default y 252 - depends on SMP && !SLUB_TINY 253 - bool "Enable per cpu partial caches" 254 - help 255 - Per cpu partial caches accelerate objects allocation and freeing 256 - that is local to a processor at the price of more indeterminism 257 - in the latency of the free. On overflow these caches will be cleared 258 - which requires the taking of locks that may cause latency spikes. 259 - Typically one would choose no for a realtime system. 260 - 261 250 config RANDOM_KMALLOC_CACHES 262 251 default n 263 252 depends on !SLUB_TINY
+1
mm/internal.h
··· 846 846 struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order); 847 847 #define alloc_frozen_pages_nolock(...) \ 848 848 alloc_hooks(alloc_frozen_pages_nolock_noprof(__VA_ARGS__)) 849 + void free_frozen_pages_nolock(struct page *page, unsigned int order); 849 850 850 851 extern void zone_pcp_reset(struct zone *zone); 851 852 extern void zone_pcp_disable(struct zone *zone);
+5
mm/page_alloc.c
··· 2981 2981 __free_frozen_pages(page, order, FPI_NONE); 2982 2982 } 2983 2983 2984 + void free_frozen_pages_nolock(struct page *page, unsigned int order) 2985 + { 2986 + __free_frozen_pages(page, order, FPI_TRYLOCK); 2987 + } 2988 + 2984 2989 /* 2985 2990 * Free a batch of folios 2986 2991 */
+17 -40
mm/slab.h
··· 21 21 # define system_has_freelist_aba() system_has_cmpxchg128() 22 22 # define try_cmpxchg_freelist try_cmpxchg128 23 23 # endif 24 - #define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg128 25 24 typedef u128 freelist_full_t; 26 25 #else /* CONFIG_64BIT */ 27 26 # ifdef system_has_cmpxchg64 28 27 # define system_has_freelist_aba() system_has_cmpxchg64() 29 28 # define try_cmpxchg_freelist try_cmpxchg64 30 29 # endif 31 - #define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg64 32 30 typedef u64 freelist_full_t; 33 31 #endif /* CONFIG_64BIT */ 34 32 ··· 77 79 struct kmem_cache *slab_cache; 78 80 union { 79 81 struct { 80 - union { 81 - struct list_head slab_list; 82 - struct { /* For deferred deactivate_slab() */ 83 - struct llist_node llnode; 84 - void *flush_freelist; 85 - }; 86 - #ifdef CONFIG_SLUB_CPU_PARTIAL 87 - struct { 88 - struct slab *next; 89 - int slabs; /* Nr of slabs left */ 90 - }; 91 - #endif 92 - }; 82 + struct list_head slab_list; 93 83 /* Double-word boundary */ 94 84 struct freelist_counters; 95 85 }; ··· 182 196 return PAGE_SIZE << slab_order(slab); 183 197 } 184 198 185 - #ifdef CONFIG_SLUB_CPU_PARTIAL 186 - #define slub_percpu_partial(c) ((c)->partial) 187 - 188 - #define slub_set_percpu_partial(c, p) \ 189 - ({ \ 190 - slub_percpu_partial(c) = (p)->next; \ 191 - }) 192 - 193 - #define slub_percpu_partial_read_once(c) READ_ONCE(slub_percpu_partial(c)) 194 - #else 195 - #define slub_percpu_partial(c) NULL 196 - 197 - #define slub_set_percpu_partial(c, p) 198 - 199 - #define slub_percpu_partial_read_once(c) NULL 200 - #endif // CONFIG_SLUB_CPU_PARTIAL 201 - 202 199 /* 203 200 * Word size structure that can be atomically updated or read and that 204 201 * contains both the order and the number of objects that a slab of the ··· 195 226 * Slab cache management. 196 227 */ 197 228 struct kmem_cache { 198 - struct kmem_cache_cpu __percpu *cpu_slab; 199 - struct lock_class_key lock_key; 200 229 struct slub_percpu_sheaves __percpu *cpu_sheaves; 201 230 /* Used for retrieving partial slabs, etc. */ 202 231 slab_flags_t flags; ··· 203 236 unsigned int object_size; /* Object size without metadata */ 204 237 struct reciprocal_value reciprocal_size; 205 238 unsigned int offset; /* Free pointer offset */ 206 - #ifdef CONFIG_SLUB_CPU_PARTIAL 207 - /* Number of per cpu partial objects to keep around */ 208 - unsigned int cpu_partial; 209 - /* Number of per cpu partial slabs to keep around */ 210 - unsigned int cpu_partial_slabs; 211 - #endif 212 239 unsigned int sheaf_capacity; 213 240 struct kmem_cache_order_objects oo; 214 241 ··· 243 282 unsigned int usersize; /* Usercopy region size */ 244 283 #endif 245 284 285 + #ifdef CONFIG_SLUB_STATS 286 + struct kmem_cache_stats __percpu *cpu_stats; 287 + #endif 288 + 246 289 struct kmem_cache_node *node[MAX_NUMNODES]; 247 290 }; 291 + 292 + /* 293 + * Every cache has !NULL s->cpu_sheaves but they may point to the 294 + * bootstrap_sheaf temporarily during init, or permanently for the boot caches 295 + * and caches with debugging enabled, or all caches with CONFIG_SLUB_TINY. This 296 + * helper distinguishes whether cache has real non-bootstrap sheaves. 297 + */ 298 + static inline bool cache_has_sheaves(struct kmem_cache *s) 299 + { 300 + /* Test CONFIG_SLUB_TINY for code elimination purposes */ 301 + return !IS_ENABLED(CONFIG_SLUB_TINY) && s->sheaf_capacity; 302 + } 248 303 249 304 #if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY) 250 305 #define SLAB_SUPPORTS_SYSFS 1
+3 -6
mm/slab_common.c
··· 1604 1604 return false; 1605 1605 1606 1606 s = slab->slab_cache; 1607 - if (s->cpu_sheaves) { 1608 - if (likely(!IS_ENABLED(CONFIG_NUMA) || 1609 - slab_nid(slab) == numa_mem_id())) 1610 - return __kfree_rcu_sheaf(s, obj); 1611 - } 1607 + if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id())) 1608 + return __kfree_rcu_sheaf(s, obj); 1612 1609 1613 1610 return false; 1614 1611 } ··· 2109 2112 */ 2110 2113 void kvfree_rcu_barrier_on_cache(struct kmem_cache *s) 2111 2114 { 2112 - if (s->cpu_sheaves) { 2115 + if (cache_has_sheaves(s)) { 2113 2116 flush_rcu_sheaves_on_cache(s); 2114 2117 rcu_barrier(); 2115 2118 }
+971 -1723
mm/slub.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 /* 3 - * SLUB: A slab allocator that limits cache line use instead of queuing 4 - * objects in per cpu and per node lists. 3 + * SLUB: A slab allocator with low overhead percpu array caches and mostly 4 + * lockless freeing of objects to slabs in the slowpath. 5 5 * 6 - * The allocator synchronizes using per slab locks or atomic operations 7 - * and only uses a centralized lock to manage a pool of partial slabs. 6 + * The allocator synchronizes using spin_trylock for percpu arrays in the 7 + * fastpath, and cmpxchg_double (or bit spinlock) for slowpath freeing. 8 + * Uses a centralized lock to manage a pool of partial slabs. 8 9 * 9 10 * (C) 2007 SGI, Christoph Lameter 10 11 * (C) 2011 Linux Foundation, Christoph Lameter 12 + * (C) 2025 SUSE, Vlastimil Babka 11 13 */ 12 14 13 15 #include <linux/mm.h> ··· 55 53 56 54 /* 57 55 * Lock order: 58 - * 1. slab_mutex (Global Mutex) 59 - * 2. node->list_lock (Spinlock) 60 - * 3. kmem_cache->cpu_slab->lock (Local lock) 61 - * 4. slab_lock(slab) (Only on some arches) 62 - * 5. object_map_lock (Only for debugging) 56 + * 0. cpu_hotplug_lock 57 + * 1. slab_mutex (Global Mutex) 58 + * 2a. kmem_cache->cpu_sheaves->lock (Local trylock) 59 + * 2b. node->barn->lock (Spinlock) 60 + * 2c. node->list_lock (Spinlock) 61 + * 3. slab_lock(slab) (Only on some arches) 62 + * 4. object_map_lock (Only for debugging) 63 63 * 64 64 * slab_mutex 65 65 * ··· 82 78 * C. slab->objects -> Number of objects in slab 83 79 * D. slab->frozen -> frozen state 84 80 * 81 + * SL_partial slabs 82 + * 83 + * Slabs on node partial list have at least one free object. A limited number 84 + * of slabs on the list can be fully free (slab->inuse == 0), until we start 85 + * discarding them. These slabs are marked with SL_partial, and the flag is 86 + * cleared while removing them, usually to grab their freelist afterwards. 87 + * This clearing also exempts them from list management. Please see 88 + * __slab_free() for more details. 89 + * 90 + * Full slabs 91 + * 92 + * For caches without debugging enabled, full slabs (slab->inuse == 93 + * slab->objects and slab->freelist == NULL) are not placed on any list. 94 + * The __slab_free() freeing the first object from such a slab will place 95 + * it on the partial list. Caches with debugging enabled place such slab 96 + * on the full list and use different allocation and freeing paths. 97 + * 85 98 * Frozen slabs 86 99 * 87 - * If a slab is frozen then it is exempt from list management. It is 88 - * the cpu slab which is actively allocated from by the processor that 89 - * froze it and it is not on any list. The processor that froze the 90 - * slab is the one who can perform list operations on the slab. Other 91 - * processors may put objects onto the freelist but the processor that 92 - * froze the slab is the only one that can retrieve the objects from the 93 - * slab's freelist. 94 - * 95 - * CPU partial slabs 96 - * 97 - * The partially empty slabs cached on the CPU partial list are used 98 - * for performance reasons, which speeds up the allocation process. 99 - * These slabs are not frozen, but are also exempt from list management, 100 - * by clearing the SL_partial flag when moving out of the node 101 - * partial list. Please see __slab_free() for more details. 100 + * If a slab is frozen then it is exempt from list management. It is used to 101 + * indicate a slab that has failed consistency checks and thus cannot be 102 + * allocated from anymore - it is also marked as full. Any previously 103 + * allocated objects will be simply leaked upon freeing instead of attempting 104 + * to modify the potentially corrupted freelist and metadata. 102 105 * 103 106 * To sum up, the current scheme is: 104 - * - node partial slab: SL_partial && !frozen 105 - * - cpu partial slab: !SL_partial && !frozen 106 - * - cpu slab: !SL_partial && frozen 107 - * - full slab: !SL_partial && !frozen 107 + * - node partial slab: SL_partial && !full && !frozen 108 + * - taken off partial list: !SL_partial && !full && !frozen 109 + * - full slab, not on any list: !SL_partial && full && !frozen 110 + * - frozen due to inconsistency: !SL_partial && full && frozen 108 111 * 109 - * list_lock 112 + * node->list_lock (spinlock) 110 113 * 111 114 * The list_lock protects the partial and full list on each node and 112 115 * the partial slab counter. If taken then no new slabs may be added or ··· 123 112 * 124 113 * The list_lock is a centralized lock and thus we avoid taking it as 125 114 * much as possible. As long as SLUB does not have to handle partial 126 - * slabs, operations can continue without any centralized lock. F.e. 127 - * allocating a long series of objects that fill up slabs does not require 128 - * the list lock. 115 + * slabs, operations can continue without any centralized lock. 129 116 * 130 117 * For debug caches, all allocations are forced to go through a list_lock 131 118 * protected region to serialize against concurrent validation. 132 119 * 133 - * cpu_slab->lock local lock 120 + * cpu_sheaves->lock (local_trylock) 134 121 * 135 - * This locks protect slowpath manipulation of all kmem_cache_cpu fields 136 - * except the stat counters. This is a percpu structure manipulated only by 137 - * the local cpu, so the lock protects against being preempted or interrupted 138 - * by an irq. Fast path operations rely on lockless operations instead. 122 + * This lock protects fastpath operations on the percpu sheaves. On !RT it 123 + * only disables preemption and does no atomic operations. As long as the main 124 + * or spare sheaf can handle the allocation or free, there is no other 125 + * overhead. 139 126 * 140 - * On PREEMPT_RT, the local lock neither disables interrupts nor preemption 141 - * which means the lockless fastpath cannot be used as it might interfere with 142 - * an in-progress slow path operations. In this case the local lock is always 143 - * taken but it still utilizes the freelist for the common operations. 127 + * node->barn->lock (spinlock) 144 128 * 145 - * lockless fastpaths 129 + * This lock protects the operations on per-NUMA-node barn. It can quickly 130 + * serve an empty or full sheaf if available, and avoid more expensive refill 131 + * or flush operation. 146 132 * 147 - * The fast path allocation (slab_alloc_node()) and freeing (do_slab_free()) 148 - * are fully lockless when satisfied from the percpu slab (and when 149 - * cmpxchg_double is possible to use, otherwise slab_lock is taken). 150 - * They also don't disable preemption or migration or irqs. They rely on 151 - * the transaction id (tid) field to detect being preempted or moved to 152 - * another cpu. 133 + * Lockless freeing 134 + * 135 + * Objects may have to be freed to their slabs when they are from a remote 136 + * node (where we want to avoid filling local sheaves with remote objects) 137 + * or when there are too many full sheaves. On architectures supporting 138 + * cmpxchg_double this is done by a lockless update of slab's freelist and 139 + * counters, otherwise slab_lock is taken. This only needs to take the 140 + * list_lock if it's a first free to a full slab, or when a slab becomes empty 141 + * after the free. 153 142 * 154 143 * irq, preemption, migration considerations 155 144 * 156 - * Interrupts are disabled as part of list_lock or local_lock operations, or 145 + * Interrupts are disabled as part of list_lock or barn lock operations, or 157 146 * around the slab_lock operation, in order to make the slab allocator safe 158 147 * to use in the context of an irq. 148 + * Preemption is disabled as part of local_trylock operations. 149 + * kmalloc_nolock() and kfree_nolock() are safe in NMI context but see 150 + * their limitations. 159 151 * 160 - * In addition, preemption (or migration on PREEMPT_RT) is disabled in the 161 - * allocation slowpath, bulk allocation, and put_cpu_partial(), so that the 162 - * local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer 163 - * doesn't have to be revalidated in each section protected by the local lock. 164 - * 165 - * SLUB assigns one slab for allocation to each processor. 166 - * Allocations only occur from these slabs called cpu slabs. 152 + * SLUB assigns two object arrays called sheaves for caching allocations and 153 + * frees on each cpu, with a NUMA node shared barn for balancing between cpus. 154 + * Allocations and frees are primarily served from these sheaves. 167 155 * 168 156 * Slabs with free elements are kept on a partial list and during regular 169 157 * operations no list for full slabs is used. If an object in a full slab is ··· 170 160 * We track full slabs for debugging purposes though because otherwise we 171 161 * cannot scan all objects. 172 162 * 173 - * Slabs are freed when they become empty. Teardown and setup is 174 - * minimal so we rely on the page allocators per cpu caches for 175 - * fast frees and allocs. 176 - * 177 - * slab->frozen The slab is frozen and exempt from list processing. 178 - * This means that the slab is dedicated to a purpose 179 - * such as satisfying allocations for a specific 180 - * processor. Objects may be freed in the slab while 181 - * it is frozen but slab_free will then skip the usual 182 - * list operations. It is up to the processor holding 183 - * the slab to integrate the slab into the slab lists 184 - * when the slab is no longer needed. 185 - * 186 - * One use of this flag is to mark slabs that are 187 - * used for allocations. Then such a slab becomes a cpu 188 - * slab. The cpu slab may be equipped with an additional 189 - * freelist that allows lockless access to 190 - * free objects in addition to the regular freelist 191 - * that requires the slab lock. 163 + * Slabs are freed when they become empty. Teardown and setup is minimal so we 164 + * rely on the page allocators per cpu caches for fast frees and allocs. 192 165 * 193 166 * SLAB_DEBUG_FLAGS Slab requires special handling due to debug 194 167 * options set. This moves slab handling out of ··· 194 201 SL_pfmemalloc = PG_active, /* Historical reasons for this bit */ 195 202 }; 196 203 197 - /* 198 - * We could simply use migrate_disable()/enable() but as long as it's a 199 - * function call even on !PREEMPT_RT, use inline preempt_disable() there. 200 - */ 201 - #ifndef CONFIG_PREEMPT_RT 202 - #define slub_get_cpu_ptr(var) get_cpu_ptr(var) 203 - #define slub_put_cpu_ptr(var) put_cpu_ptr(var) 204 - #define USE_LOCKLESS_FAST_PATH() (true) 205 - #else 206 - #define slub_get_cpu_ptr(var) \ 207 - ({ \ 208 - migrate_disable(); \ 209 - this_cpu_ptr(var); \ 210 - }) 211 - #define slub_put_cpu_ptr(var) \ 212 - do { \ 213 - (void)(var); \ 214 - migrate_enable(); \ 215 - } while (0) 216 - #define USE_LOCKLESS_FAST_PATH() (false) 217 - #endif 218 - 219 204 #ifndef CONFIG_SLUB_TINY 220 205 #define __fastpath_inline __always_inline 221 206 #else ··· 212 241 static DEFINE_STATIC_KEY_FALSE(strict_numa); 213 242 #endif 214 243 215 - /* Structure holding parameters for get_partial() call chain */ 244 + /* Structure holding parameters for get_from_partial() call chain */ 216 245 struct partial_context { 217 246 gfp_t flags; 218 247 unsigned int orig_size; 219 - void *object; 248 + }; 249 + 250 + /* Structure holding parameters for get_partial_node_bulk() */ 251 + struct partial_bulk_context { 252 + gfp_t flags; 253 + unsigned int min_objects; 254 + unsigned int max_objects; 255 + struct list_head slabs; 220 256 }; 221 257 222 258 static inline bool kmem_cache_debug(struct kmem_cache *s) ··· 237 259 p += s->red_left_pad; 238 260 239 261 return p; 240 - } 241 - 242 - static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) 243 - { 244 - #ifdef CONFIG_SLUB_CPU_PARTIAL 245 - return !kmem_cache_debug(s); 246 - #else 247 - return false; 248 - #endif 249 262 } 250 263 251 264 /* ··· 329 360 static inline void debugfs_slab_add(struct kmem_cache *s) { } 330 361 #endif 331 362 363 + enum add_mode { 364 + ADD_TO_HEAD, 365 + ADD_TO_TAIL, 366 + }; 367 + 332 368 enum stat_item { 333 - ALLOC_PCS, /* Allocation from percpu sheaf */ 334 - ALLOC_FASTPATH, /* Allocation from cpu slab */ 335 - ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ 336 - FREE_PCS, /* Free to percpu sheaf */ 369 + ALLOC_FASTPATH, /* Allocation from percpu sheaves */ 370 + ALLOC_SLOWPATH, /* Allocation from partial or new slab */ 337 371 FREE_RCU_SHEAF, /* Free to rcu_free sheaf */ 338 372 FREE_RCU_SHEAF_FAIL, /* Failed to free to a rcu_free sheaf */ 339 - FREE_FASTPATH, /* Free to cpu slab */ 340 - FREE_SLOWPATH, /* Freeing not to cpu slab */ 341 - FREE_FROZEN, /* Freeing to frozen slab */ 373 + FREE_FASTPATH, /* Free to percpu sheaves */ 374 + FREE_SLOWPATH, /* Free to a slab */ 342 375 FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */ 343 376 FREE_REMOVE_PARTIAL, /* Freeing removes last object */ 344 - ALLOC_FROM_PARTIAL, /* Cpu slab acquired from node partial list */ 345 - ALLOC_SLAB, /* Cpu slab acquired from page allocator */ 346 - ALLOC_REFILL, /* Refill cpu slab from slab freelist */ 347 - ALLOC_NODE_MISMATCH, /* Switching cpu slab */ 377 + ALLOC_SLAB, /* New slab acquired from page allocator */ 378 + ALLOC_NODE_MISMATCH, /* Requested node different from cpu sheaf */ 348 379 FREE_SLAB, /* Slab freed to the page allocator */ 349 - CPUSLAB_FLUSH, /* Abandoning of the cpu slab */ 350 - DEACTIVATE_FULL, /* Cpu slab was full when deactivated */ 351 - DEACTIVATE_EMPTY, /* Cpu slab was empty when deactivated */ 352 - DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */ 353 - DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ 354 - DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ 355 - DEACTIVATE_BYPASS, /* Implicit deactivation */ 356 380 ORDER_FALLBACK, /* Number of times fallback was necessary */ 357 - CMPXCHG_DOUBLE_CPU_FAIL,/* Failures of this_cpu_cmpxchg_double */ 358 381 CMPXCHG_DOUBLE_FAIL, /* Failures of slab freelist update */ 359 - CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */ 360 - CPU_PARTIAL_FREE, /* Refill cpu partial on free */ 361 - CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ 362 - CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ 363 382 SHEAF_FLUSH, /* Objects flushed from a sheaf */ 364 383 SHEAF_REFILL, /* Objects refilled to a sheaf */ 365 384 SHEAF_ALLOC, /* Allocation of an empty sheaf */ ··· 364 407 NR_SLUB_STAT_ITEMS 365 408 }; 366 409 367 - struct freelist_tid { 368 - union { 369 - struct { 370 - void *freelist; /* Pointer to next available object */ 371 - unsigned long tid; /* Globally unique transaction id */ 372 - }; 373 - freelist_full_t freelist_tid; 374 - }; 375 - }; 376 - 377 - /* 378 - * When changing the layout, make sure freelist and tid are still compatible 379 - * with this_cpu_cmpxchg_double() alignment requirements. 380 - */ 381 - struct kmem_cache_cpu { 382 - struct freelist_tid; 383 - struct slab *slab; /* The slab from which we are allocating */ 384 - #ifdef CONFIG_SLUB_CPU_PARTIAL 385 - struct slab *partial; /* Partially allocated slabs */ 386 - #endif 387 - local_trylock_t lock; /* Protects the fields above */ 388 410 #ifdef CONFIG_SLUB_STATS 411 + struct kmem_cache_stats { 389 412 unsigned int stat[NR_SLUB_STAT_ITEMS]; 390 - #endif 391 413 }; 414 + #endif 392 415 393 416 static inline void stat(const struct kmem_cache *s, enum stat_item si) 394 417 { ··· 377 440 * The rmw is racy on a preemptible kernel but this is acceptable, so 378 441 * avoid this_cpu_add()'s irq-disable overhead. 379 442 */ 380 - raw_cpu_inc(s->cpu_slab->stat[si]); 443 + raw_cpu_inc(s->cpu_stats->stat[si]); 381 444 #endif 382 445 } 383 446 ··· 385 448 void stat_add(const struct kmem_cache *s, enum stat_item si, int v) 386 449 { 387 450 #ifdef CONFIG_SLUB_STATS 388 - raw_cpu_add(s->cpu_slab->stat[si], v); 451 + raw_cpu_add(s->cpu_stats->stat[si], v); 389 452 #endif 390 453 } 391 454 ··· 474 537 static nodemask_t slab_nodes; 475 538 476 539 /* 477 - * Workqueue used for flush_cpu_slab(). 540 + * Workqueue used for flushing cpu and kfree_rcu sheaves. 478 541 */ 479 542 static struct workqueue_struct *flushwq; 480 543 ··· 531 594 ptr_addr = (unsigned long)object + s->offset; 532 595 p = *(freeptr_t *)(ptr_addr); 533 596 return freelist_ptr_decode(s, p, ptr_addr); 534 - } 535 - 536 - static void prefetch_freepointer(const struct kmem_cache *s, void *object) 537 - { 538 - prefetchw(object + s->offset); 539 - } 540 - 541 - /* 542 - * When running under KMSAN, get_freepointer_safe() may return an uninitialized 543 - * pointer value in the case the current thread loses the race for the next 544 - * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in 545 - * slab_alloc_node() will fail, so the uninitialized value won't be used, but 546 - * KMSAN will still check all arguments of cmpxchg because of imperfect 547 - * handling of inline assembly. 548 - * To work around this problem, we apply __no_kmsan_checks to ensure that 549 - * get_freepointer_safe() returns initialized memory. 550 - */ 551 - __no_kmsan_checks 552 - static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) 553 - { 554 - unsigned long freepointer_addr; 555 - freeptr_t p; 556 - 557 - if (!debug_pagealloc_enabled_static()) 558 - return get_freepointer(s, object); 559 - 560 - object = kasan_reset_tag(object); 561 - freepointer_addr = (unsigned long)object + s->offset; 562 - copy_from_kernel_nofault(&p, (freeptr_t *)freepointer_addr, sizeof(p)); 563 - return freelist_ptr_decode(s, p, freepointer_addr); 564 597 } 565 598 566 599 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) ··· 596 689 return x.x & OO_MASK; 597 690 } 598 691 599 - #ifdef CONFIG_SLUB_CPU_PARTIAL 600 - static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) 601 - { 602 - unsigned int nr_slabs; 603 - 604 - s->cpu_partial = nr_objects; 605 - 606 - /* 607 - * We take the number of objects but actually limit the number of 608 - * slabs on the per cpu partial list, in order to limit excessive 609 - * growth of the list. For simplicity we assume that the slabs will 610 - * be half-full. 611 - */ 612 - nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo)); 613 - s->cpu_partial_slabs = nr_slabs; 614 - } 615 - 616 - static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) 617 - { 618 - return s->cpu_partial_slabs; 619 - } 620 - #else 621 - #ifdef SLAB_SUPPORTS_SYSFS 622 - static inline void 623 - slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) 624 - { 625 - } 626 - #endif 627 - 628 - static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) 629 - { 630 - return 0; 631 - } 632 - #endif /* CONFIG_SLUB_CPU_PARTIAL */ 633 - 634 692 /* 635 693 * If network-based swap is enabled, slub must keep track of whether memory 636 694 * were allocated from pfmemalloc reserves. ··· 651 779 if (slab->freelist == old->freelist && 652 780 slab->counters == old->counters) { 653 781 slab->freelist = new->freelist; 654 - slab->counters = new->counters; 782 + /* prevent tearing for the read in get_partial_node_bulk() */ 783 + WRITE_ONCE(slab->counters, new->counters); 655 784 ret = true; 656 785 } 657 786 slab_unlock(slab); ··· 672 799 { 673 800 bool ret; 674 801 675 - if (USE_LOCKLESS_FAST_PATH()) 802 + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) 676 803 lockdep_assert_irqs_disabled(); 677 804 678 805 if (s->flags & __CMPXCHG_DOUBLE) ··· 1051 1178 p->handle = handle; 1052 1179 #endif 1053 1180 p->addr = addr; 1054 - p->cpu = smp_processor_id(); 1181 + p->cpu = raw_smp_processor_id(); 1055 1182 p->pid = current->pid; 1056 1183 p->when = jiffies; 1057 1184 } ··· 1213 1340 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 1214 1341 1215 1342 WARN_ON(1); 1216 - } 1217 - 1218 - static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, 1219 - void **freelist, void *nextfree) 1220 - { 1221 - if ((s->flags & SLAB_CONSISTENCY_CHECKS) && 1222 - !check_valid_pointer(s, slab, nextfree) && freelist) { 1223 - object_err(s, slab, *freelist, "Freechain corrupt"); 1224 - *freelist = NULL; 1225 - slab_fix(s, "Isolate corrupted freechain"); 1226 - return true; 1227 - } 1228 - 1229 - return false; 1230 1343 } 1231 1344 1232 1345 static void __slab_err(struct slab *slab) ··· 2026 2167 int objects) {} 2027 2168 static inline void dec_slabs_node(struct kmem_cache *s, int node, 2028 2169 int objects) {} 2029 - static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, 2030 - void **freelist, void *nextfree) 2031 - { 2032 - return false; 2033 - } 2034 2170 #endif /* CONFIG_SLUB_DEBUG */ 2035 2171 2036 2172 /* ··· 2726 2872 return object; 2727 2873 } 2728 2874 2729 - static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) 2875 + static struct slab_sheaf *__alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp, 2876 + unsigned int capacity) 2730 2877 { 2731 2878 struct slab_sheaf *sheaf; 2732 2879 size_t sheaf_size; ··· 2745 2890 if (s->flags & SLAB_KMALLOC) 2746 2891 gfp |= __GFP_NO_OBJ_EXT; 2747 2892 2748 - sheaf_size = struct_size(sheaf, objects, s->sheaf_capacity); 2893 + sheaf_size = struct_size(sheaf, objects, capacity); 2749 2894 sheaf = kzalloc(sheaf_size, gfp); 2750 2895 2751 2896 if (unlikely(!sheaf)) ··· 2758 2903 return sheaf; 2759 2904 } 2760 2905 2906 + static inline struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, 2907 + gfp_t gfp) 2908 + { 2909 + return __alloc_empty_sheaf(s, gfp, s->sheaf_capacity); 2910 + } 2911 + 2761 2912 static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf) 2762 2913 { 2763 2914 kfree(sheaf); ··· 2771 2910 stat(s, SHEAF_FREE); 2772 2911 } 2773 2912 2774 - static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, 2775 - size_t size, void **p); 2776 - 2913 + static unsigned int 2914 + refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 2915 + unsigned int max); 2777 2916 2778 2917 static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, 2779 2918 gfp_t gfp) ··· 2784 2923 if (!to_fill) 2785 2924 return 0; 2786 2925 2787 - filled = __kmem_cache_alloc_bulk(s, gfp, to_fill, 2788 - &sheaf->objects[sheaf->size]); 2926 + filled = refill_objects(s, &sheaf->objects[sheaf->size], gfp, to_fill, 2927 + to_fill); 2789 2928 2790 2929 sheaf->size += filled; 2791 2930 ··· 2986 3125 { 2987 3126 int cpu; 2988 3127 3128 + /* 3129 + * We may be unwinding cache creation that failed before or during the 3130 + * allocation of this. 3131 + */ 3132 + if (!s->cpu_sheaves) 3133 + return; 3134 + 3135 + /* pcs->main can only point to the bootstrap sheaf, nothing to free */ 3136 + if (!cache_has_sheaves(s)) 3137 + goto free_pcs; 3138 + 2989 3139 for_each_possible_cpu(cpu) { 2990 3140 struct slub_percpu_sheaves *pcs; 2991 3141 2992 3142 pcs = per_cpu_ptr(s->cpu_sheaves, cpu); 2993 3143 2994 - /* can happen when unwinding failed create */ 3144 + /* This can happen when unwinding failed cache creation. */ 2995 3145 if (!pcs->main) 2996 3146 continue; 2997 3147 ··· 3024 3152 } 3025 3153 } 3026 3154 3155 + free_pcs: 3027 3156 free_percpu(s->cpu_sheaves); 3028 3157 s->cpu_sheaves = NULL; 3029 3158 } 3030 3159 3031 - static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn) 3160 + static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn, 3161 + bool allow_spin) 3032 3162 { 3033 3163 struct slab_sheaf *empty = NULL; 3034 3164 unsigned long flags; ··· 3038 3164 if (!data_race(barn->nr_empty)) 3039 3165 return NULL; 3040 3166 3041 - spin_lock_irqsave(&barn->lock, flags); 3167 + if (likely(allow_spin)) 3168 + spin_lock_irqsave(&barn->lock, flags); 3169 + else if (!spin_trylock_irqsave(&barn->lock, flags)) 3170 + return NULL; 3042 3171 3043 3172 if (likely(barn->nr_empty)) { 3044 3173 empty = list_first_entry(&barn->sheaves_empty, ··· 3118 3241 * change. 3119 3242 */ 3120 3243 static struct slab_sheaf * 3121 - barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty) 3244 + barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty, 3245 + bool allow_spin) 3122 3246 { 3123 3247 struct slab_sheaf *full = NULL; 3124 3248 unsigned long flags; ··· 3127 3249 if (!data_race(barn->nr_full)) 3128 3250 return NULL; 3129 3251 3130 - spin_lock_irqsave(&barn->lock, flags); 3252 + if (likely(allow_spin)) 3253 + spin_lock_irqsave(&barn->lock, flags); 3254 + else if (!spin_trylock_irqsave(&barn->lock, flags)) 3255 + return NULL; 3131 3256 3132 3257 if (likely(barn->nr_full)) { 3133 3258 full = list_first_entry(&barn->sheaves_full, struct slab_sheaf, ··· 3151 3270 * barn. But if there are too many full sheaves, reject this with -E2BIG. 3152 3271 */ 3153 3272 static struct slab_sheaf * 3154 - barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full) 3273 + barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full, 3274 + bool allow_spin) 3155 3275 { 3156 3276 struct slab_sheaf *empty; 3157 3277 unsigned long flags; ··· 3163 3281 if (!data_race(barn->nr_empty)) 3164 3282 return ERR_PTR(-ENOMEM); 3165 3283 3166 - spin_lock_irqsave(&barn->lock, flags); 3284 + if (likely(allow_spin)) 3285 + spin_lock_irqsave(&barn->lock, flags); 3286 + else if (!spin_trylock_irqsave(&barn->lock, flags)) 3287 + return ERR_PTR(-EBUSY); 3167 3288 3168 3289 if (likely(barn->nr_empty)) { 3169 3290 empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf, ··· 3470 3585 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); 3471 3586 } 3472 3587 3473 - static void __free_slab(struct kmem_cache *s, struct slab *slab) 3588 + static void __free_slab(struct kmem_cache *s, struct slab *slab, bool allow_spin) 3474 3589 { 3475 3590 struct page *page = slab_page(slab); 3476 3591 int order = compound_order(page); ··· 3481 3596 __ClearPageSlab(page); 3482 3597 mm_account_reclaimed_pages(pages); 3483 3598 unaccount_slab(slab, order, s); 3484 - free_frozen_pages(page, order); 3599 + if (allow_spin) 3600 + free_frozen_pages(page, order); 3601 + else 3602 + free_frozen_pages_nolock(page, order); 3603 + } 3604 + 3605 + static void free_new_slab_nolock(struct kmem_cache *s, struct slab *slab) 3606 + { 3607 + /* 3608 + * Since it was just allocated, we can skip the actions in 3609 + * discard_slab() and free_slab(). 3610 + */ 3611 + __free_slab(s, slab, false); 3485 3612 } 3486 3613 3487 3614 static void rcu_free_slab(struct rcu_head *h) 3488 3615 { 3489 3616 struct slab *slab = container_of(h, struct slab, rcu_head); 3490 3617 3491 - __free_slab(slab->slab_cache, slab); 3618 + __free_slab(slab->slab_cache, slab, true); 3492 3619 } 3493 3620 3494 3621 static void free_slab(struct kmem_cache *s, struct slab *slab) ··· 3516 3619 if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) 3517 3620 call_rcu(&slab->rcu_head, rcu_free_slab); 3518 3621 else 3519 - __free_slab(s, slab); 3622 + __free_slab(s, slab, true); 3520 3623 } 3521 3624 3522 3625 static void discard_slab(struct kmem_cache *s, struct slab *slab) ··· 3544 3647 * Management of partially allocated slabs. 3545 3648 */ 3546 3649 static inline void 3547 - __add_partial(struct kmem_cache_node *n, struct slab *slab, int tail) 3650 + __add_partial(struct kmem_cache_node *n, struct slab *slab, enum add_mode mode) 3548 3651 { 3549 3652 n->nr_partial++; 3550 - if (tail == DEACTIVATE_TO_TAIL) 3653 + if (mode == ADD_TO_TAIL) 3551 3654 list_add_tail(&slab->slab_list, &n->partial); 3552 3655 else 3553 3656 list_add(&slab->slab_list, &n->partial); ··· 3555 3658 } 3556 3659 3557 3660 static inline void add_partial(struct kmem_cache_node *n, 3558 - struct slab *slab, int tail) 3661 + struct slab *slab, enum add_mode mode) 3559 3662 { 3560 3663 lockdep_assert_held(&n->list_lock); 3561 - __add_partial(n, slab, tail); 3664 + __add_partial(n, slab, mode); 3562 3665 } 3563 3666 3564 3667 static inline void remove_partial(struct kmem_cache_node *n, ··· 3609 3712 return object; 3610 3713 } 3611 3714 3612 - static void defer_deactivate_slab(struct slab *slab, void *flush_freelist); 3613 - 3614 3715 /* 3615 3716 * Called only for kmem_cache_debug() caches to allocate from a freshly 3616 3717 * allocated slab. Allocate a single object instead of whole freelist ··· 3624 3729 void *object; 3625 3730 3626 3731 if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) { 3627 - /* Unlucky, discard newly allocated slab */ 3628 - defer_deactivate_slab(slab, NULL); 3732 + /* Unlucky, discard newly allocated slab. */ 3733 + free_new_slab_nolock(s, slab); 3629 3734 return NULL; 3630 3735 } 3631 3736 ··· 3651 3756 if (slab->inuse == slab->objects) 3652 3757 add_full(s, n, slab); 3653 3758 else 3654 - add_partial(n, slab, DEACTIVATE_TO_HEAD); 3759 + add_partial(n, slab, ADD_TO_HEAD); 3655 3760 3656 3761 inc_slabs_node(s, nid, slab->objects); 3657 3762 spin_unlock_irqrestore(&n->list_lock, flags); ··· 3659 3764 return object; 3660 3765 } 3661 3766 3662 - #ifdef CONFIG_SLUB_CPU_PARTIAL 3663 - static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain); 3664 - #else 3665 - static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab, 3666 - int drain) { } 3667 - #endif 3668 3767 static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags); 3669 3768 3670 - /* 3671 - * Try to allocate a partial slab from a specific node. 3672 - */ 3673 - static struct slab *get_partial_node(struct kmem_cache *s, 3674 - struct kmem_cache_node *n, 3675 - struct partial_context *pc) 3769 + static bool get_partial_node_bulk(struct kmem_cache *s, 3770 + struct kmem_cache_node *n, 3771 + struct partial_bulk_context *pc, 3772 + bool allow_spin) 3676 3773 { 3677 - struct slab *slab, *slab2, *partial = NULL; 3774 + struct slab *slab, *slab2; 3775 + unsigned int total_free = 0; 3678 3776 unsigned long flags; 3679 - unsigned int partial_slabs = 0; 3777 + 3778 + /* Racy check to avoid taking the lock unnecessarily. */ 3779 + if (!n || data_race(!n->nr_partial)) 3780 + return false; 3781 + 3782 + INIT_LIST_HEAD(&pc->slabs); 3783 + 3784 + if (allow_spin) 3785 + spin_lock_irqsave(&n->list_lock, flags); 3786 + else if (!spin_trylock_irqsave(&n->list_lock, flags)) 3787 + return false; 3788 + 3789 + list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { 3790 + struct freelist_counters flc; 3791 + unsigned int slab_free; 3792 + 3793 + if (!pfmemalloc_match(slab, pc->flags)) 3794 + continue; 3795 + 3796 + /* 3797 + * determine the number of free objects in the slab racily 3798 + * 3799 + * slab_free is a lower bound due to possible subsequent 3800 + * concurrent freeing, so the caller may get more objects than 3801 + * requested and must handle that 3802 + */ 3803 + flc.counters = data_race(READ_ONCE(slab->counters)); 3804 + slab_free = flc.objects - flc.inuse; 3805 + 3806 + /* we have already min and this would get us over the max */ 3807 + if (total_free >= pc->min_objects 3808 + && total_free + slab_free > pc->max_objects) 3809 + break; 3810 + 3811 + remove_partial(n, slab); 3812 + 3813 + list_add(&slab->slab_list, &pc->slabs); 3814 + 3815 + total_free += slab_free; 3816 + if (total_free >= pc->max_objects) 3817 + break; 3818 + } 3819 + 3820 + spin_unlock_irqrestore(&n->list_lock, flags); 3821 + return total_free > 0; 3822 + } 3823 + 3824 + /* 3825 + * Try to allocate object from a partial slab on a specific node. 3826 + */ 3827 + static void *get_from_partial_node(struct kmem_cache *s, 3828 + struct kmem_cache_node *n, 3829 + struct partial_context *pc) 3830 + { 3831 + struct slab *slab, *slab2; 3832 + unsigned long flags; 3833 + void *object = NULL; 3680 3834 3681 3835 /* 3682 3836 * Racy check. If we mistakenly see no partial slabs then we 3683 3837 * just allocate an empty slab. If we mistakenly try to get a 3684 - * partial slab and there is none available then get_partial() 3838 + * partial slab and there is none available then get_from_partial() 3685 3839 * will return NULL. 3686 3840 */ 3687 3841 if (!n || !n->nr_partial) ··· 3741 3797 else if (!spin_trylock_irqsave(&n->list_lock, flags)) 3742 3798 return NULL; 3743 3799 list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { 3800 + 3801 + struct freelist_counters old, new; 3802 + 3744 3803 if (!pfmemalloc_match(slab, pc->flags)) 3745 3804 continue; 3746 3805 3747 3806 if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 3748 - void *object = alloc_single_from_partial(s, n, slab, 3807 + object = alloc_single_from_partial(s, n, slab, 3749 3808 pc->orig_size); 3750 - if (object) { 3751 - partial = slab; 3752 - pc->object = object; 3809 + if (object) 3753 3810 break; 3754 - } 3755 3811 continue; 3756 3812 } 3757 3813 3758 - remove_partial(n, slab); 3814 + /* 3815 + * get a single object from the slab. This might race against 3816 + * __slab_free(), which however has to take the list_lock if 3817 + * it's about to make the slab fully free. 3818 + */ 3819 + do { 3820 + old.freelist = slab->freelist; 3821 + old.counters = slab->counters; 3759 3822 3760 - if (!partial) { 3761 - partial = slab; 3762 - stat(s, ALLOC_FROM_PARTIAL); 3823 + new.freelist = get_freepointer(s, old.freelist); 3824 + new.counters = old.counters; 3825 + new.inuse++; 3763 3826 3764 - if ((slub_get_cpu_partial(s) == 0)) { 3765 - break; 3766 - } 3767 - } else { 3768 - put_cpu_partial(s, slab, 0); 3769 - stat(s, CPU_PARTIAL_NODE); 3827 + } while (!__slab_update_freelist(s, slab, &old, &new, "get_from_partial_node")); 3770 3828 3771 - if (++partial_slabs > slub_get_cpu_partial(s) / 2) { 3772 - break; 3773 - } 3774 - } 3829 + object = old.freelist; 3830 + if (!new.freelist) 3831 + remove_partial(n, slab); 3832 + 3833 + break; 3775 3834 } 3776 3835 spin_unlock_irqrestore(&n->list_lock, flags); 3777 - return partial; 3836 + return object; 3778 3837 } 3779 3838 3780 3839 /* 3781 - * Get a slab from somewhere. Search in increasing NUMA distances. 3840 + * Get an object from somewhere. Search in increasing NUMA distances. 3782 3841 */ 3783 - static struct slab *get_any_partial(struct kmem_cache *s, 3784 - struct partial_context *pc) 3842 + static void *get_from_any_partial(struct kmem_cache *s, struct partial_context *pc) 3785 3843 { 3786 3844 #ifdef CONFIG_NUMA 3787 3845 struct zonelist *zonelist; 3788 3846 struct zoneref *z; 3789 3847 struct zone *zone; 3790 3848 enum zone_type highest_zoneidx = gfp_zone(pc->flags); 3791 - struct slab *slab; 3792 3849 unsigned int cpuset_mems_cookie; 3793 3850 3794 3851 /* ··· 3824 3879 3825 3880 if (n && cpuset_zone_allowed(zone, pc->flags) && 3826 3881 n->nr_partial > s->min_partial) { 3827 - slab = get_partial_node(s, n, pc); 3828 - if (slab) { 3882 + 3883 + void *object = get_from_partial_node(s, n, pc); 3884 + 3885 + if (object) { 3829 3886 /* 3830 3887 * Don't check read_mems_allowed_retry() 3831 3888 * here - if mems_allowed was updated in ··· 3835 3888 * between allocation and the cpuset 3836 3889 * update 3837 3890 */ 3838 - return slab; 3891 + return object; 3839 3892 } 3840 3893 } 3841 3894 } ··· 3845 3898 } 3846 3899 3847 3900 /* 3848 - * Get a partial slab, lock it and return it. 3901 + * Get an object from a partial slab 3849 3902 */ 3850 - static struct slab *get_partial(struct kmem_cache *s, int node, 3851 - struct partial_context *pc) 3903 + static void *get_from_partial(struct kmem_cache *s, int node, 3904 + struct partial_context *pc) 3852 3905 { 3853 - struct slab *slab; 3854 3906 int searchnode = node; 3907 + void *object; 3855 3908 3856 3909 if (node == NUMA_NO_NODE) 3857 3910 searchnode = numa_mem_id(); 3858 3911 3859 - slab = get_partial_node(s, get_node(s, searchnode), pc); 3860 - if (slab || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE))) 3861 - return slab; 3912 + object = get_from_partial_node(s, get_node(s, searchnode), pc); 3913 + if (object || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE))) 3914 + return object; 3862 3915 3863 - return get_any_partial(s, pc); 3864 - } 3865 - 3866 - #ifdef CONFIG_PREEMPTION 3867 - /* 3868 - * Calculate the next globally unique transaction for disambiguation 3869 - * during cmpxchg. The transactions start with the cpu number and are then 3870 - * incremented by CONFIG_NR_CPUS. 3871 - */ 3872 - #define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS) 3873 - #else 3874 - /* 3875 - * No preemption supported therefore also no need to check for 3876 - * different cpus. 3877 - */ 3878 - #define TID_STEP 1 3879 - #endif /* CONFIG_PREEMPTION */ 3880 - 3881 - static inline unsigned long next_tid(unsigned long tid) 3882 - { 3883 - return tid + TID_STEP; 3884 - } 3885 - 3886 - #ifdef SLUB_DEBUG_CMPXCHG 3887 - static inline unsigned int tid_to_cpu(unsigned long tid) 3888 - { 3889 - return tid % TID_STEP; 3890 - } 3891 - 3892 - static inline unsigned long tid_to_event(unsigned long tid) 3893 - { 3894 - return tid / TID_STEP; 3895 - } 3896 - #endif 3897 - 3898 - static inline unsigned int init_tid(int cpu) 3899 - { 3900 - return cpu; 3901 - } 3902 - 3903 - static inline void note_cmpxchg_failure(const char *n, 3904 - const struct kmem_cache *s, unsigned long tid) 3905 - { 3906 - #ifdef SLUB_DEBUG_CMPXCHG 3907 - unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); 3908 - 3909 - pr_info("%s %s: cmpxchg redo ", n, s->name); 3910 - 3911 - if (IS_ENABLED(CONFIG_PREEMPTION) && 3912 - tid_to_cpu(tid) != tid_to_cpu(actual_tid)) { 3913 - pr_warn("due to cpu change %d -> %d\n", 3914 - tid_to_cpu(tid), tid_to_cpu(actual_tid)); 3915 - } else if (tid_to_event(tid) != tid_to_event(actual_tid)) { 3916 - pr_warn("due to cpu running other code. Event %ld->%ld\n", 3917 - tid_to_event(tid), tid_to_event(actual_tid)); 3918 - } else { 3919 - pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n", 3920 - actual_tid, tid, next_tid(tid)); 3921 - } 3922 - #endif 3923 - stat(s, CMPXCHG_DOUBLE_CPU_FAIL); 3924 - } 3925 - 3926 - static void init_kmem_cache_cpus(struct kmem_cache *s) 3927 - { 3928 - #ifdef CONFIG_PREEMPT_RT 3929 - /* 3930 - * Register lockdep key for non-boot kmem caches to avoid 3931 - * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key() 3932 - */ 3933 - bool finegrain_lockdep = !init_section_contains(s, 1); 3934 - #else 3935 - /* 3936 - * Don't bother with different lockdep classes for each 3937 - * kmem_cache, since we only use local_trylock_irqsave(). 3938 - */ 3939 - bool finegrain_lockdep = false; 3940 - #endif 3941 - int cpu; 3942 - struct kmem_cache_cpu *c; 3943 - 3944 - if (finegrain_lockdep) 3945 - lockdep_register_key(&s->lock_key); 3946 - for_each_possible_cpu(cpu) { 3947 - c = per_cpu_ptr(s->cpu_slab, cpu); 3948 - local_trylock_init(&c->lock); 3949 - if (finegrain_lockdep) 3950 - lockdep_set_class(&c->lock, &s->lock_key); 3951 - c->tid = init_tid(cpu); 3952 - } 3953 - } 3954 - 3955 - /* 3956 - * Finishes removing the cpu slab. Merges cpu's freelist with slab's freelist, 3957 - * unfreezes the slabs and puts it on the proper list. 3958 - * Assumes the slab has been already safely taken away from kmem_cache_cpu 3959 - * by the caller. 3960 - */ 3961 - static void deactivate_slab(struct kmem_cache *s, struct slab *slab, 3962 - void *freelist) 3963 - { 3964 - struct kmem_cache_node *n = get_node(s, slab_nid(slab)); 3965 - int free_delta = 0; 3966 - void *nextfree, *freelist_iter, *freelist_tail; 3967 - int tail = DEACTIVATE_TO_HEAD; 3968 - unsigned long flags = 0; 3969 - struct freelist_counters old, new; 3970 - 3971 - if (READ_ONCE(slab->freelist)) { 3972 - stat(s, DEACTIVATE_REMOTE_FREES); 3973 - tail = DEACTIVATE_TO_TAIL; 3974 - } 3975 - 3976 - /* 3977 - * Stage one: Count the objects on cpu's freelist as free_delta and 3978 - * remember the last object in freelist_tail for later splicing. 3979 - */ 3980 - freelist_tail = NULL; 3981 - freelist_iter = freelist; 3982 - while (freelist_iter) { 3983 - nextfree = get_freepointer(s, freelist_iter); 3984 - 3985 - /* 3986 - * If 'nextfree' is invalid, it is possible that the object at 3987 - * 'freelist_iter' is already corrupted. So isolate all objects 3988 - * starting at 'freelist_iter' by skipping them. 3989 - */ 3990 - if (freelist_corrupted(s, slab, &freelist_iter, nextfree)) 3991 - break; 3992 - 3993 - freelist_tail = freelist_iter; 3994 - free_delta++; 3995 - 3996 - freelist_iter = nextfree; 3997 - } 3998 - 3999 - /* 4000 - * Stage two: Unfreeze the slab while splicing the per-cpu 4001 - * freelist to the head of slab's freelist. 4002 - */ 4003 - do { 4004 - old.freelist = READ_ONCE(slab->freelist); 4005 - old.counters = READ_ONCE(slab->counters); 4006 - VM_BUG_ON(!old.frozen); 4007 - 4008 - /* Determine target state of the slab */ 4009 - new.counters = old.counters; 4010 - new.frozen = 0; 4011 - if (freelist_tail) { 4012 - new.inuse -= free_delta; 4013 - set_freepointer(s, freelist_tail, old.freelist); 4014 - new.freelist = freelist; 4015 - } else { 4016 - new.freelist = old.freelist; 4017 - } 4018 - } while (!slab_update_freelist(s, slab, &old, &new, "unfreezing slab")); 4019 - 4020 - /* 4021 - * Stage three: Manipulate the slab list based on the updated state. 4022 - */ 4023 - if (!new.inuse && n->nr_partial >= s->min_partial) { 4024 - stat(s, DEACTIVATE_EMPTY); 4025 - discard_slab(s, slab); 4026 - stat(s, FREE_SLAB); 4027 - } else if (new.freelist) { 4028 - spin_lock_irqsave(&n->list_lock, flags); 4029 - add_partial(n, slab, tail); 4030 - spin_unlock_irqrestore(&n->list_lock, flags); 4031 - stat(s, tail); 4032 - } else { 4033 - stat(s, DEACTIVATE_FULL); 4034 - } 4035 - } 4036 - 4037 - /* 4038 - * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock 4039 - * can be acquired without a deadlock before invoking the function. 4040 - * 4041 - * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is 4042 - * using local_lock_is_locked() properly before calling local_lock_cpu_slab(), 4043 - * and kmalloc() is not used in an unsupported context. 4044 - * 4045 - * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave(). 4046 - * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but 4047 - * lockdep_assert() will catch a bug in case: 4048 - * #1 4049 - * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock() 4050 - * or 4051 - * #2 4052 - * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock() 4053 - * 4054 - * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt 4055 - * disabled context. The lock will always be acquired and if needed it 4056 - * block and sleep until the lock is available. 4057 - * #1 is possible in !PREEMPT_RT only. 4058 - * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock: 4059 - * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) -> 4060 - * tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B) 4061 - * 4062 - * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B 4063 - */ 4064 - #if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP) 4065 - #define local_lock_cpu_slab(s, flags) \ 4066 - local_lock_irqsave(&(s)->cpu_slab->lock, flags) 4067 - #else 4068 - #define local_lock_cpu_slab(s, flags) \ 4069 - do { \ 4070 - bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \ 4071 - lockdep_assert(__l); \ 4072 - } while (0) 4073 - #endif 4074 - 4075 - #define local_unlock_cpu_slab(s, flags) \ 4076 - local_unlock_irqrestore(&(s)->cpu_slab->lock, flags) 4077 - 4078 - #ifdef CONFIG_SLUB_CPU_PARTIAL 4079 - static void __put_partials(struct kmem_cache *s, struct slab *partial_slab) 4080 - { 4081 - struct kmem_cache_node *n = NULL, *n2 = NULL; 4082 - struct slab *slab, *slab_to_discard = NULL; 4083 - unsigned long flags = 0; 4084 - 4085 - while (partial_slab) { 4086 - slab = partial_slab; 4087 - partial_slab = slab->next; 4088 - 4089 - n2 = get_node(s, slab_nid(slab)); 4090 - if (n != n2) { 4091 - if (n) 4092 - spin_unlock_irqrestore(&n->list_lock, flags); 4093 - 4094 - n = n2; 4095 - spin_lock_irqsave(&n->list_lock, flags); 4096 - } 4097 - 4098 - if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) { 4099 - slab->next = slab_to_discard; 4100 - slab_to_discard = slab; 4101 - } else { 4102 - add_partial(n, slab, DEACTIVATE_TO_TAIL); 4103 - stat(s, FREE_ADD_PARTIAL); 4104 - } 4105 - } 4106 - 4107 - if (n) 4108 - spin_unlock_irqrestore(&n->list_lock, flags); 4109 - 4110 - while (slab_to_discard) { 4111 - slab = slab_to_discard; 4112 - slab_to_discard = slab_to_discard->next; 4113 - 4114 - stat(s, DEACTIVATE_EMPTY); 4115 - discard_slab(s, slab); 4116 - stat(s, FREE_SLAB); 4117 - } 4118 - } 4119 - 4120 - /* 4121 - * Put all the cpu partial slabs to the node partial list. 4122 - */ 4123 - static void put_partials(struct kmem_cache *s) 4124 - { 4125 - struct slab *partial_slab; 4126 - unsigned long flags; 4127 - 4128 - local_lock_irqsave(&s->cpu_slab->lock, flags); 4129 - partial_slab = this_cpu_read(s->cpu_slab->partial); 4130 - this_cpu_write(s->cpu_slab->partial, NULL); 4131 - local_unlock_irqrestore(&s->cpu_slab->lock, flags); 4132 - 4133 - if (partial_slab) 4134 - __put_partials(s, partial_slab); 4135 - } 4136 - 4137 - static void put_partials_cpu(struct kmem_cache *s, 4138 - struct kmem_cache_cpu *c) 4139 - { 4140 - struct slab *partial_slab; 4141 - 4142 - partial_slab = slub_percpu_partial(c); 4143 - c->partial = NULL; 4144 - 4145 - if (partial_slab) 4146 - __put_partials(s, partial_slab); 4147 - } 4148 - 4149 - /* 4150 - * Put a slab into a partial slab slot if available. 4151 - * 4152 - * If we did not find a slot then simply move all the partials to the 4153 - * per node partial list. 4154 - */ 4155 - static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) 4156 - { 4157 - struct slab *oldslab; 4158 - struct slab *slab_to_put = NULL; 4159 - unsigned long flags; 4160 - int slabs = 0; 4161 - 4162 - local_lock_cpu_slab(s, flags); 4163 - 4164 - oldslab = this_cpu_read(s->cpu_slab->partial); 4165 - 4166 - if (oldslab) { 4167 - if (drain && oldslab->slabs >= s->cpu_partial_slabs) { 4168 - /* 4169 - * Partial array is full. Move the existing set to the 4170 - * per node partial list. Postpone the actual unfreezing 4171 - * outside of the critical section. 4172 - */ 4173 - slab_to_put = oldslab; 4174 - oldslab = NULL; 4175 - } else { 4176 - slabs = oldslab->slabs; 4177 - } 4178 - } 4179 - 4180 - slabs++; 4181 - 4182 - slab->slabs = slabs; 4183 - slab->next = oldslab; 4184 - 4185 - this_cpu_write(s->cpu_slab->partial, slab); 4186 - 4187 - local_unlock_cpu_slab(s, flags); 4188 - 4189 - if (slab_to_put) { 4190 - __put_partials(s, slab_to_put); 4191 - stat(s, CPU_PARTIAL_DRAIN); 4192 - } 4193 - } 4194 - 4195 - #else /* CONFIG_SLUB_CPU_PARTIAL */ 4196 - 4197 - static inline void put_partials(struct kmem_cache *s) { } 4198 - static inline void put_partials_cpu(struct kmem_cache *s, 4199 - struct kmem_cache_cpu *c) { } 4200 - 4201 - #endif /* CONFIG_SLUB_CPU_PARTIAL */ 4202 - 4203 - static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 4204 - { 4205 - unsigned long flags; 4206 - struct slab *slab; 4207 - void *freelist; 4208 - 4209 - local_lock_irqsave(&s->cpu_slab->lock, flags); 4210 - 4211 - slab = c->slab; 4212 - freelist = c->freelist; 4213 - 4214 - c->slab = NULL; 4215 - c->freelist = NULL; 4216 - c->tid = next_tid(c->tid); 4217 - 4218 - local_unlock_irqrestore(&s->cpu_slab->lock, flags); 4219 - 4220 - if (slab) { 4221 - deactivate_slab(s, slab, freelist); 4222 - stat(s, CPUSLAB_FLUSH); 4223 - } 4224 - } 4225 - 4226 - static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) 4227 - { 4228 - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 4229 - void *freelist = c->freelist; 4230 - struct slab *slab = c->slab; 4231 - 4232 - c->slab = NULL; 4233 - c->freelist = NULL; 4234 - c->tid = next_tid(c->tid); 4235 - 4236 - if (slab) { 4237 - deactivate_slab(s, slab, freelist); 4238 - stat(s, CPUSLAB_FLUSH); 4239 - } 4240 - 4241 - put_partials_cpu(s, c); 4242 - } 4243 - 4244 - static inline void flush_this_cpu_slab(struct kmem_cache *s) 4245 - { 4246 - struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); 4247 - 4248 - if (c->slab) 4249 - flush_slab(s, c); 4250 - 4251 - put_partials(s); 4252 - } 4253 - 4254 - static bool has_cpu_slab(int cpu, struct kmem_cache *s) 4255 - { 4256 - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 4257 - 4258 - return c->slab || slub_percpu_partial(c); 3916 + return get_from_any_partial(s, pc); 4259 3917 } 4260 3918 4261 3919 static bool has_pcs_used(int cpu, struct kmem_cache *s) 4262 3920 { 4263 3921 struct slub_percpu_sheaves *pcs; 4264 3922 4265 - if (!s->cpu_sheaves) 3923 + if (!cache_has_sheaves(s)) 4266 3924 return false; 4267 3925 4268 3926 pcs = per_cpu_ptr(s->cpu_sheaves, cpu); ··· 3876 4324 } 3877 4325 3878 4326 /* 3879 - * Flush cpu slab. 4327 + * Flush percpu sheaves 3880 4328 * 3881 4329 * Called from CPU work handler with migration disabled. 3882 4330 */ 3883 - static void flush_cpu_slab(struct work_struct *w) 4331 + static void flush_cpu_sheaves(struct work_struct *w) 3884 4332 { 3885 4333 struct kmem_cache *s; 3886 4334 struct slub_flush_work *sfw; ··· 3889 4337 3890 4338 s = sfw->s; 3891 4339 3892 - if (s->cpu_sheaves) 4340 + if (cache_has_sheaves(s)) 3893 4341 pcs_flush_all(s); 3894 - 3895 - flush_this_cpu_slab(s); 3896 4342 } 3897 4343 3898 4344 static void flush_all_cpus_locked(struct kmem_cache *s) ··· 3903 4353 3904 4354 for_each_online_cpu(cpu) { 3905 4355 sfw = &per_cpu(slub_flush, cpu); 3906 - if (!has_cpu_slab(cpu, s) && !has_pcs_used(cpu, s)) { 4356 + if (!has_pcs_used(cpu, s)) { 3907 4357 sfw->skip = true; 3908 4358 continue; 3909 4359 } 3910 - INIT_WORK(&sfw->work, flush_cpu_slab); 4360 + INIT_WORK(&sfw->work, flush_cpu_sheaves); 3911 4361 sfw->skip = false; 3912 4362 sfw->s = s; 3913 4363 queue_work_on(cpu, flushwq, &sfw->work); ··· 3992 4442 mutex_lock(&slab_mutex); 3993 4443 3994 4444 list_for_each_entry(s, &slab_caches, list) { 3995 - if (!s->cpu_sheaves) 4445 + if (!cache_has_sheaves(s)) 3996 4446 continue; 3997 4447 flush_rcu_sheaves_on_cache(s); 3998 4448 } ··· 4013 4463 4014 4464 mutex_lock(&slab_mutex); 4015 4465 list_for_each_entry(s, &slab_caches, list) { 4016 - __flush_cpu_slab(s, cpu); 4017 - if (s->cpu_sheaves) 4466 + if (cache_has_sheaves(s)) 4018 4467 __pcs_flush_all_cpu(s, cpu); 4019 4468 } 4020 4469 mutex_unlock(&slab_mutex); 4021 4470 return 0; 4022 - } 4023 - 4024 - /* 4025 - * Check if the objects in a per cpu structure fit numa 4026 - * locality expectations. 4027 - */ 4028 - static inline int node_match(struct slab *slab, int node) 4029 - { 4030 - #ifdef CONFIG_NUMA 4031 - if (node != NUMA_NO_NODE && slab_nid(slab) != node) 4032 - return 0; 4033 - #endif 4034 - return 1; 4035 4471 } 4036 4472 4037 4473 #ifdef CONFIG_SLUB_DEBUG ··· 4192 4656 return true; 4193 4657 } 4194 4658 4195 - static inline bool 4196 - __update_cpu_freelist_fast(struct kmem_cache *s, 4197 - void *freelist_old, void *freelist_new, 4198 - unsigned long tid) 4199 - { 4200 - struct freelist_tid old = { .freelist = freelist_old, .tid = tid }; 4201 - struct freelist_tid new = { .freelist = freelist_new, .tid = next_tid(tid) }; 4202 - 4203 - return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid, 4204 - &old.freelist_tid, new.freelist_tid); 4205 - } 4206 - 4207 4659 /* 4208 - * Check the slab->freelist and either transfer the freelist to the 4209 - * per cpu freelist or deactivate the slab. 4660 + * Get the slab's freelist and do not freeze it. 4210 4661 * 4211 - * The slab is still frozen if the return value is not NULL. 4662 + * Assumes the slab is isolated from node partial list and not frozen. 4212 4663 * 4213 - * If this function returns NULL then the slab has been unfrozen. 4664 + * Assumes this is performed only for caches without debugging so we 4665 + * don't need to worry about adding the slab to the full list. 4214 4666 */ 4215 - static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) 4216 - { 4217 - struct freelist_counters old, new; 4218 - 4219 - lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); 4220 - 4221 - do { 4222 - old.freelist = slab->freelist; 4223 - old.counters = slab->counters; 4224 - 4225 - new.freelist = NULL; 4226 - new.counters = old.counters; 4227 - 4228 - new.inuse = old.objects; 4229 - new.frozen = old.freelist != NULL; 4230 - 4231 - 4232 - } while (!__slab_update_freelist(s, slab, &old, &new, "get_freelist")); 4233 - 4234 - return old.freelist; 4235 - } 4236 - 4237 - /* 4238 - * Freeze the partial slab and return the pointer to the freelist. 4239 - */ 4240 - static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) 4667 + static inline void *get_freelist_nofreeze(struct kmem_cache *s, struct slab *slab) 4241 4668 { 4242 4669 struct freelist_counters old, new; 4243 4670 ··· 4210 4711 4211 4712 new.freelist = NULL; 4212 4713 new.counters = old.counters; 4213 - VM_BUG_ON(new.frozen); 4714 + VM_WARN_ON_ONCE(new.frozen); 4214 4715 4215 4716 new.inuse = old.objects; 4216 - new.frozen = 1; 4217 4717 4218 - } while (!slab_update_freelist(s, slab, &old, &new, "freeze_slab")); 4718 + } while (!slab_update_freelist(s, slab, &old, &new, "get_freelist_nofreeze")); 4219 4719 4220 4720 return old.freelist; 4221 - } 4222 - 4223 - /* 4224 - * Slow path. The lockless freelist is empty or we need to perform 4225 - * debugging duties. 4226 - * 4227 - * Processing is still very fast if new objects have been freed to the 4228 - * regular freelist. In that case we simply take over the regular freelist 4229 - * as the lockless freelist and zap the regular freelist. 4230 - * 4231 - * If that is not working then we fall back to the partial lists. We take the 4232 - * first element of the freelist as the object to allocate now and move the 4233 - * rest of the freelist to the lockless freelist. 4234 - * 4235 - * And if we were unable to get a new slab from the partial slab lists then 4236 - * we need to allocate a new slab. This is the slowest path since it involves 4237 - * a call to the page allocator and the setup of a new slab. 4238 - * 4239 - * Version of __slab_alloc to use when we know that preemption is 4240 - * already disabled (which is the case for bulk allocation). 4241 - */ 4242 - static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 4243 - unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) 4244 - { 4245 - bool allow_spin = gfpflags_allow_spinning(gfpflags); 4246 - void *freelist; 4247 - struct slab *slab; 4248 - unsigned long flags; 4249 - struct partial_context pc; 4250 - bool try_thisnode = true; 4251 - 4252 - stat(s, ALLOC_SLOWPATH); 4253 - 4254 - reread_slab: 4255 - 4256 - slab = READ_ONCE(c->slab); 4257 - if (!slab) { 4258 - /* 4259 - * if the node is not online or has no normal memory, just 4260 - * ignore the node constraint 4261 - */ 4262 - if (unlikely(node != NUMA_NO_NODE && 4263 - !node_isset(node, slab_nodes))) 4264 - node = NUMA_NO_NODE; 4265 - goto new_slab; 4266 - } 4267 - 4268 - if (unlikely(!node_match(slab, node))) { 4269 - /* 4270 - * same as above but node_match() being false already 4271 - * implies node != NUMA_NO_NODE. 4272 - * 4273 - * We don't strictly honor pfmemalloc and NUMA preferences 4274 - * when !allow_spin because: 4275 - * 4276 - * 1. Most kmalloc() users allocate objects on the local node, 4277 - * so kmalloc_nolock() tries not to interfere with them by 4278 - * deactivating the cpu slab. 4279 - * 4280 - * 2. Deactivating due to NUMA or pfmemalloc mismatch may cause 4281 - * unnecessary slab allocations even when n->partial list 4282 - * is not empty. 4283 - */ 4284 - if (!node_isset(node, slab_nodes) || 4285 - !allow_spin) { 4286 - node = NUMA_NO_NODE; 4287 - } else { 4288 - stat(s, ALLOC_NODE_MISMATCH); 4289 - goto deactivate_slab; 4290 - } 4291 - } 4292 - 4293 - /* 4294 - * By rights, we should be searching for a slab page that was 4295 - * PFMEMALLOC but right now, we are losing the pfmemalloc 4296 - * information when the page leaves the per-cpu allocator 4297 - */ 4298 - if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) 4299 - goto deactivate_slab; 4300 - 4301 - /* must check again c->slab in case we got preempted and it changed */ 4302 - local_lock_cpu_slab(s, flags); 4303 - 4304 - if (unlikely(slab != c->slab)) { 4305 - local_unlock_cpu_slab(s, flags); 4306 - goto reread_slab; 4307 - } 4308 - freelist = c->freelist; 4309 - if (freelist) 4310 - goto load_freelist; 4311 - 4312 - freelist = get_freelist(s, slab); 4313 - 4314 - if (!freelist) { 4315 - c->slab = NULL; 4316 - c->tid = next_tid(c->tid); 4317 - local_unlock_cpu_slab(s, flags); 4318 - stat(s, DEACTIVATE_BYPASS); 4319 - goto new_slab; 4320 - } 4321 - 4322 - stat(s, ALLOC_REFILL); 4323 - 4324 - load_freelist: 4325 - 4326 - lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); 4327 - 4328 - /* 4329 - * freelist is pointing to the list of objects to be used. 4330 - * slab is pointing to the slab from which the objects are obtained. 4331 - * That slab must be frozen for per cpu allocations to work. 4332 - */ 4333 - VM_BUG_ON(!c->slab->frozen); 4334 - c->freelist = get_freepointer(s, freelist); 4335 - c->tid = next_tid(c->tid); 4336 - local_unlock_cpu_slab(s, flags); 4337 - return freelist; 4338 - 4339 - deactivate_slab: 4340 - 4341 - local_lock_cpu_slab(s, flags); 4342 - if (slab != c->slab) { 4343 - local_unlock_cpu_slab(s, flags); 4344 - goto reread_slab; 4345 - } 4346 - freelist = c->freelist; 4347 - c->slab = NULL; 4348 - c->freelist = NULL; 4349 - c->tid = next_tid(c->tid); 4350 - local_unlock_cpu_slab(s, flags); 4351 - deactivate_slab(s, slab, freelist); 4352 - 4353 - new_slab: 4354 - 4355 - #ifdef CONFIG_SLUB_CPU_PARTIAL 4356 - while (slub_percpu_partial(c)) { 4357 - local_lock_cpu_slab(s, flags); 4358 - if (unlikely(c->slab)) { 4359 - local_unlock_cpu_slab(s, flags); 4360 - goto reread_slab; 4361 - } 4362 - if (unlikely(!slub_percpu_partial(c))) { 4363 - local_unlock_cpu_slab(s, flags); 4364 - /* we were preempted and partial list got empty */ 4365 - goto new_objects; 4366 - } 4367 - 4368 - slab = slub_percpu_partial(c); 4369 - slub_set_percpu_partial(c, slab); 4370 - 4371 - if (likely(node_match(slab, node) && 4372 - pfmemalloc_match(slab, gfpflags)) || 4373 - !allow_spin) { 4374 - c->slab = slab; 4375 - freelist = get_freelist(s, slab); 4376 - VM_BUG_ON(!freelist); 4377 - stat(s, CPU_PARTIAL_ALLOC); 4378 - goto load_freelist; 4379 - } 4380 - 4381 - local_unlock_cpu_slab(s, flags); 4382 - 4383 - slab->next = NULL; 4384 - __put_partials(s, slab); 4385 - } 4386 - #endif 4387 - 4388 - new_objects: 4389 - 4390 - pc.flags = gfpflags; 4391 - /* 4392 - * When a preferred node is indicated but no __GFP_THISNODE 4393 - * 4394 - * 1) try to get a partial slab from target node only by having 4395 - * __GFP_THISNODE in pc.flags for get_partial() 4396 - * 2) if 1) failed, try to allocate a new slab from target node with 4397 - * GPF_NOWAIT | __GFP_THISNODE opportunistically 4398 - * 3) if 2) failed, retry with original gfpflags which will allow 4399 - * get_partial() try partial lists of other nodes before potentially 4400 - * allocating new page from other nodes 4401 - */ 4402 - if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) 4403 - && try_thisnode)) { 4404 - if (unlikely(!allow_spin)) 4405 - /* Do not upgrade gfp to NOWAIT from more restrictive mode */ 4406 - pc.flags = gfpflags | __GFP_THISNODE; 4407 - else 4408 - pc.flags = GFP_NOWAIT | __GFP_THISNODE; 4409 - } 4410 - 4411 - pc.orig_size = orig_size; 4412 - slab = get_partial(s, node, &pc); 4413 - if (slab) { 4414 - if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 4415 - freelist = pc.object; 4416 - /* 4417 - * For debug caches here we had to go through 4418 - * alloc_single_from_partial() so just store the 4419 - * tracking info and return the object. 4420 - * 4421 - * Due to disabled preemption we need to disallow 4422 - * blocking. The flags are further adjusted by 4423 - * gfp_nested_mask() in stack_depot itself. 4424 - */ 4425 - if (s->flags & SLAB_STORE_USER) 4426 - set_track(s, freelist, TRACK_ALLOC, addr, 4427 - gfpflags & ~(__GFP_DIRECT_RECLAIM)); 4428 - 4429 - return freelist; 4430 - } 4431 - 4432 - freelist = freeze_slab(s, slab); 4433 - goto retry_load_slab; 4434 - } 4435 - 4436 - slub_put_cpu_ptr(s->cpu_slab); 4437 - slab = new_slab(s, pc.flags, node); 4438 - c = slub_get_cpu_ptr(s->cpu_slab); 4439 - 4440 - if (unlikely(!slab)) { 4441 - if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) 4442 - && try_thisnode) { 4443 - try_thisnode = false; 4444 - goto new_objects; 4445 - } 4446 - slab_out_of_memory(s, gfpflags, node); 4447 - return NULL; 4448 - } 4449 - 4450 - stat(s, ALLOC_SLAB); 4451 - 4452 - if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 4453 - freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); 4454 - 4455 - if (unlikely(!freelist)) { 4456 - /* This could cause an endless loop. Fail instead. */ 4457 - if (!allow_spin) 4458 - return NULL; 4459 - goto new_objects; 4460 - } 4461 - 4462 - if (s->flags & SLAB_STORE_USER) 4463 - set_track(s, freelist, TRACK_ALLOC, addr, 4464 - gfpflags & ~(__GFP_DIRECT_RECLAIM)); 4465 - 4466 - return freelist; 4467 - } 4468 - 4469 - /* 4470 - * No other reference to the slab yet so we can 4471 - * muck around with it freely without cmpxchg 4472 - */ 4473 - freelist = slab->freelist; 4474 - slab->freelist = NULL; 4475 - slab->inuse = slab->objects; 4476 - slab->frozen = 1; 4477 - 4478 - inc_slabs_node(s, slab_nid(slab), slab->objects); 4479 - 4480 - if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) { 4481 - /* 4482 - * For !pfmemalloc_match() case we don't load freelist so that 4483 - * we don't make further mismatched allocations easier. 4484 - */ 4485 - deactivate_slab(s, slab, get_freepointer(s, freelist)); 4486 - return freelist; 4487 - } 4488 - 4489 - retry_load_slab: 4490 - 4491 - local_lock_cpu_slab(s, flags); 4492 - if (unlikely(c->slab)) { 4493 - void *flush_freelist = c->freelist; 4494 - struct slab *flush_slab = c->slab; 4495 - 4496 - c->slab = NULL; 4497 - c->freelist = NULL; 4498 - c->tid = next_tid(c->tid); 4499 - 4500 - local_unlock_cpu_slab(s, flags); 4501 - 4502 - if (unlikely(!allow_spin)) { 4503 - /* Reentrant slub cannot take locks, defer */ 4504 - defer_deactivate_slab(flush_slab, flush_freelist); 4505 - } else { 4506 - deactivate_slab(s, flush_slab, flush_freelist); 4507 - } 4508 - 4509 - stat(s, CPUSLAB_FLUSH); 4510 - 4511 - goto retry_load_slab; 4512 - } 4513 - c->slab = slab; 4514 - 4515 - goto load_freelist; 4516 - } 4517 - /* 4518 - * We disallow kprobes in ___slab_alloc() to prevent reentrance 4519 - * 4520 - * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of 4521 - * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf -> 4522 - * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast() 4523 - * manipulating c->freelist without lock. 4524 - * 4525 - * This does not prevent kprobe in functions called from ___slab_alloc() such as 4526 - * local_lock_irqsave() itself, and that is fine, we only need to protect the 4527 - * c->freelist manipulation in ___slab_alloc() itself. 4528 - */ 4529 - NOKPROBE_SYMBOL(___slab_alloc); 4530 - 4531 - /* 4532 - * A wrapper for ___slab_alloc() for contexts where preemption is not yet 4533 - * disabled. Compensates for possible cpu changes by refetching the per cpu area 4534 - * pointer. 4535 - */ 4536 - static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 4537 - unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) 4538 - { 4539 - void *p; 4540 - 4541 - #ifdef CONFIG_PREEMPT_COUNT 4542 - /* 4543 - * We may have been preempted and rescheduled on a different 4544 - * cpu before disabling preemption. Need to reload cpu area 4545 - * pointer. 4546 - */ 4547 - c = slub_get_cpu_ptr(s->cpu_slab); 4548 - #endif 4549 - if (unlikely(!gfpflags_allow_spinning(gfpflags))) { 4550 - if (local_lock_is_locked(&s->cpu_slab->lock)) { 4551 - /* 4552 - * EBUSY is an internal signal to kmalloc_nolock() to 4553 - * retry a different bucket. It's not propagated 4554 - * to the caller. 4555 - */ 4556 - p = ERR_PTR(-EBUSY); 4557 - goto out; 4558 - } 4559 - } 4560 - p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size); 4561 - out: 4562 - #ifdef CONFIG_PREEMPT_COUNT 4563 - slub_put_cpu_ptr(s->cpu_slab); 4564 - #endif 4565 - return p; 4566 - } 4567 - 4568 - static __always_inline void *__slab_alloc_node(struct kmem_cache *s, 4569 - gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) 4570 - { 4571 - struct kmem_cache_cpu *c; 4572 - struct slab *slab; 4573 - unsigned long tid; 4574 - void *object; 4575 - 4576 - redo: 4577 - /* 4578 - * Must read kmem_cache cpu data via this cpu ptr. Preemption is 4579 - * enabled. We may switch back and forth between cpus while 4580 - * reading from one cpu area. That does not matter as long 4581 - * as we end up on the original cpu again when doing the cmpxchg. 4582 - * 4583 - * We must guarantee that tid and kmem_cache_cpu are retrieved on the 4584 - * same cpu. We read first the kmem_cache_cpu pointer and use it to read 4585 - * the tid. If we are preempted and switched to another cpu between the 4586 - * two reads, it's OK as the two are still associated with the same cpu 4587 - * and cmpxchg later will validate the cpu. 4588 - */ 4589 - c = raw_cpu_ptr(s->cpu_slab); 4590 - tid = READ_ONCE(c->tid); 4591 - 4592 - /* 4593 - * Irqless object alloc/free algorithm used here depends on sequence 4594 - * of fetching cpu_slab's data. tid should be fetched before anything 4595 - * on c to guarantee that object and slab associated with previous tid 4596 - * won't be used with current tid. If we fetch tid first, object and 4597 - * slab could be one associated with next tid and our alloc/free 4598 - * request will be failed. In this case, we will retry. So, no problem. 4599 - */ 4600 - barrier(); 4601 - 4602 - /* 4603 - * The transaction ids are globally unique per cpu and per operation on 4604 - * a per cpu queue. Thus they can be guarantee that the cmpxchg_double 4605 - * occurs on the right processor and that there was no operation on the 4606 - * linked list in between. 4607 - */ 4608 - 4609 - object = c->freelist; 4610 - slab = c->slab; 4611 - 4612 - #ifdef CONFIG_NUMA 4613 - if (static_branch_unlikely(&strict_numa) && 4614 - node == NUMA_NO_NODE) { 4615 - 4616 - struct mempolicy *mpol = current->mempolicy; 4617 - 4618 - if (mpol) { 4619 - /* 4620 - * Special BIND rule support. If existing slab 4621 - * is in permitted set then do not redirect 4622 - * to a particular node. 4623 - * Otherwise we apply the memory policy to get 4624 - * the node we need to allocate on. 4625 - */ 4626 - if (mpol->mode != MPOL_BIND || !slab || 4627 - !node_isset(slab_nid(slab), mpol->nodes)) 4628 - 4629 - node = mempolicy_slab_node(); 4630 - } 4631 - } 4632 - #endif 4633 - 4634 - if (!USE_LOCKLESS_FAST_PATH() || 4635 - unlikely(!object || !slab || !node_match(slab, node))) { 4636 - object = __slab_alloc(s, gfpflags, node, addr, c, orig_size); 4637 - } else { 4638 - void *next_object = get_freepointer_safe(s, object); 4639 - 4640 - /* 4641 - * The cmpxchg will only match if there was no additional 4642 - * operation and if we are on the right processor. 4643 - * 4644 - * The cmpxchg does the following atomically (without lock 4645 - * semantics!) 4646 - * 1. Relocate first pointer to the current per cpu area. 4647 - * 2. Verify that tid and freelist have not been changed 4648 - * 3. If they were not changed replace tid and freelist 4649 - * 4650 - * Since this is without lock semantics the protection is only 4651 - * against code executing on this cpu *not* from access by 4652 - * other cpus. 4653 - */ 4654 - if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) { 4655 - note_cmpxchg_failure("slab_alloc", s, tid); 4656 - goto redo; 4657 - } 4658 - prefetch_freepointer(s, next_object); 4659 - stat(s, ALLOC_FASTPATH); 4660 - } 4661 - 4662 - return object; 4663 4721 } 4664 4722 4665 4723 /* ··· 4232 5176 !freeptr_outside_object(s)) 4233 5177 memset((void *)((char *)kasan_reset_tag(obj) + s->offset), 4234 5178 0, sizeof(void *)); 5179 + } 5180 + 5181 + static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab, 5182 + void **p, unsigned int count, bool allow_spin) 5183 + { 5184 + unsigned int allocated = 0; 5185 + struct kmem_cache_node *n; 5186 + bool needs_add_partial; 5187 + unsigned long flags; 5188 + void *object; 5189 + 5190 + /* 5191 + * Are we going to put the slab on the partial list? 5192 + * Note slab->inuse is 0 on a new slab. 5193 + */ 5194 + needs_add_partial = (slab->objects > count); 5195 + 5196 + if (!allow_spin && needs_add_partial) { 5197 + 5198 + n = get_node(s, slab_nid(slab)); 5199 + 5200 + if (!spin_trylock_irqsave(&n->list_lock, flags)) { 5201 + /* Unlucky, discard newly allocated slab */ 5202 + free_new_slab_nolock(s, slab); 5203 + return 0; 5204 + } 5205 + } 5206 + 5207 + object = slab->freelist; 5208 + while (object && allocated < count) { 5209 + p[allocated] = object; 5210 + object = get_freepointer(s, object); 5211 + maybe_wipe_obj_freeptr(s, p[allocated]); 5212 + 5213 + slab->inuse++; 5214 + allocated++; 5215 + } 5216 + slab->freelist = object; 5217 + 5218 + if (needs_add_partial) { 5219 + 5220 + if (allow_spin) { 5221 + n = get_node(s, slab_nid(slab)); 5222 + spin_lock_irqsave(&n->list_lock, flags); 5223 + } 5224 + add_partial(n, slab, ADD_TO_HEAD); 5225 + spin_unlock_irqrestore(&n->list_lock, flags); 5226 + } 5227 + 5228 + inc_slabs_node(s, slab_nid(slab), slab->objects); 5229 + return allocated; 5230 + } 5231 + 5232 + /* 5233 + * Slow path. We failed to allocate via percpu sheaves or they are not available 5234 + * due to bootstrap or debugging enabled or SLUB_TINY. 5235 + * 5236 + * We try to allocate from partial slab lists and fall back to allocating a new 5237 + * slab. 5238 + */ 5239 + static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 5240 + unsigned long addr, unsigned int orig_size) 5241 + { 5242 + bool allow_spin = gfpflags_allow_spinning(gfpflags); 5243 + void *object; 5244 + struct slab *slab; 5245 + struct partial_context pc; 5246 + bool try_thisnode = true; 5247 + 5248 + stat(s, ALLOC_SLOWPATH); 5249 + 5250 + new_objects: 5251 + 5252 + pc.flags = gfpflags; 5253 + /* 5254 + * When a preferred node is indicated but no __GFP_THISNODE 5255 + * 5256 + * 1) try to get a partial slab from target node only by having 5257 + * __GFP_THISNODE in pc.flags for get_from_partial() 5258 + * 2) if 1) failed, try to allocate a new slab from target node with 5259 + * GPF_NOWAIT | __GFP_THISNODE opportunistically 5260 + * 3) if 2) failed, retry with original gfpflags which will allow 5261 + * get_from_partial() try partial lists of other nodes before 5262 + * potentially allocating new page from other nodes 5263 + */ 5264 + if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) 5265 + && try_thisnode)) { 5266 + if (unlikely(!allow_spin)) 5267 + /* Do not upgrade gfp to NOWAIT from more restrictive mode */ 5268 + pc.flags = gfpflags | __GFP_THISNODE; 5269 + else 5270 + pc.flags = GFP_NOWAIT | __GFP_THISNODE; 5271 + } 5272 + 5273 + pc.orig_size = orig_size; 5274 + object = get_from_partial(s, node, &pc); 5275 + if (object) 5276 + goto success; 5277 + 5278 + slab = new_slab(s, pc.flags, node); 5279 + 5280 + if (unlikely(!slab)) { 5281 + if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) 5282 + && try_thisnode) { 5283 + try_thisnode = false; 5284 + goto new_objects; 5285 + } 5286 + slab_out_of_memory(s, gfpflags, node); 5287 + return NULL; 5288 + } 5289 + 5290 + stat(s, ALLOC_SLAB); 5291 + 5292 + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 5293 + object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); 5294 + 5295 + if (likely(object)) 5296 + goto success; 5297 + } else { 5298 + alloc_from_new_slab(s, slab, &object, 1, allow_spin); 5299 + 5300 + /* we don't need to check SLAB_STORE_USER here */ 5301 + if (likely(object)) 5302 + return object; 5303 + } 5304 + 5305 + if (allow_spin) 5306 + goto new_objects; 5307 + 5308 + /* This could cause an endless loop. Fail instead. */ 5309 + return NULL; 5310 + 5311 + success: 5312 + if (kmem_cache_debug_flags(s, SLAB_STORE_USER)) 5313 + set_track(s, object, TRACK_ALLOC, addr, gfpflags); 5314 + 5315 + return object; 5316 + } 5317 + 5318 + static __always_inline void *__slab_alloc_node(struct kmem_cache *s, 5319 + gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) 5320 + { 5321 + void *object; 5322 + 5323 + #ifdef CONFIG_NUMA 5324 + if (static_branch_unlikely(&strict_numa) && 5325 + node == NUMA_NO_NODE) { 5326 + 5327 + struct mempolicy *mpol = current->mempolicy; 5328 + 5329 + if (mpol) { 5330 + /* 5331 + * Special BIND rule support. If the local node 5332 + * is in permitted set then do not redirect 5333 + * to a particular node. 5334 + * Otherwise we apply the memory policy to get 5335 + * the node we need to allocate on. 5336 + */ 5337 + if (mpol->mode != MPOL_BIND || 5338 + !node_isset(numa_mem_id(), mpol->nodes)) 5339 + node = mempolicy_slab_node(); 5340 + } 5341 + } 5342 + #endif 5343 + 5344 + object = ___slab_alloc(s, gfpflags, node, addr, orig_size); 5345 + 5346 + return object; 4235 5347 } 4236 5348 4237 5349 static __fastpath_inline ··· 4488 5264 4489 5265 lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); 4490 5266 5267 + /* Bootstrap or debug cache, back off */ 5268 + if (unlikely(!cache_has_sheaves(s))) { 5269 + local_unlock(&s->cpu_sheaves->lock); 5270 + return NULL; 5271 + } 5272 + 4491 5273 if (pcs->spare && pcs->spare->size > 0) { 4492 5274 swap(pcs->main, pcs->spare); 4493 5275 return pcs; ··· 4505 5275 return NULL; 4506 5276 } 4507 5277 4508 - full = barn_replace_empty_sheaf(barn, pcs->main); 5278 + full = barn_replace_empty_sheaf(barn, pcs->main, 5279 + gfpflags_allow_spinning(gfp)); 4509 5280 4510 5281 if (full) { 4511 5282 stat(s, BARN_GET); ··· 4523 5292 empty = pcs->spare; 4524 5293 pcs->spare = NULL; 4525 5294 } else { 4526 - empty = barn_get_empty_sheaf(barn); 5295 + empty = barn_get_empty_sheaf(barn, true); 4527 5296 } 4528 5297 } 4529 5298 ··· 4565 5334 */ 4566 5335 4567 5336 if (pcs->main->size == 0) { 4568 - barn_put_empty_sheaf(barn, pcs->main); 5337 + if (!pcs->spare) 5338 + pcs->spare = pcs->main; 5339 + else 5340 + barn_put_empty_sheaf(barn, pcs->main); 4569 5341 pcs->main = full; 4570 5342 return pcs; 4571 5343 } ··· 4625 5391 * We assume the percpu sheaves contain only local objects although it's 4626 5392 * not completely guaranteed, so we verify later. 4627 5393 */ 4628 - if (unlikely(node_requested && node != numa_mem_id())) 5394 + if (unlikely(node_requested && node != numa_mem_id())) { 5395 + stat(s, ALLOC_NODE_MISMATCH); 4629 5396 return NULL; 5397 + } 4630 5398 4631 5399 if (!local_trylock(&s->cpu_sheaves->lock)) 4632 5400 return NULL; ··· 4651 5415 */ 4652 5416 if (page_to_nid(virt_to_page(object)) != node) { 4653 5417 local_unlock(&s->cpu_sheaves->lock); 5418 + stat(s, ALLOC_NODE_MISMATCH); 4654 5419 return NULL; 4655 5420 } 4656 5421 } ··· 4660 5423 4661 5424 local_unlock(&s->cpu_sheaves->lock); 4662 5425 4663 - stat(s, ALLOC_PCS); 5426 + stat(s, ALLOC_FASTPATH); 4664 5427 4665 5428 return object; 4666 5429 } 4667 5430 4668 5431 static __fastpath_inline 4669 - unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p) 5432 + unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, gfp_t gfp, size_t size, 5433 + void **p) 4670 5434 { 4671 5435 struct slub_percpu_sheaves *pcs; 4672 5436 struct slab_sheaf *main; ··· 4685 5447 struct slab_sheaf *full; 4686 5448 struct node_barn *barn; 4687 5449 5450 + if (unlikely(!cache_has_sheaves(s))) { 5451 + local_unlock(&s->cpu_sheaves->lock); 5452 + return allocated; 5453 + } 5454 + 4688 5455 if (pcs->spare && pcs->spare->size > 0) { 4689 5456 swap(pcs->main, pcs->spare); 4690 5457 goto do_alloc; ··· 4701 5458 return allocated; 4702 5459 } 4703 5460 4704 - full = barn_replace_empty_sheaf(barn, pcs->main); 5461 + full = barn_replace_empty_sheaf(barn, pcs->main, 5462 + gfpflags_allow_spinning(gfp)); 4705 5463 4706 5464 if (full) { 4707 5465 stat(s, BARN_GET); ··· 4732 5488 4733 5489 local_unlock(&s->cpu_sheaves->lock); 4734 5490 4735 - stat_add(s, ALLOC_PCS, batch); 5491 + stat_add(s, ALLOC_FASTPATH, batch); 4736 5492 4737 5493 allocated += batch; 4738 5494 ··· 4770 5526 if (unlikely(object)) 4771 5527 goto out; 4772 5528 4773 - if (s->cpu_sheaves) 4774 - object = alloc_from_pcs(s, gfpflags, node); 5529 + object = alloc_from_pcs(s, gfpflags, node); 4775 5530 4776 5531 if (!object) 4777 5532 object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); ··· 4865 5622 return ret; 4866 5623 } 4867 5624 5625 + static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, 5626 + size_t size, void **p); 5627 + 4868 5628 /* 4869 5629 * returns a sheaf that has at least the requested size 4870 5630 * when prefilling is needed, do so with given gfp flags ··· 4881 5635 struct slab_sheaf *sheaf = NULL; 4882 5636 struct node_barn *barn; 4883 5637 4884 - if (unlikely(size > s->sheaf_capacity)) { 5638 + if (unlikely(!size)) 5639 + return NULL; 4885 5640 4886 - /* 4887 - * slab_debug disables cpu sheaves intentionally so all 4888 - * prefilled sheaves become "oversize" and we give up on 4889 - * performance for the debugging. Same with SLUB_TINY. 4890 - * Creating a cache without sheaves and then requesting a 4891 - * prefilled sheaf is however not expected, so warn. 4892 - */ 4893 - WARN_ON_ONCE(s->sheaf_capacity == 0 && 4894 - !IS_ENABLED(CONFIG_SLUB_TINY) && 4895 - !(s->flags & SLAB_DEBUG_FLAGS)); 5641 + if (unlikely(size > s->sheaf_capacity)) { 4896 5642 4897 5643 sheaf = kzalloc(struct_size(sheaf, objects, size), gfp); 4898 5644 if (!sheaf) ··· 5206 5968 gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags; 5207 5969 struct kmem_cache *s; 5208 5970 bool can_retry = true; 5209 - void *ret = ERR_PTR(-EBUSY); 5971 + void *ret; 5210 5972 5211 5973 VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO | 5212 5974 __GFP_NO_OBJ_EXT)); ··· 5214 5976 if (unlikely(!size)) 5215 5977 return ZERO_SIZE_PTR; 5216 5978 5217 - if (IS_ENABLED(CONFIG_PREEMPT_RT) && !preemptible()) 5218 - /* 5219 - * kmalloc_nolock() in PREEMPT_RT is not supported from 5220 - * non-preemptible context because local_lock becomes a 5221 - * sleeping lock on RT. 5222 - */ 5979 + /* 5980 + * See the comment for the same check in 5981 + * alloc_frozen_pages_nolock_noprof() 5982 + */ 5983 + if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) 5223 5984 return NULL; 5985 + 5224 5986 retry: 5225 5987 if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) 5226 5988 return NULL; ··· 5229 5991 if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s)) 5230 5992 /* 5231 5993 * kmalloc_nolock() is not supported on architectures that 5232 - * don't implement cmpxchg16b, but debug caches don't use 5233 - * per-cpu slab and per-cpu partial slabs. They rely on 5234 - * kmem_cache_node->list_lock, so kmalloc_nolock() can 5235 - * attempt to allocate from debug caches by 5994 + * don't implement cmpxchg16b and thus need slab_lock() 5995 + * which could be preempted by a nmi. 5996 + * But debug caches don't use that and only rely on 5997 + * kmem_cache_node->list_lock, so kmalloc_nolock() can attempt 5998 + * to allocate from debug caches by 5236 5999 * spin_trylock_irqsave(&n->list_lock, ...) 5237 6000 */ 5238 6001 return NULL; 6002 + 6003 + ret = alloc_from_pcs(s, alloc_gfp, node); 6004 + if (ret) 6005 + goto success; 5239 6006 5240 6007 /* 5241 6008 * Do not call slab_alloc_node(), since trylock mode isn't 5242 6009 * compatible with slab_pre_alloc_hook/should_failslab and 5243 6010 * kfence_alloc. Hence call __slab_alloc_node() (at most twice) 5244 6011 * and slab_post_alloc_hook() directly. 5245 - * 5246 - * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair 5247 - * in irq saved region. It assumes that the same cpu will not 5248 - * __update_cpu_freelist_fast() into the same (freelist,tid) pair. 5249 - * Therefore use in_nmi() to check whether particular bucket is in 5250 - * irq protected section. 5251 - * 5252 - * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that 5253 - * this cpu was interrupted somewhere inside ___slab_alloc() after 5254 - * it did local_lock_irqsave(&s->cpu_slab->lock, flags). 5255 - * In this case fast path with __update_cpu_freelist_fast() is not safe. 5256 6012 */ 5257 - if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock)) 5258 - ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); 6013 + ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); 5259 6014 5260 - if (PTR_ERR(ret) == -EBUSY) { 5261 - if (can_retry) { 5262 - /* pick the next kmalloc bucket */ 5263 - size = s->object_size + 1; 5264 - /* 5265 - * Another alternative is to 5266 - * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT; 5267 - * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT; 5268 - * to retry from bucket of the same size. 5269 - */ 5270 - can_retry = false; 5271 - goto retry; 5272 - } 5273 - ret = NULL; 6015 + /* 6016 + * It's possible we failed due to trylock as we preempted someone with 6017 + * the sheaves locked, and the list_lock is also held by another cpu. 6018 + * But it should be rare that multiple kmalloc buckets would have 6019 + * sheaves locked, so try a larger one. 6020 + */ 6021 + if (!ret && can_retry) { 6022 + /* pick the next kmalloc bucket */ 6023 + size = s->object_size + 1; 6024 + /* 6025 + * Another alternative is to 6026 + * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT; 6027 + * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT; 6028 + * to retry from bucket of the same size. 6029 + */ 6030 + can_retry = false; 6031 + goto retry; 5274 6032 } 5275 6033 6034 + success: 5276 6035 maybe_wipe_obj_freeptr(s, ret); 5277 6036 slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret, 5278 6037 slab_want_init_on_alloc(alloc_gfp, s), size); ··· 5351 6116 /* was on full list */ 5352 6117 remove_full(s, n, slab); 5353 6118 if (!slab_free) { 5354 - add_partial(n, slab, DEACTIVATE_TO_TAIL); 6119 + add_partial(n, slab, ADD_TO_TAIL); 5355 6120 stat(s, FREE_ADD_PARTIAL); 5356 6121 } 5357 6122 } else if (slab_free) { ··· 5389 6154 unsigned long addr) 5390 6155 5391 6156 { 5392 - bool was_frozen, was_full; 6157 + bool was_full; 5393 6158 struct freelist_counters old, new; 5394 6159 struct kmem_cache_node *n = NULL; 5395 6160 unsigned long flags; 5396 6161 bool on_node_partial; 5397 6162 5398 - stat(s, FREE_SLOWPATH); 5399 - 5400 6163 if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 5401 6164 free_to_partial_list(s, slab, head, tail, cnt, addr); 5402 6165 return; 5403 6166 } 5404 - 5405 - /* 5406 - * It is enough to test IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) below 5407 - * instead of kmem_cache_has_cpu_partial(s), because kmem_cache_debug(s) 5408 - * is the only other reason it can be false, and it is already handled 5409 - * above. 5410 - */ 5411 6167 5412 6168 do { 5413 6169 if (unlikely(n)) { ··· 5410 6184 old.counters = slab->counters; 5411 6185 5412 6186 was_full = (old.freelist == NULL); 5413 - was_frozen = old.frozen; 5414 6187 5415 6188 set_freepointer(s, tail, old.freelist); 5416 6189 ··· 5422 6197 * to (due to not being full anymore) the partial list. 5423 6198 * Unless it's frozen. 5424 6199 */ 5425 - if ((!new.inuse || was_full) && !was_frozen) { 6200 + if (!new.inuse || was_full) { 6201 + 6202 + n = get_node(s, slab_nid(slab)); 5426 6203 /* 5427 - * If slab becomes non-full and we have cpu partial 5428 - * lists, we put it there unconditionally to avoid 5429 - * taking the list_lock. Otherwise we need it. 6204 + * Speculatively acquire the list_lock. 6205 + * If the cmpxchg does not succeed then we may 6206 + * drop the list_lock without any processing. 6207 + * 6208 + * Otherwise the list_lock will synchronize with 6209 + * other processors updating the list of slabs. 5430 6210 */ 5431 - if (!(IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full)) { 6211 + spin_lock_irqsave(&n->list_lock, flags); 5432 6212 5433 - n = get_node(s, slab_nid(slab)); 5434 - /* 5435 - * Speculatively acquire the list_lock. 5436 - * If the cmpxchg does not succeed then we may 5437 - * drop the list_lock without any processing. 5438 - * 5439 - * Otherwise the list_lock will synchronize with 5440 - * other processors updating the list of slabs. 5441 - */ 5442 - spin_lock_irqsave(&n->list_lock, flags); 5443 - 5444 - on_node_partial = slab_test_node_partial(slab); 5445 - } 6213 + on_node_partial = slab_test_node_partial(slab); 5446 6214 } 5447 6215 5448 6216 } while (!slab_update_freelist(s, slab, &old, &new, "__slab_free")); 5449 6217 5450 6218 if (likely(!n)) { 5451 - 5452 - if (likely(was_frozen)) { 5453 - /* 5454 - * The list lock was not taken therefore no list 5455 - * activity can be necessary. 5456 - */ 5457 - stat(s, FREE_FROZEN); 5458 - } else if (IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full) { 5459 - /* 5460 - * If we started with a full slab then put it onto the 5461 - * per cpu partial list. 5462 - */ 5463 - put_cpu_partial(s, slab, 1); 5464 - stat(s, CPU_PARTIAL_FREE); 5465 - } 5466 - 5467 6219 /* 5468 - * In other cases we didn't take the list_lock because the slab 5469 - * was already on the partial list and will remain there. 6220 + * We didn't take the list_lock because the slab was already on 6221 + * the partial list and will remain there. 5470 6222 */ 5471 - 5472 6223 return; 5473 6224 } 5474 6225 ··· 5466 6265 5467 6266 /* 5468 6267 * Objects left in the slab. If it was not on the partial list before 5469 - * then add it. This can only happen when cache has no per cpu partial 5470 - * list otherwise we would have put it there. 6268 + * then add it. 5471 6269 */ 5472 - if (!IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && unlikely(was_full)) { 5473 - add_partial(n, slab, DEACTIVATE_TO_TAIL); 6270 + if (unlikely(was_full)) { 6271 + add_partial(n, slab, ADD_TO_TAIL); 5474 6272 stat(s, FREE_ADD_PARTIAL); 5475 6273 } 5476 6274 spin_unlock_irqrestore(&n->list_lock, flags); ··· 5555 6355 * unlocked. 5556 6356 */ 5557 6357 static struct slub_percpu_sheaves * 5558 - __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs) 6358 + __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, 6359 + bool allow_spin) 5559 6360 { 5560 6361 struct slab_sheaf *empty; 5561 6362 struct node_barn *barn; ··· 5564 6363 5565 6364 restart: 5566 6365 lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); 6366 + 6367 + /* Bootstrap or debug cache, back off */ 6368 + if (unlikely(!cache_has_sheaves(s))) { 6369 + local_unlock(&s->cpu_sheaves->lock); 6370 + return NULL; 6371 + } 5567 6372 5568 6373 barn = get_barn(s); 5569 6374 if (!barn) { ··· 5580 6373 put_fail = false; 5581 6374 5582 6375 if (!pcs->spare) { 5583 - empty = barn_get_empty_sheaf(barn); 6376 + empty = barn_get_empty_sheaf(barn, allow_spin); 5584 6377 if (empty) { 5585 6378 pcs->spare = pcs->main; 5586 6379 pcs->main = empty; ··· 5594 6387 return pcs; 5595 6388 } 5596 6389 5597 - empty = barn_replace_full_sheaf(barn, pcs->main); 6390 + empty = barn_replace_full_sheaf(barn, pcs->main, allow_spin); 5598 6391 5599 6392 if (!IS_ERR(empty)) { 5600 6393 stat(s, BARN_PUT); ··· 5602 6395 return pcs; 5603 6396 } 5604 6397 5605 - if (PTR_ERR(empty) == -E2BIG) { 6398 + /* sheaf_flush_unused() doesn't support !allow_spin */ 6399 + if (PTR_ERR(empty) == -E2BIG && allow_spin) { 5606 6400 /* Since we got here, spare exists and is full */ 5607 6401 struct slab_sheaf *to_flush = pcs->spare; 5608 6402 ··· 5627 6419 5628 6420 alloc_empty: 5629 6421 local_unlock(&s->cpu_sheaves->lock); 6422 + 6423 + /* 6424 + * alloc_empty_sheaf() doesn't support !allow_spin and it's 6425 + * easier to fall back to freeing directly without sheaves 6426 + * than add the support (and to sheaf_flush_unused() above) 6427 + */ 6428 + if (!allow_spin) 6429 + return NULL; 5630 6430 5631 6431 empty = alloc_empty_sheaf(s, GFP_NOWAIT); 5632 6432 if (empty) ··· 5678 6462 * The object is expected to have passed slab_free_hook() already. 5679 6463 */ 5680 6464 static __fastpath_inline 5681 - bool free_to_pcs(struct kmem_cache *s, void *object) 6465 + bool free_to_pcs(struct kmem_cache *s, void *object, bool allow_spin) 5682 6466 { 5683 6467 struct slub_percpu_sheaves *pcs; 5684 6468 ··· 5689 6473 5690 6474 if (unlikely(pcs->main->size == s->sheaf_capacity)) { 5691 6475 5692 - pcs = __pcs_replace_full_main(s, pcs); 6476 + pcs = __pcs_replace_full_main(s, pcs, allow_spin); 5693 6477 if (unlikely(!pcs)) 5694 6478 return false; 5695 6479 } ··· 5698 6482 5699 6483 local_unlock(&s->cpu_sheaves->lock); 5700 6484 5701 - stat(s, FREE_PCS); 6485 + stat(s, FREE_FASTPATH); 5702 6486 5703 6487 return true; 5704 6488 } ··· 5796 6580 struct slab_sheaf *empty; 5797 6581 struct node_barn *barn; 5798 6582 6583 + /* Bootstrap or debug cache, fall back */ 6584 + if (unlikely(!cache_has_sheaves(s))) { 6585 + local_unlock(&s->cpu_sheaves->lock); 6586 + goto fail; 6587 + } 6588 + 5799 6589 if (pcs->spare && pcs->spare->size == 0) { 5800 6590 pcs->rcu_free = pcs->spare; 5801 6591 pcs->spare = NULL; ··· 5814 6592 goto fail; 5815 6593 } 5816 6594 5817 - empty = barn_get_empty_sheaf(barn); 6595 + empty = barn_get_empty_sheaf(barn, true); 5818 6596 5819 6597 if (empty) { 5820 6598 pcs->rcu_free = empty; ··· 5934 6712 goto no_empty; 5935 6713 5936 6714 if (!pcs->spare) { 5937 - empty = barn_get_empty_sheaf(barn); 6715 + empty = barn_get_empty_sheaf(barn, true); 5938 6716 if (!empty) 5939 6717 goto no_empty; 5940 6718 ··· 5948 6726 goto do_free; 5949 6727 } 5950 6728 5951 - empty = barn_replace_full_sheaf(barn, pcs->main); 6729 + empty = barn_replace_full_sheaf(barn, pcs->main, true); 5952 6730 if (IS_ERR(empty)) { 5953 6731 stat(s, BARN_PUT_FAIL); 5954 6732 goto no_empty; ··· 5966 6744 5967 6745 local_unlock(&s->cpu_sheaves->lock); 5968 6746 5969 - stat_add(s, FREE_PCS, batch); 6747 + stat_add(s, FREE_FASTPATH, batch); 5970 6748 5971 6749 if (batch < size) { 5972 6750 p += batch; ··· 5988 6766 */ 5989 6767 fallback: 5990 6768 __kmem_cache_free_bulk(s, size, p); 6769 + stat_add(s, FREE_SLOWPATH, size); 5991 6770 5992 6771 flush_remote: 5993 6772 if (remote_nr) { 5994 6773 __kmem_cache_free_bulk(s, remote_nr, &remote_objects[0]); 6774 + stat_add(s, FREE_SLOWPATH, remote_nr); 5995 6775 if (i < size) { 5996 6776 remote_nr = 0; 5997 6777 goto next_remote_batch; ··· 6003 6779 6004 6780 struct defer_free { 6005 6781 struct llist_head objects; 6006 - struct llist_head slabs; 6007 6782 struct irq_work work; 6008 6783 }; 6009 6784 ··· 6010 6787 6011 6788 static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = { 6012 6789 .objects = LLIST_HEAD_INIT(objects), 6013 - .slabs = LLIST_HEAD_INIT(slabs), 6014 6790 .work = IRQ_WORK_INIT(free_deferred_objects), 6015 6791 }; 6016 6792 6017 6793 /* 6018 6794 * In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe 6019 - * to take sleeping spin_locks from __slab_free() and deactivate_slab(). 6795 + * to take sleeping spin_locks from __slab_free(). 6020 6796 * In !PREEMPT_RT irq_work will run after local_unlock_irqrestore(). 6021 6797 */ 6022 6798 static void free_deferred_objects(struct irq_work *work) 6023 6799 { 6024 6800 struct defer_free *df = container_of(work, struct defer_free, work); 6025 6801 struct llist_head *objs = &df->objects; 6026 - struct llist_head *slabs = &df->slabs; 6027 6802 struct llist_node *llnode, *pos, *t; 6028 6803 6029 - if (llist_empty(objs) && llist_empty(slabs)) 6804 + if (llist_empty(objs)) 6030 6805 return; 6031 6806 6032 6807 llnode = llist_del_all(objs); ··· 6047 6826 set_freepointer(s, x, NULL); 6048 6827 6049 6828 __slab_free(s, slab, x, x, 1, _THIS_IP_); 6050 - } 6051 - 6052 - llnode = llist_del_all(slabs); 6053 - llist_for_each_safe(pos, t, llnode) { 6054 - struct slab *slab = container_of(pos, struct slab, llnode); 6055 - 6056 - if (slab->frozen) 6057 - deactivate_slab(slab->slab_cache, slab, slab->flush_freelist); 6058 - else 6059 - free_slab(slab->slab_cache, slab); 6829 + stat(s, FREE_SLOWPATH); 6060 6830 } 6061 6831 } 6062 6832 ··· 6064 6852 irq_work_queue(&df->work); 6065 6853 } 6066 6854 6067 - static void defer_deactivate_slab(struct slab *slab, void *flush_freelist) 6068 - { 6069 - struct defer_free *df; 6070 - 6071 - slab->flush_freelist = flush_freelist; 6072 - 6073 - guard(preempt)(); 6074 - 6075 - df = this_cpu_ptr(&defer_free_objects); 6076 - if (llist_add(&slab->llnode, &df->slabs)) 6077 - irq_work_queue(&df->work); 6078 - } 6079 - 6080 6855 void defer_free_barrier(void) 6081 6856 { 6082 6857 int cpu; 6083 6858 6084 6859 for_each_possible_cpu(cpu) 6085 6860 irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work); 6086 - } 6087 - 6088 - /* 6089 - * Fastpath with forced inlining to produce a kfree and kmem_cache_free that 6090 - * can perform fastpath freeing without additional function calls. 6091 - * 6092 - * The fastpath is only possible if we are freeing to the current cpu slab 6093 - * of this processor. This typically the case if we have just allocated 6094 - * the item before. 6095 - * 6096 - * If fastpath is not possible then fall back to __slab_free where we deal 6097 - * with all sorts of special processing. 6098 - * 6099 - * Bulk free of a freelist with several objects (all pointing to the 6100 - * same slab) possible by specifying head and tail ptr, plus objects 6101 - * count (cnt). Bulk free indicated by tail pointer being set. 6102 - */ 6103 - static __always_inline void do_slab_free(struct kmem_cache *s, 6104 - struct slab *slab, void *head, void *tail, 6105 - int cnt, unsigned long addr) 6106 - { 6107 - /* cnt == 0 signals that it's called from kfree_nolock() */ 6108 - bool allow_spin = cnt; 6109 - struct kmem_cache_cpu *c; 6110 - unsigned long tid; 6111 - void **freelist; 6112 - 6113 - redo: 6114 - /* 6115 - * Determine the currently cpus per cpu slab. 6116 - * The cpu may change afterward. However that does not matter since 6117 - * data is retrieved via this pointer. If we are on the same cpu 6118 - * during the cmpxchg then the free will succeed. 6119 - */ 6120 - c = raw_cpu_ptr(s->cpu_slab); 6121 - tid = READ_ONCE(c->tid); 6122 - 6123 - /* Same with comment on barrier() in __slab_alloc_node() */ 6124 - barrier(); 6125 - 6126 - if (unlikely(slab != c->slab)) { 6127 - if (unlikely(!allow_spin)) { 6128 - /* 6129 - * __slab_free() can locklessly cmpxchg16 into a slab, 6130 - * but then it might need to take spin_lock or local_lock 6131 - * in put_cpu_partial() for further processing. 6132 - * Avoid the complexity and simply add to a deferred list. 6133 - */ 6134 - defer_free(s, head); 6135 - } else { 6136 - __slab_free(s, slab, head, tail, cnt, addr); 6137 - } 6138 - return; 6139 - } 6140 - 6141 - if (unlikely(!allow_spin)) { 6142 - if ((in_nmi() || !USE_LOCKLESS_FAST_PATH()) && 6143 - local_lock_is_locked(&s->cpu_slab->lock)) { 6144 - defer_free(s, head); 6145 - return; 6146 - } 6147 - cnt = 1; /* restore cnt. kfree_nolock() frees one object at a time */ 6148 - } 6149 - 6150 - if (USE_LOCKLESS_FAST_PATH()) { 6151 - freelist = READ_ONCE(c->freelist); 6152 - 6153 - set_freepointer(s, tail, freelist); 6154 - 6155 - if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) { 6156 - note_cmpxchg_failure("slab_free", s, tid); 6157 - goto redo; 6158 - } 6159 - } else { 6160 - __maybe_unused unsigned long flags = 0; 6161 - 6162 - /* Update the free list under the local lock */ 6163 - local_lock_cpu_slab(s, flags); 6164 - c = this_cpu_ptr(s->cpu_slab); 6165 - if (unlikely(slab != c->slab)) { 6166 - local_unlock_cpu_slab(s, flags); 6167 - goto redo; 6168 - } 6169 - tid = c->tid; 6170 - freelist = c->freelist; 6171 - 6172 - set_freepointer(s, tail, freelist); 6173 - c->freelist = head; 6174 - c->tid = next_tid(tid); 6175 - 6176 - local_unlock_cpu_slab(s, flags); 6177 - } 6178 - stat_add(s, FREE_FASTPATH, cnt); 6179 6861 } 6180 6862 6181 6863 static __fastpath_inline ··· 6082 6976 if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false))) 6083 6977 return; 6084 6978 6085 - if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) || 6086 - slab_nid(slab) == numa_mem_id()) 6087 - && likely(!slab_test_pfmemalloc(slab))) { 6088 - if (likely(free_to_pcs(s, object))) 6979 + if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id()) 6980 + && likely(!slab_test_pfmemalloc(slab))) { 6981 + if (likely(free_to_pcs(s, object, true))) 6089 6982 return; 6090 6983 } 6091 6984 6092 - do_slab_free(s, slab, object, object, 1, addr); 6985 + __slab_free(s, slab, object, object, 1, addr); 6986 + stat(s, FREE_SLOWPATH); 6093 6987 } 6094 6988 6095 6989 #ifdef CONFIG_MEMCG ··· 6098 6992 void memcg_alloc_abort_single(struct kmem_cache *s, void *object) 6099 6993 { 6100 6994 if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) 6101 - do_slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_); 6995 + __slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_); 6102 6996 } 6103 6997 #endif 6104 6998 ··· 6112 7006 * With KASAN enabled slab_free_freelist_hook modifies the freelist 6113 7007 * to remove objects, whose reuse must be delayed. 6114 7008 */ 6115 - if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt))) 6116 - do_slab_free(s, slab, head, tail, cnt, addr); 7009 + if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt))) { 7010 + __slab_free(s, slab, head, tail, cnt, addr); 7011 + stat_add(s, FREE_SLOWPATH, cnt); 7012 + } 6117 7013 } 6118 7014 6119 7015 #ifdef CONFIG_SLUB_RCU_DEBUG ··· 6140 7032 return; 6141 7033 6142 7034 /* resume freeing */ 6143 - if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) 6144 - do_slab_free(s, slab, object, object, 1, _THIS_IP_); 7035 + if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) { 7036 + __slab_free(s, slab, object, object, 1, _THIS_IP_); 7037 + stat(s, FREE_SLOWPATH); 7038 + } 6145 7039 } 6146 7040 #endif /* CONFIG_SLUB_RCU_DEBUG */ 6147 7041 6148 7042 #ifdef CONFIG_KASAN_GENERIC 6149 7043 void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) 6150 7044 { 6151 - do_slab_free(cache, virt_to_slab(x), x, x, 1, addr); 7045 + __slab_free(cache, virt_to_slab(x), x, x, 1, addr); 7046 + stat(cache, FREE_SLOWPATH); 6152 7047 } 6153 7048 #endif 6154 7049 ··· 6451 7340 * since kasan quarantine takes locks and not supported from NMI. 6452 7341 */ 6453 7342 kasan_slab_free(s, x, false, false, /* skip quarantine */true); 6454 - do_slab_free(s, slab, x, x, 0, _RET_IP_); 7343 + 7344 + if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id())) { 7345 + if (likely(free_to_pcs(s, x, false))) 7346 + return; 7347 + } 7348 + 7349 + /* 7350 + * __slab_free() can locklessly cmpxchg16 into a slab, but then it might 7351 + * need to take spin_lock for further processing. 7352 + * Avoid the complexity and simply add to a deferred list. 7353 + */ 7354 + defer_free(s, x); 6455 7355 } 6456 7356 EXPORT_SYMBOL_GPL(kfree_nolock); 6457 7357 ··· 6888 7766 if (kfence_free(df.freelist)) 6889 7767 continue; 6890 7768 6891 - do_slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, 7769 + __slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, 6892 7770 _RET_IP_); 6893 7771 } while (likely(size)); 6894 7772 } ··· 6903 7781 * freeing to sheaves is so incompatible with the detached freelist so 6904 7782 * once we go that way, we have to do everything differently 6905 7783 */ 6906 - if (s && s->cpu_sheaves) { 7784 + if (s && cache_has_sheaves(s)) { 6907 7785 free_to_pcs_bulk(s, size, p); 6908 7786 return; 6909 7787 } ··· 6921 7799 } 6922 7800 EXPORT_SYMBOL(kmem_cache_free_bulk); 6923 7801 7802 + static unsigned int 7803 + __refill_objects_node(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 7804 + unsigned int max, struct kmem_cache_node *n, 7805 + bool allow_spin) 7806 + { 7807 + struct partial_bulk_context pc; 7808 + struct slab *slab, *slab2; 7809 + unsigned int refilled = 0; 7810 + unsigned long flags; 7811 + void *object; 7812 + 7813 + pc.flags = gfp; 7814 + pc.min_objects = min; 7815 + pc.max_objects = max; 7816 + 7817 + if (!get_partial_node_bulk(s, n, &pc, allow_spin)) 7818 + return 0; 7819 + 7820 + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { 7821 + 7822 + list_del(&slab->slab_list); 7823 + 7824 + object = get_freelist_nofreeze(s, slab); 7825 + 7826 + while (object && refilled < max) { 7827 + p[refilled] = object; 7828 + object = get_freepointer(s, object); 7829 + maybe_wipe_obj_freeptr(s, p[refilled]); 7830 + 7831 + refilled++; 7832 + } 7833 + 7834 + /* 7835 + * Freelist had more objects than we can accommodate, we need to 7836 + * free them back. We can treat it like a detached freelist, just 7837 + * need to find the tail object. 7838 + */ 7839 + if (unlikely(object)) { 7840 + void *head = object; 7841 + void *tail; 7842 + int cnt = 0; 7843 + 7844 + do { 7845 + tail = object; 7846 + cnt++; 7847 + object = get_freepointer(s, object); 7848 + } while (object); 7849 + __slab_free(s, slab, head, tail, cnt, _RET_IP_); 7850 + } 7851 + 7852 + if (refilled >= max) 7853 + break; 7854 + } 7855 + 7856 + if (unlikely(!list_empty(&pc.slabs))) { 7857 + spin_lock_irqsave(&n->list_lock, flags); 7858 + 7859 + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { 7860 + 7861 + if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) 7862 + continue; 7863 + 7864 + list_del(&slab->slab_list); 7865 + add_partial(n, slab, ADD_TO_HEAD); 7866 + } 7867 + 7868 + spin_unlock_irqrestore(&n->list_lock, flags); 7869 + 7870 + /* any slabs left are completely free and for discard */ 7871 + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { 7872 + 7873 + list_del(&slab->slab_list); 7874 + discard_slab(s, slab); 7875 + } 7876 + } 7877 + 7878 + return refilled; 7879 + } 7880 + 7881 + #ifdef CONFIG_NUMA 7882 + static unsigned int 7883 + __refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 7884 + unsigned int max) 7885 + { 7886 + struct zonelist *zonelist; 7887 + struct zoneref *z; 7888 + struct zone *zone; 7889 + enum zone_type highest_zoneidx = gfp_zone(gfp); 7890 + unsigned int cpuset_mems_cookie; 7891 + unsigned int refilled = 0; 7892 + 7893 + /* see get_from_any_partial() for the defrag ratio description */ 7894 + if (!s->remote_node_defrag_ratio || 7895 + get_cycles() % 1024 > s->remote_node_defrag_ratio) 7896 + return 0; 7897 + 7898 + do { 7899 + cpuset_mems_cookie = read_mems_allowed_begin(); 7900 + zonelist = node_zonelist(mempolicy_slab_node(), gfp); 7901 + for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { 7902 + struct kmem_cache_node *n; 7903 + unsigned int r; 7904 + 7905 + n = get_node(s, zone_to_nid(zone)); 7906 + 7907 + if (!n || !cpuset_zone_allowed(zone, gfp) || 7908 + n->nr_partial <= s->min_partial) 7909 + continue; 7910 + 7911 + r = __refill_objects_node(s, p, gfp, min, max, n, 7912 + /* allow_spin = */ false); 7913 + refilled += r; 7914 + 7915 + if (r >= min) { 7916 + /* 7917 + * Don't check read_mems_allowed_retry() here - 7918 + * if mems_allowed was updated in parallel, that 7919 + * was a harmless race between allocation and 7920 + * the cpuset update 7921 + */ 7922 + return refilled; 7923 + } 7924 + p += r; 7925 + min -= r; 7926 + max -= r; 7927 + } 7928 + } while (read_mems_allowed_retry(cpuset_mems_cookie)); 7929 + 7930 + return refilled; 7931 + } 7932 + #else 7933 + static inline unsigned int 7934 + __refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 7935 + unsigned int max) 7936 + { 7937 + return 0; 7938 + } 7939 + #endif 7940 + 7941 + static unsigned int 7942 + refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 7943 + unsigned int max) 7944 + { 7945 + int local_node = numa_mem_id(); 7946 + unsigned int refilled; 7947 + struct slab *slab; 7948 + 7949 + if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp))) 7950 + return 0; 7951 + 7952 + refilled = __refill_objects_node(s, p, gfp, min, max, 7953 + get_node(s, local_node), 7954 + /* allow_spin = */ true); 7955 + if (refilled >= min) 7956 + return refilled; 7957 + 7958 + refilled += __refill_objects_any(s, p + refilled, gfp, min - refilled, 7959 + max - refilled); 7960 + if (refilled >= min) 7961 + return refilled; 7962 + 7963 + new_slab: 7964 + 7965 + slab = new_slab(s, gfp, local_node); 7966 + if (!slab) 7967 + goto out; 7968 + 7969 + stat(s, ALLOC_SLAB); 7970 + 7971 + /* 7972 + * TODO: possible optimization - if we know we will consume the whole 7973 + * slab we might skip creating the freelist? 7974 + */ 7975 + refilled += alloc_from_new_slab(s, slab, p + refilled, max - refilled, 7976 + /* allow_spin = */ true); 7977 + 7978 + if (refilled < min) 7979 + goto new_slab; 7980 + 7981 + out: 7982 + return refilled; 7983 + } 7984 + 6924 7985 static inline 6925 7986 int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, 6926 7987 void **p) 6927 7988 { 6928 - struct kmem_cache_cpu *c; 6929 - unsigned long irqflags; 6930 7989 int i; 6931 7990 6932 - /* 6933 - * Drain objects in the per cpu slab, while disabling local 6934 - * IRQs, which protects against PREEMPT and interrupts 6935 - * handlers invoking normal fastpath. 6936 - */ 6937 - c = slub_get_cpu_ptr(s->cpu_slab); 6938 - local_lock_irqsave(&s->cpu_slab->lock, irqflags); 7991 + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 7992 + for (i = 0; i < size; i++) { 6939 7993 6940 - for (i = 0; i < size; i++) { 6941 - void *object = c->freelist; 6942 - 6943 - if (unlikely(!object)) { 6944 - /* 6945 - * We may have removed an object from c->freelist using 6946 - * the fastpath in the previous iteration; in that case, 6947 - * c->tid has not been bumped yet. 6948 - * Since ___slab_alloc() may reenable interrupts while 6949 - * allocating memory, we should bump c->tid now. 6950 - */ 6951 - c->tid = next_tid(c->tid); 6952 - 6953 - local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); 6954 - 6955 - /* 6956 - * Invoking slow path likely have side-effect 6957 - * of re-populating per CPU c->freelist 6958 - */ 6959 - p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, 6960 - _RET_IP_, c, s->object_size); 7994 + p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_, 7995 + s->object_size); 6961 7996 if (unlikely(!p[i])) 6962 7997 goto error; 6963 7998 6964 - c = this_cpu_ptr(s->cpu_slab); 6965 7999 maybe_wipe_obj_freeptr(s, p[i]); 6966 - 6967 - local_lock_irqsave(&s->cpu_slab->lock, irqflags); 6968 - 6969 - continue; /* goto for-loop */ 6970 8000 } 6971 - c->freelist = get_freepointer(s, object); 6972 - p[i] = object; 6973 - maybe_wipe_obj_freeptr(s, p[i]); 6974 - stat(s, ALLOC_FASTPATH); 8001 + } else { 8002 + i = refill_objects(s, p, flags, size, size); 8003 + if (i < size) 8004 + goto error; 8005 + stat_add(s, ALLOC_SLOWPATH, i); 6975 8006 } 6976 - c->tid = next_tid(c->tid); 6977 - local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); 6978 - slub_put_cpu_ptr(s->cpu_slab); 6979 8007 6980 8008 return i; 6981 8009 6982 8010 error: 6983 - slub_put_cpu_ptr(s->cpu_slab); 6984 8011 __kmem_cache_free_bulk(s, i, p); 6985 8012 return 0; 6986 8013 6987 8014 } 6988 8015 6989 - /* Note that interrupts must be enabled when calling this function. */ 8016 + /* 8017 + * Note that interrupts must be enabled when calling this function and gfp 8018 + * flags must allow spinning. 8019 + */ 6990 8020 int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, 6991 8021 void **p) 6992 8022 { ··· 7166 7892 size--; 7167 7893 } 7168 7894 7169 - if (s->cpu_sheaves) 7170 - i = alloc_from_pcs_bulk(s, size, p); 7895 + i = alloc_from_pcs_bulk(s, flags, size, p); 7171 7896 7172 7897 if (i < size) { 7173 7898 /* ··· 7354 8081 barn_init(barn); 7355 8082 } 7356 8083 7357 - static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) 8084 + #ifdef CONFIG_SLUB_STATS 8085 + static inline int alloc_kmem_cache_stats(struct kmem_cache *s) 7358 8086 { 7359 8087 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < 7360 8088 NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH * 7361 - sizeof(struct kmem_cache_cpu)); 8089 + sizeof(struct kmem_cache_stats)); 7362 8090 7363 - /* 7364 - * Must align to double word boundary for the double cmpxchg 7365 - * instructions to work; see __pcpu_double_call_return_bool(). 7366 - */ 7367 - s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 7368 - 2 * sizeof(void *)); 8091 + s->cpu_stats = alloc_percpu(struct kmem_cache_stats); 7369 8092 7370 - if (!s->cpu_slab) 8093 + if (!s->cpu_stats) 7371 8094 return 0; 7372 - 7373 - init_kmem_cache_cpus(s); 7374 8095 7375 8096 return 1; 7376 8097 } 8098 + #endif 7377 8099 7378 8100 static int init_percpu_sheaves(struct kmem_cache *s) 7379 8101 { 8102 + static struct slab_sheaf bootstrap_sheaf = {}; 7380 8103 int cpu; 7381 8104 7382 8105 for_each_possible_cpu(cpu) { ··· 7382 8113 7383 8114 local_trylock_init(&pcs->lock); 7384 8115 7385 - pcs->main = alloc_empty_sheaf(s, GFP_KERNEL); 8116 + /* 8117 + * Bootstrap sheaf has zero size so fast-path allocation fails. 8118 + * It has also size == s->sheaf_capacity, so fast-path free 8119 + * fails. In the slow paths we recognize the situation by 8120 + * checking s->sheaf_capacity. This allows fast paths to assume 8121 + * s->cpu_sheaves and pcs->main always exists and are valid. 8122 + * It's also safe to share the single static bootstrap_sheaf 8123 + * with zero-sized objects array as it's never modified. 8124 + * 8125 + * Bootstrap_sheaf also has NULL pointer to kmem_cache so we 8126 + * recognize it and not attempt to free it when destroying the 8127 + * cache. 8128 + * 8129 + * We keep bootstrap_sheaf for kmem_cache and kmem_cache_node, 8130 + * caches with debug enabled, and all caches with SLUB_TINY. 8131 + * For kmalloc caches it's used temporarily during the initial 8132 + * bootstrap. 8133 + */ 8134 + if (!s->sheaf_capacity) 8135 + pcs->main = &bootstrap_sheaf; 8136 + else 8137 + pcs->main = alloc_empty_sheaf(s, GFP_KERNEL); 7386 8138 7387 8139 if (!pcs->main) 7388 8140 return -ENOMEM; ··· 7454 8164 * No locks need to be taken here as it has just been 7455 8165 * initialized and there is no concurrent access. 7456 8166 */ 7457 - __add_partial(n, slab, DEACTIVATE_TO_HEAD); 8167 + __add_partial(n, slab, ADD_TO_HEAD); 7458 8168 } 7459 8169 7460 8170 static void free_kmem_cache_nodes(struct kmem_cache *s) ··· 7478 8188 void __kmem_cache_release(struct kmem_cache *s) 7479 8189 { 7480 8190 cache_random_seq_destroy(s); 7481 - if (s->cpu_sheaves) 7482 - pcs_destroy(s); 7483 - #ifdef CONFIG_PREEMPT_RT 7484 - if (s->cpu_slab) 7485 - lockdep_unregister_key(&s->lock_key); 8191 + pcs_destroy(s); 8192 + #ifdef CONFIG_SLUB_STATS 8193 + free_percpu(s->cpu_stats); 7486 8194 #endif 7487 - free_percpu(s->cpu_slab); 7488 8195 free_kmem_cache_nodes(s); 7489 8196 } 7490 8197 ··· 7498 8211 continue; 7499 8212 } 7500 8213 7501 - if (s->cpu_sheaves) { 8214 + if (cache_has_sheaves(s)) { 7502 8215 barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); 7503 8216 7504 8217 if (!barn) ··· 7519 8232 return 1; 7520 8233 } 7521 8234 7522 - static void set_cpu_partial(struct kmem_cache *s) 8235 + static unsigned int calculate_sheaf_capacity(struct kmem_cache *s, 8236 + struct kmem_cache_args *args) 8237 + 7523 8238 { 7524 - #ifdef CONFIG_SLUB_CPU_PARTIAL 7525 - unsigned int nr_objects; 8239 + unsigned int capacity; 8240 + size_t size; 8241 + 8242 + 8243 + if (IS_ENABLED(CONFIG_SLUB_TINY) || s->flags & SLAB_DEBUG_FLAGS) 8244 + return 0; 7526 8245 7527 8246 /* 7528 - * cpu_partial determined the maximum number of objects kept in the 7529 - * per cpu partial lists of a processor. 7530 - * 7531 - * Per cpu partial lists mainly contain slabs that just have one 7532 - * object freed. If they are used for allocation then they can be 7533 - * filled up again with minimal effort. The slab will never hit the 7534 - * per node partial lists and therefore no locking will be required. 7535 - * 7536 - * For backwards compatibility reasons, this is determined as number 7537 - * of objects, even though we now limit maximum number of pages, see 7538 - * slub_set_cpu_partial() 8247 + * Bootstrap caches can't have sheaves for now (SLAB_NO_OBJ_EXT). 8248 + * SLAB_NOLEAKTRACE caches (e.g., kmemleak's object_cache) must not 8249 + * have sheaves to avoid recursion when sheaf allocation triggers 8250 + * kmemleak tracking. 7539 8251 */ 7540 - if (!kmem_cache_has_cpu_partial(s)) 7541 - nr_objects = 0; 7542 - else if (s->size >= PAGE_SIZE) 7543 - nr_objects = 6; 7544 - else if (s->size >= 1024) 7545 - nr_objects = 24; 7546 - else if (s->size >= 256) 7547 - nr_objects = 52; 7548 - else 7549 - nr_objects = 120; 8252 + if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) 8253 + return 0; 7550 8254 7551 - slub_set_cpu_partial(s, nr_objects); 7552 - #endif 8255 + /* 8256 + * For now we use roughly similar formula (divided by two as there are 8257 + * two percpu sheaves) as what was used for percpu partial slabs, which 8258 + * should result in similar lock contention (barn or list_lock) 8259 + */ 8260 + if (s->size >= PAGE_SIZE) 8261 + capacity = 4; 8262 + else if (s->size >= 1024) 8263 + capacity = 12; 8264 + else if (s->size >= 256) 8265 + capacity = 26; 8266 + else 8267 + capacity = 60; 8268 + 8269 + /* Increment capacity to make sheaf exactly a kmalloc size bucket */ 8270 + size = struct_size_t(struct slab_sheaf, objects, capacity); 8271 + size = kmalloc_size_roundup(size); 8272 + capacity = (size - struct_size_t(struct slab_sheaf, objects, 0)) / sizeof(void *); 8273 + 8274 + /* 8275 + * Respect an explicit request for capacity that's typically motivated by 8276 + * expected maximum size of kmem_cache_prefill_sheaf() to not end up 8277 + * using low-performance oversize sheaves 8278 + */ 8279 + return max(capacity, args->sheaf_capacity); 7553 8280 } 7554 8281 7555 8282 /* ··· 7711 8410 s->allocflags |= __GFP_RECLAIMABLE; 7712 8411 7713 8412 /* 8413 + * For KMALLOC_NORMAL caches we enable sheaves later by 8414 + * bootstrap_kmalloc_sheaves() to avoid recursion 8415 + */ 8416 + if (!is_kmalloc_normal(s)) 8417 + s->sheaf_capacity = calculate_sheaf_capacity(s, args); 8418 + 8419 + /* 7714 8420 * Determine the number of objects per slab 7715 8421 */ 7716 8422 s->oo = oo_make(order, size); ··· 7801 8493 flush_all_cpus_locked(s); 7802 8494 7803 8495 /* we might have rcu sheaves in flight */ 7804 - if (s->cpu_sheaves) 8496 + if (cache_has_sheaves(s)) 7805 8497 rcu_barrier(); 7806 8498 7807 8499 /* Attempt to free all objects */ ··· 8113 8805 if (get_node(s, nid)) 8114 8806 continue; 8115 8807 8116 - if (s->cpu_sheaves) { 8808 + if (cache_has_sheaves(s)) { 8117 8809 barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid); 8118 8810 8119 8811 if (!barn) { ··· 8188 8880 8189 8881 memcpy(s, static_cache, kmem_cache->object_size); 8190 8882 8191 - /* 8192 - * This runs very early, and only the boot processor is supposed to be 8193 - * up. Even if it weren't true, IRQs are not up so we couldn't fire 8194 - * IPIs around. 8195 - */ 8196 - __flush_cpu_slab(s, smp_processor_id()); 8197 8883 for_each_kmem_cache_node(s, node, n) { 8198 8884 struct slab *p; 8199 8885 ··· 8201 8899 } 8202 8900 list_add(&s->list, &slab_caches); 8203 8901 return s; 8902 + } 8903 + 8904 + /* 8905 + * Finish the sheaves initialization done normally by init_percpu_sheaves() and 8906 + * init_kmem_cache_nodes(). For normal kmalloc caches we have to bootstrap it 8907 + * since sheaves and barns are allocated by kmalloc. 8908 + */ 8909 + static void __init bootstrap_cache_sheaves(struct kmem_cache *s) 8910 + { 8911 + struct kmem_cache_args empty_args = {}; 8912 + unsigned int capacity; 8913 + bool failed = false; 8914 + int node, cpu; 8915 + 8916 + capacity = calculate_sheaf_capacity(s, &empty_args); 8917 + 8918 + /* capacity can be 0 due to debugging or SLUB_TINY */ 8919 + if (!capacity) 8920 + return; 8921 + 8922 + for_each_node_mask(node, slab_nodes) { 8923 + struct node_barn *barn; 8924 + 8925 + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); 8926 + 8927 + if (!barn) { 8928 + failed = true; 8929 + goto out; 8930 + } 8931 + 8932 + barn_init(barn); 8933 + get_node(s, node)->barn = barn; 8934 + } 8935 + 8936 + for_each_possible_cpu(cpu) { 8937 + struct slub_percpu_sheaves *pcs; 8938 + 8939 + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); 8940 + 8941 + pcs->main = __alloc_empty_sheaf(s, GFP_KERNEL, capacity); 8942 + 8943 + if (!pcs->main) { 8944 + failed = true; 8945 + break; 8946 + } 8947 + } 8948 + 8949 + out: 8950 + /* 8951 + * It's still early in boot so treat this like same as a failure to 8952 + * create the kmalloc cache in the first place 8953 + */ 8954 + if (failed) 8955 + panic("Out of memory when creating kmem_cache %s\n", s->name); 8956 + 8957 + s->sheaf_capacity = capacity; 8958 + } 8959 + 8960 + static void __init bootstrap_kmalloc_sheaves(void) 8961 + { 8962 + enum kmalloc_cache_type type; 8963 + 8964 + for (type = KMALLOC_NORMAL; type <= KMALLOC_RANDOM_END; type++) { 8965 + for (int idx = 0; idx < KMALLOC_SHIFT_HIGH + 1; idx++) { 8966 + if (kmalloc_caches[type][idx]) 8967 + bootstrap_cache_sheaves(kmalloc_caches[type][idx]); 8968 + } 8969 + } 8204 8970 } 8205 8971 8206 8972 void __init kmem_cache_init(void) ··· 8313 8943 /* Now we can use the kmem_cache to allocate kmalloc slabs */ 8314 8944 setup_kmalloc_cache_index_table(); 8315 8945 create_kmalloc_caches(); 8946 + 8947 + bootstrap_kmalloc_sheaves(); 8316 8948 8317 8949 /* Setup random freelists for each cache */ 8318 8950 init_freelist_randomization(); ··· 8383 9011 s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); 8384 9012 s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial); 8385 9013 8386 - set_cpu_partial(s); 8387 - 8388 - if (args->sheaf_capacity && !IS_ENABLED(CONFIG_SLUB_TINY) 8389 - && !(s->flags & SLAB_DEBUG_FLAGS)) { 8390 - s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves); 8391 - if (!s->cpu_sheaves) { 8392 - err = -ENOMEM; 8393 - goto out; 8394 - } 8395 - // TODO: increase capacity to grow slab_sheaf up to next kmalloc size? 8396 - s->sheaf_capacity = args->sheaf_capacity; 9014 + s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves); 9015 + if (!s->cpu_sheaves) { 9016 + err = -ENOMEM; 9017 + goto out; 8397 9018 } 8398 9019 8399 9020 #ifdef CONFIG_NUMA ··· 8402 9037 if (!init_kmem_cache_nodes(s)) 8403 9038 goto out; 8404 9039 8405 - if (!alloc_kmem_cache_cpus(s)) 9040 + #ifdef CONFIG_SLUB_STATS 9041 + if (!alloc_kmem_cache_stats(s)) 8406 9042 goto out; 9043 + #endif 8407 9044 8408 - if (s->cpu_sheaves) { 8409 - err = init_percpu_sheaves(s); 8410 - if (err) 8411 - goto out; 8412 - } 9045 + err = init_percpu_sheaves(s); 9046 + if (err) 9047 + goto out; 8413 9048 8414 9049 err = 0; 8415 9050 ··· 8724 9359 if (!nodes) 8725 9360 return -ENOMEM; 8726 9361 8727 - if (flags & SO_CPU) { 8728 - int cpu; 8729 - 8730 - for_each_possible_cpu(cpu) { 8731 - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, 8732 - cpu); 8733 - int node; 8734 - struct slab *slab; 8735 - 8736 - slab = READ_ONCE(c->slab); 8737 - if (!slab) 8738 - continue; 8739 - 8740 - node = slab_nid(slab); 8741 - if (flags & SO_TOTAL) 8742 - x = slab->objects; 8743 - else if (flags & SO_OBJECTS) 8744 - x = slab->inuse; 8745 - else 8746 - x = 1; 8747 - 8748 - total += x; 8749 - nodes[node] += x; 8750 - 8751 - #ifdef CONFIG_SLUB_CPU_PARTIAL 8752 - slab = slub_percpu_partial_read_once(c); 8753 - if (slab) { 8754 - node = slab_nid(slab); 8755 - if (flags & SO_TOTAL) 8756 - WARN_ON_ONCE(1); 8757 - else if (flags & SO_OBJECTS) 8758 - WARN_ON_ONCE(1); 8759 - else 8760 - x = data_race(slab->slabs); 8761 - total += x; 8762 - nodes[node] += x; 8763 - } 8764 - #endif 8765 - } 8766 - } 8767 - 8768 9362 /* 8769 9363 * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex" 8770 9364 * already held which will conflict with an existing lock order: ··· 8855 9531 8856 9532 static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) 8857 9533 { 8858 - unsigned int nr_partial = 0; 8859 - #ifdef CONFIG_SLUB_CPU_PARTIAL 8860 - nr_partial = s->cpu_partial; 8861 - #endif 8862 - 8863 - return sysfs_emit(buf, "%u\n", nr_partial); 9534 + return sysfs_emit(buf, "0\n"); 8864 9535 } 8865 9536 8866 9537 static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, ··· 8867 9548 err = kstrtouint(buf, 10, &objects); 8868 9549 if (err) 8869 9550 return err; 8870 - if (objects && !kmem_cache_has_cpu_partial(s)) 9551 + if (objects) 8871 9552 return -EINVAL; 8872 9553 8873 - slub_set_cpu_partial(s, objects); 8874 - flush_all(s); 8875 9554 return length; 8876 9555 } 8877 9556 SLAB_ATTR(cpu_partial); ··· 8908 9591 8909 9592 static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) 8910 9593 { 8911 - int objects = 0; 8912 - int slabs = 0; 8913 - int cpu __maybe_unused; 8914 - int len = 0; 8915 - 8916 - #ifdef CONFIG_SLUB_CPU_PARTIAL 8917 - for_each_online_cpu(cpu) { 8918 - struct slab *slab; 8919 - 8920 - slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); 8921 - 8922 - if (slab) 8923 - slabs += data_race(slab->slabs); 8924 - } 8925 - #endif 8926 - 8927 - /* Approximate half-full slabs, see slub_set_cpu_partial() */ 8928 - objects = (slabs * oo_objects(s->oo)) / 2; 8929 - len += sysfs_emit_at(buf, len, "%d(%d)", objects, slabs); 8930 - 8931 - #ifdef CONFIG_SLUB_CPU_PARTIAL 8932 - for_each_online_cpu(cpu) { 8933 - struct slab *slab; 8934 - 8935 - slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); 8936 - if (slab) { 8937 - slabs = data_race(slab->slabs); 8938 - objects = (slabs * oo_objects(s->oo)) / 2; 8939 - len += sysfs_emit_at(buf, len, " C%d=%d(%d)", 8940 - cpu, objects, slabs); 8941 - } 8942 - } 8943 - #endif 8944 - len += sysfs_emit_at(buf, len, "\n"); 8945 - 8946 - return len; 9594 + return sysfs_emit(buf, "0(0)\n"); 8947 9595 } 8948 9596 SLAB_ATTR_RO(slabs_cpu_partial); 8949 9597 ··· 9094 9812 return -ENOMEM; 9095 9813 9096 9814 for_each_online_cpu(cpu) { 9097 - unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; 9815 + unsigned int x = per_cpu_ptr(s->cpu_stats, cpu)->stat[si]; 9098 9816 9099 9817 data[cpu] = x; 9100 9818 sum += x; ··· 9120 9838 int cpu; 9121 9839 9122 9840 for_each_online_cpu(cpu) 9123 - per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; 9841 + per_cpu_ptr(s->cpu_stats, cpu)->stat[si] = 0; 9124 9842 } 9125 9843 9126 9844 #define STAT_ATTR(si, text) \ ··· 9138 9856 } \ 9139 9857 SLAB_ATTR(text); \ 9140 9858 9141 - STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf); 9142 9859 STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); 9143 9860 STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); 9144 - STAT_ATTR(FREE_PCS, free_cpu_sheaf); 9145 9861 STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf); 9146 9862 STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail); 9147 9863 STAT_ATTR(FREE_FASTPATH, free_fastpath); 9148 9864 STAT_ATTR(FREE_SLOWPATH, free_slowpath); 9149 - STAT_ATTR(FREE_FROZEN, free_frozen); 9150 9865 STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial); 9151 9866 STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); 9152 - STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); 9153 9867 STAT_ATTR(ALLOC_SLAB, alloc_slab); 9154 - STAT_ATTR(ALLOC_REFILL, alloc_refill); 9155 9868 STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); 9156 9869 STAT_ATTR(FREE_SLAB, free_slab); 9157 - STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); 9158 - STAT_ATTR(DEACTIVATE_FULL, deactivate_full); 9159 - STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); 9160 - STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); 9161 - STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); 9162 - STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); 9163 - STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); 9164 9870 STAT_ATTR(ORDER_FALLBACK, order_fallback); 9165 - STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); 9166 9871 STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); 9167 - STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); 9168 - STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); 9169 - STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); 9170 - STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); 9171 9872 STAT_ATTR(SHEAF_FLUSH, sheaf_flush); 9172 9873 STAT_ATTR(SHEAF_REFILL, sheaf_refill); 9173 9874 STAT_ATTR(SHEAF_ALLOC, sheaf_alloc); ··· 9226 9961 &remote_node_defrag_ratio_attr.attr, 9227 9962 #endif 9228 9963 #ifdef CONFIG_SLUB_STATS 9229 - &alloc_cpu_sheaf_attr.attr, 9230 9964 &alloc_fastpath_attr.attr, 9231 9965 &alloc_slowpath_attr.attr, 9232 - &free_cpu_sheaf_attr.attr, 9233 9966 &free_rcu_sheaf_attr.attr, 9234 9967 &free_rcu_sheaf_fail_attr.attr, 9235 9968 &free_fastpath_attr.attr, 9236 9969 &free_slowpath_attr.attr, 9237 - &free_frozen_attr.attr, 9238 9970 &free_add_partial_attr.attr, 9239 9971 &free_remove_partial_attr.attr, 9240 - &alloc_from_partial_attr.attr, 9241 9972 &alloc_slab_attr.attr, 9242 - &alloc_refill_attr.attr, 9243 9973 &alloc_node_mismatch_attr.attr, 9244 9974 &free_slab_attr.attr, 9245 - &cpuslab_flush_attr.attr, 9246 - &deactivate_full_attr.attr, 9247 - &deactivate_empty_attr.attr, 9248 - &deactivate_to_head_attr.attr, 9249 - &deactivate_to_tail_attr.attr, 9250 - &deactivate_remote_frees_attr.attr, 9251 - &deactivate_bypass_attr.attr, 9252 9975 &order_fallback_attr.attr, 9253 9976 &cmpxchg_double_fail_attr.attr, 9254 - &cmpxchg_double_cpu_fail_attr.attr, 9255 - &cpu_partial_alloc_attr.attr, 9256 - &cpu_partial_free_attr.attr, 9257 - &cpu_partial_node_attr.attr, 9258 - &cpu_partial_drain_attr.attr, 9259 9977 &sheaf_flush_attr.attr, 9260 9978 &sheaf_refill_attr.attr, 9261 9979 &sheaf_alloc_attr.attr,